- Merge in OFED 1.5.3 from projects/ofed/head

author: jeff <jeff@FreeBSD.org> 2011-03-21 09:58:24 +0000
committer: jeff <jeff@FreeBSD.org> 2011-03-21 09:58:24 +0000
commit: 5115240a6cdc054f7eea804355742f97c74578d8 (patch)
tree: 3051c12f4ce44a65c025b72ec5821b35b2ec46be /sys
parent: 2d7d8c05e7404fbebf1f0fe24c13bc5bb58d2338 (diff)
download: FreeBSD-src-5115240a6cdc054f7eea804355742f97c74578d8.zip
FreeBSD-src-5115240a6cdc054f7eea804355742f97c74578d8.tar.gz
277 files changed, 104288 insertions, 0 deletions
diff --git a/sys/modules/Makefile b/sys/modules/Makefile
index 41ed4bf..0925a21 100644
--- a/sys/modules/Makefile
+++ b/sys/modules/Makefile
@@ -185,6 +185,9 @@ SUBDIR=	${_3dfx} \
 	mfi \
 	mii \
 	mlx \
+	mlx4 \
+	mlx4ib \
+	mlxen \
 	${_mly} \
 	mmc \
 	mmcsd \
@@ -195,6 +198,7 @@ SUBDIR=	${_3dfx} \
 	msdosfs_iconv \
 	${_mse} \
 	msk \
+	mthca \
 	mvs \
 	mwl \
 	mwlfw \
diff --git a/sys/modules/mlx4/Makefile b/sys/modules/mlx4/Makefile
new file mode 100644
index 0000000..8ea3340
--- /dev/null
+++ b/sys/modules/mlx4/Makefile
@@ -0,0 +1,14 @@
+# $FreeBSD$
+.PATH:  ${.CURDIR}/../../ofed/drivers/net/mlx4
+KMOD    = mlx4
+SRCS    = device_if.h bus_if.h pci_if.h vnode_if.h
+SRCS+=	alloc.c catas.c cmd.c cq.c eq.c fw.c icm.c intf.c main.c mcg.c mr.c
+SRCS+=	pd.c port.c profile.c qp.c reset.c sense.c srq.c xrcd.c
+
+CFLAGS+= -I${.CURDIR}/../../ofed/drivers/net/mlx4
+CFLAGS+= -I${.CURDIR}/../../ofed/include/
+CFLAGS+= -DINET6
+
+.include <bsd.kmod.mk>
+
+CFLAGS+= -Wno-cast-qual -Wno-pointer-arith -fms-extensions
diff --git a/sys/modules/mlx4ib/Makefile b/sys/modules/mlx4ib/Makefile
new file mode 100644
index 0000000..5fe01e7
--- /dev/null
+++ b/sys/modules/mlx4ib/Makefile
@@ -0,0 +1,11 @@
+# $FreeBSD$
+.PATH:  ${.CURDIR}/../../ofed/drivers/infiniband/hw/mlx4
+KMOD    = mlx4ib
+SRCS    = device_if.h bus_if.h pci_if.h vnode_if.h
+SRCS+=	ah.c cq.c doorbell.c mad.c main.c mr.c qp.c srq.c wc.c
+
+CFLAGS+= -I${.CURDIR}/../../ofed/include/ -DINET6
+
+.include <bsd.kmod.mk>
+
+CFLAGS+= -Wno-cast-qual -Wno-pointer-arith -fms-extensions
diff --git a/sys/modules/mlxen/Makefile b/sys/modules/mlxen/Makefile
new file mode 100644
index 0000000..b83b4a5
--- /dev/null
+++ b/sys/modules/mlxen/Makefile
@@ -0,0 +1,13 @@
+# $FreeBSD$
+.PATH:  ${.CURDIR}/../../ofed/drivers/net/mlx4
+KMOD    = mlxen
+SRCS    = device_if.h bus_if.h pci_if.h vnode_if.h
+SRCS	+= en_cq.c en_frag.c en_main.c en_netdev.c en_port.c en_resources.c
+SRCS	+= en_rx.c en_tx.c
+CFLAGS+= -I${.CURDIR}/../../ofed/drivers/net/mlx4
+CFLAGS+= -I${.CURDIR}/../../ofed/include/
+CFLAGS+= -DINET6
+
+.include <bsd.kmod.mk>
+
+CFLAGS+= -Wno-cast-qual -Wno-pointer-arith -fms-extensions
diff --git a/sys/modules/mthca/Makefile b/sys/modules/mthca/Makefile
new file mode 100644
index 0000000..de860fe
--- /dev/null
+++ b/sys/modules/mthca/Makefile
@@ -0,0 +1,15 @@
+# $FreeBSD$
+
+.PATH:  ${.CURDIR}/../../ofed/drivers/infiniband/hw/mthca
+KMOD    = mthca
+SRCS    = device_if.h bus_if.h pci_if.h vnode_if.h
+SRCS+=	mthca_allocator.c mthca_av.c mthca_catas.c mthca_cmd.c mthca_cq.c
+SRCS+=	mthca_eq.c mthca_mad.c mthca_main.c mthca_mcg.c mthca_memfree.c
+SRCS+=	mthca_mr.c mthca_pd.c mthca_profile.c mthca_provider.c mthca_qp.c
+SRCS+=	mthca_reset.c mthca_srq.c mthca_uar.c
+
+CFLAGS+= -I${.CURDIR}/../../ofed/include/ -DINET6
+
+.include <bsd.kmod.mk>
+
+CFLAGS+= -Wno-cast-qual -Wno-pointer-arith -fms-extensions
diff --git a/sys/ofed/drivers/infiniband/Kconfig b/sys/ofed/drivers/infiniband/Kconfig
new file mode 100644
index 0000000..0a2ef11
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/Kconfig
@@ -0,0 +1,66 @@
+menuconfig INFINIBAND
+	tristate "InfiniBand support"
+	depends on PCI || BROKEN
+	depends on HAS_IOMEM
+	---help---
+	  Core support for InfiniBand (IB).  Make sure to also select
+	  any protocols you wish to use as well as drivers for your
+	  InfiniBand hardware.
+
+if INFINIBAND
+
+config INFINIBAND_USER_MAD
+	tristate "InfiniBand userspace MAD support"
+	depends on INFINIBAND
+	---help---
+	  Userspace InfiniBand Management Datagram (MAD) support.  This
+	  is the kernel side of the userspace MAD support, which allows
+	  userspace processes to send and receive MADs. You will also
+	  need libibumad from <http://www.openib.org>.
+
+config INFINIBAND_USER_ACCESS
+	tristate "InfiniBand userspace access (verbs and CM)"
+	---help---
+	  Userspace InfiniBand access support.  This enables the
+	  kernel side of userspace verbs and the userspace
+	  communication manager (CM).  This allows userspace processes
+	  to set up connections and directly access InfiniBand
+	  hardware for fast-path operations.  You will also need
+	  libibverbs, libibcm and a hardware driver library from
+	  <http://www.openib.org>.
+
+config INFINIBAND_USER_MEM
+	bool
+	depends on INFINIBAND_USER_ACCESS != n
+	default y
+
+config INFINIBAND_ADDR_TRANS
+	bool
+	depends on INET
+	depends on !(INFINIBAND = y && IPV6 = m)
+	default y
+
+source "drivers/infiniband/hw/mthca/Kconfig"
+source "drivers/infiniband/hw/ipath/Kconfig"
+source "drivers/infiniband/hw/qib/Kconfig"
+source "drivers/infiniband/hw/ehca/Kconfig"
+source "drivers/infiniband/hw/amso1100/Kconfig"
+source "drivers/infiniband/hw/cxgb3/Kconfig"
+source "drivers/infiniband/hw/mlx4/Kconfig"
+source "drivers/infiniband/hw/nes/Kconfig"
+
+source "drivers/infiniband/ulp/ipoib/Kconfig"
+
+source "drivers/infiniband/ulp/srp/Kconfig"
+
+source "drivers/infiniband/ulp/srpt/Kconfig"
+
+source "drivers/infiniband/ulp/iser/Kconfig"
+
+source "drivers/infiniband/ulp/sdp/Kconfig"
+
+source "drivers/infiniband/ulp/qlgc_vnic/Kconfig"
+
+source "drivers/infiniband/util/Kconfig"
+
+endif # INFINIBAND
diff --git a/sys/ofed/drivers/infiniband/Makefile b/sys/ofed/drivers/infiniband/Makefile
new file mode 100644
index 0000000..ea5dbe0
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/Makefile
@@ -0,0 +1,17 @@
+obj-$(CONFIG_INFINIBAND)		+= core/
+obj-$(CONFIG_INFINIBAND_MTHCA)		+= hw/mthca/
+obj-$(CONFIG_INFINIBAND_IPATH)		+= hw/ipath/
+obj-$(CONFIG_INFINIBAND_QIB)		+= hw/qib/
+obj-$(CONFIG_INFINIBAND_EHCA)		+= hw/ehca/
+obj-$(CONFIG_INFINIBAND_AMSO1100)	+= hw/amso1100/
+obj-$(CONFIG_INFINIBAND_CXGB3)		+= hw/cxgb3/
+obj-$(CONFIG_INFINIBAND_NES)		+= hw/nes/
+obj-$(CONFIG_MLX4_INFINIBAND)		+= hw/mlx4/
+obj-$(CONFIG_INFINIBAND_NES)		+= hw/nes/
+obj-$(CONFIG_INFINIBAND_IPOIB)		+= ulp/ipoib/
+obj-$(CONFIG_INFINIBAND_SRP)		+= ulp/srp/
+obj-$(CONFIG_INFINIBAND_SRPT)		+= ulp/srpt/
+obj-$(CONFIG_INFINIBAND_ISER)		+= ulp/iser/
+obj-$(CONFIG_INFINIBAND_SDP)		+= ulp/sdp/
+obj-$(CONFIG_INFINIBAND_QLGC_VNIC)	+= ulp/qlgc_vnic/
+obj-$(CONFIG_INFINIBAND_MADEYE)		+= util/
diff --git a/sys/ofed/drivers/infiniband/core/Makefile b/sys/ofed/drivers/infiniband/core/Makefile
new file mode 100644
index 0000000..f646040
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/Makefile
@@ -0,0 +1,32 @@
+infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS)	:= ib_addr.o rdma_cm.o
+user_access-$(CONFIG_INFINIBAND_ADDR_TRANS)	:= rdma_ucm.o
+
+obj-$(CONFIG_INFINIBAND) +=		ib_core.o ib_mad.o ib_sa.o \
+					ib_cm.o iw_cm.o $(infiniband-y)
+obj-$(CONFIG_INFINIBAND_USER_MAD) +=	ib_umad.o
+obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=	ib_uverbs.o ib_ucm.o \
+					$(user_access-y)
+
+ib_core-y :=			packer.o ud_header.o verbs.o sysfs.o \
+				device.o fmr_pool.o cache.o
+ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
+
+ib_mad-y :=			mad.o smi.o agent.o mad_rmpp.o
+
+ib_sa-y :=			sa_query.o multicast.o notice.o local_sa.o
+
+ib_cm-y :=			cm.o
+
+iw_cm-y :=			iwcm.o
+
+rdma_cm-y :=			cma.o
+
+rdma_ucm-y :=			ucma.o
+
+ib_addr-y :=			addr.o
+
+ib_umad-y :=			user_mad.o
+
+ib_ucm-y :=			ucm.o
+
+ib_uverbs-y :=			uverbs_main.o uverbs_cmd.o uverbs_marshall.o
diff --git a/sys/ofed/drivers/infiniband/core/addr.c b/sys/ofed/drivers/infiniband/core/addr.c
new file mode 100644
index 0000000..cde4037
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/addr.c
@@ -0,0 +1,630 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc.  All rights reserved.
+ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mutex.h>
+#include <linux/inetdevice.h>
+#include <linux/workqueue.h>
+#include <net/arp.h>
+#include <net/neighbour.h>
+#include <net/route.h>
+#include <net/netevent.h>
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+#include <rdma/ib_addr.h>
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("IB Address Translation");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct addr_req {
+	struct list_head list;
+	struct sockaddr_storage src_addr;
+	struct sockaddr_storage dst_addr;
+	struct rdma_dev_addr *addr;
+	struct rdma_addr_client *client;
+	void *context;
+	void (*callback)(int status, struct sockaddr *src_addr,
+			 struct rdma_dev_addr *addr, void *context);
+	unsigned long timeout;
+	int status;
+};
+
+static void process_req(struct work_struct *work);
+
+static DEFINE_MUTEX(lock);
+static LIST_HEAD(req_list);
+static struct delayed_work work;
+static struct workqueue_struct *addr_wq;
+
+void rdma_addr_register_client(struct rdma_addr_client *client)
+{
+	atomic_set(&client->refcount, 1);
+	init_completion(&client->comp);
+}
+EXPORT_SYMBOL(rdma_addr_register_client);
+
+static inline void put_client(struct rdma_addr_client *client)
+{
+	if (atomic_dec_and_test(&client->refcount))
+		complete(&client->comp);
+}
+
+void rdma_addr_unregister_client(struct rdma_addr_client *client)
+{
+	put_client(client);
+	wait_for_completion(&client->comp);
+}
+EXPORT_SYMBOL(rdma_addr_unregister_client);
+
+#ifdef __linux__
+int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev,
+		     const unsigned char *dst_dev_addr)
+{
+	dev_addr->dev_type = dev->type;
+	memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN);
+	memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN);
+	if (dst_dev_addr)
+		memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN);
+	dev_addr->bound_dev_if = dev->ifindex;
+	return 0;
+}
+#else
+int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct ifnet *dev,
+		     const unsigned char *dst_dev_addr)
+{
+	if (dev->if_type == IFT_INFINIBAND)
+		dev_addr->dev_type = ARPHRD_INFINIBAND;
+	else if (dev->if_type == IFT_ETHER)
+		dev_addr->dev_type = ARPHRD_ETHER;
+	else
+		dev_addr->dev_type = 0;
+	memcpy(dev_addr->src_dev_addr, IF_LLADDR(dev), dev->if_addrlen);
+	memcpy(dev_addr->broadcast, __DECONST(char *, dev->if_broadcastaddr),
+	    dev->if_addrlen);
+	if (dst_dev_addr)
+		memcpy(dev_addr->dst_dev_addr, dst_dev_addr, dev->if_addrlen);
+	dev_addr->bound_dev_if = dev->if_index;
+	return 0;
+}
+#endif
+EXPORT_SYMBOL(rdma_copy_addr);
+
+int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr)
+{
+	struct net_device *dev;
+	int ret = -EADDRNOTAVAIL;
+
+	if (dev_addr->bound_dev_if) {
+		dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+		if (!dev)
+			return -ENODEV;
+		ret = rdma_copy_addr(dev_addr, dev, NULL);
+		dev_put(dev);
+		return ret;
+	}
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		dev = ip_dev_find(NULL,
+			((struct sockaddr_in *) addr)->sin_addr.s_addr);
+
+		if (!dev)
+			return ret;
+
+		ret = rdma_copy_addr(dev_addr, dev, NULL);
+		dev_put(dev);
+		break;
+
+#if defined(INET6)
+	case AF_INET6:
+#ifdef __linux__
+		read_lock(&dev_base_lock);
+		for_each_netdev(&init_net, dev) {
+			if (ipv6_chk_addr(&init_net,
+					  &((struct sockaddr_in6 *) addr)->sin6_addr,
+					  dev, 1)) {
+				ret = rdma_copy_addr(dev_addr, dev, NULL);
+				break;
+			}
+		}
+		read_unlock(&dev_base_lock);
+#else
+		{
+			struct sockaddr_in6 *sin6;
+			struct ifaddr *ifa;
+			in_port_t port;
+
+			sin6 = (struct sockaddr_in6 *)addr;
+			port = sin6->sin6_port;
+			sin6->sin6_port = 0;
+			ifa = ifa_ifwithaddr(addr);
+			sin6->sin6_port = port;
+			if (ifa == NULL) {
+				ret = -ENODEV;
+				break;
+			}
+			ret = rdma_copy_addr(dev_addr, ifa->ifa_ifp, NULL);
+			ifa_free(ifa);
+			break;
+		}
+#endif
+		break;
+#endif
+	}
+	return ret;
+}
+EXPORT_SYMBOL(rdma_translate_ip);
+
+static void set_timeout(unsigned long time)
+{
+	unsigned long delay;
+
+	cancel_delayed_work(&work);
+
+	delay = time - jiffies;
+	if ((long)delay <= 0)
+		delay = 1;
+
+	queue_delayed_work(addr_wq, &work, delay);
+}
+
+static void queue_req(struct addr_req *req)
+{
+	struct addr_req *temp_req;
+
+	mutex_lock(&lock);
+	list_for_each_entry_reverse(temp_req, &req_list, list) {
+		if (time_after_eq(req->timeout, temp_req->timeout))
+			break;
+	}
+
+	list_add(&req->list, &temp_req->list);
+
+	if (req_list.next == &req->list)
+		set_timeout(req->timeout);
+	mutex_unlock(&lock);
+}
+
+#ifdef __linux__
+static int addr4_resolve(struct sockaddr_in *src_in,
+			 struct sockaddr_in *dst_in,
+			 struct rdma_dev_addr *addr)
+{
+	__be32 src_ip = src_in->sin_addr.s_addr;
+	__be32 dst_ip = dst_in->sin_addr.s_addr;
+	struct flowi fl;
+	struct rtable *rt;
+	struct neighbour *neigh;
+	int ret;
+
+	memset(&fl, 0, sizeof fl);
+	fl.nl_u.ip4_u.daddr = dst_ip;
+	fl.nl_u.ip4_u.saddr = src_ip;
+	fl.oif = addr->bound_dev_if;
+
+	ret = ip_route_output_key(&init_net, &rt, &fl);
+	if (ret)
+		goto out;
+
+	src_in->sin_family = AF_INET;
+	src_in->sin_addr.s_addr = rt->rt_src;
+
+	if (rt->idev->dev->flags & IFF_LOOPBACK) {
+		ret = rdma_translate_ip((struct sockaddr *) dst_in, addr);
+		if (!ret)
+			memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
+		goto put;
+	}
+
+	/* If the device does ARP internally, return 'done' */
+	if (rt->idev->dev->flags & IFF_NOARP) {
+		rdma_copy_addr(addr, rt->idev->dev, NULL);
+		goto put;
+	}
+
+	neigh = neigh_lookup(&arp_tbl, &rt->rt_gateway, rt->idev->dev);
+	if (!neigh || !(neigh->nud_state & NUD_VALID)) {
+		neigh_event_send(rt->u.dst.neighbour, NULL);
+		ret = -ENODATA;
+		if (neigh)
+			goto release;
+		goto put;
+	}
+
+	ret = rdma_copy_addr(addr, neigh->dev, neigh->ha);
+release:
+	neigh_release(neigh);
+put:
+	ip_rt_put(rt);
+out:
+	return ret;
+}
+
+#if defined(INET6)
+static int addr6_resolve(struct sockaddr_in6 *src_in,
+			 struct sockaddr_in6 *dst_in,
+			 struct rdma_dev_addr *addr)
+{
+	struct flowi fl;
+	struct neighbour *neigh;
+	struct dst_entry *dst;
+	int ret;
+
+	memset(&fl, 0, sizeof fl);
+	ipv6_addr_copy(&fl.fl6_dst, &dst_in->sin6_addr);
+	ipv6_addr_copy(&fl.fl6_src, &src_in->sin6_addr);
+	fl.oif = addr->bound_dev_if;
+
+	dst = ip6_route_output(&init_net, NULL, &fl);
+	if ((ret = dst->error))
+		goto put;
+
+	if (ipv6_addr_any(&fl.fl6_src)) {
+		ret = ipv6_dev_get_saddr(&init_net, ip6_dst_idev(dst)->dev,
+					 &fl.fl6_dst, 0, &fl.fl6_src);
+		if (ret)
+			goto put;
+
+		src_in->sin6_family = AF_INET6;
+		ipv6_addr_copy(&src_in->sin6_addr, &fl.fl6_src);
+	}
+
+	if (dst->dev->flags & IFF_LOOPBACK) {
+		ret = rdma_translate_ip((struct sockaddr *) dst_in, addr);
+		if (!ret)
+			memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
+		goto put;
+	}
+
+	/* If the device does ARP internally, return 'done' */
+	if (dst->dev->flags & IFF_NOARP) {
+		ret = rdma_copy_addr(addr, dst->dev, NULL);
+		goto put;
+	}
+	
+	neigh = dst->neighbour;
+	if (!neigh || !(neigh->nud_state & NUD_VALID)) {
+		neigh_event_send(dst->neighbour, NULL);
+		ret = -ENODATA;
+		goto put;
+	}
+
+	ret = rdma_copy_addr(addr, dst->dev, neigh->ha);
+put:
+	dst_release(dst);
+	return ret;
+}
+#else
+static int addr6_resolve(struct sockaddr_in6 *src_in,
+			 struct sockaddr_in6 *dst_in,
+			 struct rdma_dev_addr *addr)
+{
+	return -EADDRNOTAVAIL;
+}
+#endif
+
+#else
+#include <netinet/if_ether.h>
+
+static int addr_resolve(struct sockaddr *src_in,
+			struct sockaddr *dst_in,
+			struct rdma_dev_addr *addr)
+{
+	struct sockaddr_in *sin;
+	struct sockaddr_in6 *sin6;
+	struct ifaddr *ifa;
+	struct ifnet *ifp;
+	struct llentry *lle;
+	struct rtentry *rte;
+	in_port_t port;
+	u_char edst[MAX_ADDR_LEN];
+	int multi;
+	int bcast;
+	int error;
+
+	/*
+	 * Determine whether the address is unicast, multicast, or broadcast
+	 * and whether the source interface is valid.
+	 */
+	multi = 0;
+	bcast = 0;
+	sin = NULL;
+	sin6 = NULL;
+	ifp = NULL;
+	rte = NULL;
+	switch (dst_in->sa_family) {
+	case AF_INET:
+		sin = (struct sockaddr_in *)dst_in;
+		if (sin->sin_addr.s_addr == INADDR_BROADCAST)
+			bcast = 1;
+		if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
+			multi = 1;
+		sin = (struct sockaddr_in *)src_in;
+		if (sin->sin_addr.s_addr != INADDR_ANY) {
+			/*
+			 * Address comparison fails if the port is set
+			 * cache it here to be restored later.
+			 */
+			port = sin->sin_port;
+			sin->sin_port = 0;
+			memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+		} else
+			src_in = NULL; 
+		break;
+#ifdef INET6
+	case AF_INET6:
+		sin6 = (struct sockaddr_in6 *)dst_in;
+		if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
+			multi = 1;
+		sin6 = (struct sockaddr_in6 *)src_in;
+		if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
+			port = sin6->sin6_port;
+			sin6->sin6_port = 0;
+		} else
+			src_in = NULL;
+		break;
+#endif
+	default:
+		return -EINVAL;
+	}
+	/*
+	 * If we have a source address to use look it up first and verify
+	 * that it is a local interface.
+	 */
+	if (src_in) {
+		ifa = ifa_ifwithaddr(src_in);
+		if (sin)
+			sin->sin_port = port;
+		if (sin6)
+			sin6->sin6_port = port;
+		if (ifa == NULL)
+			return -ENETUNREACH;
+		ifp = ifa->ifa_ifp;
+		ifa_free(ifa);
+		if (bcast || multi)
+			goto mcast;
+	}
+	/*
+	 * Make sure the route exists and has a valid link.
+	 */
+	rte = rtalloc1(dst_in, 1, 0);
+	if (rte == NULL || rte->rt_ifp == NULL || !RT_LINK_IS_UP(rte->rt_ifp)) {
+		if (rte) 
+			RTFREE_LOCKED(rte);
+		return -EHOSTUNREACH;
+	}
+	/*
+	 * If it's not multicast or broadcast and the route doesn't match the
+	 * requested interface return unreachable.  Otherwise fetch the
+	 * correct interface pointer and unlock the route.
+	 */
+	if (multi || bcast) {
+		if (ifp == NULL)
+			ifp = rte->rt_ifp;
+		RTFREE_LOCKED(rte);
+	} else if (ifp && ifp != rte->rt_ifp) {
+		RTFREE_LOCKED(rte);
+		return -ENETUNREACH;
+	} else {
+		if (ifp == NULL)
+			ifp = rte->rt_ifp;
+		RT_UNLOCK(rte);
+	}
+mcast:
+	if (bcast)
+		return rdma_copy_addr(addr, ifp, ifp->if_broadcastaddr);
+	if (multi) {
+		struct sockaddr *llsa;
+
+		error = ifp->if_resolvemulti(ifp, &llsa, dst_in);
+		if (error)
+			return -error;
+		error = rdma_copy_addr(addr, ifp,
+		    LLADDR((struct sockaddr_dl *)llsa));
+		free(llsa, M_IFMADDR);
+		return error;
+	}
+	/*
+	 * Resolve the link local address.
+	 */
+	if (dst_in->sa_family == AF_INET)
+		error = arpresolve(ifp, rte, NULL, dst_in, edst, &lle);
+#ifdef INET6
+	else
+		error = nd6_storelladdr(ifp, NULL, dst_in, (u_char *)edst, &lle);
+#endif
+	RTFREE(rte);
+	if (error == 0)
+		return rdma_copy_addr(addr, ifp, edst);
+	if (error == EWOULDBLOCK)
+		return -ENODATA;
+	return -error;
+}
+
+#endif
+
+static void process_req(struct work_struct *work)
+{
+	struct addr_req *req, *temp_req;
+	struct sockaddr *src_in, *dst_in;
+	struct list_head done_list;
+
+	INIT_LIST_HEAD(&done_list);
+
+	mutex_lock(&lock);
+	list_for_each_entry_safe(req, temp_req, &req_list, list) {
+		if (req->status == -ENODATA) {
+			src_in = (struct sockaddr *) &req->src_addr;
+			dst_in = (struct sockaddr *) &req->dst_addr;
+			req->status = addr_resolve(src_in, dst_in, req->addr);
+			if (req->status && time_after_eq(jiffies, req->timeout))
+				req->status = -ETIMEDOUT;
+			else if (req->status == -ENODATA)
+				continue;
+		}
+		list_move_tail(&req->list, &done_list);
+	}
+
+	if (!list_empty(&req_list)) {
+		req = list_entry(req_list.next, struct addr_req, list);
+		set_timeout(req->timeout);
+	}
+	mutex_unlock(&lock);
+
+	list_for_each_entry_safe(req, temp_req, &done_list, list) {
+		list_del(&req->list);
+		req->callback(req->status, (struct sockaddr *) &req->src_addr,
+			req->addr, req->context);
+		put_client(req->client);
+		kfree(req);
+	}
+}
+
+int rdma_resolve_ip(struct rdma_addr_client *client,
+		    struct sockaddr *src_addr, struct sockaddr *dst_addr,
+		    struct rdma_dev_addr *addr, int timeout_ms,
+		    void (*callback)(int status, struct sockaddr *src_addr,
+				     struct rdma_dev_addr *addr, void *context),
+		    void *context)
+{
+	struct sockaddr *src_in, *dst_in;
+	struct addr_req *req;
+	int ret = 0;
+
+	req = kzalloc(sizeof *req, GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	src_in = (struct sockaddr *) &req->src_addr;
+	dst_in = (struct sockaddr *) &req->dst_addr;
+
+	if (src_addr) {
+		if (src_addr->sa_family != dst_addr->sa_family) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		memcpy(src_in, src_addr, ip_addr_size(src_addr));
+	} else {
+		src_in->sa_family = dst_addr->sa_family;
+	}
+
+	memcpy(dst_in, dst_addr, ip_addr_size(dst_addr));
+	req->addr = addr;
+	req->callback = callback;
+	req->context = context;
+	req->client = client;
+	atomic_inc(&client->refcount);
+
+	req->status = addr_resolve(src_in, dst_in, addr);
+	switch (req->status) {
+	case 0:
+		req->timeout = jiffies;
+		queue_req(req);
+		break;
+	case -ENODATA:
+		req->timeout = msecs_to_jiffies(timeout_ms) + jiffies;
+		queue_req(req);
+		break;
+	default:
+		ret = req->status;
+		atomic_dec(&client->refcount);
+		goto err;
+	}
+	return ret;
+err:
+	kfree(req);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_ip);
+
+void rdma_addr_cancel(struct rdma_dev_addr *addr)
+{
+	struct addr_req *req, *temp_req;
+
+	mutex_lock(&lock);
+	list_for_each_entry_safe(req, temp_req, &req_list, list) {
+		if (req->addr == addr) {
+			req->status = -ECANCELED;
+			req->timeout = jiffies;
+			list_move(&req->list, &req_list);
+			set_timeout(req->timeout);
+			break;
+		}
+	}
+	mutex_unlock(&lock);
+}
+EXPORT_SYMBOL(rdma_addr_cancel);
+
+static int netevent_callback(struct notifier_block *self, unsigned long event,
+	void *ctx)
+{
+	if (event == NETEVENT_NEIGH_UPDATE) {
+#ifdef __linux__
+		struct neighbour *neigh = ctx;
+
+		if (neigh->nud_state & NUD_VALID) {
+			set_timeout(jiffies);
+		}
+#else
+		set_timeout(jiffies);
+#endif
+	}
+	return 0;
+}
+
+static struct notifier_block nb = {
+	.notifier_call = netevent_callback
+};
+
+static int addr_init(void)
+{
+	INIT_DELAYED_WORK(&work, process_req);
+	addr_wq = create_singlethread_workqueue("ib_addr");
+	if (!addr_wq)
+		return -ENOMEM;
+
+	register_netevent_notifier(&nb);
+	return 0;
+}
+
+static void addr_cleanup(void)
+{
+	unregister_netevent_notifier(&nb);
+	destroy_workqueue(addr_wq);
+}
+
+module_init(addr_init);
+module_exit(addr_cleanup);
diff --git a/sys/ofed/drivers/infiniband/core/agent.c b/sys/ofed/drivers/infiniband/core/agent.c
new file mode 100644
index 0000000..91916a8
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/agent.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2004, 2005 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004, 2005 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include "agent.h"
+#include "smi.h"
+#include "mad_priv.h"
+
+#define SPFX "ib_agent: "
+
+struct ib_agent_port_private {
+	struct list_head port_list;
+	struct ib_mad_agent *agent[2];
+};
+
+static DEFINE_SPINLOCK(ib_agent_port_list_lock);
+static LIST_HEAD(ib_agent_port_list);
+
+static struct ib_agent_port_private *
+__ib_get_agent_port(struct ib_device *device, int port_num)
+{
+	struct ib_agent_port_private *entry;
+
+	list_for_each_entry(entry, &ib_agent_port_list, port_list) {
+		if (entry->agent[1]->device == device &&
+		    entry->agent[1]->port_num == port_num)
+			return entry;
+	}
+	return NULL;
+}
+
+static struct ib_agent_port_private *
+ib_get_agent_port(struct ib_device *device, int port_num)
+{
+	struct ib_agent_port_private *entry;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+	entry = __ib_get_agent_port(device, port_num);
+	spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+	return entry;
+}
+
+void agent_send_response(struct ib_mad *mad, struct ib_grh *grh,
+			 struct ib_wc *wc, struct ib_device *device,
+			 int port_num, int qpn)
+{
+	struct ib_agent_port_private *port_priv;
+	struct ib_mad_agent *agent;
+	struct ib_mad_send_buf *send_buf;
+	struct ib_ah *ah;
+	struct ib_mad_send_wr_private *mad_send_wr;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH)
+		port_priv = ib_get_agent_port(device, 0);
+	else
+		port_priv = ib_get_agent_port(device, port_num);
+
+	if (!port_priv) {
+		printk(KERN_ERR SPFX "Unable to find port agent\n");
+		return;
+	}
+
+	agent = port_priv->agent[qpn];
+	ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num);
+	if (IS_ERR(ah)) {
+		printk(KERN_ERR SPFX "ib_create_ah_from_wc error\n");
+		return;
+	}
+
+	send_buf = ib_create_send_mad(agent, wc->src_qp, wc->pkey_index, 0,
+				      IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+				      GFP_KERNEL);
+	if (IS_ERR(send_buf)) {
+		printk(KERN_ERR SPFX "ib_create_send_mad error\n");
+		goto err1;
+	}
+
+	memcpy(send_buf->mad, mad, sizeof *mad);
+	send_buf->ah = ah;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH) {
+		mad_send_wr = container_of(send_buf,
+					   struct ib_mad_send_wr_private,
+					   send_buf);
+		mad_send_wr->send_wr.wr.ud.port_num = port_num;
+	}
+
+	if (ib_post_send_mad(send_buf, NULL)) {
+		printk(KERN_ERR SPFX "ib_post_send_mad error\n");
+		goto err2;
+	}
+	return;
+err2:
+	ib_free_send_mad(send_buf);
+err1:
+	ib_destroy_ah(ah);
+}
+
+static void agent_send_handler(struct ib_mad_agent *mad_agent,
+			       struct ib_mad_send_wc *mad_send_wc)
+{
+	ib_destroy_ah(mad_send_wc->send_buf->ah);
+	ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+int ib_agent_port_open(struct ib_device *device, int port_num)
+{
+	struct ib_agent_port_private *port_priv;
+	unsigned long flags;
+	int ret;
+
+	/* Create new device info */
+	port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
+	if (!port_priv) {
+		printk(KERN_ERR SPFX "No memory for ib_agent_port_private\n");
+		ret = -ENOMEM;
+		goto error1;
+	}
+
+	if (rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND) {
+		/* Obtain send only MAD agent for SMI QP */
+		port_priv->agent[0] = ib_register_mad_agent(device, port_num,
+							    IB_QPT_SMI, NULL, 0,
+							    &agent_send_handler,
+							    NULL, NULL);
+		if (IS_ERR(port_priv->agent[0])) {
+			ret = PTR_ERR(port_priv->agent[0]);
+			goto error2;
+		}
+	}
+
+	/* Obtain send only MAD agent for GSI QP */
+	port_priv->agent[1] = ib_register_mad_agent(device, port_num,
+						    IB_QPT_GSI, NULL, 0,
+						    &agent_send_handler,
+						    NULL, NULL);
+	if (IS_ERR(port_priv->agent[1])) {
+		ret = PTR_ERR(port_priv->agent[1]);
+		goto error3;
+	}
+
+	spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+	list_add_tail(&port_priv->port_list, &ib_agent_port_list);
+	spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+
+	return 0;
+
+error3:
+	if (port_priv->agent[0])
+		ib_unregister_mad_agent(port_priv->agent[0]);
+error2:
+	kfree(port_priv);
+error1:
+	return ret;
+}
+
+int ib_agent_port_close(struct ib_device *device, int port_num)
+{
+	struct ib_agent_port_private *port_priv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+	port_priv = __ib_get_agent_port(device, port_num);
+	if (port_priv == NULL) {
+		spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+		printk(KERN_ERR SPFX "Port %d not found\n", port_num);
+		return -ENODEV;
+	}
+	list_del(&port_priv->port_list);
+	spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+
+	ib_unregister_mad_agent(port_priv->agent[1]);
+	if (port_priv->agent[0])
+		ib_unregister_mad_agent(port_priv->agent[0]);
+
+	kfree(port_priv);
+	return 0;
+}
diff --git a/sys/ofed/drivers/infiniband/core/agent.h b/sys/ofed/drivers/infiniband/core/agent.h
new file mode 100644
index 0000000..6669287
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/agent.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __AGENT_H_
+#define __AGENT_H_
+
+#include <linux/err.h>
+#include <rdma/ib_mad.h>
+
+extern int ib_agent_port_open(struct ib_device *device, int port_num);
+
+extern int ib_agent_port_close(struct ib_device *device, int port_num);
+
+extern void agent_send_response(struct ib_mad *mad, struct ib_grh *grh,
+				struct ib_wc *wc, struct ib_device *device,
+				int port_num, int qpn);
+
+#endif	/* __AGENT_H_ */
diff --git a/sys/ofed/drivers/infiniband/core/cache.c b/sys/ofed/drivers/infiniband/core/cache.c
new file mode 100644
index 0000000..660bff5
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/cache.c
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include <rdma/ib_cache.h>
+
+#include "core_priv.h"
+
+struct ib_pkey_cache {
+	int             table_len;
+	u16             table[0];
+};
+
+struct ib_gid_cache {
+	int             table_len;
+	union ib_gid    table[0];
+};
+
+struct ib_update_work {
+	struct work_struct work;
+	struct ib_device  *device;
+	u8                 port_num;
+};
+
+static inline int start_port(struct ib_device *device)
+{
+	return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1;
+}
+
+static inline int end_port(struct ib_device *device)
+{
+	return (device->node_type == RDMA_NODE_IB_SWITCH) ?
+		0 : device->phys_port_cnt;
+}
+
+int ib_get_cached_gid(struct ib_device *device,
+		      u8                port_num,
+		      int               index,
+		      union ib_gid     *gid)
+{
+	struct ib_gid_cache *cache;
+	unsigned long flags;
+	int ret = 0;
+
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+
+	cache = device->cache.gid_cache[port_num - start_port(device)];
+
+	if (index < 0 || index >= cache->table_len)
+		ret = -EINVAL;
+	else
+		*gid = cache->table[index];
+
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_gid);
+
+int ib_find_cached_gid(struct ib_device *device,
+		       union ib_gid	*gid,
+		       u8               *port_num,
+		       u16              *index)
+{
+	struct ib_gid_cache *cache;
+	unsigned long flags;
+	int p, i;
+	int ret = -ENOENT;
+
+	*port_num = -1;
+	if (index)
+		*index = -1;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+
+	for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+		cache = device->cache.gid_cache[p];
+		for (i = 0; i < cache->table_len; ++i) {
+			if (!memcmp(gid, &cache->table[i], sizeof *gid)) {
+				*port_num = p + start_port(device);
+				if (index)
+					*index = i;
+				ret = 0;
+				goto found;
+			}
+		}
+	}
+found:
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_find_cached_gid);
+
+int ib_get_cached_pkey(struct ib_device *device,
+		       u8                port_num,
+		       int               index,
+		       u16              *pkey)
+{
+	struct ib_pkey_cache *cache;
+	unsigned long flags;
+	int ret = 0;
+
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+
+	cache = device->cache.pkey_cache[port_num - start_port(device)];
+
+	if (index < 0 || index >= cache->table_len)
+		ret = -EINVAL;
+	else
+		*pkey = cache->table[index];
+
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_pkey);
+
+int ib_find_cached_pkey(struct ib_device *device,
+			u8                port_num,
+			u16               pkey,
+			u16              *index)
+{
+	struct ib_pkey_cache *cache;
+	unsigned long flags;
+	int i;
+	int ret = -ENOENT;
+
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+
+	cache = device->cache.pkey_cache[port_num - start_port(device)];
+
+	*index = -1;
+
+	for (i = 0; i < cache->table_len; ++i)
+		if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) {
+			*index = i;
+			ret = 0;
+			break;
+		}
+
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_find_cached_pkey);
+
+int ib_get_cached_lmc(struct ib_device *device,
+		      u8                port_num,
+		      u8                *lmc)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+	*lmc = device->cache.lmc_cache[port_num - start_port(device)];
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_lmc);
+
+static void ib_cache_update(struct ib_device *device,
+			    u8                port)
+{
+	struct ib_port_attr       *tprops = NULL;
+	struct ib_pkey_cache      *pkey_cache = NULL, *old_pkey_cache;
+	struct ib_gid_cache       *gid_cache = NULL, *old_gid_cache;
+	int                        i;
+	int                        ret;
+
+	tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+	if (!tprops)
+		return;
+
+	ret = ib_query_port(device, port, tprops);
+	if (ret) {
+		printk(KERN_WARNING "ib_query_port failed (%d) for %s\n",
+		       ret, device->name);
+		goto err;
+	}
+
+	pkey_cache = kmalloc(sizeof *pkey_cache + tprops->pkey_tbl_len *
+			     sizeof *pkey_cache->table, GFP_KERNEL);
+	if (!pkey_cache)
+		goto err;
+
+	pkey_cache->table_len = tprops->pkey_tbl_len;
+
+	gid_cache = kmalloc(sizeof *gid_cache + tprops->gid_tbl_len *
+			    sizeof *gid_cache->table, GFP_KERNEL);
+	if (!gid_cache)
+		goto err;
+
+	gid_cache->table_len = tprops->gid_tbl_len;
+
+	for (i = 0; i < pkey_cache->table_len; ++i) {
+		ret = ib_query_pkey(device, port, i, pkey_cache->table + i);
+		if (ret) {
+			printk(KERN_WARNING "ib_query_pkey failed (%d) for %s (index %d)\n",
+			       ret, device->name, i);
+			goto err;
+		}
+	}
+
+	for (i = 0; i < gid_cache->table_len; ++i) {
+		ret = ib_query_gid(device, port, i, gid_cache->table + i);
+		if (ret) {
+			printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n",
+			       ret, device->name, i);
+			goto err;
+		}
+	}
+
+	write_lock_irq(&device->cache.lock);
+
+	old_pkey_cache = device->cache.pkey_cache[port - start_port(device)];
+	old_gid_cache  = device->cache.gid_cache [port - start_port(device)];
+
+	device->cache.pkey_cache[port - start_port(device)] = pkey_cache;
+	device->cache.gid_cache [port - start_port(device)] = gid_cache;
+
+	device->cache.lmc_cache[port - start_port(device)] = tprops->lmc;
+
+	write_unlock_irq(&device->cache.lock);
+
+	kfree(old_pkey_cache);
+	kfree(old_gid_cache);
+	kfree(tprops);
+	return;
+
+err:
+	kfree(pkey_cache);
+	kfree(gid_cache);
+	kfree(tprops);
+}
+
+static void ib_cache_task(struct work_struct *_work)
+{
+	struct ib_update_work *work =
+		container_of(_work, struct ib_update_work, work);
+
+	ib_cache_update(work->device, work->port_num);
+	kfree(work);
+}
+
+static void ib_cache_event(struct ib_event_handler *handler,
+			   struct ib_event *event)
+{
+	struct ib_update_work *work;
+
+	if (event->event == IB_EVENT_PORT_ERR    ||
+	    event->event == IB_EVENT_PORT_ACTIVE ||
+	    event->event == IB_EVENT_LID_CHANGE  ||
+	    event->event == IB_EVENT_PKEY_CHANGE ||
+	    event->event == IB_EVENT_SM_CHANGE   ||
+	    event->event == IB_EVENT_CLIENT_REREGISTER ||
+	    event->event == IB_EVENT_GID_CHANGE) {
+		work = kmalloc(sizeof *work, GFP_ATOMIC);
+		if (work) {
+			INIT_WORK(&work->work, ib_cache_task);
+			work->device   = event->device;
+			work->port_num = event->element.port_num;
+			schedule_work(&work->work);
+		}
+	}
+}
+
+static void ib_cache_setup_one(struct ib_device *device)
+{
+	int p;
+
+	rwlock_init(&device->cache.lock);
+
+	device->cache.pkey_cache =
+		kmalloc(sizeof *device->cache.pkey_cache *
+			(end_port(device) - start_port(device) + 1), GFP_KERNEL);
+	device->cache.gid_cache =
+		kmalloc(sizeof *device->cache.gid_cache *
+			(end_port(device) - start_port(device) + 1), GFP_KERNEL);
+
+	device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache *
+					  (end_port(device) -
+					   start_port(device) + 1),
+					  GFP_KERNEL);
+
+	if (!device->cache.pkey_cache || !device->cache.gid_cache ||
+	    !device->cache.lmc_cache) {
+		printk(KERN_WARNING "Couldn't allocate cache "
+		       "for %s\n", device->name);
+		goto err;
+	}
+
+	for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+		device->cache.pkey_cache[p] = NULL;
+		device->cache.gid_cache [p] = NULL;
+		ib_cache_update(device, p + start_port(device));
+	}
+
+	INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
+			      device, ib_cache_event);
+	if (ib_register_event_handler(&device->cache.event_handler))
+		goto err_cache;
+
+	return;
+
+err_cache:
+	for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+		kfree(device->cache.pkey_cache[p]);
+		kfree(device->cache.gid_cache[p]);
+	}
+
+err:
+	kfree(device->cache.pkey_cache);
+	kfree(device->cache.gid_cache);
+	kfree(device->cache.lmc_cache);
+}
+
+static void ib_cache_cleanup_one(struct ib_device *device)
+{
+	int p;
+
+	ib_unregister_event_handler(&device->cache.event_handler);
+	flush_scheduled_work();
+
+	for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+		kfree(device->cache.pkey_cache[p]);
+		kfree(device->cache.gid_cache[p]);
+	}
+
+	kfree(device->cache.pkey_cache);
+	kfree(device->cache.gid_cache);
+	kfree(device->cache.lmc_cache);
+}
+
+static struct ib_client cache_client = {
+	.name   = "cache",
+	.add    = ib_cache_setup_one,
+	.remove = ib_cache_cleanup_one
+};
+
+int __init ib_cache_setup(void)
+{
+	return ib_register_client(&cache_client);
+}
+
+void __exit ib_cache_cleanup(void)
+{
+	ib_unregister_client(&cache_client);
+}
diff --git a/sys/ofed/drivers/infiniband/core/cm.c b/sys/ofed/drivers/infiniband/core/cm.c
new file mode 100644
index 0000000..24f8b12
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/cm.c
@@ -0,0 +1,3894 @@
+/*
+ * Copyright (c) 2004-2007 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/dma-mapping.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/interrupt.h>
+#include <linux/random.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/sysfs.h>
+#include <linux/workqueue.h>
+#include <linux/kdev_t.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_cm.h>
+#include "cm_msgs.h"
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("InfiniBand CM");
+MODULE_LICENSE("Dual BSD/GPL");
+
+#define PFX    "ib_cm: "
+
+/*
+ * Limit CM message timeouts to something reasonable:
+ * 8 seconds per message, with up to 15 retries
+ */
+static int max_timeout = 21;
+module_param(max_timeout, int, 0644);
+MODULE_PARM_DESC(max_timeout, "Maximum IB CM per message timeout "
+                             "(default=21, or ~8 seconds)");
+
+static void cm_add_one(struct ib_device *device);
+static void cm_remove_one(struct ib_device *device);
+
+static struct ib_client cm_client = {
+	.name   = "cm",
+	.add    = cm_add_one,
+	.remove = cm_remove_one
+};
+
+static struct ib_cm {
+	spinlock_t lock;
+	struct list_head device_list;
+	rwlock_t device_lock;
+	struct rb_root listen_service_table;
+	u64 listen_service_id;
+	/* struct rb_root peer_service_table; todo: fix peer to peer */
+	struct rb_root remote_qp_table;
+	struct rb_root remote_id_table;
+	struct rb_root remote_sidr_table;
+	struct idr local_id_table;
+	__be32 random_id_operand;
+	struct list_head timewait_list;
+	struct workqueue_struct *wq;
+} cm;
+
+/* Counter indexes ordered by attribute ID */
+enum {
+	CM_REQ_COUNTER,
+	CM_MRA_COUNTER,
+	CM_REJ_COUNTER,
+	CM_REP_COUNTER,
+	CM_RTU_COUNTER,
+	CM_DREQ_COUNTER,
+	CM_DREP_COUNTER,
+	CM_SIDR_REQ_COUNTER,
+	CM_SIDR_REP_COUNTER,
+	CM_LAP_COUNTER,
+	CM_APR_COUNTER,
+	CM_ATTR_COUNT,
+	CM_ATTR_ID_OFFSET = 0x0010,
+};
+
+enum {
+	CM_XMIT,
+	CM_XMIT_RETRIES,
+	CM_RECV,
+	CM_RECV_DUPLICATES,
+	CM_COUNTER_GROUPS
+};
+
+static char const counter_group_names[CM_COUNTER_GROUPS]
+				     [sizeof("cm_rx_duplicates")] = {
+	"cm_tx_msgs", "cm_tx_retries",
+	"cm_rx_msgs", "cm_rx_duplicates"
+};
+
+struct cm_counter_group {
+	struct kobject obj;
+	atomic_long_t counter[CM_ATTR_COUNT];
+};
+
+struct cm_counter_attribute {
+	struct attribute attr;
+	int index;
+};
+
+#define CM_COUNTER_ATTR(_name, _index) \
+struct cm_counter_attribute cm_##_name##_counter_attr = { \
+	.attr = { .name = __stringify(_name), .mode = 0444 }, \
+	.index = _index \
+}
+
+static CM_COUNTER_ATTR(req, CM_REQ_COUNTER);
+static CM_COUNTER_ATTR(mra, CM_MRA_COUNTER);
+static CM_COUNTER_ATTR(rej, CM_REJ_COUNTER);
+static CM_COUNTER_ATTR(rep, CM_REP_COUNTER);
+static CM_COUNTER_ATTR(rtu, CM_RTU_COUNTER);
+static CM_COUNTER_ATTR(dreq, CM_DREQ_COUNTER);
+static CM_COUNTER_ATTR(drep, CM_DREP_COUNTER);
+static CM_COUNTER_ATTR(sidr_req, CM_SIDR_REQ_COUNTER);
+static CM_COUNTER_ATTR(sidr_rep, CM_SIDR_REP_COUNTER);
+static CM_COUNTER_ATTR(lap, CM_LAP_COUNTER);
+static CM_COUNTER_ATTR(apr, CM_APR_COUNTER);
+
+static struct attribute *cm_counter_default_attrs[] = {
+	&cm_req_counter_attr.attr,
+	&cm_mra_counter_attr.attr,
+	&cm_rej_counter_attr.attr,
+	&cm_rep_counter_attr.attr,
+	&cm_rtu_counter_attr.attr,
+	&cm_dreq_counter_attr.attr,
+	&cm_drep_counter_attr.attr,
+	&cm_sidr_req_counter_attr.attr,
+	&cm_sidr_rep_counter_attr.attr,
+	&cm_lap_counter_attr.attr,
+	&cm_apr_counter_attr.attr,
+	NULL
+};
+
+struct cm_port {
+	struct cm_device *cm_dev;
+	struct ib_mad_agent *mad_agent;
+	struct kobject port_obj;
+	u8 port_num;
+	struct cm_counter_group counter_group[CM_COUNTER_GROUPS];
+};
+
+struct cm_device {
+	struct list_head list;
+	struct ib_device *ib_device;
+	struct device *device;
+	u8 ack_delay;
+	struct cm_port *port[0];
+};
+
+struct cm_av {
+	struct cm_port *port;
+	union ib_gid dgid;
+	struct ib_ah_attr ah_attr;
+	u16 pkey_index;
+	u8 timeout;
+};
+
+struct cm_work {
+	struct delayed_work work;
+	struct list_head list;
+	struct cm_port *port;
+	struct ib_mad_recv_wc *mad_recv_wc;	/* Received MADs */
+	__be32 local_id;			/* Established / timewait */
+	__be32 remote_id;
+	struct ib_cm_event cm_event;
+	struct ib_sa_path_rec path[0];
+};
+
+struct cm_timewait_info {
+	struct cm_work work;			/* Must be first. */
+	struct list_head list;
+	struct rb_node remote_qp_node;
+	struct rb_node remote_id_node;
+	__be64 remote_ca_guid;
+	__be32 remote_qpn;
+	u8 inserted_remote_qp;
+	u8 inserted_remote_id;
+};
+
+struct cm_id_private {
+	struct ib_cm_id	id;
+
+	struct rb_node service_node;
+	struct rb_node sidr_id_node;
+	spinlock_t lock;	/* Do not acquire inside cm.lock */
+	struct completion comp;
+	atomic_t refcount;
+
+	struct ib_mad_send_buf *msg;
+	struct cm_timewait_info *timewait_info;
+	/* todo: use alternate port on send failure */
+	struct cm_av av;
+	struct cm_av alt_av;
+	struct ib_cm_compare_data *compare_data;
+
+	void *private_data;
+	__be64 tid;
+	__be32 local_qpn;
+	__be32 remote_qpn;
+	enum ib_qp_type qp_type;
+	__be32 sq_psn;
+	__be32 rq_psn;
+	int timeout_ms;
+	enum ib_mtu path_mtu;
+	__be16 pkey;
+	u8 private_data_len;
+	u8 max_cm_retries;
+	u8 peer_to_peer;
+	u8 responder_resources;
+	u8 initiator_depth;
+	u8 retry_count;
+	u8 rnr_retry_count;
+	u8 service_timeout;
+	u8 target_ack_delay;
+
+	struct list_head work_list;
+	atomic_t work_count;
+};
+
+static void cm_work_handler(struct work_struct *work);
+
+static inline void cm_deref_id(struct cm_id_private *cm_id_priv)
+{
+	if (atomic_dec_and_test(&cm_id_priv->refcount))
+		complete(&cm_id_priv->comp);
+}
+
+static int cm_alloc_msg(struct cm_id_private *cm_id_priv,
+			struct ib_mad_send_buf **msg)
+{
+	struct ib_mad_agent *mad_agent;
+	struct ib_mad_send_buf *m;
+	struct ib_ah *ah;
+
+	mad_agent = cm_id_priv->av.port->mad_agent;
+	ah = ib_create_ah(mad_agent->qp->pd, &cm_id_priv->av.ah_attr);
+	if (IS_ERR(ah))
+		return PTR_ERR(ah);
+
+	m = ib_create_send_mad(mad_agent, cm_id_priv->id.remote_cm_qpn,
+			       cm_id_priv->av.pkey_index,
+			       0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+			       GFP_ATOMIC);
+	if (IS_ERR(m)) {
+		ib_destroy_ah(ah);
+		return PTR_ERR(m);
+	}
+
+	/* Timeout set by caller if response is expected. */
+	m->ah = ah;
+	m->retries = cm_id_priv->max_cm_retries;
+
+	atomic_inc(&cm_id_priv->refcount);
+	m->context[0] = cm_id_priv;
+	*msg = m;
+	return 0;
+}
+
+static int cm_alloc_response_msg(struct cm_port *port,
+				 struct ib_mad_recv_wc *mad_recv_wc,
+				 struct ib_mad_send_buf **msg)
+{
+	struct ib_mad_send_buf *m;
+	struct ib_ah *ah;
+
+	ah = ib_create_ah_from_wc(port->mad_agent->qp->pd, mad_recv_wc->wc,
+				  mad_recv_wc->recv_buf.grh, port->port_num);
+	if (IS_ERR(ah))
+		return PTR_ERR(ah);
+
+	m = ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index,
+			       0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+			       GFP_ATOMIC);
+	if (IS_ERR(m)) {
+		ib_destroy_ah(ah);
+		return PTR_ERR(m);
+	}
+	m->ah = ah;
+	*msg = m;
+	return 0;
+}
+
+static void cm_free_msg(struct ib_mad_send_buf *msg)
+{
+	ib_destroy_ah(msg->ah);
+	if (msg->context[0])
+		cm_deref_id(msg->context[0]);
+	ib_free_send_mad(msg);
+}
+
+static void * cm_copy_private_data(const void *private_data,
+				   u8 private_data_len)
+{
+	void *data;
+
+	if (!private_data || !private_data_len)
+		return NULL;
+
+	data = kmemdup(private_data, private_data_len, GFP_KERNEL);
+	if (!data)
+		return ERR_PTR(-ENOMEM);
+
+	return data;
+}
+
+static void cm_set_private_data(struct cm_id_private *cm_id_priv,
+				 void *private_data, u8 private_data_len)
+{
+	if (cm_id_priv->private_data && cm_id_priv->private_data_len)
+		kfree(cm_id_priv->private_data);
+
+	cm_id_priv->private_data = private_data;
+	cm_id_priv->private_data_len = private_data_len;
+}
+
+static void cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc,
+				    struct ib_grh *grh, struct cm_av *av)
+{
+	av->port = port;
+	av->pkey_index = wc->pkey_index;
+	ib_init_ah_from_wc(port->cm_dev->ib_device, port->port_num, wc,
+			   grh, &av->ah_attr);
+}
+
+static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av)
+{
+	struct cm_device *cm_dev;
+	struct cm_port *port = NULL;
+	unsigned long flags;
+	int ret;
+	u8 p;
+
+	read_lock_irqsave(&cm.device_lock, flags);
+	list_for_each_entry(cm_dev, &cm.device_list, list) {
+		if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid,
+					&p, NULL)) {
+			port = cm_dev->port[p-1];
+			break;
+		}
+	}
+	read_unlock_irqrestore(&cm.device_lock, flags);
+
+	if (!port)
+		return -EINVAL;
+
+	ret = ib_find_cached_pkey(cm_dev->ib_device, port->port_num,
+				  be16_to_cpu(path->pkey), &av->pkey_index);
+	if (ret)
+		return ret;
+
+	av->port = port;
+	ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path,
+			     &av->ah_attr);
+	av->timeout = path->packet_life_time + 1;
+	return 0;
+}
+
+static int cm_alloc_id(struct cm_id_private *cm_id_priv)
+{
+	unsigned long flags;
+	int ret, id;
+	static int next_id;
+
+	do {
+		spin_lock_irqsave(&cm.lock, flags);
+		ret = idr_get_new_above(&cm.local_id_table, cm_id_priv,
+					next_id, &id);
+		if (!ret)
+			next_id = ((unsigned) id + 1) & MAX_ID_MASK;
+		spin_unlock_irqrestore(&cm.lock, flags);
+	} while( (ret == -EAGAIN) && idr_pre_get(&cm.local_id_table, GFP_KERNEL) );
+
+	cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand;
+	return ret;
+}
+
+static void cm_free_id(__be32 local_id)
+{
+	spin_lock_irq(&cm.lock);
+	idr_remove(&cm.local_id_table,
+		   (__force int) (local_id ^ cm.random_id_operand));
+	spin_unlock_irq(&cm.lock);
+}
+
+static struct cm_id_private * cm_get_id(__be32 local_id, __be32 remote_id)
+{
+	struct cm_id_private *cm_id_priv;
+
+	cm_id_priv = idr_find(&cm.local_id_table,
+			      (__force int) (local_id ^ cm.random_id_operand));
+	if (cm_id_priv) {
+		if (cm_id_priv->id.remote_id == remote_id)
+			atomic_inc(&cm_id_priv->refcount);
+		else
+			cm_id_priv = NULL;
+	}
+
+	return cm_id_priv;
+}
+
+static struct cm_id_private * cm_acquire_id(__be32 local_id, __be32 remote_id)
+{
+	struct cm_id_private *cm_id_priv;
+
+	spin_lock_irq(&cm.lock);
+	cm_id_priv = cm_get_id(local_id, remote_id);
+	spin_unlock_irq(&cm.lock);
+
+	return cm_id_priv;
+}
+
+static void cm_mask_copy(u8 *dst, u8 *src, u8 *mask)
+{
+	int i;
+
+	for (i = 0; i < IB_CM_COMPARE_SIZE / sizeof(unsigned long); i++)
+		((unsigned long *) dst)[i] = ((unsigned long *) src)[i] &
+					     ((unsigned long *) mask)[i];
+}
+
+static int cm_compare_data(struct ib_cm_compare_data *src_data,
+			   struct ib_cm_compare_data *dst_data)
+{
+	u8 src[IB_CM_COMPARE_SIZE];
+	u8 dst[IB_CM_COMPARE_SIZE];
+
+	if (!src_data || !dst_data)
+		return 0;
+
+	cm_mask_copy(src, src_data->data, dst_data->mask);
+	cm_mask_copy(dst, dst_data->data, src_data->mask);
+	return memcmp(src, dst, IB_CM_COMPARE_SIZE);
+}
+
+static int cm_compare_private_data(u8 *private_data,
+				   struct ib_cm_compare_data *dst_data)
+{
+	u8 src[IB_CM_COMPARE_SIZE];
+
+	if (!dst_data)
+		return 0;
+
+	cm_mask_copy(src, private_data, dst_data->mask);
+	return memcmp(src, dst_data->data, IB_CM_COMPARE_SIZE);
+}
+
+/*
+ * Trivial helpers to strip endian annotation and compare; the
+ * endianness doesn't actually matter since we just need a stable
+ * order for the RB tree.
+ */
+static int be32_lt(__be32 a, __be32 b)
+{
+	return (__force u32) a < (__force u32) b;
+}
+
+static int be32_gt(__be32 a, __be32 b)
+{
+	return (__force u32) a > (__force u32) b;
+}
+
+static int be64_lt(__be64 a, __be64 b)
+{
+	return (__force u64) a < (__force u64) b;
+}
+
+static int be64_gt(__be64 a, __be64 b)
+{
+	return (__force u64) a > (__force u64) b;
+}
+
+static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
+{
+	struct rb_node **link = &cm.listen_service_table.rb_node;
+	struct rb_node *parent = NULL;
+	struct cm_id_private *cur_cm_id_priv;
+	__be64 service_id = cm_id_priv->id.service_id;
+	__be64 service_mask = cm_id_priv->id.service_mask;
+	int data_cmp;
+
+	while (*link) {
+		parent = *link;
+		cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
+					  service_node);
+		data_cmp = cm_compare_data(cm_id_priv->compare_data,
+					   cur_cm_id_priv->compare_data);
+		if ((cur_cm_id_priv->id.service_mask & service_id) ==
+		    (service_mask & cur_cm_id_priv->id.service_id) &&
+		    (cm_id_priv->id.device == cur_cm_id_priv->id.device) &&
+		    !data_cmp)
+			return cur_cm_id_priv;
+
+		if (cm_id_priv->id.device < cur_cm_id_priv->id.device)
+			link = &(*link)->rb_left;
+		else if (cm_id_priv->id.device > cur_cm_id_priv->id.device)
+			link = &(*link)->rb_right;
+		else if (be64_lt(service_id, cur_cm_id_priv->id.service_id))
+			link = &(*link)->rb_left;
+		else if (be64_gt(service_id, cur_cm_id_priv->id.service_id))
+			link = &(*link)->rb_right;
+		else if (data_cmp < 0)
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+	rb_link_node(&cm_id_priv->service_node, parent, link);
+	rb_insert_color(&cm_id_priv->service_node, &cm.listen_service_table);
+	return NULL;
+}
+
+static struct cm_id_private * cm_find_listen(struct ib_device *device,
+					     __be64 service_id,
+					     u8 *private_data)
+{
+	struct rb_node *node = cm.listen_service_table.rb_node;
+	struct cm_id_private *cm_id_priv;
+	int data_cmp;
+
+	while (node) {
+		cm_id_priv = rb_entry(node, struct cm_id_private, service_node);
+		data_cmp = cm_compare_private_data(private_data,
+						   cm_id_priv->compare_data);
+		if ((cm_id_priv->id.service_mask & service_id) ==
+		     cm_id_priv->id.service_id &&
+		    (cm_id_priv->id.device == device) && !data_cmp)
+			return cm_id_priv;
+
+		if (device < cm_id_priv->id.device)
+			node = node->rb_left;
+		else if (device > cm_id_priv->id.device)
+			node = node->rb_right;
+		else if (be64_lt(service_id, cm_id_priv->id.service_id))
+			node = node->rb_left;
+		else if (be64_gt(service_id, cm_id_priv->id.service_id))
+			node = node->rb_right;
+		else if (data_cmp < 0)
+			node = node->rb_left;
+		else
+			node = node->rb_right;
+	}
+	return NULL;
+}
+
+static struct cm_timewait_info * cm_insert_remote_id(struct cm_timewait_info
+						     *timewait_info)
+{
+	struct rb_node **link = &cm.remote_id_table.rb_node;
+	struct rb_node *parent = NULL;
+	struct cm_timewait_info *cur_timewait_info;
+	__be64 remote_ca_guid = timewait_info->remote_ca_guid;
+	__be32 remote_id = timewait_info->work.remote_id;
+
+	while (*link) {
+		parent = *link;
+		cur_timewait_info = rb_entry(parent, struct cm_timewait_info,
+					     remote_id_node);
+		if (be32_lt(remote_id, cur_timewait_info->work.remote_id))
+			link = &(*link)->rb_left;
+		else if (be32_gt(remote_id, cur_timewait_info->work.remote_id))
+			link = &(*link)->rb_right;
+		else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+			link = &(*link)->rb_left;
+		else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+			link = &(*link)->rb_right;
+		else
+			return cur_timewait_info;
+	}
+	timewait_info->inserted_remote_id = 1;
+	rb_link_node(&timewait_info->remote_id_node, parent, link);
+	rb_insert_color(&timewait_info->remote_id_node, &cm.remote_id_table);
+	return NULL;
+}
+
+static struct cm_timewait_info * cm_find_remote_id(__be64 remote_ca_guid,
+						   __be32 remote_id)
+{
+	struct rb_node *node = cm.remote_id_table.rb_node;
+	struct cm_timewait_info *timewait_info;
+
+	while (node) {
+		timewait_info = rb_entry(node, struct cm_timewait_info,
+					 remote_id_node);
+		if (be32_lt(remote_id, timewait_info->work.remote_id))
+			node = node->rb_left;
+		else if (be32_gt(remote_id, timewait_info->work.remote_id))
+			node = node->rb_right;
+		else if (be64_lt(remote_ca_guid, timewait_info->remote_ca_guid))
+			node = node->rb_left;
+		else if (be64_gt(remote_ca_guid, timewait_info->remote_ca_guid))
+			node = node->rb_right;
+		else
+			return timewait_info;
+	}
+	return NULL;
+}
+
+static struct cm_timewait_info * cm_insert_remote_qpn(struct cm_timewait_info
+						      *timewait_info)
+{
+	struct rb_node **link = &cm.remote_qp_table.rb_node;
+	struct rb_node *parent = NULL;
+	struct cm_timewait_info *cur_timewait_info;
+	__be64 remote_ca_guid = timewait_info->remote_ca_guid;
+	__be32 remote_qpn = timewait_info->remote_qpn;
+
+	while (*link) {
+		parent = *link;
+		cur_timewait_info = rb_entry(parent, struct cm_timewait_info,
+					     remote_qp_node);
+		if (be32_lt(remote_qpn, cur_timewait_info->remote_qpn))
+			link = &(*link)->rb_left;
+		else if (be32_gt(remote_qpn, cur_timewait_info->remote_qpn))
+			link = &(*link)->rb_right;
+		else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+			link = &(*link)->rb_left;
+		else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+			link = &(*link)->rb_right;
+		else
+			return cur_timewait_info;
+	}
+	timewait_info->inserted_remote_qp = 1;
+	rb_link_node(&timewait_info->remote_qp_node, parent, link);
+	rb_insert_color(&timewait_info->remote_qp_node, &cm.remote_qp_table);
+	return NULL;
+}
+
+static struct cm_id_private * cm_insert_remote_sidr(struct cm_id_private
+						    *cm_id_priv)
+{
+	struct rb_node **link = &cm.remote_sidr_table.rb_node;
+	struct rb_node *parent = NULL;
+	struct cm_id_private *cur_cm_id_priv;
+	union ib_gid *port_gid = &cm_id_priv->av.dgid;
+	__be32 remote_id = cm_id_priv->id.remote_id;
+
+	while (*link) {
+		parent = *link;
+		cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
+					  sidr_id_node);
+		if (be32_lt(remote_id, cur_cm_id_priv->id.remote_id))
+			link = &(*link)->rb_left;
+		else if (be32_gt(remote_id, cur_cm_id_priv->id.remote_id))
+			link = &(*link)->rb_right;
+		else {
+			int cmp;
+			cmp = memcmp(port_gid, &cur_cm_id_priv->av.dgid,
+				     sizeof *port_gid);
+			if (cmp < 0)
+				link = &(*link)->rb_left;
+			else if (cmp > 0)
+				link = &(*link)->rb_right;
+			else
+				return cur_cm_id_priv;
+		}
+	}
+	rb_link_node(&cm_id_priv->sidr_id_node, parent, link);
+	rb_insert_color(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
+	return NULL;
+}
+
+static void cm_reject_sidr_req(struct cm_id_private *cm_id_priv,
+			       enum ib_cm_sidr_status status)
+{
+	struct ib_cm_sidr_rep_param param;
+
+	memset(&param, 0, sizeof param);
+	param.status = status;
+	ib_send_cm_sidr_rep(&cm_id_priv->id, &param);
+}
+
+struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
+				 ib_cm_handler cm_handler,
+				 void *context)
+{
+	struct cm_id_private *cm_id_priv;
+	int ret;
+
+	cm_id_priv = kzalloc(sizeof *cm_id_priv, GFP_KERNEL);
+	if (!cm_id_priv)
+		return ERR_PTR(-ENOMEM);
+
+	cm_id_priv->id.state = IB_CM_IDLE;
+	cm_id_priv->id.device = device;
+	cm_id_priv->id.cm_handler = cm_handler;
+	cm_id_priv->id.context = context;
+	cm_id_priv->id.remote_cm_qpn = 1;
+	ret = cm_alloc_id(cm_id_priv);
+	if (ret)
+		goto error;
+
+	spin_lock_init(&cm_id_priv->lock);
+	init_completion(&cm_id_priv->comp);
+	INIT_LIST_HEAD(&cm_id_priv->work_list);
+	atomic_set(&cm_id_priv->work_count, -1);
+	atomic_set(&cm_id_priv->refcount, 1);
+	return &cm_id_priv->id;
+
+error:
+	kfree(cm_id_priv);
+	return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(ib_create_cm_id);
+
+static struct cm_work * cm_dequeue_work(struct cm_id_private *cm_id_priv)
+{
+	struct cm_work *work;
+
+	if (list_empty(&cm_id_priv->work_list))
+		return NULL;
+
+	work = list_entry(cm_id_priv->work_list.next, struct cm_work, list);
+	list_del(&work->list);
+	return work;
+}
+
+static void cm_free_work(struct cm_work *work)
+{
+	if (work->mad_recv_wc)
+		ib_free_recv_mad(work->mad_recv_wc);
+	kfree(work);
+}
+
+static inline int cm_convert_to_ms(int iba_time)
+{
+	/* approximate conversion to ms from 4.096us x 2^iba_time */
+	return 1 << max(iba_time - 8, 0);
+}
+
+/*
+ * calculate: 4.096x2^ack_timeout = 4.096x2^ack_delay + 2x4.096x2^life_time
+ * Because of how ack_timeout is stored, adding one doubles the timeout.
+ * To avoid large timeouts, select the max(ack_delay, life_time + 1), and
+ * increment it (round up) only if the other is within 50%.
+ */
+static u8 cm_ack_timeout(u8 ca_ack_delay, u8 packet_life_time)
+{
+	int ack_timeout = packet_life_time + 1;
+
+	if (ack_timeout >= ca_ack_delay)
+		ack_timeout += (ca_ack_delay >= (ack_timeout - 1));
+	else
+		ack_timeout = ca_ack_delay +
+			      (ack_timeout >= (ca_ack_delay - 1));
+
+	return min(31, ack_timeout);
+}
+
+static void cm_cleanup_timewait(struct cm_timewait_info *timewait_info)
+{
+	if (timewait_info->inserted_remote_id) {
+		rb_erase(&timewait_info->remote_id_node, &cm.remote_id_table);
+		timewait_info->inserted_remote_id = 0;
+	}
+
+	if (timewait_info->inserted_remote_qp) {
+		rb_erase(&timewait_info->remote_qp_node, &cm.remote_qp_table);
+		timewait_info->inserted_remote_qp = 0;
+	}
+}
+
+static struct cm_timewait_info * cm_create_timewait_info(__be32 local_id)
+{
+	struct cm_timewait_info *timewait_info;
+
+	timewait_info = kzalloc(sizeof *timewait_info, GFP_KERNEL);
+	if (!timewait_info)
+		return ERR_PTR(-ENOMEM);
+
+	timewait_info->work.local_id = local_id;
+	INIT_DELAYED_WORK(&timewait_info->work.work, cm_work_handler);
+	timewait_info->work.cm_event.event = IB_CM_TIMEWAIT_EXIT;
+	return timewait_info;
+}
+
+static void cm_enter_timewait(struct cm_id_private *cm_id_priv)
+{
+	int wait_time;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cm.lock, flags);
+	cm_cleanup_timewait(cm_id_priv->timewait_info);
+	list_add_tail(&cm_id_priv->timewait_info->list, &cm.timewait_list);
+	spin_unlock_irqrestore(&cm.lock, flags);
+
+	/*
+	 * The cm_id could be destroyed by the user before we exit timewait.
+	 * To protect against this, we search for the cm_id after exiting
+	 * timewait before notifying the user that we've exited timewait.
+	 */
+	cm_id_priv->id.state = IB_CM_TIMEWAIT;
+	wait_time = cm_convert_to_ms(cm_id_priv->av.timeout);
+	queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work,
+			   msecs_to_jiffies(wait_time));
+	cm_id_priv->timewait_info = NULL;
+}
+
+static void cm_reset_to_idle(struct cm_id_private *cm_id_priv)
+{
+	unsigned long flags;
+
+	cm_id_priv->id.state = IB_CM_IDLE;
+	if (cm_id_priv->timewait_info) {
+		spin_lock_irqsave(&cm.lock, flags);
+		cm_cleanup_timewait(cm_id_priv->timewait_info);
+		spin_unlock_irqrestore(&cm.lock, flags);
+		kfree(cm_id_priv->timewait_info);
+		cm_id_priv->timewait_info = NULL;
+	}
+}
+
+static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_work *work;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+retest:
+	spin_lock_irq(&cm_id_priv->lock);
+	switch (cm_id->state) {
+	case IB_CM_LISTEN:
+		cm_id->state = IB_CM_IDLE;
+		spin_unlock_irq(&cm_id_priv->lock);
+		spin_lock_irq(&cm.lock);
+		rb_erase(&cm_id_priv->service_node, &cm.listen_service_table);
+		spin_unlock_irq(&cm.lock);
+		break;
+	case IB_CM_SIDR_REQ_SENT:
+		cm_id->state = IB_CM_IDLE;
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		spin_unlock_irq(&cm_id_priv->lock);
+		break;
+	case IB_CM_SIDR_REQ_RCVD:
+		spin_unlock_irq(&cm_id_priv->lock);
+		cm_reject_sidr_req(cm_id_priv, IB_SIDR_REJECT);
+		break;
+	case IB_CM_REQ_SENT:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		spin_unlock_irq(&cm_id_priv->lock);
+		ib_send_cm_rej(cm_id, IB_CM_REJ_TIMEOUT,
+			       &cm_id_priv->id.device->node_guid,
+			       sizeof cm_id_priv->id.device->node_guid,
+			       NULL, 0);
+		break;
+	case IB_CM_REQ_RCVD:
+		if (err == -ENOMEM) {
+			/* Do not reject to allow future retries. */
+			cm_reset_to_idle(cm_id_priv);
+			spin_unlock_irq(&cm_id_priv->lock);
+		} else {
+			spin_unlock_irq(&cm_id_priv->lock);
+			ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
+				       NULL, 0, NULL, 0);
+		}
+		break;
+	case IB_CM_MRA_REQ_RCVD:
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		/* Fall through */
+	case IB_CM_MRA_REQ_SENT:
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+		spin_unlock_irq(&cm_id_priv->lock);
+		ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
+			       NULL, 0, NULL, 0);
+		break;
+	case IB_CM_ESTABLISHED:
+		spin_unlock_irq(&cm_id_priv->lock);
+		ib_send_cm_dreq(cm_id, NULL, 0);
+		goto retest;
+	case IB_CM_DREQ_SENT:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		cm_enter_timewait(cm_id_priv);
+		spin_unlock_irq(&cm_id_priv->lock);
+		break;
+	case IB_CM_DREQ_RCVD:
+		spin_unlock_irq(&cm_id_priv->lock);
+		ib_send_cm_drep(cm_id, NULL, 0);
+		break;
+	default:
+		spin_unlock_irq(&cm_id_priv->lock);
+		break;
+	}
+
+	cm_free_id(cm_id->local_id);
+	cm_deref_id(cm_id_priv);
+	wait_for_completion(&cm_id_priv->comp);
+	while ((work = cm_dequeue_work(cm_id_priv)) != NULL)
+		cm_free_work(work);
+	kfree(cm_id_priv->compare_data);
+	kfree(cm_id_priv->private_data);
+	kfree(cm_id_priv);
+}
+
+void ib_destroy_cm_id(struct ib_cm_id *cm_id)
+{
+	cm_destroy_id(cm_id, 0);
+}
+EXPORT_SYMBOL(ib_destroy_cm_id);
+
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
+		 struct ib_cm_compare_data *compare_data)
+{
+	struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
+	unsigned long flags;
+	int ret = 0;
+
+	service_mask = service_mask ? service_mask : ~cpu_to_be64(0);
+	service_id &= service_mask;
+	if ((service_id & IB_SERVICE_ID_AGN_MASK) == IB_CM_ASSIGN_SERVICE_ID &&
+	    (service_id != IB_CM_ASSIGN_SERVICE_ID))
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	if (cm_id->state != IB_CM_IDLE)
+		return -EINVAL;
+
+	if (compare_data) {
+		cm_id_priv->compare_data = kzalloc(sizeof *compare_data,
+						   GFP_KERNEL);
+		if (!cm_id_priv->compare_data)
+			return -ENOMEM;
+		cm_mask_copy(cm_id_priv->compare_data->data,
+			     compare_data->data, compare_data->mask);
+		memcpy(cm_id_priv->compare_data->mask, compare_data->mask,
+		       IB_CM_COMPARE_SIZE);
+	}
+
+	cm_id->state = IB_CM_LISTEN;
+
+	spin_lock_irqsave(&cm.lock, flags);
+	if (service_id == IB_CM_ASSIGN_SERVICE_ID) {
+		cm_id->service_id = cpu_to_be64(cm.listen_service_id++);
+		cm_id->service_mask = ~cpu_to_be64(0);
+	} else {
+		cm_id->service_id = service_id;
+		cm_id->service_mask = service_mask;
+	}
+	cur_cm_id_priv = cm_insert_listen(cm_id_priv);
+	spin_unlock_irqrestore(&cm.lock, flags);
+
+	if (cur_cm_id_priv) {
+		cm_id->state = IB_CM_IDLE;
+		kfree(cm_id_priv->compare_data);
+		cm_id_priv->compare_data = NULL;
+		ret = -EBUSY;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(ib_cm_listen);
+
+static __be64 cm_form_tid(struct cm_id_private *cm_id_priv,
+			  enum cm_msg_sequence msg_seq)
+{
+	u64 hi_tid, low_tid;
+
+	hi_tid   = ((u64) cm_id_priv->av.port->mad_agent->hi_tid) << 32;
+	low_tid  = (u64) ((__force u32)cm_id_priv->id.local_id |
+			  (msg_seq << 30));
+	return cpu_to_be64(hi_tid | low_tid);
+}
+
+static void cm_format_mad_hdr(struct ib_mad_hdr *hdr,
+			      __be16 attr_id, __be64 tid)
+{
+	hdr->base_version  = IB_MGMT_BASE_VERSION;
+	hdr->mgmt_class	   = IB_MGMT_CLASS_CM;
+	hdr->class_version = IB_CM_CLASS_VERSION;
+	hdr->method	   = IB_MGMT_METHOD_SEND;
+	hdr->attr_id	   = attr_id;
+	hdr->tid	   = tid;
+}
+
+static void cm_format_req(struct cm_req_msg *req_msg,
+			  struct cm_id_private *cm_id_priv,
+			  struct ib_cm_req_param *param)
+{
+	struct ib_sa_path_rec *pri_path = param->primary_path;
+	struct ib_sa_path_rec *alt_path = param->alternate_path;
+
+	cm_format_mad_hdr(&req_msg->hdr, CM_REQ_ATTR_ID,
+			  cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_REQ));
+
+	req_msg->local_comm_id = cm_id_priv->id.local_id;
+	req_msg->service_id = param->service_id;
+	req_msg->local_ca_guid = cm_id_priv->id.device->node_guid;
+	cm_req_set_local_qpn(req_msg, cpu_to_be32(param->qp_num));
+	cm_req_set_resp_res(req_msg, param->responder_resources);
+	cm_req_set_init_depth(req_msg, param->initiator_depth);
+	cm_req_set_remote_resp_timeout(req_msg,
+				       param->remote_cm_response_timeout);
+       if (param->remote_cm_response_timeout > (u8) max_timeout) {
+               printk(KERN_WARNING PFX "req remote_cm_response_timeout %d > "
+                      "%d, decreasing\n", param->remote_cm_response_timeout,
+                      max_timeout);
+               cm_req_set_remote_resp_timeout(req_msg, (u8) max_timeout);
+       }
+	cm_req_set_qp_type(req_msg, param->qp_type);
+	cm_req_set_flow_ctrl(req_msg, param->flow_control);
+	cm_req_set_starting_psn(req_msg, cpu_to_be32(param->starting_psn));
+	cm_req_set_local_resp_timeout(req_msg,
+				      param->local_cm_response_timeout);
+       if (param->local_cm_response_timeout > (u8) max_timeout) {
+               printk(KERN_WARNING PFX "req local_cm_response_timeout %d > "
+                      "%d, decreasing\n", param->local_cm_response_timeout,
+                      max_timeout);
+               cm_req_set_local_resp_timeout(req_msg, (u8) max_timeout);
+       }
+	cm_req_set_retry_count(req_msg, param->retry_count);
+	req_msg->pkey = param->primary_path->pkey;
+	cm_req_set_path_mtu(req_msg, param->primary_path->mtu);
+	cm_req_set_rnr_retry_count(req_msg, param->rnr_retry_count);
+	cm_req_set_max_cm_retries(req_msg, param->max_cm_retries);
+	cm_req_set_srq(req_msg, param->srq);
+
+	if (pri_path->hop_limit <= 1) {
+		req_msg->primary_local_lid = pri_path->slid;
+		req_msg->primary_remote_lid = pri_path->dlid;
+	} else {
+		/* Work-around until there's a way to obtain remote LID info */
+		req_msg->primary_local_lid = IB_LID_PERMISSIVE;
+		req_msg->primary_remote_lid = IB_LID_PERMISSIVE;
+	}
+	req_msg->primary_local_gid = pri_path->sgid;
+	req_msg->primary_remote_gid = pri_path->dgid;
+	cm_req_set_primary_flow_label(req_msg, pri_path->flow_label);
+	cm_req_set_primary_packet_rate(req_msg, pri_path->rate);
+	req_msg->primary_traffic_class = pri_path->traffic_class;
+	req_msg->primary_hop_limit = pri_path->hop_limit;
+	cm_req_set_primary_sl(req_msg, pri_path->sl);
+	cm_req_set_primary_subnet_local(req_msg, (pri_path->hop_limit <= 1));
+	cm_req_set_primary_local_ack_timeout(req_msg,
+		cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
+			       pri_path->packet_life_time));
+
+	if (alt_path) {
+		if (alt_path->hop_limit <= 1) {
+			req_msg->alt_local_lid = alt_path->slid;
+			req_msg->alt_remote_lid = alt_path->dlid;
+		} else {
+			req_msg->alt_local_lid = IB_LID_PERMISSIVE;
+			req_msg->alt_remote_lid = IB_LID_PERMISSIVE;
+		}
+		req_msg->alt_local_gid = alt_path->sgid;
+		req_msg->alt_remote_gid = alt_path->dgid;
+		cm_req_set_alt_flow_label(req_msg,
+					  alt_path->flow_label);
+		cm_req_set_alt_packet_rate(req_msg, alt_path->rate);
+		req_msg->alt_traffic_class = alt_path->traffic_class;
+		req_msg->alt_hop_limit = alt_path->hop_limit;
+		cm_req_set_alt_sl(req_msg, alt_path->sl);
+		cm_req_set_alt_subnet_local(req_msg, (alt_path->hop_limit <= 1));
+		cm_req_set_alt_local_ack_timeout(req_msg,
+			cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
+				       alt_path->packet_life_time));
+	}
+
+	if (param->private_data && param->private_data_len)
+		memcpy(req_msg->private_data, param->private_data,
+		       param->private_data_len);
+}
+
+static int cm_validate_req_param(struct ib_cm_req_param *param)
+{
+	/* peer-to-peer not supported */
+	if (param->peer_to_peer)
+		return -EINVAL;
+
+	if (!param->primary_path)
+		return -EINVAL;
+
+	if (param->qp_type != IB_QPT_RC && param->qp_type != IB_QPT_UC)
+		return -EINVAL;
+
+	if (param->private_data &&
+	    param->private_data_len > IB_CM_REQ_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	if (param->alternate_path &&
+	    (param->alternate_path->pkey != param->primary_path->pkey ||
+	     param->alternate_path->mtu != param->primary_path->mtu))
+		return -EINVAL;
+
+	return 0;
+}
+
+int ib_send_cm_req(struct ib_cm_id *cm_id,
+		   struct ib_cm_req_param *param)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_req_msg *req_msg;
+	unsigned long flags;
+	int ret;
+
+	ret = cm_validate_req_param(param);
+	if (ret)
+		return ret;
+
+	/* Verify that we're not in timewait. */
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_IDLE) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		ret = -EINVAL;
+		goto out;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
+							    id.local_id);
+	if (IS_ERR(cm_id_priv->timewait_info)) {
+		ret = PTR_ERR(cm_id_priv->timewait_info);
+		goto out;
+	}
+
+	ret = cm_init_av_by_path(param->primary_path, &cm_id_priv->av);
+	if (ret)
+		goto error1;
+	if (param->alternate_path) {
+		ret = cm_init_av_by_path(param->alternate_path,
+					 &cm_id_priv->alt_av);
+		if (ret)
+			goto error1;
+	}
+	cm_id->service_id = param->service_id;
+	cm_id->service_mask = ~cpu_to_be64(0);
+	cm_id_priv->timeout_ms = cm_convert_to_ms(
+				    param->primary_path->packet_life_time) * 2 +
+				 cm_convert_to_ms(
+				    param->remote_cm_response_timeout);
+       if (cm_id_priv->timeout_ms > cm_convert_to_ms(max_timeout)) {
+               printk(KERN_WARNING PFX "req timeout_ms %d > %d, decreasing\n",
+                      cm_id_priv->timeout_ms, cm_convert_to_ms(max_timeout));
+               cm_id_priv->timeout_ms = cm_convert_to_ms(max_timeout);
+       }
+	cm_id_priv->max_cm_retries = param->max_cm_retries;
+	cm_id_priv->initiator_depth = param->initiator_depth;
+	cm_id_priv->responder_resources = param->responder_resources;
+	cm_id_priv->retry_count = param->retry_count;
+	cm_id_priv->path_mtu = param->primary_path->mtu;
+	cm_id_priv->pkey = param->primary_path->pkey;
+	cm_id_priv->qp_type = param->qp_type;
+
+	ret = cm_alloc_msg(cm_id_priv, &cm_id_priv->msg);
+	if (ret)
+		goto error1;
+
+	req_msg = (struct cm_req_msg *) cm_id_priv->msg->mad;
+	cm_format_req(req_msg, cm_id_priv, param);
+	cm_id_priv->tid = req_msg->hdr.tid;
+	cm_id_priv->msg->timeout_ms = cm_id_priv->timeout_ms;
+	cm_id_priv->msg->context[1] = (void *) (unsigned long) IB_CM_REQ_SENT;
+
+	cm_id_priv->local_qpn = cm_req_get_local_qpn(req_msg);
+	cm_id_priv->rq_psn = cm_req_get_starting_psn(req_msg);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	ret = ib_post_send_mad(cm_id_priv->msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		goto error2;
+	}
+	BUG_ON(cm_id->state != IB_CM_IDLE);
+	cm_id->state = IB_CM_REQ_SENT;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return 0;
+
+error2:	cm_free_msg(cm_id_priv->msg);
+error1:	kfree(cm_id_priv->timewait_info);
+out:	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_req);
+
+static int cm_issue_rej(struct cm_port *port,
+			struct ib_mad_recv_wc *mad_recv_wc,
+			enum ib_cm_rej_reason reason,
+			enum cm_msg_response msg_rejected,
+			void *ari, u8 ari_length)
+{
+	struct ib_mad_send_buf *msg = NULL;
+	struct cm_rej_msg *rej_msg, *rcv_msg;
+	int ret;
+
+	ret = cm_alloc_response_msg(port, mad_recv_wc, &msg);
+	if (ret)
+		return ret;
+
+	/* We just need common CM header information.  Cast to any message. */
+	rcv_msg = (struct cm_rej_msg *) mad_recv_wc->recv_buf.mad;
+	rej_msg = (struct cm_rej_msg *) msg->mad;
+
+	cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, rcv_msg->hdr.tid);
+	rej_msg->remote_comm_id = rcv_msg->local_comm_id;
+	rej_msg->local_comm_id = rcv_msg->remote_comm_id;
+	cm_rej_set_msg_rejected(rej_msg, msg_rejected);
+	rej_msg->reason = cpu_to_be16(reason);
+
+	if (ari && ari_length) {
+		cm_rej_set_reject_info_len(rej_msg, ari_length);
+		memcpy(rej_msg->ari, ari, ari_length);
+	}
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		cm_free_msg(msg);
+
+	return ret;
+}
+
+static inline int cm_is_active_peer(__be64 local_ca_guid, __be64 remote_ca_guid,
+				    __be32 local_qpn, __be32 remote_qpn)
+{
+	return (be64_to_cpu(local_ca_guid) > be64_to_cpu(remote_ca_guid) ||
+		((local_ca_guid == remote_ca_guid) &&
+		 (be32_to_cpu(local_qpn) > be32_to_cpu(remote_qpn))));
+}
+
+static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
+					    struct ib_sa_path_rec *primary_path,
+					    struct ib_sa_path_rec *alt_path)
+{
+	memset(primary_path, 0, sizeof *primary_path);
+	primary_path->dgid = req_msg->primary_local_gid;
+	primary_path->sgid = req_msg->primary_remote_gid;
+	primary_path->dlid = req_msg->primary_local_lid;
+	primary_path->slid = req_msg->primary_remote_lid;
+	primary_path->flow_label = cm_req_get_primary_flow_label(req_msg);
+	primary_path->hop_limit = req_msg->primary_hop_limit;
+	primary_path->traffic_class = req_msg->primary_traffic_class;
+	primary_path->reversible = 1;
+	primary_path->pkey = req_msg->pkey;
+	primary_path->sl = cm_req_get_primary_sl(req_msg);
+	primary_path->mtu_selector = IB_SA_EQ;
+	primary_path->mtu = cm_req_get_path_mtu(req_msg);
+	primary_path->rate_selector = IB_SA_EQ;
+	primary_path->rate = cm_req_get_primary_packet_rate(req_msg);
+	primary_path->packet_life_time_selector = IB_SA_EQ;
+	primary_path->packet_life_time =
+		cm_req_get_primary_local_ack_timeout(req_msg);
+	primary_path->packet_life_time -= (primary_path->packet_life_time > 0);
+
+	if (req_msg->alt_local_lid) {
+		memset(alt_path, 0, sizeof *alt_path);
+		alt_path->dgid = req_msg->alt_local_gid;
+		alt_path->sgid = req_msg->alt_remote_gid;
+		alt_path->dlid = req_msg->alt_local_lid;
+		alt_path->slid = req_msg->alt_remote_lid;
+		alt_path->flow_label = cm_req_get_alt_flow_label(req_msg);
+		alt_path->hop_limit = req_msg->alt_hop_limit;
+		alt_path->traffic_class = req_msg->alt_traffic_class;
+		alt_path->reversible = 1;
+		alt_path->pkey = req_msg->pkey;
+		alt_path->sl = cm_req_get_alt_sl(req_msg);
+		alt_path->mtu_selector = IB_SA_EQ;
+		alt_path->mtu = cm_req_get_path_mtu(req_msg);
+		alt_path->rate_selector = IB_SA_EQ;
+		alt_path->rate = cm_req_get_alt_packet_rate(req_msg);
+		alt_path->packet_life_time_selector = IB_SA_EQ;
+		alt_path->packet_life_time =
+			cm_req_get_alt_local_ack_timeout(req_msg);
+		alt_path->packet_life_time -= (alt_path->packet_life_time > 0);
+	}
+}
+
+static void cm_format_req_event(struct cm_work *work,
+				struct cm_id_private *cm_id_priv,
+				struct ib_cm_id *listen_id)
+{
+	struct cm_req_msg *req_msg;
+	struct ib_cm_req_event_param *param;
+
+	req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
+	param = &work->cm_event.param.req_rcvd;
+	param->listen_id = listen_id;
+	param->port = cm_id_priv->av.port->port_num;
+	param->primary_path = &work->path[0];
+	if (req_msg->alt_local_lid)
+		param->alternate_path = &work->path[1];
+	else
+		param->alternate_path = NULL;
+	param->remote_ca_guid = req_msg->local_ca_guid;
+	param->remote_qkey = be32_to_cpu(req_msg->local_qkey);
+	param->remote_qpn = be32_to_cpu(cm_req_get_local_qpn(req_msg));
+	param->qp_type = cm_req_get_qp_type(req_msg);
+	param->starting_psn = be32_to_cpu(cm_req_get_starting_psn(req_msg));
+	param->responder_resources = cm_req_get_init_depth(req_msg);
+	param->initiator_depth = cm_req_get_resp_res(req_msg);
+	param->local_cm_response_timeout =
+					cm_req_get_remote_resp_timeout(req_msg);
+	param->flow_control = cm_req_get_flow_ctrl(req_msg);
+	param->remote_cm_response_timeout =
+					cm_req_get_local_resp_timeout(req_msg);
+	param->retry_count = cm_req_get_retry_count(req_msg);
+	param->rnr_retry_count = cm_req_get_rnr_retry_count(req_msg);
+	param->srq = cm_req_get_srq(req_msg);
+	work->cm_event.private_data = &req_msg->private_data;
+}
+
+static void cm_process_work(struct cm_id_private *cm_id_priv,
+			    struct cm_work *work)
+{
+	int ret;
+
+	/* We will typically only have the current event to report. */
+	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->cm_event);
+	cm_free_work(work);
+
+	while (!ret && !atomic_add_negative(-1, &cm_id_priv->work_count)) {
+		spin_lock_irq(&cm_id_priv->lock);
+		work = cm_dequeue_work(cm_id_priv);
+		spin_unlock_irq(&cm_id_priv->lock);
+		BUG_ON(!work);
+		ret = cm_id_priv->id.cm_handler(&cm_id_priv->id,
+						&work->cm_event);
+		cm_free_work(work);
+	}
+	cm_deref_id(cm_id_priv);
+	if (ret)
+		cm_destroy_id(&cm_id_priv->id, ret);
+}
+
+static void cm_format_mra(struct cm_mra_msg *mra_msg,
+			  struct cm_id_private *cm_id_priv,
+			  enum cm_msg_response msg_mraed, u8 service_timeout,
+			  const void *private_data, u8 private_data_len)
+{
+	cm_format_mad_hdr(&mra_msg->hdr, CM_MRA_ATTR_ID, cm_id_priv->tid);
+	cm_mra_set_msg_mraed(mra_msg, msg_mraed);
+	mra_msg->local_comm_id = cm_id_priv->id.local_id;
+	mra_msg->remote_comm_id = cm_id_priv->id.remote_id;
+	cm_mra_set_service_timeout(mra_msg, service_timeout);
+
+	if (private_data && private_data_len)
+		memcpy(mra_msg->private_data, private_data, private_data_len);
+}
+
+static void cm_format_rej(struct cm_rej_msg *rej_msg,
+			  struct cm_id_private *cm_id_priv,
+			  enum ib_cm_rej_reason reason,
+			  void *ari,
+			  u8 ari_length,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, cm_id_priv->tid);
+	rej_msg->remote_comm_id = cm_id_priv->id.remote_id;
+
+	switch(cm_id_priv->id.state) {
+	case IB_CM_REQ_RCVD:
+		rej_msg->local_comm_id = 0;
+		cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REQ);
+		break;
+	case IB_CM_MRA_REQ_SENT:
+		rej_msg->local_comm_id = cm_id_priv->id.local_id;
+		cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REQ);
+		break;
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+		rej_msg->local_comm_id = cm_id_priv->id.local_id;
+		cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REP);
+		break;
+	default:
+		rej_msg->local_comm_id = cm_id_priv->id.local_id;
+		cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_OTHER);
+		break;
+	}
+
+	rej_msg->reason = cpu_to_be16(reason);
+	if (ari && ari_length) {
+		cm_rej_set_reject_info_len(rej_msg, ari_length);
+		memcpy(rej_msg->ari, ari, ari_length);
+	}
+
+	if (private_data && private_data_len)
+		memcpy(rej_msg->private_data, private_data, private_data_len);
+}
+
+static void cm_dup_req_handler(struct cm_work *work,
+			       struct cm_id_private *cm_id_priv)
+{
+	struct ib_mad_send_buf *msg = NULL;
+	int ret;
+
+	atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+			counter[CM_REQ_COUNTER]);
+
+	/* Quick state check to discard duplicate REQs. */
+	if (cm_id_priv->id.state == IB_CM_REQ_RCVD)
+		return;
+
+	ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
+	if (ret)
+		return;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_MRA_REQ_SENT:
+		cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+			      CM_MSG_RESPONSE_REQ, cm_id_priv->service_timeout,
+			      cm_id_priv->private_data,
+			      cm_id_priv->private_data_len);
+		break;
+	case IB_CM_TIMEWAIT:
+		cm_format_rej((struct cm_rej_msg *) msg->mad, cm_id_priv,
+			      IB_CM_REJ_STALE_CONN, NULL, 0, NULL, 0);
+		break;
+	default:
+		goto unlock;
+	}
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		goto free;
+	return;
+
+unlock:	spin_unlock_irq(&cm_id_priv->lock);
+free:	cm_free_msg(msg);
+}
+
+static struct cm_id_private * cm_match_req(struct cm_work *work,
+					   struct cm_id_private *cm_id_priv)
+{
+	struct cm_id_private *listen_cm_id_priv, *cur_cm_id_priv;
+	struct cm_timewait_info *timewait_info;
+	struct cm_req_msg *req_msg;
+
+	req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
+
+	/* Check for possible duplicate REQ. */
+	spin_lock_irq(&cm.lock);
+	timewait_info = cm_insert_remote_id(cm_id_priv->timewait_info);
+	if (timewait_info) {
+		cur_cm_id_priv = cm_get_id(timewait_info->work.local_id,
+					   timewait_info->work.remote_id);
+		spin_unlock_irq(&cm.lock);
+		if (cur_cm_id_priv) {
+			cm_dup_req_handler(work, cur_cm_id_priv);
+			cm_deref_id(cur_cm_id_priv);
+		}
+		return NULL;
+	}
+
+	/* Check for stale connections. */
+	timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info);
+	if (timewait_info) {
+		cm_cleanup_timewait(cm_id_priv->timewait_info);
+		spin_unlock_irq(&cm.lock);
+		cm_issue_rej(work->port, work->mad_recv_wc,
+			     IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REQ,
+			     NULL, 0);
+		return NULL;
+	}
+
+	/* Find matching listen request. */
+	listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device,
+					   req_msg->service_id,
+					   req_msg->private_data);
+	if (!listen_cm_id_priv) {
+		cm_cleanup_timewait(cm_id_priv->timewait_info);
+		spin_unlock_irq(&cm.lock);
+		cm_issue_rej(work->port, work->mad_recv_wc,
+			     IB_CM_REJ_INVALID_SERVICE_ID, CM_MSG_RESPONSE_REQ,
+			     NULL, 0);
+		goto out;
+	}
+	atomic_inc(&listen_cm_id_priv->refcount);
+	atomic_inc(&cm_id_priv->refcount);
+	cm_id_priv->id.state = IB_CM_REQ_RCVD;
+	atomic_inc(&cm_id_priv->work_count);
+	spin_unlock_irq(&cm.lock);
+out:
+	return listen_cm_id_priv;
+}
+
+/*
+ * Work-around for inter-subnet connections.  If the LIDs are permissive,
+ * we need to override the LID/SL data in the REQ with the LID information
+ * in the work completion.
+ */
+static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc)
+{
+	if (!cm_req_get_primary_subnet_local(req_msg)) {
+		if (req_msg->primary_local_lid == IB_LID_PERMISSIVE) {
+			req_msg->primary_local_lid = cpu_to_be16(wc->slid);
+			cm_req_set_primary_sl(req_msg, wc->sl);
+		}
+
+		if (req_msg->primary_remote_lid == IB_LID_PERMISSIVE)
+			req_msg->primary_remote_lid = cpu_to_be16(wc->dlid_path_bits);
+	}
+
+	if (!cm_req_get_alt_subnet_local(req_msg)) {
+		if (req_msg->alt_local_lid == IB_LID_PERMISSIVE) {
+			req_msg->alt_local_lid = cpu_to_be16(wc->slid);
+			cm_req_set_alt_sl(req_msg, wc->sl);
+		}
+
+		if (req_msg->alt_remote_lid == IB_LID_PERMISSIVE)
+			req_msg->alt_remote_lid = cpu_to_be16(wc->dlid_path_bits);
+	}
+}
+
+static int cm_req_handler(struct cm_work *work)
+{
+	struct ib_cm_id *cm_id;
+	struct cm_id_private *cm_id_priv, *listen_cm_id_priv;
+	struct cm_req_msg *req_msg;
+	int ret;
+
+	req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
+
+	cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL);
+	if (IS_ERR(cm_id))
+		return PTR_ERR(cm_id);
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	cm_id_priv->id.remote_id = req_msg->local_comm_id;
+	cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+				work->mad_recv_wc->recv_buf.grh,
+				&cm_id_priv->av);
+	cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
+							    id.local_id);
+	if (IS_ERR(cm_id_priv->timewait_info)) {
+		ret = PTR_ERR(cm_id_priv->timewait_info);
+		goto destroy;
+	}
+	cm_id_priv->timewait_info->work.remote_id = req_msg->local_comm_id;
+	cm_id_priv->timewait_info->remote_ca_guid = req_msg->local_ca_guid;
+	cm_id_priv->timewait_info->remote_qpn = cm_req_get_local_qpn(req_msg);
+
+	listen_cm_id_priv = cm_match_req(work, cm_id_priv);
+	if (!listen_cm_id_priv) {
+		ret = -EINVAL;
+		kfree(cm_id_priv->timewait_info);
+		goto destroy;
+	}
+
+	cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler;
+	cm_id_priv->id.context = listen_cm_id_priv->id.context;
+	cm_id_priv->id.service_id = req_msg->service_id;
+	cm_id_priv->id.service_mask = ~cpu_to_be64(0);
+
+	cm_process_routed_req(req_msg, work->mad_recv_wc->wc);
+	cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]);
+	ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av);
+	if (ret) {
+		ib_get_cached_gid(work->port->cm_dev->ib_device,
+				  work->port->port_num, 0, &work->path[0].sgid);
+		ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID,
+			       &work->path[0].sgid, sizeof work->path[0].sgid,
+			       NULL, 0);
+		goto rejected;
+	}
+	if (req_msg->alt_local_lid) {
+		ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av);
+		if (ret) {
+			ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_ALT_GID,
+				       &work->path[0].sgid,
+				       sizeof work->path[0].sgid, NULL, 0);
+			goto rejected;
+		}
+	}
+	cm_id_priv->tid = req_msg->hdr.tid;
+	cm_id_priv->timeout_ms = cm_convert_to_ms(
+					cm_req_get_local_resp_timeout(req_msg));
+       if (cm_req_get_local_resp_timeout(req_msg) > (u8) max_timeout) {
+               printk(KERN_WARNING PFX "rcvd cm_local_resp_timeout %d > %d, "
+                      "decreasing used timeout_ms\n",
+                      cm_req_get_local_resp_timeout(req_msg), max_timeout);
+               cm_id_priv->timeout_ms = cm_convert_to_ms(max_timeout);
+       }
+
+	cm_id_priv->max_cm_retries = cm_req_get_max_cm_retries(req_msg);
+	cm_id_priv->remote_qpn = cm_req_get_local_qpn(req_msg);
+	cm_id_priv->initiator_depth = cm_req_get_resp_res(req_msg);
+	cm_id_priv->responder_resources = cm_req_get_init_depth(req_msg);
+	cm_id_priv->path_mtu = cm_req_get_path_mtu(req_msg);
+	cm_id_priv->pkey = req_msg->pkey;
+	cm_id_priv->sq_psn = cm_req_get_starting_psn(req_msg);
+	cm_id_priv->retry_count = cm_req_get_retry_count(req_msg);
+	cm_id_priv->rnr_retry_count = cm_req_get_rnr_retry_count(req_msg);
+	cm_id_priv->qp_type = cm_req_get_qp_type(req_msg);
+
+	cm_format_req_event(work, cm_id_priv, &listen_cm_id_priv->id);
+	cm_process_work(cm_id_priv, work);
+	cm_deref_id(listen_cm_id_priv);
+	return 0;
+
+rejected:
+	atomic_dec(&cm_id_priv->refcount);
+	cm_deref_id(listen_cm_id_priv);
+destroy:
+	ib_destroy_cm_id(cm_id);
+	return ret;
+}
+
+static void cm_format_rep(struct cm_rep_msg *rep_msg,
+			  struct cm_id_private *cm_id_priv,
+			  struct ib_cm_rep_param *param)
+{
+	cm_format_mad_hdr(&rep_msg->hdr, CM_REP_ATTR_ID, cm_id_priv->tid);
+	rep_msg->local_comm_id = cm_id_priv->id.local_id;
+	rep_msg->remote_comm_id = cm_id_priv->id.remote_id;
+	cm_rep_set_local_qpn(rep_msg, cpu_to_be32(param->qp_num));
+	cm_rep_set_starting_psn(rep_msg, cpu_to_be32(param->starting_psn));
+	rep_msg->resp_resources = param->responder_resources;
+	rep_msg->initiator_depth = param->initiator_depth;
+	cm_rep_set_target_ack_delay(rep_msg,
+				    cm_id_priv->av.port->cm_dev->ack_delay);
+	cm_rep_set_failover(rep_msg, param->failover_accepted);
+	cm_rep_set_flow_ctrl(rep_msg, param->flow_control);
+	cm_rep_set_rnr_retry_count(rep_msg, param->rnr_retry_count);
+	cm_rep_set_srq(rep_msg, param->srq);
+	rep_msg->local_ca_guid = cm_id_priv->id.device->node_guid;
+
+	if (param->private_data && param->private_data_len)
+		memcpy(rep_msg->private_data, param->private_data,
+		       param->private_data_len);
+}
+
+int ib_send_cm_rep(struct ib_cm_id *cm_id,
+		   struct ib_cm_rep_param *param)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	struct cm_rep_msg *rep_msg;
+	unsigned long flags;
+	int ret;
+
+	if (param->private_data &&
+	    param->private_data_len > IB_CM_REP_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_REQ_RCVD &&
+	    cm_id->state != IB_CM_MRA_REQ_SENT) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto out;
+
+	rep_msg = (struct cm_rep_msg *) msg->mad;
+	cm_format_rep(rep_msg, cm_id_priv, param);
+	msg->timeout_ms = cm_id_priv->timeout_ms;
+	msg->context[1] = (void *) (unsigned long) IB_CM_REP_SENT;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+
+	cm_id->state = IB_CM_REP_SENT;
+	cm_id_priv->msg = msg;
+	cm_id_priv->initiator_depth = param->initiator_depth;
+	cm_id_priv->responder_resources = param->responder_resources;
+	cm_id_priv->rq_psn = cm_rep_get_starting_psn(rep_msg);
+	cm_id_priv->local_qpn = cm_rep_get_local_qpn(rep_msg);
+
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_rep);
+
+static void cm_format_rtu(struct cm_rtu_msg *rtu_msg,
+			  struct cm_id_private *cm_id_priv,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&rtu_msg->hdr, CM_RTU_ATTR_ID, cm_id_priv->tid);
+	rtu_msg->local_comm_id = cm_id_priv->id.local_id;
+	rtu_msg->remote_comm_id = cm_id_priv->id.remote_id;
+
+	if (private_data && private_data_len)
+		memcpy(rtu_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_rtu(struct ib_cm_id *cm_id,
+		   const void *private_data,
+		   u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	void *data;
+	int ret;
+
+	if (private_data && private_data_len > IB_CM_RTU_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	data = cm_copy_private_data(private_data, private_data_len);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_REP_RCVD &&
+	    cm_id->state != IB_CM_MRA_REP_SENT) {
+		ret = -EINVAL;
+		goto error;
+	}
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto error;
+
+	cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv,
+		      private_data, private_data_len);
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		kfree(data);
+		return ret;
+	}
+
+	cm_id->state = IB_CM_ESTABLISHED;
+	cm_set_private_data(cm_id_priv, data, private_data_len);
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return 0;
+
+error:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	kfree(data);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_rtu);
+
+static void cm_format_rep_event(struct cm_work *work)
+{
+	struct cm_rep_msg *rep_msg;
+	struct ib_cm_rep_event_param *param;
+
+	rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad;
+	param = &work->cm_event.param.rep_rcvd;
+	param->remote_ca_guid = rep_msg->local_ca_guid;
+	param->remote_qkey = be32_to_cpu(rep_msg->local_qkey);
+	param->remote_qpn = be32_to_cpu(cm_rep_get_local_qpn(rep_msg));
+	param->starting_psn = be32_to_cpu(cm_rep_get_starting_psn(rep_msg));
+	param->responder_resources = rep_msg->initiator_depth;
+	param->initiator_depth = rep_msg->resp_resources;
+	param->target_ack_delay = cm_rep_get_target_ack_delay(rep_msg);
+	param->failover_accepted = cm_rep_get_failover(rep_msg);
+	param->flow_control = cm_rep_get_flow_ctrl(rep_msg);
+	param->rnr_retry_count = cm_rep_get_rnr_retry_count(rep_msg);
+	param->srq = cm_rep_get_srq(rep_msg);
+	work->cm_event.private_data = &rep_msg->private_data;
+}
+
+static void cm_dup_rep_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_rep_msg *rep_msg;
+	struct ib_mad_send_buf *msg = NULL;
+	int ret;
+
+	rep_msg = (struct cm_rep_msg *) work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id,
+				   rep_msg->local_comm_id);
+	if (!cm_id_priv)
+		return;
+
+	atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+			counter[CM_REP_COUNTER]);
+	ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
+	if (ret)
+		goto deref;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state == IB_CM_ESTABLISHED)
+		cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv,
+			      cm_id_priv->private_data,
+			      cm_id_priv->private_data_len);
+	else if (cm_id_priv->id.state == IB_CM_MRA_REP_SENT)
+		cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+			      CM_MSG_RESPONSE_REP, cm_id_priv->service_timeout,
+			      cm_id_priv->private_data,
+			      cm_id_priv->private_data_len);
+	else
+		goto unlock;
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		goto free;
+	goto deref;
+
+unlock:	spin_unlock_irq(&cm_id_priv->lock);
+free:	cm_free_msg(msg);
+deref:	cm_deref_id(cm_id_priv);
+}
+
+static int cm_rep_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_rep_msg *rep_msg;
+	int ret;
+
+	rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id, 0);
+	if (!cm_id_priv) {
+		cm_dup_rep_handler(work);
+		return -EINVAL;
+	}
+
+	cm_format_rep_event(work);
+
+	spin_lock_irq(&cm_id_priv->lock);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REQ_SENT:
+	case IB_CM_MRA_REQ_RCVD:
+		break;
+	default:
+		spin_unlock_irq(&cm_id_priv->lock);
+		ret = -EINVAL;
+		goto error;
+	}
+
+	cm_id_priv->timewait_info->work.remote_id = rep_msg->local_comm_id;
+	cm_id_priv->timewait_info->remote_ca_guid = rep_msg->local_ca_guid;
+	cm_id_priv->timewait_info->remote_qpn = cm_rep_get_local_qpn(rep_msg);
+
+	spin_lock(&cm.lock);
+	/* Check for duplicate REP. */
+	if (cm_insert_remote_id(cm_id_priv->timewait_info)) {
+		spin_unlock(&cm.lock);
+		spin_unlock_irq(&cm_id_priv->lock);
+		ret = -EINVAL;
+		goto error;
+	}
+	/* Check for a stale connection. */
+	if (cm_insert_remote_qpn(cm_id_priv->timewait_info)) {
+		rb_erase(&cm_id_priv->timewait_info->remote_id_node,
+			 &cm.remote_id_table);
+		cm_id_priv->timewait_info->inserted_remote_id = 0;
+		spin_unlock(&cm.lock);
+		spin_unlock_irq(&cm_id_priv->lock);
+		cm_issue_rej(work->port, work->mad_recv_wc,
+			     IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP,
+			     NULL, 0);
+		ret = -EINVAL;
+		goto error;
+	}
+	spin_unlock(&cm.lock);
+
+	cm_id_priv->id.state = IB_CM_REP_RCVD;
+	cm_id_priv->id.remote_id = rep_msg->local_comm_id;
+	cm_id_priv->remote_qpn = cm_rep_get_local_qpn(rep_msg);
+	cm_id_priv->initiator_depth = rep_msg->resp_resources;
+	cm_id_priv->responder_resources = rep_msg->initiator_depth;
+	cm_id_priv->sq_psn = cm_rep_get_starting_psn(rep_msg);
+	cm_id_priv->rnr_retry_count = cm_rep_get_rnr_retry_count(rep_msg);
+	cm_id_priv->target_ack_delay = cm_rep_get_target_ack_delay(rep_msg);
+	cm_id_priv->av.timeout =
+			cm_ack_timeout(cm_id_priv->target_ack_delay,
+				       cm_id_priv->av.timeout - 1);
+	cm_id_priv->alt_av.timeout =
+			cm_ack_timeout(cm_id_priv->target_ack_delay,
+				       cm_id_priv->alt_av.timeout - 1);
+
+	/* todo: handle peer_to_peer */
+
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+
+error:
+	cm_deref_id(cm_id_priv);
+	return ret;
+}
+
+static int cm_establish_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	int ret;
+
+	/* See comment in cm_establish about lookup. */
+	cm_id_priv = cm_acquire_id(work->local_id, work->remote_id);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_ESTABLISHED) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		goto out;
+	}
+
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static int cm_rtu_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_rtu_msg *rtu_msg;
+	int ret;
+
+	rtu_msg = (struct cm_rtu_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(rtu_msg->remote_comm_id,
+				   rtu_msg->local_comm_id);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	work->cm_event.private_data = &rtu_msg->private_data;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_REP_SENT &&
+	    cm_id_priv->id.state != IB_CM_MRA_REP_RCVD) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_RTU_COUNTER]);
+		goto out;
+	}
+	cm_id_priv->id.state = IB_CM_ESTABLISHED;
+
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static void cm_format_dreq(struct cm_dreq_msg *dreq_msg,
+			  struct cm_id_private *cm_id_priv,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&dreq_msg->hdr, CM_DREQ_ATTR_ID,
+			  cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_DREQ));
+	dreq_msg->local_comm_id = cm_id_priv->id.local_id;
+	dreq_msg->remote_comm_id = cm_id_priv->id.remote_id;
+	cm_dreq_set_remote_qpn(dreq_msg, cm_id_priv->remote_qpn);
+
+	if (private_data && private_data_len)
+		memcpy(dreq_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_dreq(struct ib_cm_id *cm_id,
+		    const void *private_data,
+		    u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if (private_data && private_data_len > IB_CM_DREQ_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_ESTABLISHED) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret) {
+		cm_enter_timewait(cm_id_priv);
+		goto out;
+	}
+
+	cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv,
+		       private_data, private_data_len);
+	msg->timeout_ms = cm_id_priv->timeout_ms;
+	msg->context[1] = (void *) (unsigned long) IB_CM_DREQ_SENT;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		cm_enter_timewait(cm_id_priv);
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+
+	cm_id->state = IB_CM_DREQ_SENT;
+	cm_id_priv->msg = msg;
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_dreq);
+
+static void cm_format_drep(struct cm_drep_msg *drep_msg,
+			  struct cm_id_private *cm_id_priv,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, cm_id_priv->tid);
+	drep_msg->local_comm_id = cm_id_priv->id.local_id;
+	drep_msg->remote_comm_id = cm_id_priv->id.remote_id;
+
+	if (private_data && private_data_len)
+		memcpy(drep_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_drep(struct ib_cm_id *cm_id,
+		    const void *private_data,
+		    u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	void *data;
+	int ret;
+
+	if (private_data && private_data_len > IB_CM_DREP_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	data = cm_copy_private_data(private_data, private_data_len);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_DREQ_RCVD) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		kfree(data);
+		return -EINVAL;
+	}
+
+	cm_set_private_data(cm_id_priv, data, private_data_len);
+	cm_enter_timewait(cm_id_priv);
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto out;
+
+	cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv,
+		       private_data, private_data_len);
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_drep);
+
+static int cm_issue_drep(struct cm_port *port,
+			 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_mad_send_buf *msg = NULL;
+	struct cm_dreq_msg *dreq_msg;
+	struct cm_drep_msg *drep_msg;
+	int ret;
+
+	ret = cm_alloc_response_msg(port, mad_recv_wc, &msg);
+	if (ret)
+		return ret;
+
+	dreq_msg = (struct cm_dreq_msg *) mad_recv_wc->recv_buf.mad;
+	drep_msg = (struct cm_drep_msg *) msg->mad;
+
+	cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, dreq_msg->hdr.tid);
+	drep_msg->remote_comm_id = dreq_msg->local_comm_id;
+	drep_msg->local_comm_id = dreq_msg->remote_comm_id;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		cm_free_msg(msg);
+
+	return ret;
+}
+
+static int cm_dreq_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_dreq_msg *dreq_msg;
+	struct ib_mad_send_buf *msg = NULL;
+	int ret;
+
+	dreq_msg = (struct cm_dreq_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(dreq_msg->remote_comm_id,
+				   dreq_msg->local_comm_id);
+	if (!cm_id_priv) {
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_DREQ_COUNTER]);
+		cm_issue_drep(work->port, work->mad_recv_wc);
+		return -EINVAL;
+	}
+
+	work->cm_event.private_data = &dreq_msg->private_data;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->local_qpn != cm_dreq_get_remote_qpn(dreq_msg))
+		goto unlock;
+
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REP_SENT:
+	case IB_CM_DREQ_SENT:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		break;
+	case IB_CM_ESTABLISHED:
+	case IB_CM_MRA_REP_RCVD:
+		break;
+	case IB_CM_TIMEWAIT:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_DREQ_COUNTER]);
+		if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg))
+			goto unlock;
+
+		cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv,
+			       cm_id_priv->private_data,
+			       cm_id_priv->private_data_len);
+		spin_unlock_irq(&cm_id_priv->lock);
+
+		if (ib_post_send_mad(msg, NULL))
+			cm_free_msg(msg);
+		goto deref;
+	case IB_CM_DREQ_RCVD:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_DREQ_COUNTER]);
+		goto unlock;
+	default:
+		goto unlock;
+	}
+	cm_id_priv->id.state = IB_CM_DREQ_RCVD;
+	cm_id_priv->tid = dreq_msg->hdr.tid;
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+
+unlock:	spin_unlock_irq(&cm_id_priv->lock);
+deref:	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static int cm_drep_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_drep_msg *drep_msg;
+	int ret;
+
+	drep_msg = (struct cm_drep_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(drep_msg->remote_comm_id,
+				   drep_msg->local_comm_id);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	work->cm_event.private_data = &drep_msg->private_data;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_DREQ_SENT &&
+	    cm_id_priv->id.state != IB_CM_DREQ_RCVD) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		goto out;
+	}
+	cm_enter_timewait(cm_id_priv);
+
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+int ib_send_cm_rej(struct ib_cm_id *cm_id,
+		   enum ib_cm_rej_reason reason,
+		   void *ari,
+		   u8 ari_length,
+		   const void *private_data,
+		   u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if ((private_data && private_data_len > IB_CM_REJ_PRIVATE_DATA_SIZE) ||
+	    (ari && ari_length > IB_CM_REJ_ARI_LENGTH))
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id->state) {
+	case IB_CM_REQ_SENT:
+	case IB_CM_MRA_REQ_RCVD:
+	case IB_CM_REQ_RCVD:
+	case IB_CM_MRA_REQ_SENT:
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+		ret = cm_alloc_msg(cm_id_priv, &msg);
+		if (!ret)
+			cm_format_rej((struct cm_rej_msg *) msg->mad,
+				      cm_id_priv, reason, ari, ari_length,
+				      private_data, private_data_len);
+
+		cm_reset_to_idle(cm_id_priv);
+		break;
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+		ret = cm_alloc_msg(cm_id_priv, &msg);
+		if (!ret)
+			cm_format_rej((struct cm_rej_msg *) msg->mad,
+				      cm_id_priv, reason, ari, ari_length,
+				      private_data, private_data_len);
+
+		cm_enter_timewait(cm_id_priv);
+		break;
+	default:
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (ret)
+		goto out;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		cm_free_msg(msg);
+
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_rej);
+
+static void cm_format_rej_event(struct cm_work *work)
+{
+	struct cm_rej_msg *rej_msg;
+	struct ib_cm_rej_event_param *param;
+
+	rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad;
+	param = &work->cm_event.param.rej_rcvd;
+	param->ari = rej_msg->ari;
+	param->ari_length = cm_rej_get_reject_info_len(rej_msg);
+	param->reason = __be16_to_cpu(rej_msg->reason);
+	work->cm_event.private_data = &rej_msg->private_data;
+}
+
+static struct cm_id_private * cm_acquire_rejected_id(struct cm_rej_msg *rej_msg)
+{
+	struct cm_timewait_info *timewait_info;
+	struct cm_id_private *cm_id_priv;
+	__be32 remote_id;
+
+	remote_id = rej_msg->local_comm_id;
+
+	if (__be16_to_cpu(rej_msg->reason) == IB_CM_REJ_TIMEOUT) {
+		spin_lock_irq(&cm.lock);
+		timewait_info = cm_find_remote_id( *((__be64 *) rej_msg->ari),
+						  remote_id);
+		if (!timewait_info) {
+			spin_unlock_irq(&cm.lock);
+			return NULL;
+		}
+		cm_id_priv = idr_find(&cm.local_id_table, (__force int)
+				      (timewait_info->work.local_id ^
+				       cm.random_id_operand));
+		if (cm_id_priv) {
+			if (cm_id_priv->id.remote_id == remote_id)
+				atomic_inc(&cm_id_priv->refcount);
+			else
+				cm_id_priv = NULL;
+		}
+		spin_unlock_irq(&cm.lock);
+	} else if (cm_rej_get_msg_rejected(rej_msg) == CM_MSG_RESPONSE_REQ)
+		cm_id_priv = cm_acquire_id(rej_msg->remote_comm_id, 0);
+	else
+		cm_id_priv = cm_acquire_id(rej_msg->remote_comm_id, remote_id);
+
+	return cm_id_priv;
+}
+
+static int cm_rej_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_rej_msg *rej_msg;
+	int ret;
+
+	rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_rejected_id(rej_msg);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	cm_format_rej_event(work);
+
+	spin_lock_irq(&cm_id_priv->lock);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REQ_SENT:
+	case IB_CM_MRA_REQ_RCVD:
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		/* fall through */
+	case IB_CM_REQ_RCVD:
+	case IB_CM_MRA_REQ_SENT:
+		if (__be16_to_cpu(rej_msg->reason) == IB_CM_REJ_STALE_CONN)
+			cm_enter_timewait(cm_id_priv);
+		else
+			cm_reset_to_idle(cm_id_priv);
+		break;
+	case IB_CM_DREQ_SENT:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		/* fall through */
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+	case IB_CM_ESTABLISHED:
+		cm_enter_timewait(cm_id_priv);
+		break;
+	default:
+		spin_unlock_irq(&cm_id_priv->lock);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+int ib_send_cm_mra(struct ib_cm_id *cm_id,
+		   u8 service_timeout,
+		   const void *private_data,
+		   u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	enum ib_cm_state cm_state;
+	enum ib_cm_lap_state lap_state;
+	enum cm_msg_response msg_response;
+	void *data;
+	unsigned long flags;
+	int ret;
+
+	if (private_data && private_data_len > IB_CM_MRA_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	data = cm_copy_private_data(private_data, private_data_len);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch(cm_id_priv->id.state) {
+	case IB_CM_REQ_RCVD:
+		cm_state = IB_CM_MRA_REQ_SENT;
+		lap_state = cm_id->lap_state;
+		msg_response = CM_MSG_RESPONSE_REQ;
+		break;
+	case IB_CM_REP_RCVD:
+		cm_state = IB_CM_MRA_REP_SENT;
+		lap_state = cm_id->lap_state;
+		msg_response = CM_MSG_RESPONSE_REP;
+		break;
+	case IB_CM_ESTABLISHED:
+		if (cm_id->lap_state == IB_CM_LAP_RCVD) {
+			cm_state = cm_id->state;
+			lap_state = IB_CM_MRA_LAP_SENT;
+			msg_response = CM_MSG_RESPONSE_OTHER;
+			break;
+		}
+	default:
+		ret = -EINVAL;
+		goto error1;
+	}
+
+	if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) {
+		ret = cm_alloc_msg(cm_id_priv, &msg);
+		if (ret)
+			goto error1;
+
+		cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+			      msg_response, service_timeout,
+			      private_data, private_data_len);
+		ret = ib_post_send_mad(msg, NULL);
+		if (ret)
+			goto error2;
+	}
+
+	cm_id->state = cm_state;
+	cm_id->lap_state = lap_state;
+	cm_id_priv->service_timeout = service_timeout;
+	cm_set_private_data(cm_id_priv, data, private_data_len);
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return 0;
+
+error1:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	kfree(data);
+	return ret;
+
+error2:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	kfree(data);
+	cm_free_msg(msg);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_mra);
+
+static struct cm_id_private * cm_acquire_mraed_id(struct cm_mra_msg *mra_msg)
+{
+	switch (cm_mra_get_msg_mraed(mra_msg)) {
+	case CM_MSG_RESPONSE_REQ:
+		return cm_acquire_id(mra_msg->remote_comm_id, 0);
+	case CM_MSG_RESPONSE_REP:
+	case CM_MSG_RESPONSE_OTHER:
+		return cm_acquire_id(mra_msg->remote_comm_id,
+				     mra_msg->local_comm_id);
+	default:
+		return NULL;
+	}
+}
+
+static int cm_mra_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_mra_msg *mra_msg;
+	int timeout, ret;
+
+	mra_msg = (struct cm_mra_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_mraed_id(mra_msg);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	work->cm_event.private_data = &mra_msg->private_data;
+	work->cm_event.param.mra_rcvd.service_timeout =
+					cm_mra_get_service_timeout(mra_msg);
+	timeout = cm_convert_to_ms(cm_mra_get_service_timeout(mra_msg)) +
+		  cm_convert_to_ms(cm_id_priv->av.timeout);
+       if (timeout > cm_convert_to_ms(max_timeout)) {
+               printk(KERN_WARNING PFX "calculated mra timeout %d > %d, "
+                      "decreasing used timeout_ms\n", timeout,
+                      cm_convert_to_ms(max_timeout));
+               timeout = cm_convert_to_ms(max_timeout);
+       }
+
+	spin_lock_irq(&cm_id_priv->lock);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REQ_SENT:
+		if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_REQ ||
+		    ib_modify_mad(cm_id_priv->av.port->mad_agent,
+				  cm_id_priv->msg, timeout))
+			goto out;
+		cm_id_priv->id.state = IB_CM_MRA_REQ_RCVD;
+		break;
+	case IB_CM_REP_SENT:
+		if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_REP ||
+		    ib_modify_mad(cm_id_priv->av.port->mad_agent,
+				  cm_id_priv->msg, timeout))
+			goto out;
+		cm_id_priv->id.state = IB_CM_MRA_REP_RCVD;
+		break;
+	case IB_CM_ESTABLISHED:
+		if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_OTHER ||
+		    cm_id_priv->id.lap_state != IB_CM_LAP_SENT ||
+		    ib_modify_mad(cm_id_priv->av.port->mad_agent,
+				  cm_id_priv->msg, timeout)) {
+			if (cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
+				atomic_long_inc(&work->port->
+						counter_group[CM_RECV_DUPLICATES].
+						counter[CM_MRA_COUNTER]);
+			goto out;
+		}
+		cm_id_priv->id.lap_state = IB_CM_MRA_LAP_RCVD;
+		break;
+	case IB_CM_MRA_REQ_RCVD:
+	case IB_CM_MRA_REP_RCVD:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_MRA_COUNTER]);
+		/* fall through */
+	default:
+		goto out;
+	}
+
+	cm_id_priv->msg->context[1] = (void *) (unsigned long)
+				      cm_id_priv->id.state;
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	spin_unlock_irq(&cm_id_priv->lock);
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static void cm_format_lap(struct cm_lap_msg *lap_msg,
+			  struct cm_id_private *cm_id_priv,
+			  struct ib_sa_path_rec *alternate_path,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&lap_msg->hdr, CM_LAP_ATTR_ID,
+			  cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_LAP));
+	lap_msg->local_comm_id = cm_id_priv->id.local_id;
+	lap_msg->remote_comm_id = cm_id_priv->id.remote_id;
+	cm_lap_set_remote_qpn(lap_msg, cm_id_priv->remote_qpn);
+	/* todo: need remote CM response timeout */
+	cm_lap_set_remote_resp_timeout(lap_msg, 0x1F);
+	lap_msg->alt_local_lid = alternate_path->slid;
+	lap_msg->alt_remote_lid = alternate_path->dlid;
+	lap_msg->alt_local_gid = alternate_path->sgid;
+	lap_msg->alt_remote_gid = alternate_path->dgid;
+	cm_lap_set_flow_label(lap_msg, alternate_path->flow_label);
+	cm_lap_set_traffic_class(lap_msg, alternate_path->traffic_class);
+	lap_msg->alt_hop_limit = alternate_path->hop_limit;
+	cm_lap_set_packet_rate(lap_msg, alternate_path->rate);
+	cm_lap_set_sl(lap_msg, alternate_path->sl);
+	cm_lap_set_subnet_local(lap_msg, 1); /* local only... */
+	cm_lap_set_local_ack_timeout(lap_msg,
+		cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
+			       alternate_path->packet_life_time));
+
+	if (private_data && private_data_len)
+		memcpy(lap_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_lap(struct ib_cm_id *cm_id,
+		   struct ib_sa_path_rec *alternate_path,
+		   const void *private_data,
+		   u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if (private_data && private_data_len > IB_CM_LAP_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_ESTABLISHED ||
+	    (cm_id->lap_state != IB_CM_LAP_UNINIT &&
+	     cm_id->lap_state != IB_CM_LAP_IDLE)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = cm_init_av_by_path(alternate_path, &cm_id_priv->alt_av);
+	if (ret)
+		goto out;
+	cm_id_priv->alt_av.timeout =
+			cm_ack_timeout(cm_id_priv->target_ack_delay,
+				       cm_id_priv->alt_av.timeout - 1);
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto out;
+
+	cm_format_lap((struct cm_lap_msg *) msg->mad, cm_id_priv,
+		      alternate_path, private_data, private_data_len);
+	msg->timeout_ms = cm_id_priv->timeout_ms;
+	msg->context[1] = (void *) (unsigned long) IB_CM_ESTABLISHED;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+
+	cm_id->lap_state = IB_CM_LAP_SENT;
+	cm_id_priv->msg = msg;
+
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_lap);
+
+static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv,
+				    struct ib_sa_path_rec *path,
+				    struct cm_lap_msg *lap_msg)
+{
+	memset(path, 0, sizeof *path);
+	path->dgid = lap_msg->alt_local_gid;
+	path->sgid = lap_msg->alt_remote_gid;
+	path->dlid = lap_msg->alt_local_lid;
+	path->slid = lap_msg->alt_remote_lid;
+	path->flow_label = cm_lap_get_flow_label(lap_msg);
+	path->hop_limit = lap_msg->alt_hop_limit;
+	path->traffic_class = cm_lap_get_traffic_class(lap_msg);
+	path->reversible = 1;
+	path->pkey = cm_id_priv->pkey;
+	path->sl = cm_lap_get_sl(lap_msg);
+	path->mtu_selector = IB_SA_EQ;
+	path->mtu = cm_id_priv->path_mtu;
+	path->rate_selector = IB_SA_EQ;
+	path->rate = cm_lap_get_packet_rate(lap_msg);
+	path->packet_life_time_selector = IB_SA_EQ;
+	path->packet_life_time = cm_lap_get_local_ack_timeout(lap_msg);
+	path->packet_life_time -= (path->packet_life_time > 0);
+}
+
+static int cm_lap_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_lap_msg *lap_msg;
+	struct ib_cm_lap_event_param *param;
+	struct ib_mad_send_buf *msg = NULL;
+	int ret;
+
+	/* todo: verify LAP request and send reject APR if invalid. */
+	lap_msg = (struct cm_lap_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(lap_msg->remote_comm_id,
+				   lap_msg->local_comm_id);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	param = &work->cm_event.param.lap_rcvd;
+	param->alternate_path = &work->path[0];
+	cm_format_path_from_lap(cm_id_priv, param->alternate_path, lap_msg);
+	work->cm_event.private_data = &lap_msg->private_data;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_ESTABLISHED)
+		goto unlock;
+
+	switch (cm_id_priv->id.lap_state) {
+	case IB_CM_LAP_UNINIT:
+	case IB_CM_LAP_IDLE:
+		break;
+	case IB_CM_MRA_LAP_SENT:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_LAP_COUNTER]);
+		if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg))
+			goto unlock;
+
+		cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+			      CM_MSG_RESPONSE_OTHER,
+			      cm_id_priv->service_timeout,
+			      cm_id_priv->private_data,
+			      cm_id_priv->private_data_len);
+		spin_unlock_irq(&cm_id_priv->lock);
+
+		if (ib_post_send_mad(msg, NULL))
+			cm_free_msg(msg);
+		goto deref;
+	case IB_CM_LAP_RCVD:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_LAP_COUNTER]);
+		goto unlock;
+	default:
+		goto unlock;
+	}
+
+	cm_id_priv->id.lap_state = IB_CM_LAP_RCVD;
+	cm_id_priv->tid = lap_msg->hdr.tid;
+	cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+				work->mad_recv_wc->recv_buf.grh,
+				&cm_id_priv->av);
+	cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av);
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+
+unlock:	spin_unlock_irq(&cm_id_priv->lock);
+deref:	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static void cm_format_apr(struct cm_apr_msg *apr_msg,
+			  struct cm_id_private *cm_id_priv,
+			  enum ib_cm_apr_status status,
+			  void *info,
+			  u8 info_length,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&apr_msg->hdr, CM_APR_ATTR_ID, cm_id_priv->tid);
+	apr_msg->local_comm_id = cm_id_priv->id.local_id;
+	apr_msg->remote_comm_id = cm_id_priv->id.remote_id;
+	apr_msg->ap_status = (u8) status;
+
+	if (info && info_length) {
+		apr_msg->info_length = info_length;
+		memcpy(apr_msg->info, info, info_length);
+	}
+
+	if (private_data && private_data_len)
+		memcpy(apr_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_apr(struct ib_cm_id *cm_id,
+		   enum ib_cm_apr_status status,
+		   void *info,
+		   u8 info_length,
+		   const void *private_data,
+		   u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if ((private_data && private_data_len > IB_CM_APR_PRIVATE_DATA_SIZE) ||
+	    (info && info_length > IB_CM_APR_INFO_LENGTH))
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_ESTABLISHED ||
+	    (cm_id->lap_state != IB_CM_LAP_RCVD &&
+	     cm_id->lap_state != IB_CM_MRA_LAP_SENT)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto out;
+
+	cm_format_apr((struct cm_apr_msg *) msg->mad, cm_id_priv, status,
+		      info, info_length, private_data, private_data_len);
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+
+	cm_id->lap_state = IB_CM_LAP_IDLE;
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_apr);
+
+static int cm_apr_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_apr_msg *apr_msg;
+	int ret;
+
+	apr_msg = (struct cm_apr_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(apr_msg->remote_comm_id,
+				   apr_msg->local_comm_id);
+	if (!cm_id_priv)
+		return -EINVAL; /* Unmatched reply. */
+
+	work->cm_event.param.apr_rcvd.ap_status = apr_msg->ap_status;
+	work->cm_event.param.apr_rcvd.apr_info = &apr_msg->info;
+	work->cm_event.param.apr_rcvd.info_len = apr_msg->info_length;
+	work->cm_event.private_data = &apr_msg->private_data;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_ESTABLISHED ||
+	    (cm_id_priv->id.lap_state != IB_CM_LAP_SENT &&
+	     cm_id_priv->id.lap_state != IB_CM_MRA_LAP_RCVD)) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		goto out;
+	}
+	cm_id_priv->id.lap_state = IB_CM_LAP_IDLE;
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	cm_id_priv->msg = NULL;
+
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static int cm_timewait_handler(struct cm_work *work)
+{
+	struct cm_timewait_info *timewait_info;
+	struct cm_id_private *cm_id_priv;
+	int ret;
+
+	timewait_info = (struct cm_timewait_info *)work;
+	spin_lock_irq(&cm.lock);
+	list_del(&timewait_info->list);
+	spin_unlock_irq(&cm.lock);
+
+	cm_id_priv = cm_acquire_id(timewait_info->work.local_id,
+				   timewait_info->work.remote_id);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_TIMEWAIT ||
+	    cm_id_priv->remote_qpn != timewait_info->remote_qpn) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		goto out;
+	}
+	cm_id_priv->id.state = IB_CM_IDLE;
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static void cm_format_sidr_req(struct cm_sidr_req_msg *sidr_req_msg,
+			       struct cm_id_private *cm_id_priv,
+			       struct ib_cm_sidr_req_param *param)
+{
+	cm_format_mad_hdr(&sidr_req_msg->hdr, CM_SIDR_REQ_ATTR_ID,
+			  cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_SIDR));
+	sidr_req_msg->request_id = cm_id_priv->id.local_id;
+	sidr_req_msg->pkey = param->path->pkey;
+	sidr_req_msg->service_id = param->service_id;
+
+	if (param->private_data && param->private_data_len)
+		memcpy(sidr_req_msg->private_data, param->private_data,
+		       param->private_data_len);
+}
+
+int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
+			struct ib_cm_sidr_req_param *param)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if (!param->path || (param->private_data &&
+	     param->private_data_len > IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE))
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	ret = cm_init_av_by_path(param->path, &cm_id_priv->av);
+	if (ret)
+		goto out;
+
+	cm_id->service_id = param->service_id;
+	cm_id->service_mask = ~cpu_to_be64(0);
+	cm_id_priv->timeout_ms = param->timeout_ms;
+       if (cm_id_priv->timeout_ms > cm_convert_to_ms(max_timeout)) {
+               printk(KERN_WARNING PFX "sidr req timeout_ms %d > %d, "
+                      "decreasing used timeout_ms\n", param->timeout_ms,
+                      cm_convert_to_ms(max_timeout));
+               cm_id_priv->timeout_ms = cm_convert_to_ms(max_timeout);
+       }
+	cm_id_priv->max_cm_retries = param->max_cm_retries;
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto out;
+
+	cm_format_sidr_req((struct cm_sidr_req_msg *) msg->mad, cm_id_priv,
+			   param);
+	msg->timeout_ms = cm_id_priv->timeout_ms;
+	msg->context[1] = (void *) (unsigned long) IB_CM_SIDR_REQ_SENT;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state == IB_CM_IDLE)
+		ret = ib_post_send_mad(msg, NULL);
+	else
+		ret = -EINVAL;
+
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		goto out;
+	}
+	cm_id->state = IB_CM_SIDR_REQ_SENT;
+	cm_id_priv->msg = msg;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+out:
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_sidr_req);
+
+static void cm_format_sidr_req_event(struct cm_work *work,
+				     struct ib_cm_id *listen_id)
+{
+	struct cm_sidr_req_msg *sidr_req_msg;
+	struct ib_cm_sidr_req_event_param *param;
+
+	sidr_req_msg = (struct cm_sidr_req_msg *)
+				work->mad_recv_wc->recv_buf.mad;
+	param = &work->cm_event.param.sidr_req_rcvd;
+	param->pkey = __be16_to_cpu(sidr_req_msg->pkey);
+	param->listen_id = listen_id;
+	param->port = work->port->port_num;
+	work->cm_event.private_data = &sidr_req_msg->private_data;
+}
+
+static int cm_sidr_req_handler(struct cm_work *work)
+{
+	struct ib_cm_id *cm_id;
+	struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
+	struct cm_sidr_req_msg *sidr_req_msg;
+	struct ib_wc *wc;
+
+	cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL);
+	if (IS_ERR(cm_id))
+		return PTR_ERR(cm_id);
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+
+	/* Record SGID/SLID and request ID for lookup. */
+	sidr_req_msg = (struct cm_sidr_req_msg *)
+				work->mad_recv_wc->recv_buf.mad;
+	wc = work->mad_recv_wc->wc;
+	cm_id_priv->av.dgid.global.subnet_prefix = cpu_to_be64(wc->slid);
+	cm_id_priv->av.dgid.global.interface_id = 0;
+	cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+				work->mad_recv_wc->recv_buf.grh,
+				&cm_id_priv->av);
+	cm_id_priv->id.remote_id = sidr_req_msg->request_id;
+	cm_id_priv->tid = sidr_req_msg->hdr.tid;
+	atomic_inc(&cm_id_priv->work_count);
+
+	spin_lock_irq(&cm.lock);
+	cur_cm_id_priv = cm_insert_remote_sidr(cm_id_priv);
+	if (cur_cm_id_priv) {
+		spin_unlock_irq(&cm.lock);
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_SIDR_REQ_COUNTER]);
+		goto out; /* Duplicate message. */
+	}
+	cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD;
+	cur_cm_id_priv = cm_find_listen(cm_id->device,
+					sidr_req_msg->service_id,
+					sidr_req_msg->private_data);
+	if (!cur_cm_id_priv) {
+		spin_unlock_irq(&cm.lock);
+		cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED);
+		goto out; /* No match. */
+	}
+	atomic_inc(&cur_cm_id_priv->refcount);
+	spin_unlock_irq(&cm.lock);
+
+	cm_id_priv->id.cm_handler = cur_cm_id_priv->id.cm_handler;
+	cm_id_priv->id.context = cur_cm_id_priv->id.context;
+	cm_id_priv->id.service_id = sidr_req_msg->service_id;
+	cm_id_priv->id.service_mask = ~cpu_to_be64(0);
+
+	cm_format_sidr_req_event(work, &cur_cm_id_priv->id);
+	cm_process_work(cm_id_priv, work);
+	cm_deref_id(cur_cm_id_priv);
+	return 0;
+out:
+	ib_destroy_cm_id(&cm_id_priv->id);
+	return -EINVAL;
+}
+
+static void cm_format_sidr_rep(struct cm_sidr_rep_msg *sidr_rep_msg,
+			       struct cm_id_private *cm_id_priv,
+			       struct ib_cm_sidr_rep_param *param)
+{
+	cm_format_mad_hdr(&sidr_rep_msg->hdr, CM_SIDR_REP_ATTR_ID,
+			  cm_id_priv->tid);
+	sidr_rep_msg->request_id = cm_id_priv->id.remote_id;
+	sidr_rep_msg->status = param->status;
+	cm_sidr_rep_set_qpn(sidr_rep_msg, cpu_to_be32(param->qp_num));
+	sidr_rep_msg->service_id = cm_id_priv->id.service_id;
+	sidr_rep_msg->qkey = cpu_to_be32(param->qkey);
+
+	if (param->info && param->info_length)
+		memcpy(sidr_rep_msg->info, param->info, param->info_length);
+
+	if (param->private_data && param->private_data_len)
+		memcpy(sidr_rep_msg->private_data, param->private_data,
+		       param->private_data_len);
+}
+
+int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id,
+			struct ib_cm_sidr_rep_param *param)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if ((param->info && param->info_length > IB_CM_SIDR_REP_INFO_LENGTH) ||
+	    (param->private_data &&
+	     param->private_data_len > IB_CM_SIDR_REP_PRIVATE_DATA_SIZE))
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_SIDR_REQ_RCVD) {
+		ret = -EINVAL;
+		goto error;
+	}
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto error;
+
+	cm_format_sidr_rep((struct cm_sidr_rep_msg *) msg->mad, cm_id_priv,
+			   param);
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+	cm_id->state = IB_CM_IDLE;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	spin_lock_irqsave(&cm.lock, flags);
+	rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
+	spin_unlock_irqrestore(&cm.lock, flags);
+	return 0;
+
+error:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_sidr_rep);
+
+static void cm_format_sidr_rep_event(struct cm_work *work)
+{
+	struct cm_sidr_rep_msg *sidr_rep_msg;
+	struct ib_cm_sidr_rep_event_param *param;
+
+	sidr_rep_msg = (struct cm_sidr_rep_msg *)
+				work->mad_recv_wc->recv_buf.mad;
+	param = &work->cm_event.param.sidr_rep_rcvd;
+	param->status = sidr_rep_msg->status;
+	param->qkey = be32_to_cpu(sidr_rep_msg->qkey);
+	param->qpn = be32_to_cpu(cm_sidr_rep_get_qpn(sidr_rep_msg));
+	param->info = &sidr_rep_msg->info;
+	param->info_len = sidr_rep_msg->info_length;
+	work->cm_event.private_data = &sidr_rep_msg->private_data;
+}
+
+static int cm_sidr_rep_handler(struct cm_work *work)
+{
+	struct cm_sidr_rep_msg *sidr_rep_msg;
+	struct cm_id_private *cm_id_priv;
+
+	sidr_rep_msg = (struct cm_sidr_rep_msg *)
+				work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(sidr_rep_msg->request_id, 0);
+	if (!cm_id_priv)
+		return -EINVAL; /* Unmatched reply. */
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_SIDR_REQ_SENT) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		goto out;
+	}
+	cm_id_priv->id.state = IB_CM_IDLE;
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	cm_format_sidr_rep_event(work);
+	cm_process_work(cm_id_priv, work);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static void cm_process_send_error(struct ib_mad_send_buf *msg,
+				  enum ib_wc_status wc_status)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_cm_event cm_event;
+	enum ib_cm_state state;
+	int ret;
+
+	memset(&cm_event, 0, sizeof cm_event);
+	cm_id_priv = msg->context[0];
+
+	/* Discard old sends or ones without a response. */
+	spin_lock_irq(&cm_id_priv->lock);
+	state = (enum ib_cm_state) (unsigned long) msg->context[1];
+	if (msg != cm_id_priv->msg || state != cm_id_priv->id.state)
+		goto discard;
+
+	switch (state) {
+	case IB_CM_REQ_SENT:
+	case IB_CM_MRA_REQ_RCVD:
+		cm_reset_to_idle(cm_id_priv);
+		cm_event.event = IB_CM_REQ_ERROR;
+		break;
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+		cm_reset_to_idle(cm_id_priv);
+		cm_event.event = IB_CM_REP_ERROR;
+		break;
+	case IB_CM_DREQ_SENT:
+		cm_enter_timewait(cm_id_priv);
+		cm_event.event = IB_CM_DREQ_ERROR;
+		break;
+	case IB_CM_SIDR_REQ_SENT:
+		cm_id_priv->id.state = IB_CM_IDLE;
+		cm_event.event = IB_CM_SIDR_REQ_ERROR;
+		break;
+	default:
+		goto discard;
+	}
+	spin_unlock_irq(&cm_id_priv->lock);
+	cm_event.param.send_status = wc_status;
+
+	/* No other events can occur on the cm_id at this point. */
+	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &cm_event);
+	cm_free_msg(msg);
+	if (ret)
+		ib_destroy_cm_id(&cm_id_priv->id);
+	return;
+discard:
+	spin_unlock_irq(&cm_id_priv->lock);
+	cm_free_msg(msg);
+}
+
+static void cm_send_handler(struct ib_mad_agent *mad_agent,
+			    struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_mad_send_buf *msg = mad_send_wc->send_buf;
+	struct cm_port *port;
+	u16 attr_index;
+
+	port = mad_agent->context;
+	attr_index = be16_to_cpu(((struct ib_mad_hdr *)
+				  msg->mad)->attr_id) - CM_ATTR_ID_OFFSET;
+
+	/*
+	 * If the send was in response to a received message (context[0] is not
+	 * set to a cm_id), and is not a REJ, then it is a send that was
+	 * manually retried.
+	 */
+	if (!msg->context[0] && (attr_index != CM_REJ_COUNTER))
+		msg->retries = 1;
+
+	atomic_long_add(1 + msg->retries,
+			&port->counter_group[CM_XMIT].counter[attr_index]);
+	if (msg->retries)
+		atomic_long_add(msg->retries,
+				&port->counter_group[CM_XMIT_RETRIES].
+				counter[attr_index]);
+
+	switch (mad_send_wc->status) {
+	case IB_WC_SUCCESS:
+	case IB_WC_WR_FLUSH_ERR:
+		cm_free_msg(msg);
+		break;
+	default:
+		if (msg->context[0] && msg->context[1])
+			cm_process_send_error(msg, mad_send_wc->status);
+		else
+			cm_free_msg(msg);
+		break;
+	}
+}
+
+static void cm_work_handler(struct work_struct *_work)
+{
+	struct cm_work *work = container_of(_work, struct cm_work, work.work);
+	int ret;
+
+	switch (work->cm_event.event) {
+	case IB_CM_REQ_RECEIVED:
+		ret = cm_req_handler(work);
+		break;
+	case IB_CM_MRA_RECEIVED:
+		ret = cm_mra_handler(work);
+		break;
+	case IB_CM_REJ_RECEIVED:
+		ret = cm_rej_handler(work);
+		break;
+	case IB_CM_REP_RECEIVED:
+		ret = cm_rep_handler(work);
+		break;
+	case IB_CM_RTU_RECEIVED:
+		ret = cm_rtu_handler(work);
+		break;
+	case IB_CM_USER_ESTABLISHED:
+		ret = cm_establish_handler(work);
+		break;
+	case IB_CM_DREQ_RECEIVED:
+		ret = cm_dreq_handler(work);
+		break;
+	case IB_CM_DREP_RECEIVED:
+		ret = cm_drep_handler(work);
+		break;
+	case IB_CM_SIDR_REQ_RECEIVED:
+		ret = cm_sidr_req_handler(work);
+		break;
+	case IB_CM_SIDR_REP_RECEIVED:
+		ret = cm_sidr_rep_handler(work);
+		break;
+	case IB_CM_LAP_RECEIVED:
+		ret = cm_lap_handler(work);
+		break;
+	case IB_CM_APR_RECEIVED:
+		ret = cm_apr_handler(work);
+		break;
+	case IB_CM_TIMEWAIT_EXIT:
+		ret = cm_timewait_handler(work);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	if (ret)
+		cm_free_work(work);
+}
+
+static int cm_establish(struct ib_cm_id *cm_id)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_work *work;
+	unsigned long flags;
+	int ret = 0;
+
+	work = kmalloc(sizeof *work, GFP_ATOMIC);
+	if (!work)
+		return -ENOMEM;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id->state)
+	{
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+		cm_id->state = IB_CM_ESTABLISHED;
+		break;
+	case IB_CM_ESTABLISHED:
+		ret = -EISCONN;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	if (ret) {
+		kfree(work);
+		goto out;
+	}
+
+	/*
+	 * The CM worker thread may try to destroy the cm_id before it
+	 * can execute this work item.  To prevent potential deadlock,
+	 * we need to find the cm_id once we're in the context of the
+	 * worker thread, rather than holding a reference on it.
+	 */
+	INIT_DELAYED_WORK(&work->work, cm_work_handler);
+	work->local_id = cm_id->local_id;
+	work->remote_id = cm_id->remote_id;
+	work->mad_recv_wc = NULL;
+	work->cm_event.event = IB_CM_USER_ESTABLISHED;
+	queue_delayed_work(cm.wq, &work->work, 0);
+out:
+	return ret;
+}
+
+static int cm_migrate(struct ib_cm_id *cm_id)
+{
+	struct cm_id_private *cm_id_priv;
+	unsigned long flags;
+	int ret = 0;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state == IB_CM_ESTABLISHED &&
+	    (cm_id->lap_state == IB_CM_LAP_UNINIT ||
+	     cm_id->lap_state == IB_CM_LAP_IDLE)) {
+		cm_id->lap_state = IB_CM_LAP_IDLE;
+		cm_id_priv->av = cm_id_priv->alt_av;
+	} else
+		ret = -EINVAL;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	return ret;
+}
+
+int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event)
+{
+	int ret;
+
+	switch (event) {
+	case IB_EVENT_COMM_EST:
+		ret = cm_establish(cm_id);
+		break;
+	case IB_EVENT_PATH_MIG:
+		ret = cm_migrate(cm_id);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(ib_cm_notify);
+
+static void cm_recv_handler(struct ib_mad_agent *mad_agent,
+			    struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct cm_port *port = mad_agent->context;
+	struct cm_work *work;
+	enum ib_cm_event_type event;
+	u16 attr_id;
+	int paths = 0;
+
+	switch (mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) {
+	case CM_REQ_ATTR_ID:
+		paths = 1 + (((struct cm_req_msg *) mad_recv_wc->recv_buf.mad)->
+						    alt_local_lid != 0);
+		event = IB_CM_REQ_RECEIVED;
+		break;
+	case CM_MRA_ATTR_ID:
+		event = IB_CM_MRA_RECEIVED;
+		break;
+	case CM_REJ_ATTR_ID:
+		event = IB_CM_REJ_RECEIVED;
+		break;
+	case CM_REP_ATTR_ID:
+		event = IB_CM_REP_RECEIVED;
+		break;
+	case CM_RTU_ATTR_ID:
+		event = IB_CM_RTU_RECEIVED;
+		break;
+	case CM_DREQ_ATTR_ID:
+		event = IB_CM_DREQ_RECEIVED;
+		break;
+	case CM_DREP_ATTR_ID:
+		event = IB_CM_DREP_RECEIVED;
+		break;
+	case CM_SIDR_REQ_ATTR_ID:
+		event = IB_CM_SIDR_REQ_RECEIVED;
+		break;
+	case CM_SIDR_REP_ATTR_ID:
+		event = IB_CM_SIDR_REP_RECEIVED;
+		break;
+	case CM_LAP_ATTR_ID:
+		paths = 1;
+		event = IB_CM_LAP_RECEIVED;
+		break;
+	case CM_APR_ATTR_ID:
+		event = IB_CM_APR_RECEIVED;
+		break;
+	default:
+		ib_free_recv_mad(mad_recv_wc);
+		return;
+	}
+
+	attr_id = be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.attr_id);
+	atomic_long_inc(&port->counter_group[CM_RECV].
+			counter[attr_id - CM_ATTR_ID_OFFSET]);
+
+	work = kmalloc(sizeof *work + sizeof(struct ib_sa_path_rec) * paths,
+		       GFP_KERNEL);
+	if (!work) {
+		ib_free_recv_mad(mad_recv_wc);
+		return;
+	}
+
+	INIT_DELAYED_WORK(&work->work, cm_work_handler);
+	work->cm_event.event = event;
+	work->mad_recv_wc = mad_recv_wc;
+	work->port = port;
+	queue_delayed_work(cm.wq, &work->work, 0);
+}
+
+static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv,
+				struct ib_qp_attr *qp_attr,
+				int *qp_attr_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REQ_SENT:
+	case IB_CM_MRA_REQ_RCVD:
+	case IB_CM_REQ_RCVD:
+	case IB_CM_MRA_REQ_SENT:
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+	case IB_CM_ESTABLISHED:
+		*qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS |
+				IB_QP_PKEY_INDEX | IB_QP_PORT;
+		qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE;
+		if (cm_id_priv->responder_resources)
+			qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ |
+						    IB_ACCESS_REMOTE_ATOMIC;
+		qp_attr->pkey_index = cm_id_priv->av.pkey_index;
+		qp_attr->port_num = cm_id_priv->av.port->port_num;
+		ret = 0;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv,
+			       struct ib_qp_attr *qp_attr,
+			       int *qp_attr_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REQ_RCVD:
+	case IB_CM_MRA_REQ_SENT:
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+	case IB_CM_ESTABLISHED:
+		*qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU |
+				IB_QP_DEST_QPN | IB_QP_RQ_PSN;
+		qp_attr->ah_attr = cm_id_priv->av.ah_attr;
+		qp_attr->path_mtu = cm_id_priv->path_mtu;
+		qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn);
+		qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn);
+		if (cm_id_priv->qp_type == IB_QPT_RC) {
+			*qp_attr_mask |= IB_QP_MAX_DEST_RD_ATOMIC |
+					 IB_QP_MIN_RNR_TIMER;
+			qp_attr->max_dest_rd_atomic =
+					cm_id_priv->responder_resources;
+			qp_attr->min_rnr_timer = 0;
+		}
+		if (cm_id_priv->alt_av.ah_attr.dlid) {
+			*qp_attr_mask |= IB_QP_ALT_PATH;
+			qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num;
+			qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
+			qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
+			qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
+		}
+		ret = 0;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv,
+			       struct ib_qp_attr *qp_attr,
+			       int *qp_attr_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->id.state) {
+	/* Allow transition to RTS before sending REP */
+	case IB_CM_REQ_RCVD:
+	case IB_CM_MRA_REQ_SENT:
+
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+	case IB_CM_ESTABLISHED:
+		if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT) {
+			*qp_attr_mask = IB_QP_STATE | IB_QP_SQ_PSN;
+			qp_attr->sq_psn = be32_to_cpu(cm_id_priv->sq_psn);
+			if (cm_id_priv->qp_type == IB_QPT_RC) {
+				*qp_attr_mask |= IB_QP_TIMEOUT | IB_QP_RETRY_CNT |
+						 IB_QP_RNR_RETRY |
+						 IB_QP_MAX_QP_RD_ATOMIC;
+				qp_attr->timeout = cm_id_priv->av.timeout;
+				qp_attr->retry_cnt = cm_id_priv->retry_count;
+				qp_attr->rnr_retry = cm_id_priv->rnr_retry_count;
+				qp_attr->max_rd_atomic =
+					cm_id_priv->initiator_depth;
+			}
+			if (cm_id_priv->alt_av.ah_attr.dlid) {
+				*qp_attr_mask |= IB_QP_PATH_MIG_STATE;
+				qp_attr->path_mig_state = IB_MIG_REARM;
+			}
+		} else {
+			*qp_attr_mask = IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE;
+			qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num;
+			qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
+			qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
+			qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
+			qp_attr->path_mig_state = IB_MIG_REARM;
+		}
+		ret = 0;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+int ib_cm_init_qp_attr(struct ib_cm_id *cm_id,
+		       struct ib_qp_attr *qp_attr,
+		       int *qp_attr_mask)
+{
+	struct cm_id_private *cm_id_priv;
+	int ret;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	switch (qp_attr->qp_state) {
+	case IB_QPS_INIT:
+		ret = cm_init_qp_init_attr(cm_id_priv, qp_attr, qp_attr_mask);
+		break;
+	case IB_QPS_RTR:
+		ret = cm_init_qp_rtr_attr(cm_id_priv, qp_attr, qp_attr_mask);
+		break;
+	case IB_QPS_RTS:
+		ret = cm_init_qp_rts_attr(cm_id_priv, qp_attr, qp_attr_mask);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(ib_cm_init_qp_attr);
+
+static void cm_get_ack_delay(struct cm_device *cm_dev)
+{
+	struct ib_device_attr attr;
+
+	if (ib_query_device(cm_dev->ib_device, &attr))
+		cm_dev->ack_delay = 0; /* acks will rely on packet life time */
+	else
+		cm_dev->ack_delay = attr.local_ca_ack_delay;
+}
+
+static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr,
+			       char *buf)
+{
+	struct cm_counter_group *group;
+	struct cm_counter_attribute *cm_attr;
+
+	group = container_of(obj, struct cm_counter_group, obj);
+	cm_attr = container_of(attr, struct cm_counter_attribute, attr);
+
+	return sprintf(buf, "%ld\n",
+		       atomic_long_read(&group->counter[cm_attr->index]));
+}
+
+static struct sysfs_ops cm_counter_ops = {
+	.show = cm_show_counter
+};
+
+static struct kobj_type cm_counter_obj_type = {
+	.sysfs_ops = &cm_counter_ops,
+	.default_attrs = cm_counter_default_attrs
+};
+
+static void cm_release_port_obj(struct kobject *obj)
+{
+	struct cm_port *cm_port;
+
+	cm_port = container_of(obj, struct cm_port, port_obj);
+	kfree(cm_port);
+}
+
+static struct kobj_type cm_port_obj_type = {
+	.release = cm_release_port_obj
+};
+
+struct class cm_class = {
+	.name    = "infiniband_cm",
+};
+EXPORT_SYMBOL(cm_class);
+
+static int cm_create_port_fs(struct cm_port *port)
+{
+	int i, ret;
+
+	ret = kobject_init_and_add(&port->port_obj, &cm_port_obj_type,
+				   &port->cm_dev->device->kobj,
+				   "%d", port->port_num);
+	if (ret) {
+		kfree(port);
+		return ret;
+	}
+
+	for (i = 0; i < CM_COUNTER_GROUPS; i++) {
+		ret = kobject_init_and_add(&port->counter_group[i].obj,
+					   &cm_counter_obj_type,
+					   &port->port_obj,
+					   "%s", counter_group_names[i]);
+		if (ret)
+			goto error;
+	}
+
+	return 0;
+
+error:
+	while (i--)
+		kobject_put(&port->counter_group[i].obj);
+	kobject_put(&port->port_obj);
+	return ret;
+
+}
+
+static void cm_remove_port_fs(struct cm_port *port)
+{
+	int i;
+
+	for (i = 0; i < CM_COUNTER_GROUPS; i++)
+		kobject_put(&port->counter_group[i].obj);
+
+	kobject_put(&port->port_obj);
+}
+
+static void cm_add_one(struct ib_device *ib_device)
+{
+	struct cm_device *cm_dev;
+	struct cm_port *port;
+	struct ib_mad_reg_req reg_req = {
+		.mgmt_class = IB_MGMT_CLASS_CM,
+		.mgmt_class_version = IB_CM_CLASS_VERSION
+	};
+	struct ib_port_modify port_modify = {
+		.set_port_cap_mask = IB_PORT_CM_SUP
+	};
+	unsigned long flags;
+	int ret;
+	u8 i;
+
+	if (rdma_node_get_transport(ib_device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	cm_dev = kzalloc(sizeof(*cm_dev) + sizeof(*port) *
+			 ib_device->phys_port_cnt, GFP_KERNEL);
+	if (!cm_dev)
+		return;
+
+	cm_dev->ib_device = ib_device;
+	cm_get_ack_delay(cm_dev);
+
+	cm_dev->device = device_create(&cm_class, &ib_device->dev,
+				       MKDEV(0, 0), NULL,
+				       "%s", ib_device->name);
+	if (!cm_dev->device) {
+		kfree(cm_dev);
+		return;
+	}
+
+	set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
+	for (i = 1; i <= ib_device->phys_port_cnt; i++) {
+		port = kzalloc(sizeof *port, GFP_KERNEL);
+		if (!port)
+			goto error1;
+
+		cm_dev->port[i-1] = port;
+		port->cm_dev = cm_dev;
+		port->port_num = i;
+
+		ret = cm_create_port_fs(port);
+		if (ret)
+			goto error1;
+
+		port->mad_agent = ib_register_mad_agent(ib_device, i,
+							IB_QPT_GSI,
+							&reg_req,
+							0,
+							cm_send_handler,
+							cm_recv_handler,
+							port);
+		if (IS_ERR(port->mad_agent))
+			goto error2;
+
+		ret = ib_modify_port(ib_device, i, 0, &port_modify);
+		if (ret)
+			goto error3;
+	}
+	ib_set_client_data(ib_device, &cm_client, cm_dev);
+
+	write_lock_irqsave(&cm.device_lock, flags);
+	list_add_tail(&cm_dev->list, &cm.device_list);
+	write_unlock_irqrestore(&cm.device_lock, flags);
+	return;
+
+error3:
+	ib_unregister_mad_agent(port->mad_agent);
+error2:
+	cm_remove_port_fs(port);
+error1:
+	port_modify.set_port_cap_mask = 0;
+	port_modify.clr_port_cap_mask = IB_PORT_CM_SUP;
+	while (--i) {
+		port = cm_dev->port[i-1];
+		ib_modify_port(ib_device, port->port_num, 0, &port_modify);
+		ib_unregister_mad_agent(port->mad_agent);
+		cm_remove_port_fs(port);
+	}
+	device_unregister(cm_dev->device);
+	kfree(cm_dev);
+}
+
+static void cm_remove_one(struct ib_device *ib_device)
+{
+	struct cm_device *cm_dev;
+	struct cm_port *port;
+	struct ib_port_modify port_modify = {
+		.clr_port_cap_mask = IB_PORT_CM_SUP
+	};
+	unsigned long flags;
+	int i;
+
+	cm_dev = ib_get_client_data(ib_device, &cm_client);
+	if (!cm_dev)
+		return;
+
+	write_lock_irqsave(&cm.device_lock, flags);
+	list_del(&cm_dev->list);
+	write_unlock_irqrestore(&cm.device_lock, flags);
+
+	for (i = 1; i <= ib_device->phys_port_cnt; i++) {
+		port = cm_dev->port[i-1];
+		ib_modify_port(ib_device, port->port_num, 0, &port_modify);
+		ib_unregister_mad_agent(port->mad_agent);
+		flush_workqueue(cm.wq);
+		cm_remove_port_fs(port);
+	}
+	device_unregister(cm_dev->device);
+	kfree(cm_dev);
+}
+
+static int __init ib_cm_init(void)
+{
+	int ret;
+
+	memset(&cm, 0, sizeof cm);
+	INIT_LIST_HEAD(&cm.device_list);
+	rwlock_init(&cm.device_lock);
+	spin_lock_init(&cm.lock);
+	cm.listen_service_table = RB_ROOT;
+	cm.listen_service_id = be64_to_cpu(IB_CM_ASSIGN_SERVICE_ID);
+	cm.remote_id_table = RB_ROOT;
+	cm.remote_qp_table = RB_ROOT;
+	cm.remote_sidr_table = RB_ROOT;
+	idr_init(&cm.local_id_table);
+	get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand);
+	idr_pre_get(&cm.local_id_table, GFP_KERNEL);
+	INIT_LIST_HEAD(&cm.timewait_list);
+
+	ret = class_register(&cm_class);
+	if (ret)
+		return -ENOMEM;
+
+	cm.wq = create_workqueue("ib_cm");
+	if (!cm.wq) {
+		ret = -ENOMEM;
+		goto error1;
+	}
+
+	ret = ib_register_client(&cm_client);
+	if (ret)
+		goto error2;
+
+	return 0;
+error2:
+	destroy_workqueue(cm.wq);
+error1:
+	class_unregister(&cm_class);
+	return ret;
+}
+
+static void __exit ib_cm_cleanup(void)
+{
+	struct cm_timewait_info *timewait_info, *tmp;
+
+	spin_lock_irq(&cm.lock);
+	list_for_each_entry(timewait_info, &cm.timewait_list, list)
+		cancel_delayed_work(&timewait_info->work.work);
+	spin_unlock_irq(&cm.lock);
+
+	ib_unregister_client(&cm_client);
+	destroy_workqueue(cm.wq);
+
+	list_for_each_entry_safe(timewait_info, tmp, &cm.timewait_list, list) {
+		list_del(&timewait_info->list);
+		kfree(timewait_info);
+	}
+
+	class_unregister(&cm_class);
+	idr_destroy(&cm.local_id_table);
+}
+
+module_init_order(ib_cm_init, SI_ORDER_SECOND);
+module_exit(ib_cm_cleanup);
+
diff --git a/sys/ofed/drivers/infiniband/core/cm_msgs.h b/sys/ofed/drivers/infiniband/core/cm_msgs.h
new file mode 100644
index 0000000..7e63c08
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/cm_msgs.h
@@ -0,0 +1,819 @@
+/*
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING the madirectory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use source and binary forms, with or
+ *     withmodification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retathe above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHWARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS THE
+ * SOFTWARE.
+ */
+#if !defined(CM_MSGS_H)
+#define CM_MSGS_H
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_cm.h>
+
+/*
+ * Parameters to routines below should be in network-byte order, and values
+ * are returned in network-byte order.
+ */
+
+#define IB_CM_CLASS_VERSION	2 /* IB specification 1.2 */
+
+#define CM_REQ_ATTR_ID		cpu_to_be16(0x0010)
+#define CM_MRA_ATTR_ID		cpu_to_be16(0x0011)
+#define CM_REJ_ATTR_ID		cpu_to_be16(0x0012)
+#define CM_REP_ATTR_ID		cpu_to_be16(0x0013)
+#define CM_RTU_ATTR_ID		cpu_to_be16(0x0014)
+#define CM_DREQ_ATTR_ID		cpu_to_be16(0x0015)
+#define CM_DREP_ATTR_ID		cpu_to_be16(0x0016)
+#define CM_SIDR_REQ_ATTR_ID	cpu_to_be16(0x0017)
+#define CM_SIDR_REP_ATTR_ID	cpu_to_be16(0x0018)
+#define CM_LAP_ATTR_ID		cpu_to_be16(0x0019)
+#define CM_APR_ATTR_ID		cpu_to_be16(0x001A)
+
+enum cm_msg_sequence {
+	CM_MSG_SEQUENCE_REQ,
+	CM_MSG_SEQUENCE_LAP,
+	CM_MSG_SEQUENCE_DREQ,
+	CM_MSG_SEQUENCE_SIDR
+};
+
+struct cm_req_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 rsvd4;
+	__be64 service_id;
+	__be64 local_ca_guid;
+	__be32 rsvd24;
+	__be32 local_qkey;
+	/* local QPN:24, responder resources:8 */
+	__be32 offset32;
+	/* local EECN:24, initiator depth:8 */
+	__be32 offset36;
+	/*
+	 * remote EECN:24, remote CM response timeout:5,
+	 * transport service type:2, end-to-end flow control:1
+	 */
+	__be32 offset40;
+	/* starting PSN:24, local CM response timeout:5, retry count:3 */
+	__be32 offset44;
+	__be16 pkey;
+	/* path MTU:4, RDC exists:1, RNR retry count:3. */
+	u8 offset50;
+	/* max CM Retries:4, SRQ:1, rsvd:3 */
+	u8 offset51;
+
+	__be16 primary_local_lid;
+	__be16 primary_remote_lid;
+	union ib_gid primary_local_gid;
+	union ib_gid primary_remote_gid;
+	/* flow label:20, rsvd:6, packet rate:6 */
+	__be32 primary_offset88;
+	u8 primary_traffic_class;
+	u8 primary_hop_limit;
+	/* SL:4, subnet local:1, rsvd:3 */
+	u8 primary_offset94;
+	/* local ACK timeout:5, rsvd:3 */
+	u8 primary_offset95;
+
+	__be16 alt_local_lid;
+	__be16 alt_remote_lid;
+	union ib_gid alt_local_gid;
+	union ib_gid alt_remote_gid;
+	/* flow label:20, rsvd:6, packet rate:6 */
+	__be32 alt_offset132;
+	u8 alt_traffic_class;
+	u8 alt_hop_limit;
+	/* SL:4, subnet local:1, rsvd:3 */
+	u8 alt_offset138;
+	/* local ACK timeout:5, rsvd:3 */
+	u8 alt_offset139;
+
+	u8 private_data[IB_CM_REQ_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline __be32 cm_req_get_local_qpn(struct cm_req_msg *req_msg)
+{
+	return cpu_to_be32(be32_to_cpu(req_msg->offset32) >> 8);
+}
+
+static inline void cm_req_set_local_qpn(struct cm_req_msg *req_msg, __be32 qpn)
+{
+	req_msg->offset32 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+					 (be32_to_cpu(req_msg->offset32) &
+					  0x000000FF));
+}
+
+static inline u8 cm_req_get_resp_res(struct cm_req_msg *req_msg)
+{
+	return (u8) be32_to_cpu(req_msg->offset32);
+}
+
+static inline void cm_req_set_resp_res(struct cm_req_msg *req_msg, u8 resp_res)
+{
+	req_msg->offset32 = cpu_to_be32(resp_res |
+					(be32_to_cpu(req_msg->offset32) &
+					 0xFFFFFF00));
+}
+
+static inline u8 cm_req_get_init_depth(struct cm_req_msg *req_msg)
+{
+	return (u8) be32_to_cpu(req_msg->offset36);
+}
+
+static inline void cm_req_set_init_depth(struct cm_req_msg *req_msg,
+					 u8 init_depth)
+{
+	req_msg->offset36 = cpu_to_be32(init_depth |
+					(be32_to_cpu(req_msg->offset36) &
+					 0xFFFFFF00));
+}
+
+static inline u8 cm_req_get_remote_resp_timeout(struct cm_req_msg *req_msg)
+{
+	return (u8) ((be32_to_cpu(req_msg->offset40) & 0xF8) >> 3);
+}
+
+static inline void cm_req_set_remote_resp_timeout(struct cm_req_msg *req_msg,
+						  u8 resp_timeout)
+{
+	req_msg->offset40 = cpu_to_be32((resp_timeout << 3) |
+					 (be32_to_cpu(req_msg->offset40) &
+					  0xFFFFFF07));
+}
+
+static inline enum ib_qp_type cm_req_get_qp_type(struct cm_req_msg *req_msg)
+{
+	u8 transport_type = (u8) (be32_to_cpu(req_msg->offset40) & 0x06) >> 1;
+	switch(transport_type) {
+	case 0: return IB_QPT_RC;
+	case 1: return IB_QPT_UC;
+	default: return 0;
+	}
+}
+
+static inline void cm_req_set_qp_type(struct cm_req_msg *req_msg,
+				      enum ib_qp_type qp_type)
+{
+	switch(qp_type) {
+	case IB_QPT_UC:
+		req_msg->offset40 = cpu_to_be32((be32_to_cpu(
+						  req_msg->offset40) &
+						   0xFFFFFFF9) | 0x2);
+		break;
+	default:
+		req_msg->offset40 = cpu_to_be32(be32_to_cpu(
+						 req_msg->offset40) &
+						  0xFFFFFFF9);
+	}
+}
+
+static inline u8 cm_req_get_flow_ctrl(struct cm_req_msg *req_msg)
+{
+	return be32_to_cpu(req_msg->offset40) & 0x1;
+}
+
+static inline void cm_req_set_flow_ctrl(struct cm_req_msg *req_msg,
+					u8 flow_ctrl)
+{
+	req_msg->offset40 = cpu_to_be32((flow_ctrl & 0x1) |
+					 (be32_to_cpu(req_msg->offset40) &
+					  0xFFFFFFFE));
+}
+
+static inline __be32 cm_req_get_starting_psn(struct cm_req_msg *req_msg)
+{
+	return cpu_to_be32(be32_to_cpu(req_msg->offset44) >> 8);
+}
+
+static inline void cm_req_set_starting_psn(struct cm_req_msg *req_msg,
+					   __be32 starting_psn)
+{
+	req_msg->offset44 = cpu_to_be32((be32_to_cpu(starting_psn) << 8) |
+			    (be32_to_cpu(req_msg->offset44) & 0x000000FF));
+}
+
+static inline u8 cm_req_get_local_resp_timeout(struct cm_req_msg *req_msg)
+{
+	return (u8) ((be32_to_cpu(req_msg->offset44) & 0xF8) >> 3);
+}
+
+static inline void cm_req_set_local_resp_timeout(struct cm_req_msg *req_msg,
+						 u8 resp_timeout)
+{
+	req_msg->offset44 = cpu_to_be32((resp_timeout << 3) |
+			    (be32_to_cpu(req_msg->offset44) & 0xFFFFFF07));
+}
+
+static inline u8 cm_req_get_retry_count(struct cm_req_msg *req_msg)
+{
+	return (u8) (be32_to_cpu(req_msg->offset44) & 0x7);
+}
+
+static inline void cm_req_set_retry_count(struct cm_req_msg *req_msg,
+					  u8 retry_count)
+{
+	req_msg->offset44 = cpu_to_be32((retry_count & 0x7) |
+			    (be32_to_cpu(req_msg->offset44) & 0xFFFFFFF8));
+}
+
+static inline u8 cm_req_get_path_mtu(struct cm_req_msg *req_msg)
+{
+	return req_msg->offset50 >> 4;
+}
+
+static inline void cm_req_set_path_mtu(struct cm_req_msg *req_msg, u8 path_mtu)
+{
+	req_msg->offset50 = (u8) ((req_msg->offset50 & 0xF) | (path_mtu << 4));
+}
+
+static inline u8 cm_req_get_rnr_retry_count(struct cm_req_msg *req_msg)
+{
+	return req_msg->offset50 & 0x7;
+}
+
+static inline void cm_req_set_rnr_retry_count(struct cm_req_msg *req_msg,
+					      u8 rnr_retry_count)
+{
+	req_msg->offset50 = (u8) ((req_msg->offset50 & 0xF8) |
+				  (rnr_retry_count & 0x7));
+}
+
+static inline u8 cm_req_get_max_cm_retries(struct cm_req_msg *req_msg)
+{
+	return req_msg->offset51 >> 4;
+}
+
+static inline void cm_req_set_max_cm_retries(struct cm_req_msg *req_msg,
+					     u8 retries)
+{
+	req_msg->offset51 = (u8) ((req_msg->offset51 & 0xF) | (retries << 4));
+}
+
+static inline u8 cm_req_get_srq(struct cm_req_msg *req_msg)
+{
+	return (req_msg->offset51 & 0x8) >> 3;
+}
+
+static inline void cm_req_set_srq(struct cm_req_msg *req_msg, u8 srq)
+{
+	req_msg->offset51 = (u8) ((req_msg->offset51 & 0xF7) |
+				  ((srq & 0x1) << 3));
+}
+
+static inline __be32 cm_req_get_primary_flow_label(struct cm_req_msg *req_msg)
+{
+	return cpu_to_be32(be32_to_cpu(req_msg->primary_offset88) >> 12);
+}
+
+static inline void cm_req_set_primary_flow_label(struct cm_req_msg *req_msg,
+						 __be32 flow_label)
+{
+	req_msg->primary_offset88 = cpu_to_be32(
+				    (be32_to_cpu(req_msg->primary_offset88) &
+				     0x00000FFF) |
+				     (be32_to_cpu(flow_label) << 12));
+}
+
+static inline u8 cm_req_get_primary_packet_rate(struct cm_req_msg *req_msg)
+{
+	return (u8) (be32_to_cpu(req_msg->primary_offset88) & 0x3F);
+}
+
+static inline void cm_req_set_primary_packet_rate(struct cm_req_msg *req_msg,
+						  u8 rate)
+{
+	req_msg->primary_offset88 = cpu_to_be32(
+				    (be32_to_cpu(req_msg->primary_offset88) &
+				     0xFFFFFFC0) | (rate & 0x3F));
+}
+
+static inline u8 cm_req_get_primary_sl(struct cm_req_msg *req_msg)
+{
+	return (u8) (req_msg->primary_offset94 >> 4);
+}
+
+static inline void cm_req_set_primary_sl(struct cm_req_msg *req_msg, u8 sl)
+{
+	req_msg->primary_offset94 = (u8) ((req_msg->primary_offset94 & 0x0F) |
+					  (sl << 4));
+}
+
+static inline u8 cm_req_get_primary_subnet_local(struct cm_req_msg *req_msg)
+{
+	return (u8) ((req_msg->primary_offset94 & 0x08) >> 3);
+}
+
+static inline void cm_req_set_primary_subnet_local(struct cm_req_msg *req_msg,
+						   u8 subnet_local)
+{
+	req_msg->primary_offset94 = (u8) ((req_msg->primary_offset94 & 0xF7) |
+					  ((subnet_local & 0x1) << 3));
+}
+
+static inline u8 cm_req_get_primary_local_ack_timeout(struct cm_req_msg *req_msg)
+{
+	return (u8) (req_msg->primary_offset95 >> 3);
+}
+
+static inline void cm_req_set_primary_local_ack_timeout(struct cm_req_msg *req_msg,
+							u8 local_ack_timeout)
+{
+	req_msg->primary_offset95 = (u8) ((req_msg->primary_offset95 & 0x07) |
+					  (local_ack_timeout << 3));
+}
+
+static inline __be32 cm_req_get_alt_flow_label(struct cm_req_msg *req_msg)
+{
+	return cpu_to_be32(be32_to_cpu(req_msg->alt_offset132) >> 12);
+}
+
+static inline void cm_req_set_alt_flow_label(struct cm_req_msg *req_msg,
+					     __be32 flow_label)
+{
+	req_msg->alt_offset132 = cpu_to_be32(
+				 (be32_to_cpu(req_msg->alt_offset132) &
+				  0x00000FFF) |
+				  (be32_to_cpu(flow_label) << 12));
+}
+
+static inline u8 cm_req_get_alt_packet_rate(struct cm_req_msg *req_msg)
+{
+	return (u8) (be32_to_cpu(req_msg->alt_offset132) & 0x3F);
+}
+
+static inline void cm_req_set_alt_packet_rate(struct cm_req_msg *req_msg,
+					      u8 rate)
+{
+	req_msg->alt_offset132 = cpu_to_be32(
+				 (be32_to_cpu(req_msg->alt_offset132) &
+				  0xFFFFFFC0) | (rate & 0x3F));
+}
+
+static inline u8 cm_req_get_alt_sl(struct cm_req_msg *req_msg)
+{
+	return (u8) (req_msg->alt_offset138 >> 4);
+}
+
+static inline void cm_req_set_alt_sl(struct cm_req_msg *req_msg, u8 sl)
+{
+	req_msg->alt_offset138 = (u8) ((req_msg->alt_offset138 & 0x0F) |
+				       (sl << 4));
+}
+
+static inline u8 cm_req_get_alt_subnet_local(struct cm_req_msg *req_msg)
+{
+	return (u8) ((req_msg->alt_offset138 & 0x08) >> 3);
+}
+
+static inline void cm_req_set_alt_subnet_local(struct cm_req_msg *req_msg,
+					       u8 subnet_local)
+{
+	req_msg->alt_offset138 = (u8) ((req_msg->alt_offset138 & 0xF7) |
+				       ((subnet_local & 0x1) << 3));
+}
+
+static inline u8 cm_req_get_alt_local_ack_timeout(struct cm_req_msg *req_msg)
+{
+	return (u8) (req_msg->alt_offset139 >> 3);
+}
+
+static inline void cm_req_set_alt_local_ack_timeout(struct cm_req_msg *req_msg,
+						    u8 local_ack_timeout)
+{
+	req_msg->alt_offset139 = (u8) ((req_msg->alt_offset139 & 0x07) |
+				       (local_ack_timeout << 3));
+}
+
+/* Message REJected or MRAed */
+enum cm_msg_response {
+	CM_MSG_RESPONSE_REQ = 0x0,
+	CM_MSG_RESPONSE_REP = 0x1,
+	CM_MSG_RESPONSE_OTHER = 0x2
+};
+
+ struct cm_mra_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+	/* message MRAed:2, rsvd:6 */
+	u8 offset8;
+	/* service timeout:5, rsvd:3 */
+	u8 offset9;
+
+	u8 private_data[IB_CM_MRA_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline u8 cm_mra_get_msg_mraed(struct cm_mra_msg *mra_msg)
+{
+	return (u8) (mra_msg->offset8 >> 6);
+}
+
+static inline void cm_mra_set_msg_mraed(struct cm_mra_msg *mra_msg, u8 msg)
+{
+	mra_msg->offset8 = (u8) ((mra_msg->offset8 & 0x3F) | (msg << 6));
+}
+
+static inline u8 cm_mra_get_service_timeout(struct cm_mra_msg *mra_msg)
+{
+	return (u8) (mra_msg->offset9 >> 3);
+}
+
+static inline void cm_mra_set_service_timeout(struct cm_mra_msg *mra_msg,
+					      u8 service_timeout)
+{
+	mra_msg->offset9 = (u8) ((mra_msg->offset9 & 0x07) |
+				 (service_timeout << 3));
+}
+
+struct cm_rej_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+	/* message REJected:2, rsvd:6 */
+	u8 offset8;
+	/* reject info length:7, rsvd:1. */
+	u8 offset9;
+	__be16 reason;
+	u8 ari[IB_CM_REJ_ARI_LENGTH];
+
+	u8 private_data[IB_CM_REJ_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline u8 cm_rej_get_msg_rejected(struct cm_rej_msg *rej_msg)
+{
+	return (u8) (rej_msg->offset8 >> 6);
+}
+
+static inline void cm_rej_set_msg_rejected(struct cm_rej_msg *rej_msg, u8 msg)
+{
+	rej_msg->offset8 = (u8) ((rej_msg->offset8 & 0x3F) | (msg << 6));
+}
+
+static inline u8 cm_rej_get_reject_info_len(struct cm_rej_msg *rej_msg)
+{
+	return (u8) (rej_msg->offset9 >> 1);
+}
+
+static inline void cm_rej_set_reject_info_len(struct cm_rej_msg *rej_msg,
+					      u8 len)
+{
+	rej_msg->offset9 = (u8) ((rej_msg->offset9 & 0x1) | (len << 1));
+}
+
+struct cm_rep_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+	__be32 local_qkey;
+	/* local QPN:24, rsvd:8 */
+	__be32 offset12;
+	/* local EECN:24, rsvd:8 */
+	__be32 offset16;
+	/* starting PSN:24 rsvd:8 */
+	__be32 offset20;
+	u8 resp_resources;
+	u8 initiator_depth;
+	/* target ACK delay:5, failover accepted:2, end-to-end flow control:1 */
+	u8 offset26;
+	/* RNR retry count:3, SRQ:1, rsvd:5 */
+	u8 offset27;
+	__be64 local_ca_guid;
+
+	u8 private_data[IB_CM_REP_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline __be32 cm_rep_get_local_qpn(struct cm_rep_msg *rep_msg)
+{
+	return cpu_to_be32(be32_to_cpu(rep_msg->offset12) >> 8);
+}
+
+static inline void cm_rep_set_local_qpn(struct cm_rep_msg *rep_msg, __be32 qpn)
+{
+	rep_msg->offset12 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+			    (be32_to_cpu(rep_msg->offset12) & 0x000000FF));
+}
+
+static inline __be32 cm_rep_get_starting_psn(struct cm_rep_msg *rep_msg)
+{
+	return cpu_to_be32(be32_to_cpu(rep_msg->offset20) >> 8);
+}
+
+static inline void cm_rep_set_starting_psn(struct cm_rep_msg *rep_msg,
+					   __be32 starting_psn)
+{
+	rep_msg->offset20 = cpu_to_be32((be32_to_cpu(starting_psn) << 8) |
+			    (be32_to_cpu(rep_msg->offset20) & 0x000000FF));
+}
+
+static inline u8 cm_rep_get_target_ack_delay(struct cm_rep_msg *rep_msg)
+{
+	return (u8) (rep_msg->offset26 >> 3);
+}
+
+static inline void cm_rep_set_target_ack_delay(struct cm_rep_msg *rep_msg,
+					       u8 target_ack_delay)
+{
+	rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0x07) |
+				  (target_ack_delay << 3));
+}
+
+static inline u8 cm_rep_get_failover(struct cm_rep_msg *rep_msg)
+{
+	return (u8) ((rep_msg->offset26 & 0x06) >> 1);
+}
+
+static inline void cm_rep_set_failover(struct cm_rep_msg *rep_msg, u8 failover)
+{
+	rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0xF9) |
+				  ((failover & 0x3) << 1));
+}
+
+static inline u8 cm_rep_get_flow_ctrl(struct cm_rep_msg *rep_msg)
+{
+	return (u8) (rep_msg->offset26 & 0x01);
+}
+
+static inline void cm_rep_set_flow_ctrl(struct cm_rep_msg *rep_msg,
+					    u8 flow_ctrl)
+{
+	rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0xFE) |
+				  (flow_ctrl & 0x1));
+}
+
+static inline u8 cm_rep_get_rnr_retry_count(struct cm_rep_msg *rep_msg)
+{
+	return (u8) (rep_msg->offset27 >> 5);
+}
+
+static inline void cm_rep_set_rnr_retry_count(struct cm_rep_msg *rep_msg,
+					      u8 rnr_retry_count)
+{
+	rep_msg->offset27 = (u8) ((rep_msg->offset27 & 0x1F) |
+				  (rnr_retry_count << 5));
+}
+
+static inline u8 cm_rep_get_srq(struct cm_rep_msg *rep_msg)
+{
+	return (u8) ((rep_msg->offset27 >> 4) & 0x1);
+}
+
+static inline void cm_rep_set_srq(struct cm_rep_msg *rep_msg, u8 srq)
+{
+	rep_msg->offset27 = (u8) ((rep_msg->offset27 & 0xEF) |
+				  ((srq & 0x1) << 4));
+}
+
+struct cm_rtu_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+
+	u8 private_data[IB_CM_RTU_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+struct cm_dreq_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+	/* remote QPN/EECN:24, rsvd:8 */
+	__be32 offset8;
+
+	u8 private_data[IB_CM_DREQ_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline __be32 cm_dreq_get_remote_qpn(struct cm_dreq_msg *dreq_msg)
+{
+	return cpu_to_be32(be32_to_cpu(dreq_msg->offset8) >> 8);
+}
+
+static inline void cm_dreq_set_remote_qpn(struct cm_dreq_msg *dreq_msg, __be32 qpn)
+{
+	dreq_msg->offset8 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+			    (be32_to_cpu(dreq_msg->offset8) & 0x000000FF));
+}
+
+struct cm_drep_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+
+	u8 private_data[IB_CM_DREP_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+struct cm_lap_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+
+	__be32 rsvd8;
+	/* remote QPN/EECN:24, remote CM response timeout:5, rsvd:3 */
+	__be32 offset12;
+	__be32 rsvd16;
+
+	__be16 alt_local_lid;
+	__be16 alt_remote_lid;
+	union ib_gid alt_local_gid;
+	union ib_gid alt_remote_gid;
+	/* flow label:20, rsvd:4, traffic class:8 */
+	__be32 offset56;
+	u8 alt_hop_limit;
+	/* rsvd:2, packet rate:6 */
+	u8 offset61;
+	/* SL:4, subnet local:1, rsvd:3 */
+	u8 offset62;
+	/* local ACK timeout:5, rsvd:3 */
+	u8 offset63;
+
+	u8 private_data[IB_CM_LAP_PRIVATE_DATA_SIZE];
+} __attribute__  ((packed));
+
+static inline __be32 cm_lap_get_remote_qpn(struct cm_lap_msg *lap_msg)
+{
+	return cpu_to_be32(be32_to_cpu(lap_msg->offset12) >> 8);
+}
+
+static inline void cm_lap_set_remote_qpn(struct cm_lap_msg *lap_msg, __be32 qpn)
+{
+	lap_msg->offset12 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+					 (be32_to_cpu(lap_msg->offset12) &
+					  0x000000FF));
+}
+
+static inline u8 cm_lap_get_remote_resp_timeout(struct cm_lap_msg *lap_msg)
+{
+	return (u8) ((be32_to_cpu(lap_msg->offset12) & 0xF8) >> 3);
+}
+
+static inline void cm_lap_set_remote_resp_timeout(struct cm_lap_msg *lap_msg,
+						  u8 resp_timeout)
+{
+	lap_msg->offset12 = cpu_to_be32((resp_timeout << 3) |
+					 (be32_to_cpu(lap_msg->offset12) &
+					  0xFFFFFF07));
+}
+
+static inline __be32 cm_lap_get_flow_label(struct cm_lap_msg *lap_msg)
+{
+	return cpu_to_be32(be32_to_cpu(lap_msg->offset56) >> 12);
+}
+
+static inline void cm_lap_set_flow_label(struct cm_lap_msg *lap_msg,
+					 __be32 flow_label)
+{
+	lap_msg->offset56 = cpu_to_be32(
+				 (be32_to_cpu(lap_msg->offset56) & 0x00000FFF) |
+				 (be32_to_cpu(flow_label) << 12));
+}
+
+static inline u8 cm_lap_get_traffic_class(struct cm_lap_msg *lap_msg)
+{
+	return (u8) be32_to_cpu(lap_msg->offset56);
+}
+
+static inline void cm_lap_set_traffic_class(struct cm_lap_msg *lap_msg,
+					    u8 traffic_class)
+{
+	lap_msg->offset56 = cpu_to_be32(traffic_class |
+					 (be32_to_cpu(lap_msg->offset56) &
+					  0xFFFFFF00));
+}
+
+static inline u8 cm_lap_get_packet_rate(struct cm_lap_msg *lap_msg)
+{
+	return lap_msg->offset61 & 0x3F;
+}
+
+static inline void cm_lap_set_packet_rate(struct cm_lap_msg *lap_msg,
+					  u8 packet_rate)
+{
+	lap_msg->offset61 = (packet_rate & 0x3F) | (lap_msg->offset61 & 0xC0);
+}
+
+static inline u8 cm_lap_get_sl(struct cm_lap_msg *lap_msg)
+{
+	return lap_msg->offset62 >> 4;
+}
+
+static inline void cm_lap_set_sl(struct cm_lap_msg *lap_msg, u8 sl)
+{
+	lap_msg->offset62 = (sl << 4) | (lap_msg->offset62 & 0x0F);
+}
+
+static inline u8 cm_lap_get_subnet_local(struct cm_lap_msg *lap_msg)
+{
+	return (lap_msg->offset62 >> 3) & 0x1;
+}
+
+static inline void cm_lap_set_subnet_local(struct cm_lap_msg *lap_msg,
+					   u8 subnet_local)
+{
+	lap_msg->offset62 = ((subnet_local & 0x1) << 3) |
+			     (lap_msg->offset61 & 0xF7);
+}
+static inline u8 cm_lap_get_local_ack_timeout(struct cm_lap_msg *lap_msg)
+{
+	return lap_msg->offset63 >> 3;
+}
+
+static inline void cm_lap_set_local_ack_timeout(struct cm_lap_msg *lap_msg,
+						u8 local_ack_timeout)
+{
+	lap_msg->offset63 = (local_ack_timeout << 3) |
+			    (lap_msg->offset63 & 0x07);
+}
+
+struct cm_apr_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+
+	u8 info_length;
+	u8 ap_status;
+	u8 info[IB_CM_APR_INFO_LENGTH];
+
+	u8 private_data[IB_CM_APR_PRIVATE_DATA_SIZE];
+} __attribute__ ((packed));
+
+struct cm_sidr_req_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 request_id;
+	__be16 pkey;
+	__be16 rsvd;
+	__be64 service_id;
+
+	u8 private_data[IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE];
+} __attribute__ ((packed));
+
+struct cm_sidr_rep_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 request_id;
+	u8 status;
+	u8 info_length;
+	__be16 rsvd;
+	/* QPN:24, rsvd:8 */
+	__be32 offset8;
+	__be64 service_id;
+	__be32 qkey;
+	u8 info[IB_CM_SIDR_REP_INFO_LENGTH];
+
+	u8 private_data[IB_CM_SIDR_REP_PRIVATE_DATA_SIZE];
+} __attribute__ ((packed));
+
+static inline __be32 cm_sidr_rep_get_qpn(struct cm_sidr_rep_msg *sidr_rep_msg)
+{
+	return cpu_to_be32(be32_to_cpu(sidr_rep_msg->offset8) >> 8);
+}
+
+static inline void cm_sidr_rep_set_qpn(struct cm_sidr_rep_msg *sidr_rep_msg,
+				       __be32 qpn)
+{
+	sidr_rep_msg->offset8 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+					(be32_to_cpu(sidr_rep_msg->offset8) &
+					 0x000000FF));
+}
+
+#endif /* CM_MSGS_H */
diff --git a/sys/ofed/drivers/infiniband/core/cma.c b/sys/ofed/drivers/infiniband/core/cma.c
new file mode 100644
index 0000000..c016451
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/cma.c
@@ -0,0 +1,3386 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc.  All rights reserved.
+ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/mutex.h>
+#include <linux/random.h>
+#include <linux/idr.h>
+#include <linux/inetdevice.h>
+
+#include <net/tcp.h>
+#include <net/ipv6.h>
+
+#include <rdma/rdma_cm.h>
+#include <rdma/rdma_cm_ib.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_sa.h>
+#include <rdma/iw_cm.h>
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("Generic RDMA CM Agent");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static int tavor_quirk = 0;
+module_param_named(tavor_quirk, tavor_quirk, int, 0644);
+MODULE_PARM_DESC(tavor_quirk, "Tavor performance quirk: limit MTU to 1K if > 0");
+
+int unify_tcp_port_space = 0;
+module_param(unify_tcp_port_space, int, 0644);
+MODULE_PARM_DESC(unify_tcp_port_space, "Unify the host TCP and RDMA port "
+		 "space allocation (default=0)");
+
+#define CMA_CM_RESPONSE_TIMEOUT 20
+#define CMA_MAX_CM_RETRIES 15
+#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
+#define IBOE_PACKET_LIFETIME 18
+
+static int cma_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
+module_param_named(cma_response_timeout, cma_response_timeout, int, 0644);
+MODULE_PARM_DESC(cma_response_timeout, "CMA_CM_RESPONSE_TIMEOUT default=20");
+
+static int def_prec2sl = 3;
+module_param_named(def_prec2sl, def_prec2sl, int, 0644);
+MODULE_PARM_DESC(def_prec2sl, "Default value for SL priority with RoCE. Valid values 0 - 7");
+
+static void cma_add_one(struct ib_device *device);
+static void cma_remove_one(struct ib_device *device);
+
+static struct ib_client cma_client = {
+	.name   = "cma",
+	.add    = cma_add_one,
+	.remove = cma_remove_one
+};
+
+static struct ib_sa_client sa_client;
+static struct rdma_addr_client addr_client;
+static LIST_HEAD(dev_list);
+static LIST_HEAD(listen_any_list);
+static DEFINE_MUTEX(lock);
+static struct workqueue_struct *cma_wq;
+static DEFINE_IDR(sdp_ps);
+static DEFINE_IDR(tcp_ps);
+static DEFINE_IDR(udp_ps);
+static DEFINE_IDR(ipoib_ps);
+static int next_port;
+
+struct cma_device {
+	struct list_head	list;
+	struct ib_device	*device;
+	struct completion	comp;
+	atomic_t		refcount;
+	struct list_head	id_list;
+};
+
+enum cma_state {
+	CMA_IDLE,
+	CMA_ADDR_QUERY,
+	CMA_ADDR_RESOLVED,
+	CMA_ROUTE_QUERY,
+	CMA_ROUTE_RESOLVED,
+	CMA_CONNECT,
+	CMA_DISCONNECT,
+	CMA_ADDR_BOUND,
+	CMA_LISTEN,
+	CMA_DEVICE_REMOVAL,
+	CMA_DESTROYING
+};
+
+struct rdma_bind_list {
+	struct idr		*ps;
+	struct hlist_head	owners;
+	unsigned short		port;
+};
+
+/*
+ * Device removal can occur at anytime, so we need extra handling to
+ * serialize notifying the user of device removal with other callbacks.
+ * We do this by disabling removal notification while a callback is in process,
+ * and reporting it after the callback completes.
+ */
+struct rdma_id_private {
+	struct rdma_cm_id	id;
+
+	struct rdma_bind_list	*bind_list;
+	struct socket		*sock;
+	struct hlist_node	node;
+	struct list_head	list; /* listen_any_list or cma_device.list */
+	struct list_head	listen_list; /* per device listens */
+	struct cma_device	*cma_dev;
+	struct list_head	mc_list;
+
+	int			internal_id;
+	enum cma_state		state;
+	spinlock_t		lock;
+	struct mutex		qp_mutex;
+
+	struct completion	comp;
+	atomic_t		refcount;
+	struct mutex		handler_mutex;
+
+	int			backlog;
+	int			timeout_ms;
+	struct ib_sa_query	*query;
+	int			query_id;
+	union {
+		struct ib_cm_id	*ib;
+		struct iw_cm_id	*iw;
+	} cm_id;
+
+	u32			seq_num;
+	u32			qkey;
+	u32			qp_num;
+	u8			srq;
+	u8			tos;
+};
+
+struct cma_multicast {
+	struct rdma_id_private *id_priv;
+	union {
+		struct ib_sa_multicast *ib;
+	} multicast;
+	struct list_head	list;
+	void			*context;
+	struct sockaddr_storage	addr;
+	struct kref		mcref;
+};
+
+struct cma_work {
+	struct work_struct	work;
+	struct rdma_id_private	*id;
+	enum cma_state		old_state;
+	enum cma_state		new_state;
+	struct rdma_cm_event	event;
+};
+
+struct cma_ndev_work {
+	struct work_struct	work;
+	struct rdma_id_private	*id;
+	struct rdma_cm_event	event;
+};
+
+struct iboe_mcast_work {
+	struct work_struct	 work;
+	struct rdma_id_private	*id;
+	struct cma_multicast	*mc;
+};
+
+union cma_ip_addr {
+	struct in6_addr ip6;
+	struct {
+		__be32 pad[3];
+		__be32 addr;
+	} ip4;
+};
+
+struct cma_hdr {
+	u8 cma_version;
+	u8 ip_version;	/* IP version: 7:4 */
+	__be16 port;
+	union cma_ip_addr src_addr;
+	union cma_ip_addr dst_addr;
+};
+
+struct sdp_hh {
+	u8 bsdh[16];
+	u8 sdp_version; /* Major version: 7:4 */
+	u8 ip_version;	/* IP version: 7:4 */
+	u8 sdp_specific1[10];
+	__be16 port;
+	__be16 sdp_specific2;
+	union cma_ip_addr src_addr;
+	union cma_ip_addr dst_addr;
+};
+
+struct sdp_hah {
+	u8 bsdh[16];
+	u8 sdp_version;
+};
+
+#define CMA_VERSION 0x00
+#define SDP_MAJ_VERSION 0x2
+
+static int cma_comp(struct rdma_id_private *id_priv, enum cma_state comp)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&id_priv->lock, flags);
+	ret = (id_priv->state == comp);
+	spin_unlock_irqrestore(&id_priv->lock, flags);
+	return ret;
+}
+
+static int cma_comp_exch(struct rdma_id_private *id_priv,
+			 enum cma_state comp, enum cma_state exch)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&id_priv->lock, flags);
+	if ((ret = (id_priv->state == comp)))
+		id_priv->state = exch;
+	spin_unlock_irqrestore(&id_priv->lock, flags);
+	return ret;
+}
+
+static enum cma_state cma_exch(struct rdma_id_private *id_priv,
+			       enum cma_state exch)
+{
+	unsigned long flags;
+	enum cma_state old;
+
+	spin_lock_irqsave(&id_priv->lock, flags);
+	old = id_priv->state;
+	id_priv->state = exch;
+	spin_unlock_irqrestore(&id_priv->lock, flags);
+	return old;
+}
+
+static inline u8 cma_get_ip_ver(struct cma_hdr *hdr)
+{
+	return hdr->ip_version >> 4;
+}
+
+static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver)
+{
+	hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF);
+}
+
+static inline u8 sdp_get_majv(u8 sdp_version)
+{
+	return sdp_version >> 4;
+}
+
+static inline u8 sdp_get_ip_ver(struct sdp_hh *hh)
+{
+	return hh->ip_version >> 4;
+}
+
+static inline void sdp_set_ip_ver(struct sdp_hh *hh, u8 ip_ver)
+{
+	hh->ip_version = (ip_ver << 4) | (hh->ip_version & 0xF);
+}
+
+static inline int cma_is_ud_ps(enum rdma_port_space ps)
+{
+	return (ps == RDMA_PS_UDP || ps == RDMA_PS_IPOIB);
+}
+
+static void cma_attach_to_dev(struct rdma_id_private *id_priv,
+			      struct cma_device *cma_dev)
+{
+	atomic_inc(&cma_dev->refcount);
+	id_priv->cma_dev = cma_dev;
+	id_priv->id.device = cma_dev->device;
+	id_priv->id.route.addr.dev_addr.transport =
+		rdma_node_get_transport(cma_dev->device->node_type);
+	list_add_tail(&id_priv->list, &cma_dev->id_list);
+}
+
+static inline void cma_deref_dev(struct cma_device *cma_dev)
+{
+	if (atomic_dec_and_test(&cma_dev->refcount))
+		complete(&cma_dev->comp);
+}
+
+static inline void release_mc(struct kref *kref)
+{
+	struct cma_multicast *mc = container_of(kref, struct cma_multicast, mcref);
+
+	kfree(mc->multicast.ib);
+	kfree(mc);
+}
+
+static void cma_detach_from_dev(struct rdma_id_private *id_priv)
+{
+	list_del(&id_priv->list);
+	cma_deref_dev(id_priv->cma_dev);
+	id_priv->cma_dev = NULL;
+}
+
+static int cma_set_qkey(struct rdma_id_private *id_priv)
+{
+	struct ib_sa_mcmember_rec rec;
+	int ret = 0;
+
+	if (id_priv->qkey)
+		return 0;
+
+	switch (id_priv->id.ps) {
+	case RDMA_PS_UDP:
+		id_priv->qkey = RDMA_UDP_QKEY;
+		break;
+	case RDMA_PS_IPOIB:
+		ib_addr_get_mgid(&id_priv->id.route.addr.dev_addr, &rec.mgid);
+		ret = ib_sa_get_mcmember_rec(id_priv->id.device,
+					     id_priv->id.port_num, &rec.mgid,
+					     &rec);
+		if (!ret)
+			id_priv->qkey = be32_to_cpu(rec.qkey);
+		break;
+	default:
+		break;
+	}
+	return ret;
+}
+
+static int cma_acquire_dev(struct rdma_id_private *id_priv)
+{
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	struct cma_device *cma_dev;
+	union ib_gid gid;
+	int ret = -ENODEV;
+
+	if (dev_addr->dev_type != ARPHRD_INFINIBAND) {
+		iboe_addr_get_sgid(dev_addr, &gid);
+		list_for_each_entry(cma_dev, &dev_list, list) {
+			ret = ib_find_cached_gid(cma_dev->device, &gid,
+						 &id_priv->id.port_num, NULL);
+			if (!ret)
+				goto out;
+		}
+	}
+
+	memcpy(&gid, dev_addr->src_dev_addr +
+	       rdma_addr_gid_offset(dev_addr), sizeof gid);
+	list_for_each_entry(cma_dev, &dev_list, list) {
+		ret = ib_find_cached_gid(cma_dev->device, &gid,
+					 &id_priv->id.port_num, NULL);
+		if (!ret)
+			break;
+	}
+
+out:
+	if (!ret)
+		cma_attach_to_dev(id_priv, cma_dev);
+
+	return ret;
+}
+
+static void cma_deref_id(struct rdma_id_private *id_priv)
+{
+	if (atomic_dec_and_test(&id_priv->refcount))
+		complete(&id_priv->comp);
+}
+
+static int cma_disable_callback(struct rdma_id_private *id_priv,
+			      enum cma_state state)
+{
+	mutex_lock(&id_priv->handler_mutex);
+	if (id_priv->state != state) {
+		mutex_unlock(&id_priv->handler_mutex);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int cma_has_cm_dev(struct rdma_id_private *id_priv)
+{
+	return (id_priv->id.device && id_priv->cm_id.ib);
+}
+
+struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
+				  void *context, enum rdma_port_space ps)
+{
+	struct rdma_id_private *id_priv;
+
+	id_priv = kzalloc(sizeof *id_priv, GFP_KERNEL);
+	if (!id_priv)
+		return ERR_PTR(-ENOMEM);
+
+	id_priv->state = CMA_IDLE;
+	id_priv->id.context = context;
+	id_priv->id.event_handler = event_handler;
+	id_priv->id.ps = ps;
+	spin_lock_init(&id_priv->lock);
+	mutex_init(&id_priv->qp_mutex);
+	init_completion(&id_priv->comp);
+	atomic_set(&id_priv->refcount, 1);
+	mutex_init(&id_priv->handler_mutex);
+	INIT_LIST_HEAD(&id_priv->listen_list);
+	INIT_LIST_HEAD(&id_priv->mc_list);
+	get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num);
+
+	return &id_priv->id;
+}
+EXPORT_SYMBOL(rdma_create_id);
+
+static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+
+	qp_attr.qp_state = IB_QPS_INIT;
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		return ret;
+
+	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+	if (ret)
+		return ret;
+
+	qp_attr.qp_state = IB_QPS_RTR;
+	ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+	if (ret)
+		return ret;
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	qp_attr.sq_psn = 0;
+	ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN);
+
+	return ret;
+}
+
+static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+
+	qp_attr.qp_state = IB_QPS_INIT;
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		return ret;
+
+	return ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+}
+
+int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd,
+		   struct ib_qp_init_attr *qp_init_attr)
+{
+	struct rdma_id_private *id_priv;
+	struct ib_qp *qp;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (id->device != pd->device)
+		return -EINVAL;
+
+	qp = ib_create_qp(pd, qp_init_attr);
+	if (IS_ERR(qp))
+		return PTR_ERR(qp);
+
+	if (cma_is_ud_ps(id_priv->id.ps))
+		ret = cma_init_ud_qp(id_priv, qp);
+	else
+		ret = cma_init_conn_qp(id_priv, qp);
+	if (ret)
+		goto err;
+
+	id->qp = qp;
+	id_priv->qp_num = qp->qp_num;
+	id_priv->srq = (qp->srq != NULL);
+	return 0;
+err:
+	ib_destroy_qp(qp);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_create_qp);
+
+void rdma_destroy_qp(struct rdma_cm_id *id)
+{
+	struct rdma_id_private *id_priv;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	mutex_lock(&id_priv->qp_mutex);
+	ib_destroy_qp(id_priv->id.qp);
+	id_priv->id.qp = NULL;
+	mutex_unlock(&id_priv->qp_mutex);
+}
+EXPORT_SYMBOL(rdma_destroy_qp);
+
+static int cma_modify_qp_rtr(struct rdma_id_private *id_priv,
+			     struct rdma_conn_param *conn_param)
+{
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+
+	mutex_lock(&id_priv->qp_mutex);
+	if (!id_priv->id.qp) {
+		ret = 0;
+		goto out;
+	}
+
+	/* Need to update QP attributes from default values. */
+	qp_attr.qp_state = IB_QPS_INIT;
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		goto out;
+
+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+	if (ret)
+		goto out;
+
+	qp_attr.qp_state = IB_QPS_RTR;
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		goto out;
+
+	if (conn_param)
+		qp_attr.max_dest_rd_atomic = conn_param->responder_resources;
+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+out:
+	mutex_unlock(&id_priv->qp_mutex);
+	return ret;
+}
+
+static int cma_modify_qp_rts(struct rdma_id_private *id_priv,
+			     struct rdma_conn_param *conn_param)
+{
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+
+	mutex_lock(&id_priv->qp_mutex);
+	if (!id_priv->id.qp) {
+		ret = 0;
+		goto out;
+	}
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		goto out;
+
+	if (conn_param)
+		qp_attr.max_rd_atomic = conn_param->initiator_depth;
+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+out:
+	mutex_unlock(&id_priv->qp_mutex);
+	return ret;
+}
+
+static int cma_modify_qp_err(struct rdma_id_private *id_priv)
+{
+	struct ib_qp_attr qp_attr;
+	int ret;
+
+	mutex_lock(&id_priv->qp_mutex);
+	if (!id_priv->id.qp) {
+		ret = 0;
+		goto out;
+	}
+
+	qp_attr.qp_state = IB_QPS_ERR;
+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE);
+out:
+	mutex_unlock(&id_priv->qp_mutex);
+	return ret;
+}
+
+static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv,
+			       struct ib_qp_attr *qp_attr, int *qp_attr_mask)
+{
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	int ret;
+	u16 pkey;
+
+	if (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) ==
+	    IB_LINK_LAYER_INFINIBAND)
+		pkey = ib_addr_get_pkey(dev_addr);
+	else
+		pkey = 0xffff;
+
+	ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num,
+				  pkey, &qp_attr->pkey_index);
+	if (ret)
+		return ret;
+
+	qp_attr->port_num = id_priv->id.port_num;
+	*qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT;
+
+	if (cma_is_ud_ps(id_priv->id.ps)) {
+		ret = cma_set_qkey(id_priv);
+		if (ret)
+			return ret;
+
+		qp_attr->qkey = id_priv->qkey;
+		*qp_attr_mask |= IB_QP_QKEY;
+	} else {
+		qp_attr->qp_access_flags = 0;
+		*qp_attr_mask |= IB_QP_ACCESS_FLAGS;
+	}
+	return 0;
+}
+
+int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
+		       int *qp_attr_mask)
+{
+	struct rdma_id_private *id_priv;
+	int ret = 0;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		if (!id_priv->cm_id.ib || cma_is_ud_ps(id_priv->id.ps))
+			ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask);
+		else
+			ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr,
+						 qp_attr_mask);
+		if (qp_attr->qp_state == IB_QPS_RTR)
+			qp_attr->rq_psn = id_priv->seq_num;
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		if (!id_priv->cm_id.iw) {
+			qp_attr->qp_access_flags = 0;
+			*qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
+		} else
+			ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr,
+						 qp_attr_mask);
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(rdma_init_qp_attr);
+
+static inline int cma_zero_addr(struct sockaddr *addr)
+{
+	struct in6_addr *ip6;
+
+	if (addr->sa_family == AF_INET)
+		return ipv4_is_zeronet(
+			((struct sockaddr_in *)addr)->sin_addr.s_addr);
+	else {
+		ip6 = &((struct sockaddr_in6 *) addr)->sin6_addr;
+		return (ip6->s6_addr32[0] | ip6->s6_addr32[1] |
+			ip6->s6_addr32[2] | ip6->s6_addr32[3]) == 0;
+	}
+}
+
+static inline int cma_loopback_addr(struct sockaddr *addr)
+{
+	if (addr->sa_family == AF_INET)
+		return ipv4_is_loopback(
+			((struct sockaddr_in *) addr)->sin_addr.s_addr);
+	else
+		return ipv6_addr_loopback(
+			&((struct sockaddr_in6 *) addr)->sin6_addr);
+}
+
+static inline int cma_any_addr(struct sockaddr *addr)
+{
+	return cma_zero_addr(addr) || cma_loopback_addr(addr);
+}
+
+static inline __be16 cma_port(struct sockaddr *addr)
+{
+	if (addr->sa_family == AF_INET)
+		return ((struct sockaddr_in *) addr)->sin_port;
+	else
+		return ((struct sockaddr_in6 *) addr)->sin6_port;
+}
+
+static inline int cma_any_port(struct sockaddr *addr)
+{
+	return !cma_port(addr);
+}
+
+static int cma_get_net_info(void *hdr, enum rdma_port_space ps,
+			    u8 *ip_ver, __be16 *port,
+			    union cma_ip_addr **src, union cma_ip_addr **dst)
+{
+	switch (ps) {
+	case RDMA_PS_SDP:
+		if (sdp_get_majv(((struct sdp_hh *) hdr)->sdp_version) !=
+		    SDP_MAJ_VERSION)
+			return -EINVAL;
+
+		*ip_ver	= sdp_get_ip_ver(hdr);
+		*port	= ((struct sdp_hh *) hdr)->port;
+		*src	= &((struct sdp_hh *) hdr)->src_addr;
+		*dst	= &((struct sdp_hh *) hdr)->dst_addr;
+		break;
+	default:
+		if (((struct cma_hdr *) hdr)->cma_version != CMA_VERSION)
+			return -EINVAL;
+
+		*ip_ver	= cma_get_ip_ver(hdr);
+		*port	= ((struct cma_hdr *) hdr)->port;
+		*src	= &((struct cma_hdr *) hdr)->src_addr;
+		*dst	= &((struct cma_hdr *) hdr)->dst_addr;
+		break;
+	}
+
+	if (*ip_ver != 4 && *ip_ver != 6)
+		return -EINVAL;
+	return 0;
+}
+
+static void cma_save_net_info(struct rdma_addr *addr,
+			      struct rdma_addr *listen_addr,
+			      u8 ip_ver, __be16 port,
+			      union cma_ip_addr *src, union cma_ip_addr *dst)
+{
+	struct sockaddr_in *listen4, *ip4;
+	struct sockaddr_in6 *listen6, *ip6;
+
+	switch (ip_ver) {
+	case 4:
+		listen4 = (struct sockaddr_in *) &listen_addr->src_addr;
+		ip4 = (struct sockaddr_in *) &addr->src_addr;
+		ip4->sin_family = listen4->sin_family;
+		ip4->sin_addr.s_addr = dst->ip4.addr;
+		ip4->sin_port = listen4->sin_port;
+
+		ip4 = (struct sockaddr_in *) &addr->dst_addr;
+		ip4->sin_family = listen4->sin_family;
+		ip4->sin_addr.s_addr = src->ip4.addr;
+		ip4->sin_port = port;
+		break;
+	case 6:
+		listen6 = (struct sockaddr_in6 *) &listen_addr->src_addr;
+		ip6 = (struct sockaddr_in6 *) &addr->src_addr;
+		ip6->sin6_family = listen6->sin6_family;
+		ip6->sin6_addr = dst->ip6;
+		ip6->sin6_port = listen6->sin6_port;
+
+		ip6 = (struct sockaddr_in6 *) &addr->dst_addr;
+		ip6->sin6_family = listen6->sin6_family;
+		ip6->sin6_addr = src->ip6;
+		ip6->sin6_port = port;
+		break;
+	default:
+		break;
+	}
+}
+
+static inline int cma_user_data_offset(enum rdma_port_space ps)
+{
+	switch (ps) {
+	case RDMA_PS_SDP:
+		return 0;
+	default:
+		return sizeof(struct cma_hdr);
+	}
+}
+
+static void cma_cancel_route(struct rdma_id_private *id_priv)
+{
+	switch (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num)) {
+	case IB_LINK_LAYER_INFINIBAND:
+		if (id_priv->query)
+			ib_sa_cancel_query(id_priv->query_id, id_priv->query);
+		break;
+	default:
+		break;
+	}
+}
+
+static void cma_cancel_listens(struct rdma_id_private *id_priv)
+{
+	struct rdma_id_private *dev_id_priv;
+
+	/*
+	 * Remove from listen_any_list to prevent added devices from spawning
+	 * additional listen requests.
+	 */
+	mutex_lock(&lock);
+	list_del(&id_priv->list);
+
+	while (!list_empty(&id_priv->listen_list)) {
+		dev_id_priv = list_entry(id_priv->listen_list.next,
+					 struct rdma_id_private, listen_list);
+		/* sync with device removal to avoid duplicate destruction */
+		list_del_init(&dev_id_priv->list);
+		list_del(&dev_id_priv->listen_list);
+		mutex_unlock(&lock);
+
+		rdma_destroy_id(&dev_id_priv->id);
+		mutex_lock(&lock);
+	}
+	mutex_unlock(&lock);
+}
+
+static void cma_cancel_operation(struct rdma_id_private *id_priv,
+				 enum cma_state state)
+{
+	switch (state) {
+	case CMA_ADDR_QUERY:
+		rdma_addr_cancel(&id_priv->id.route.addr.dev_addr);
+		break;
+	case CMA_ROUTE_QUERY:
+		cma_cancel_route(id_priv);
+		break;
+	case CMA_LISTEN:
+		if (cma_any_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr)
+				&& !id_priv->cma_dev)
+			cma_cancel_listens(id_priv);
+		break;
+	default:
+		break;
+	}
+}
+
+static void cma_release_port(struct rdma_id_private *id_priv)
+{
+	struct rdma_bind_list *bind_list = id_priv->bind_list;
+
+	if (!bind_list)
+		return;
+
+	mutex_lock(&lock);
+	hlist_del(&id_priv->node);
+	if (hlist_empty(&bind_list->owners)) {
+		idr_remove(bind_list->ps, bind_list->port);
+		kfree(bind_list);
+	}
+	mutex_unlock(&lock);
+	if (id_priv->sock)
+		sock_release(id_priv->sock);
+}
+
+static void cma_leave_mc_groups(struct rdma_id_private *id_priv)
+{
+	struct cma_multicast *mc;
+
+	while (!list_empty(&id_priv->mc_list)) {
+		mc = container_of(id_priv->mc_list.next,
+				  struct cma_multicast, list);
+		list_del(&mc->list);
+		switch (rdma_port_get_link_layer(id_priv->cma_dev->device, id_priv->id.port_num)) {
+		case IB_LINK_LAYER_INFINIBAND:
+			ib_sa_free_multicast(mc->multicast.ib);
+			kfree(mc);
+			break;
+		case IB_LINK_LAYER_ETHERNET:
+			kref_put(&mc->mcref, release_mc);
+			break;
+		default:
+			break;
+		}
+	}
+}
+
+void rdma_destroy_id(struct rdma_cm_id *id)
+{
+	struct rdma_id_private *id_priv;
+	enum cma_state state;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	state = cma_exch(id_priv, CMA_DESTROYING);
+	cma_cancel_operation(id_priv, state);
+
+	mutex_lock(&lock);
+	if (id_priv->cma_dev) {
+		mutex_unlock(&lock);
+		switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
+		case RDMA_TRANSPORT_IB:
+			if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib))
+				ib_destroy_cm_id(id_priv->cm_id.ib);
+			break;
+		case RDMA_TRANSPORT_IWARP:
+			if (id_priv->cm_id.iw && !IS_ERR(id_priv->cm_id.iw))
+				iw_destroy_cm_id(id_priv->cm_id.iw);
+			break;
+		default:
+			break;
+		}
+		cma_leave_mc_groups(id_priv);
+		mutex_lock(&lock);
+		cma_detach_from_dev(id_priv);
+	}
+	mutex_unlock(&lock);
+
+	cma_release_port(id_priv);
+	cma_deref_id(id_priv);
+	wait_for_completion(&id_priv->comp);
+
+	if (id_priv->internal_id)
+		cma_deref_id(id_priv->id.context);
+
+	kfree(id_priv->id.route.path_rec);
+	kfree(id_priv);
+}
+EXPORT_SYMBOL(rdma_destroy_id);
+
+static int cma_rep_recv(struct rdma_id_private *id_priv)
+{
+	int ret;
+
+	ret = cma_modify_qp_rtr(id_priv, NULL);
+	if (ret)
+		goto reject;
+
+	ret = cma_modify_qp_rts(id_priv, NULL);
+	if (ret)
+		goto reject;
+
+	ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0);
+	if (ret)
+		goto reject;
+
+	return 0;
+reject:
+	cma_modify_qp_err(id_priv);
+	ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED,
+		       NULL, 0, NULL, 0);
+	return ret;
+}
+
+static int cma_verify_rep(struct rdma_id_private *id_priv, void *data)
+{
+	if (id_priv->id.ps == RDMA_PS_SDP &&
+	    sdp_get_majv(((struct sdp_hah *) data)->sdp_version) !=
+	    SDP_MAJ_VERSION)
+		return -EINVAL;
+
+	return 0;
+}
+
+static void cma_set_rep_event_data(struct rdma_cm_event *event,
+				   struct ib_cm_rep_event_param *rep_data,
+				   void *private_data)
+{
+	event->param.conn.private_data = private_data;
+	event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE;
+	event->param.conn.responder_resources = rep_data->responder_resources;
+	event->param.conn.initiator_depth = rep_data->initiator_depth;
+	event->param.conn.flow_control = rep_data->flow_control;
+	event->param.conn.rnr_retry_count = rep_data->rnr_retry_count;
+	event->param.conn.srq = rep_data->srq;
+	event->param.conn.qp_num = rep_data->remote_qpn;
+}
+
+static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
+{
+	struct rdma_id_private *id_priv = cm_id->context;
+	struct rdma_cm_event event;
+	int ret = 0;
+
+	if ((ib_event->event != IB_CM_TIMEWAIT_EXIT &&
+		cma_disable_callback(id_priv, CMA_CONNECT)) ||
+	    (ib_event->event == IB_CM_TIMEWAIT_EXIT &&
+		cma_disable_callback(id_priv, CMA_DISCONNECT)))
+		return 0;
+
+	memset(&event, 0, sizeof event);
+	switch (ib_event->event) {
+	case IB_CM_REQ_ERROR:
+	case IB_CM_REP_ERROR:
+		event.event = RDMA_CM_EVENT_UNREACHABLE;
+		event.status = -ETIMEDOUT;
+		break;
+	case IB_CM_REP_RECEIVED:
+		event.status = cma_verify_rep(id_priv, ib_event->private_data);
+		if (event.status)
+			event.event = RDMA_CM_EVENT_CONNECT_ERROR;
+		else if (id_priv->id.qp && id_priv->id.ps != RDMA_PS_SDP) {
+			event.status = cma_rep_recv(id_priv);
+			event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR :
+						     RDMA_CM_EVENT_ESTABLISHED;
+		} else
+			event.event = RDMA_CM_EVENT_CONNECT_RESPONSE;
+		cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd,
+				       ib_event->private_data);
+		break;
+	case IB_CM_RTU_RECEIVED:
+	case IB_CM_USER_ESTABLISHED:
+		event.event = RDMA_CM_EVENT_ESTABLISHED;
+		break;
+	case IB_CM_DREQ_ERROR:
+		event.status = -ETIMEDOUT; /* fall through */
+	case IB_CM_DREQ_RECEIVED:
+	case IB_CM_DREP_RECEIVED:
+		if (!cma_comp_exch(id_priv, CMA_CONNECT, CMA_DISCONNECT))
+			goto out;
+		event.event = RDMA_CM_EVENT_DISCONNECTED;
+		break;
+	case IB_CM_TIMEWAIT_EXIT:
+		event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT;
+		break;
+	case IB_CM_MRA_RECEIVED:
+		/* ignore event */
+		goto out;
+	case IB_CM_REJ_RECEIVED:
+		cma_modify_qp_err(id_priv);
+		event.status = ib_event->param.rej_rcvd.reason;
+		event.event = RDMA_CM_EVENT_REJECTED;
+		event.param.conn.private_data = ib_event->private_data;
+		event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
+		break;
+	default:
+		printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n",
+		       ib_event->event);
+		goto out;
+	}
+
+	ret = id_priv->id.event_handler(&id_priv->id, &event);
+	if (ret) {
+		/* Destroy the CM ID by returning a non-zero value. */
+		id_priv->cm_id.ib = NULL;
+		cma_exch(id_priv, CMA_DESTROYING);
+		mutex_unlock(&id_priv->handler_mutex);
+		rdma_destroy_id(&id_priv->id);
+		return ret;
+	}
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	return ret;
+}
+
+static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
+					       struct ib_cm_event *ib_event)
+{
+	struct rdma_id_private *id_priv;
+	struct rdma_cm_id *id;
+	struct rdma_route *rt;
+	union cma_ip_addr *src, *dst;
+	__be16 port;
+	u8 ip_ver;
+	int ret;
+
+	if (cma_get_net_info(ib_event->private_data, listen_id->ps,
+			     &ip_ver, &port, &src, &dst))
+		goto err;
+
+	id = rdma_create_id(listen_id->event_handler, listen_id->context,
+			    listen_id->ps);
+	if (IS_ERR(id))
+		goto err;
+
+	cma_save_net_info(&id->route.addr, &listen_id->route.addr,
+			  ip_ver, port, src, dst);
+
+	rt = &id->route;
+	rt->num_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1;
+	rt->path_rec = kmalloc(sizeof *rt->path_rec * rt->num_paths,
+			       GFP_KERNEL);
+	if (!rt->path_rec)
+		goto destroy_id;
+
+	rt->path_rec[0] = *ib_event->param.req_rcvd.primary_path;
+	if (rt->num_paths == 2)
+		rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;
+
+	if (cma_any_addr((struct sockaddr *) &rt->addr.src_addr)) {
+		rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND;
+		rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
+		ib_addr_set_pkey(&rt->addr.dev_addr, rt->path_rec[0].pkey);
+	} else {
+		ret = rdma_translate_ip((struct sockaddr *) &rt->addr.src_addr,
+					&rt->addr.dev_addr);
+		if (ret)
+			goto destroy_id;
+	}
+	rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid);
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	id_priv->state = CMA_CONNECT;
+	return id_priv;
+
+destroy_id:
+	rdma_destroy_id(id);
+err:
+	return NULL;
+}
+
+static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
+					      struct ib_cm_event *ib_event)
+{
+	struct rdma_id_private *id_priv;
+	struct rdma_cm_id *id;
+	union cma_ip_addr *src, *dst;
+	__be16 port;
+	u8 ip_ver;
+	int ret;
+
+	id = rdma_create_id(listen_id->event_handler, listen_id->context,
+			    listen_id->ps);
+	if (IS_ERR(id))
+		return NULL;
+
+
+	if (cma_get_net_info(ib_event->private_data, listen_id->ps,
+			     &ip_ver, &port, &src, &dst))
+		goto err;
+
+	cma_save_net_info(&id->route.addr, &listen_id->route.addr,
+			  ip_ver, port, src, dst);
+
+	if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) {
+		ret = rdma_translate_ip((struct sockaddr *) &id->route.addr.src_addr,
+					&id->route.addr.dev_addr);
+		if (ret)
+			goto err;
+	}
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	id_priv->state = CMA_CONNECT;
+	return id_priv;
+err:
+	rdma_destroy_id(id);
+	return NULL;
+}
+
+static void cma_set_req_event_data(struct rdma_cm_event *event,
+				   struct ib_cm_req_event_param *req_data,
+				   void *private_data, int offset)
+{
+	event->param.conn.private_data = private_data + offset;
+	event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset;
+	event->param.conn.responder_resources = req_data->responder_resources;
+	event->param.conn.initiator_depth = req_data->initiator_depth;
+	event->param.conn.flow_control = req_data->flow_control;
+	event->param.conn.retry_count = req_data->retry_count;
+	event->param.conn.rnr_retry_count = req_data->rnr_retry_count;
+	event->param.conn.srq = req_data->srq;
+	event->param.conn.qp_num = req_data->remote_qpn;
+}
+
+static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
+{
+	struct rdma_id_private *listen_id, *conn_id;
+	struct rdma_cm_event event;
+	int offset, ret;
+
+	listen_id = cm_id->context;
+	if (cma_disable_callback(listen_id, CMA_LISTEN))
+		return -ECONNABORTED;
+
+	memset(&event, 0, sizeof event);
+	offset = cma_user_data_offset(listen_id->id.ps);
+	event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
+	if (cma_is_ud_ps(listen_id->id.ps)) {
+		conn_id = cma_new_udp_id(&listen_id->id, ib_event);
+		event.param.ud.private_data = ib_event->private_data + offset;
+		event.param.ud.private_data_len =
+				IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset;
+	} else {
+		conn_id = cma_new_conn_id(&listen_id->id, ib_event);
+		cma_set_req_event_data(&event, &ib_event->param.req_rcvd,
+				       ib_event->private_data, offset);
+	}
+	if (!conn_id) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
+	mutex_lock(&lock);
+	ret = cma_acquire_dev(conn_id);
+	mutex_unlock(&lock);
+	if (ret)
+		goto release_conn_id;
+
+	conn_id->cm_id.ib = cm_id;
+	cm_id->context = conn_id;
+	cm_id->cm_handler = cma_ib_handler;
+
+	ret = conn_id->id.event_handler(&conn_id->id, &event);
+	if (!ret) {
+		/*
+		 * Acquire mutex to prevent user executing rdma_destroy_id()
+		 * while we're accessing the cm_id.
+		 */
+		mutex_lock(&lock);
+		if (cma_comp(conn_id, CMA_CONNECT) &&
+		    !cma_is_ud_ps(conn_id->id.ps))
+			ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
+		mutex_unlock(&lock);
+		mutex_unlock(&conn_id->handler_mutex);
+		goto out;
+	}
+
+	/* Destroy the CM ID by returning a non-zero value. */
+	conn_id->cm_id.ib = NULL;
+
+release_conn_id:
+	cma_exch(conn_id, CMA_DESTROYING);
+	mutex_unlock(&conn_id->handler_mutex);
+	rdma_destroy_id(&conn_id->id);
+
+out:
+	mutex_unlock(&listen_id->handler_mutex);
+	return ret;
+}
+
+static __be64 cma_get_service_id(enum rdma_port_space ps, struct sockaddr *addr)
+{
+	return cpu_to_be64(((u64)ps << 16) + be16_to_cpu(cma_port(addr)));
+}
+
+static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr,
+				 struct ib_cm_compare_data *compare)
+{
+	struct cma_hdr *cma_data, *cma_mask;
+	struct sdp_hh *sdp_data, *sdp_mask;
+	__be32 ip4_addr;
+	struct in6_addr ip6_addr;
+
+	memset(compare, 0, sizeof *compare);
+	cma_data = (void *) compare->data;
+	cma_mask = (void *) compare->mask;
+	sdp_data = (void *) compare->data;
+	sdp_mask = (void *) compare->mask;
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr;
+		if (ps == RDMA_PS_SDP) {
+			sdp_set_ip_ver(sdp_data, 4);
+			sdp_set_ip_ver(sdp_mask, 0xF);
+			sdp_data->dst_addr.ip4.addr = ip4_addr;
+			sdp_mask->dst_addr.ip4.addr = htonl(~0);
+		} else {
+			cma_set_ip_ver(cma_data, 4);
+			cma_set_ip_ver(cma_mask, 0xF);
+			cma_data->dst_addr.ip4.addr = ip4_addr;
+			cma_mask->dst_addr.ip4.addr = htonl(~0);
+		}
+		break;
+#ifdef INET6
+	case AF_INET6:
+		ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr;
+		if (ps == RDMA_PS_SDP) {
+			sdp_set_ip_ver(sdp_data, 6);
+			sdp_set_ip_ver(sdp_mask, 0xF);
+			sdp_data->dst_addr.ip6 = ip6_addr;
+			memset(&sdp_mask->dst_addr.ip6, 0xFF,
+			       sizeof sdp_mask->dst_addr.ip6);
+		} else {
+			cma_set_ip_ver(cma_data, 6);
+			cma_set_ip_ver(cma_mask, 0xF);
+			cma_data->dst_addr.ip6 = ip6_addr;
+			memset(&cma_mask->dst_addr.ip6, 0xFF,
+			       sizeof cma_mask->dst_addr.ip6);
+		}
+		break;
+#endif
+	default:
+		break;
+	}
+}
+
+static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
+{
+	struct rdma_id_private *id_priv = iw_id->context;
+	struct rdma_cm_event event;
+	struct sockaddr_in *sin;
+	int ret = 0;
+
+	if (cma_disable_callback(id_priv, CMA_CONNECT))
+		return 0;
+
+	memset(&event, 0, sizeof event);
+	switch (iw_event->event) {
+	case IW_CM_EVENT_CLOSE:
+		event.event = RDMA_CM_EVENT_DISCONNECTED;
+		break;
+	case IW_CM_EVENT_CONNECT_REPLY:
+		sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr;
+		*sin = iw_event->local_addr;
+		sin = (struct sockaddr_in *) &id_priv->id.route.addr.dst_addr;
+		*sin = iw_event->remote_addr;
+		switch (iw_event->status) {
+		case 0:
+			event.event = RDMA_CM_EVENT_ESTABLISHED;
+			break;
+		case -ECONNRESET:
+		case -ECONNREFUSED:
+			event.event = RDMA_CM_EVENT_REJECTED;
+			break;
+		case -ETIMEDOUT:
+			event.event = RDMA_CM_EVENT_UNREACHABLE;
+			break;
+		default:
+			event.event = RDMA_CM_EVENT_CONNECT_ERROR;
+			break;
+		}
+		break;
+	case IW_CM_EVENT_ESTABLISHED:
+		event.event = RDMA_CM_EVENT_ESTABLISHED;
+		break;
+	default:
+		BUG_ON(1);
+	}
+
+	event.status = iw_event->status;
+	event.param.conn.private_data = iw_event->private_data;
+	event.param.conn.private_data_len = iw_event->private_data_len;
+	ret = id_priv->id.event_handler(&id_priv->id, &event);
+	if (ret) {
+		/* Destroy the CM ID by returning a non-zero value. */
+		id_priv->cm_id.iw = NULL;
+		cma_exch(id_priv, CMA_DESTROYING);
+		mutex_unlock(&id_priv->handler_mutex);
+		rdma_destroy_id(&id_priv->id);
+		return ret;
+	}
+
+	mutex_unlock(&id_priv->handler_mutex);
+	return ret;
+}
+
+static int iw_conn_req_handler(struct iw_cm_id *cm_id,
+			       struct iw_cm_event *iw_event)
+{
+	struct rdma_cm_id *new_cm_id;
+	struct rdma_id_private *listen_id, *conn_id;
+	struct sockaddr_in *sin;
+	struct net_device *dev = NULL;
+	struct rdma_cm_event event;
+	int ret;
+	struct ib_device_attr attr;
+
+	listen_id = cm_id->context;
+	if (cma_disable_callback(listen_id, CMA_LISTEN))
+		return -ECONNABORTED;
+
+	/* Create a new RDMA id for the new IW CM ID */
+	new_cm_id = rdma_create_id(listen_id->id.event_handler,
+				   listen_id->id.context,
+				   RDMA_PS_TCP);
+	if (IS_ERR(new_cm_id)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	conn_id = container_of(new_cm_id, struct rdma_id_private, id);
+	mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
+	conn_id->state = CMA_CONNECT;
+
+	dev = ip_dev_find(NULL, iw_event->local_addr.sin_addr.s_addr);
+	if (!dev) {
+		ret = -EADDRNOTAVAIL;
+		mutex_unlock(&conn_id->handler_mutex);
+		rdma_destroy_id(new_cm_id);
+		goto out;
+	}
+	ret = rdma_copy_addr(&conn_id->id.route.addr.dev_addr, dev, NULL);
+	if (ret) {
+		mutex_unlock(&conn_id->handler_mutex);
+		rdma_destroy_id(new_cm_id);
+		goto out;
+	}
+
+	mutex_lock(&lock);
+	ret = cma_acquire_dev(conn_id);
+	mutex_unlock(&lock);
+	if (ret) {
+		mutex_unlock(&conn_id->handler_mutex);
+		rdma_destroy_id(new_cm_id);
+		goto out;
+	}
+
+	conn_id->cm_id.iw = cm_id;
+	cm_id->context = conn_id;
+	cm_id->cm_handler = cma_iw_handler;
+
+	sin = (struct sockaddr_in *) &new_cm_id->route.addr.src_addr;
+	*sin = iw_event->local_addr;
+	sin = (struct sockaddr_in *) &new_cm_id->route.addr.dst_addr;
+	*sin = iw_event->remote_addr;
+
+	ret = ib_query_device(conn_id->id.device, &attr);
+	if (ret) {
+		mutex_unlock(&conn_id->handler_mutex);
+		rdma_destroy_id(new_cm_id);
+		goto out;
+	}
+
+	memset(&event, 0, sizeof event);
+	event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
+	event.param.conn.private_data = iw_event->private_data;
+	event.param.conn.private_data_len = iw_event->private_data_len;
+	event.param.conn.initiator_depth = attr.max_qp_init_rd_atom;
+	event.param.conn.responder_resources = attr.max_qp_rd_atom;
+	ret = conn_id->id.event_handler(&conn_id->id, &event);
+	if (ret) {
+		/* User wants to destroy the CM ID */
+		conn_id->cm_id.iw = NULL;
+		cma_exch(conn_id, CMA_DESTROYING);
+		mutex_unlock(&conn_id->handler_mutex);
+		rdma_destroy_id(&conn_id->id);
+		goto out;
+	}
+
+	mutex_unlock(&conn_id->handler_mutex);
+
+out:
+	if (dev)
+		dev_put(dev);
+	mutex_unlock(&listen_id->handler_mutex);
+	return ret;
+}
+
+static int cma_ib_listen(struct rdma_id_private *id_priv)
+{
+	struct ib_cm_compare_data compare_data;
+	struct sockaddr *addr;
+	__be64 svc_id;
+	int ret;
+
+	id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device, cma_req_handler,
+					    id_priv);
+	if (IS_ERR(id_priv->cm_id.ib))
+		return PTR_ERR(id_priv->cm_id.ib);
+
+	addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr;
+	svc_id = cma_get_service_id(id_priv->id.ps, addr);
+	if (cma_any_addr(addr))
+		ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL);
+	else {
+		cma_set_compare_data(id_priv->id.ps, addr, &compare_data);
+		ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data);
+	}
+
+	if (ret) {
+		ib_destroy_cm_id(id_priv->cm_id.ib);
+		id_priv->cm_id.ib = NULL;
+	}
+
+	return ret;
+}
+
+static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog)
+{
+	int ret;
+	struct sockaddr_in *sin;
+
+	id_priv->cm_id.iw = iw_create_cm_id(id_priv->id.device,
+					    iw_conn_req_handler,
+					    id_priv);
+	if (IS_ERR(id_priv->cm_id.iw))
+		return PTR_ERR(id_priv->cm_id.iw);
+
+	sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr;
+	id_priv->cm_id.iw->local_addr = *sin;
+
+	ret = iw_cm_listen(id_priv->cm_id.iw, backlog);
+
+	if (ret) {
+		iw_destroy_cm_id(id_priv->cm_id.iw);
+		id_priv->cm_id.iw = NULL;
+	}
+
+	return ret;
+}
+
+static int cma_listen_handler(struct rdma_cm_id *id,
+			      struct rdma_cm_event *event)
+{
+	struct rdma_id_private *id_priv = id->context;
+
+	id->context = id_priv->id.context;
+	id->event_handler = id_priv->id.event_handler;
+	return id_priv->id.event_handler(id, event);
+}
+
+static void cma_listen_on_dev(struct rdma_id_private *id_priv,
+			      struct cma_device *cma_dev)
+{
+	struct rdma_id_private *dev_id_priv;
+	struct rdma_cm_id *id;
+	int ret;
+
+	id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps);
+	if (IS_ERR(id))
+		return;
+
+	dev_id_priv = container_of(id, struct rdma_id_private, id);
+
+	dev_id_priv->state = CMA_ADDR_BOUND;
+	memcpy(&id->route.addr.src_addr, &id_priv->id.route.addr.src_addr,
+	       ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr));
+
+	cma_attach_to_dev(dev_id_priv, cma_dev);
+	list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list);
+	atomic_inc(&id_priv->refcount);
+	dev_id_priv->internal_id = 1;
+
+	ret = rdma_listen(id, id_priv->backlog);
+	if (ret)
+		printk(KERN_WARNING "RDMA CMA: cma_listen_on_dev, error %d, "
+		       "listening on device %s\n", ret, cma_dev->device->name);
+}
+
+static void cma_listen_on_all(struct rdma_id_private *id_priv)
+{
+	struct cma_device *cma_dev;
+
+	mutex_lock(&lock);
+	list_add_tail(&id_priv->list, &listen_any_list);
+	list_for_each_entry(cma_dev, &dev_list, list)
+		cma_listen_on_dev(id_priv, cma_dev);
+	mutex_unlock(&lock);
+}
+
+int rdma_listen(struct rdma_cm_id *id, int backlog)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (id_priv->state == CMA_IDLE) {
+		((struct sockaddr *) &id->route.addr.src_addr)->sa_family = AF_INET;
+		ret = rdma_bind_addr(id, (struct sockaddr *) &id->route.addr.src_addr);
+		if (ret)
+			return ret;
+	}
+
+	if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_LISTEN))
+		return -EINVAL;
+
+	id_priv->backlog = backlog;
+	if (id->device) {
+		switch (rdma_node_get_transport(id->device->node_type)) {
+		case RDMA_TRANSPORT_IB:
+			ret = cma_ib_listen(id_priv);
+			if (ret)
+				goto err;
+			break;
+		case RDMA_TRANSPORT_IWARP:
+			ret = cma_iw_listen(id_priv, backlog);
+			if (ret)
+				goto err;
+			break;
+		default:
+			ret = -ENOSYS;
+			goto err;
+		}
+	} else
+		cma_listen_on_all(id_priv);
+
+	return 0;
+err:
+	id_priv->backlog = 0;
+	cma_comp_exch(id_priv, CMA_LISTEN, CMA_ADDR_BOUND);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_listen);
+
+void rdma_set_service_type(struct rdma_cm_id *id, int tos)
+{
+	struct rdma_id_private *id_priv;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	id_priv->tos = (u8) tos;
+}
+EXPORT_SYMBOL(rdma_set_service_type);
+
+static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec,
+			      void *context)
+{
+	struct cma_work *work = context;
+	struct rdma_route *route;
+
+	route = &work->id->id.route;
+
+	if (!status) {
+		route->num_paths = 1;
+		*route->path_rec = *path_rec;
+	} else {
+		work->old_state = CMA_ROUTE_QUERY;
+		work->new_state = CMA_ADDR_RESOLVED;
+		work->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
+		work->event.status = status;
+	}
+
+	queue_work(cma_wq, &work->work);
+}
+
+static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms,
+			      struct cma_work *work)
+{
+	struct rdma_addr *addr = &id_priv->id.route.addr;
+	struct ib_sa_path_rec path_rec;
+	ib_sa_comp_mask comp_mask;
+	struct sockaddr_in6 *sin6;
+
+	memset(&path_rec, 0, sizeof path_rec);
+	rdma_addr_get_sgid(&addr->dev_addr, &path_rec.sgid);
+	rdma_addr_get_dgid(&addr->dev_addr, &path_rec.dgid);
+	path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(&addr->dev_addr));
+	path_rec.numb_path = 1;
+	path_rec.reversible = 1;
+	path_rec.service_id = cma_get_service_id(id_priv->id.ps,
+							(struct sockaddr *) &addr->dst_addr);
+
+	comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID |
+		    IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH |
+		    IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID;
+
+	if (addr->src_addr.ss_family == AF_INET) {
+		path_rec.qos_class = cpu_to_be16((u16) id_priv->tos);
+		comp_mask |= IB_SA_PATH_REC_QOS_CLASS;
+	} else {
+		sin6 = (struct sockaddr_in6 *) &addr->src_addr;
+		path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20);
+		comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS;
+	}
+
+	if (tavor_quirk) {
+		path_rec.mtu_selector = IB_SA_LT;
+		path_rec.mtu = IB_MTU_2048;
+	}
+
+	id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device,
+					       id_priv->id.port_num, &path_rec,
+					       comp_mask, timeout_ms,
+					       GFP_KERNEL, cma_query_handler,
+					       work, &id_priv->query);
+
+	return (id_priv->query_id < 0) ? id_priv->query_id : 0;
+}
+
+static void cma_work_handler(struct work_struct *_work)
+{
+	struct cma_work *work = container_of(_work, struct cma_work, work);
+	struct rdma_id_private *id_priv = work->id;
+	int destroy = 0;
+
+	mutex_lock(&id_priv->handler_mutex);
+	if (!cma_comp_exch(id_priv, work->old_state, work->new_state))
+		goto out;
+
+	if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
+		cma_exch(id_priv, CMA_DESTROYING);
+		destroy = 1;
+	}
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	cma_deref_id(id_priv);
+	if (destroy)
+		rdma_destroy_id(&id_priv->id);
+	kfree(work);
+}
+
+static void cma_ndev_work_handler(struct work_struct *_work)
+{
+	struct cma_ndev_work *work = container_of(_work, struct cma_ndev_work, work);
+	struct rdma_id_private *id_priv = work->id;
+	int destroy = 0;
+
+	mutex_lock(&id_priv->handler_mutex);
+	if (id_priv->state == CMA_DESTROYING ||
+	    id_priv->state == CMA_DEVICE_REMOVAL)
+		goto out;
+
+	if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
+		cma_exch(id_priv, CMA_DESTROYING);
+		destroy = 1;
+	}
+
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	cma_deref_id(id_priv);
+	if (destroy)
+		rdma_destroy_id(&id_priv->id);
+	kfree(work);
+}
+
+static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms)
+{
+	struct rdma_route *route = &id_priv->id.route;
+	struct cma_work *work;
+	int ret;
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	work->id = id_priv;
+	INIT_WORK(&work->work, cma_work_handler);
+	work->old_state = CMA_ROUTE_QUERY;
+	work->new_state = CMA_ROUTE_RESOLVED;
+	work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+
+	route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL);
+	if (!route->path_rec) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	ret = cma_query_ib_route(id_priv, timeout_ms, work);
+	if (ret)
+		goto err2;
+
+	return 0;
+err2:
+	kfree(route->path_rec);
+	route->path_rec = NULL;
+err1:
+	kfree(work);
+	return ret;
+}
+
+int rdma_set_ib_paths(struct rdma_cm_id *id,
+		      struct ib_sa_path_rec *path_rec, int num_paths)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ROUTE_RESOLVED))
+		return -EINVAL;
+
+	id->route.path_rec = kmalloc(sizeof *path_rec * num_paths, GFP_KERNEL);
+	if (!id->route.path_rec) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	memcpy(id->route.path_rec, path_rec, sizeof *path_rec * num_paths);
+	return 0;
+err:
+	cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_ADDR_RESOLVED);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_set_ib_paths);
+
+static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms)
+{
+	struct cma_work *work;
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	work->id = id_priv;
+	INIT_WORK(&work->work, cma_work_handler);
+	work->old_state = CMA_ROUTE_QUERY;
+	work->new_state = CMA_ROUTE_RESOLVED;
+	work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+	queue_work(cma_wq, &work->work);
+	return 0;
+}
+
+static u8 tos_to_sl(u8 tos)
+{
+	return def_prec2sl & 7;
+}
+
+static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
+{
+	struct rdma_route *route = &id_priv->id.route;
+	struct rdma_addr *addr = &route->addr;
+	struct cma_work *work;
+	int ret;
+	struct sockaddr_in *src_addr = (struct sockaddr_in *)&route->addr.src_addr;
+	struct sockaddr_in *dst_addr = (struct sockaddr_in *)&route->addr.dst_addr;
+	struct net_device *ndev = NULL;
+	u16 vid;
+
+	if (src_addr->sin_family != dst_addr->sin_family)
+		return -EINVAL;
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	work->id = id_priv;
+	INIT_WORK(&work->work, cma_work_handler);
+
+	route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL);
+	if (!route->path_rec) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	route->num_paths = 1;
+
+	if (addr->dev_addr.bound_dev_if)
+		ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if);
+	if (!ndev) {
+		ret = -ENODEV;
+		goto err2;
+	}
+
+	vid = rdma_vlan_dev_vlan_id(ndev);
+
+	iboe_mac_vlan_to_ll(&route->path_rec->sgid, addr->dev_addr.src_dev_addr, vid);
+	iboe_mac_vlan_to_ll(&route->path_rec->dgid, addr->dev_addr.dst_dev_addr, vid);
+
+	route->path_rec->hop_limit = 1;
+	route->path_rec->reversible = 1;
+	route->path_rec->pkey = cpu_to_be16(0xffff);
+	route->path_rec->mtu_selector = IB_SA_EQ;
+	route->path_rec->sl = tos_to_sl(id_priv->tos);
+
+#ifdef __linux__
+	route->path_rec->mtu = iboe_get_mtu(ndev->mtu);
+#else
+	route->path_rec->mtu = iboe_get_mtu(ndev->if_mtu);
+#endif
+	route->path_rec->rate_selector = IB_SA_EQ;
+	route->path_rec->rate = iboe_get_rate(ndev);
+	dev_put(ndev);
+	route->path_rec->packet_life_time_selector = IB_SA_EQ;
+	route->path_rec->packet_life_time = IBOE_PACKET_LIFETIME;
+	if (!route->path_rec->mtu) {
+		ret = -EINVAL;
+		goto err2;
+	}
+
+	work->old_state = CMA_ROUTE_QUERY;
+	work->new_state = CMA_ROUTE_RESOLVED;
+	work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+	work->event.status = 0;
+
+	queue_work(cma_wq, &work->work);
+
+	return 0;
+
+err2:
+	kfree(route->path_rec);
+	route->path_rec = NULL;
+err1:
+	kfree(work);
+	return ret;
+}
+
+int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ROUTE_QUERY))
+		return -EINVAL;
+
+	atomic_inc(&id_priv->refcount);
+	switch (rdma_node_get_transport(id->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		switch (rdma_port_get_link_layer(id->device, id->port_num)) {
+		case IB_LINK_LAYER_INFINIBAND:
+			ret = cma_resolve_ib_route(id_priv, timeout_ms);
+			break;
+		case IB_LINK_LAYER_ETHERNET:
+			ret = cma_resolve_iboe_route(id_priv);
+			break;
+		default:
+			ret = -ENOSYS;
+		}
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		ret = cma_resolve_iw_route(id_priv, timeout_ms);
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	cma_comp_exch(id_priv, CMA_ROUTE_QUERY, CMA_ADDR_RESOLVED);
+	cma_deref_id(id_priv);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_route);
+
+static int cma_bind_loopback(struct rdma_id_private *id_priv)
+{
+	struct cma_device *cma_dev;
+	struct ib_port_attr port_attr;
+	union ib_gid gid;
+	u16 pkey;
+	int ret;
+	u8 p;
+
+	mutex_lock(&lock);
+	if (list_empty(&dev_list)) {
+		ret = -ENODEV;
+		goto out;
+	}
+	list_for_each_entry(cma_dev, &dev_list, list)
+		for (p = 1; p <= cma_dev->device->phys_port_cnt; ++p)
+			if (!ib_query_port(cma_dev->device, p, &port_attr) &&
+			    port_attr.state == IB_PORT_ACTIVE)
+				goto port_found;
+
+	p = 1;
+	cma_dev = list_entry(dev_list.next, struct cma_device, list);
+
+port_found:
+	ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid);
+	if (ret)
+		goto out;
+
+	ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey);
+	if (ret)
+		goto out;
+
+	id_priv->id.route.addr.dev_addr.dev_type =
+		(rdma_port_get_link_layer(cma_dev->device, p) == IB_LINK_LAYER_INFINIBAND) ?
+		ARPHRD_INFINIBAND : ARPHRD_ETHER;
+
+	rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid);
+	ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey);
+	id_priv->id.port_num = p;
+	cma_attach_to_dev(id_priv, cma_dev);
+out:
+	mutex_unlock(&lock);
+	return ret;
+}
+
+static void addr_handler(int status, struct sockaddr *src_addr,
+			 struct rdma_dev_addr *dev_addr, void *context)
+{
+	struct rdma_id_private *id_priv = context;
+	struct rdma_cm_event event;
+
+	memset(&event, 0, sizeof event);
+	mutex_lock(&id_priv->handler_mutex);
+
+	/*
+	 * Grab mutex to block rdma_destroy_id() from removing the device while
+	 * we're trying to acquire it.
+	 */
+	mutex_lock(&lock);
+	if (!cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_ADDR_RESOLVED)) {
+		mutex_unlock(&lock);
+		goto out;
+	}
+
+	if (!status && !id_priv->cma_dev)
+		status = cma_acquire_dev(id_priv);
+	mutex_unlock(&lock);
+
+	if (status) {
+		if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ADDR_BOUND))
+			goto out;
+		event.event = RDMA_CM_EVENT_ADDR_ERROR;
+		event.status = status;
+	} else {
+		memcpy(&id_priv->id.route.addr.src_addr, src_addr,
+		       ip_addr_size(src_addr));
+		event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+	}
+
+	if (id_priv->id.event_handler(&id_priv->id, &event)) {
+		cma_exch(id_priv, CMA_DESTROYING);
+		mutex_unlock(&id_priv->handler_mutex);
+		cma_deref_id(id_priv);
+		rdma_destroy_id(&id_priv->id);
+		return;
+	}
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	cma_deref_id(id_priv);
+}
+
+static int cma_resolve_loopback(struct rdma_id_private *id_priv)
+{
+	struct cma_work *work;
+	struct sockaddr *src, *dst;
+	union ib_gid gid;
+	int ret;
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	if (!id_priv->cma_dev) {
+		ret = cma_bind_loopback(id_priv);
+		if (ret)
+			goto err;
+	}
+
+	rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid);
+	rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid);
+
+	src = (struct sockaddr *) &id_priv->id.route.addr.src_addr;
+	if (cma_zero_addr(src)) {
+		dst = (struct sockaddr *) &id_priv->id.route.addr.dst_addr;
+		if ((src->sa_family = dst->sa_family) == AF_INET) {
+			((struct sockaddr_in *) src)->sin_addr.s_addr =
+				((struct sockaddr_in *) dst)->sin_addr.s_addr;
+		} else {
+			ipv6_addr_copy(&((struct sockaddr_in6 *) src)->sin6_addr,
+				       &((struct sockaddr_in6 *) dst)->sin6_addr);
+		}
+	}
+
+	work->id = id_priv;
+	INIT_WORK(&work->work, cma_work_handler);
+	work->old_state = CMA_ADDR_QUERY;
+	work->new_state = CMA_ADDR_RESOLVED;
+	work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+	queue_work(cma_wq, &work->work);
+	return 0;
+err:
+	kfree(work);
+	return ret;
+}
+
+static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+			 struct sockaddr *dst_addr)
+{
+	if (!src_addr || !src_addr->sa_family) {
+		src_addr = (struct sockaddr *) &id->route.addr.src_addr;
+		if ((src_addr->sa_family = dst_addr->sa_family) == AF_INET6) {
+			((struct sockaddr_in6 *) src_addr)->sin6_scope_id =
+				((struct sockaddr_in6 *) dst_addr)->sin6_scope_id;
+		}
+	}
+	return rdma_bind_addr(id, src_addr);
+}
+
+int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+		      struct sockaddr *dst_addr, int timeout_ms)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (id_priv->state == CMA_IDLE) {
+		ret = cma_bind_addr(id, src_addr, dst_addr);
+		if (ret)
+			return ret;
+	}
+
+	if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_ADDR_QUERY))
+		return -EINVAL;
+
+	atomic_inc(&id_priv->refcount);
+	memcpy(&id->route.addr.dst_addr, dst_addr, ip_addr_size(dst_addr));
+	if (cma_any_addr(dst_addr))
+		ret = cma_resolve_loopback(id_priv);
+	else
+		ret = rdma_resolve_ip(&addr_client, (struct sockaddr *) &id->route.addr.src_addr,
+				      dst_addr, &id->route.addr.dev_addr,
+				      timeout_ms, addr_handler, id_priv);
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_ADDR_BOUND);
+	cma_deref_id(id_priv);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_addr);
+
+static void cma_bind_port(struct rdma_bind_list *bind_list,
+			  struct rdma_id_private *id_priv)
+{
+	struct sockaddr_in *sin;
+
+	sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr;
+	sin->sin_port = htons(bind_list->port);
+	id_priv->bind_list = bind_list;
+	hlist_add_head(&id_priv->node, &bind_list->owners);
+}
+
+static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv,
+			  unsigned short snum)
+{
+	struct rdma_bind_list *bind_list;
+	int port, ret;
+
+	bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL);
+	if (!bind_list)
+		return -ENOMEM;
+
+	do {
+		ret = idr_get_new_above(ps, bind_list, snum, &port);
+	} while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL));
+
+	if (ret)
+		goto err1;
+
+	if (port != snum) {
+		ret = -EADDRNOTAVAIL;
+		goto err2;
+	}
+
+	bind_list->ps = ps;
+	bind_list->port = (unsigned short) port;
+	cma_bind_port(bind_list, id_priv);
+	return 0;
+err2:
+	idr_remove(ps, port);
+err1:
+	kfree(bind_list);
+	return ret;
+}
+
+static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv)
+{
+	struct rdma_bind_list *bind_list;
+	int port, ret, low, high;
+
+	bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL);
+	if (!bind_list)
+		return -ENOMEM;
+
+retry:
+	/* FIXME: add proper port randomization per like inet_csk_get_port */
+	do {
+		ret = idr_get_new_above(ps, bind_list, next_port, &port);
+	} while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL));
+
+	if (ret)
+		goto err1;
+
+	inet_get_local_port_range(&low, &high);
+	if (port > high) {
+		if (next_port != low) {
+			idr_remove(ps, port);
+			next_port = low;
+			goto retry;
+		}
+		ret = -EADDRNOTAVAIL;
+		goto err2;
+	}
+
+	if (port == high)
+		next_port = low;
+	else
+		next_port = port + 1;
+
+	bind_list->ps = ps;
+	bind_list->port = (unsigned short) port;
+	cma_bind_port(bind_list, id_priv);
+	return 0;
+err2:
+	idr_remove(ps, port);
+err1:
+	kfree(bind_list);
+	return ret;
+}
+
+static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
+{
+	struct rdma_id_private *cur_id;
+	struct sockaddr_in *sin, *cur_sin;
+	struct rdma_bind_list *bind_list;
+	struct hlist_node *node;
+	unsigned short snum;
+
+	sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr;
+	snum = ntohs(sin->sin_port);
+#ifdef __linux__
+	if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+		return -EACCES;
+#endif
+
+	bind_list = idr_find(ps, snum);
+	if (!bind_list)
+		return cma_alloc_port(ps, id_priv, snum);
+
+	/*
+	 * We don't support binding to any address if anyone is bound to
+	 * a specific address on the same port.
+	 */
+	if (cma_any_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr))
+		return -EADDRNOTAVAIL;
+
+	hlist_for_each_entry(cur_id, node, &bind_list->owners, node) {
+		if (cma_any_addr((struct sockaddr *) &cur_id->id.route.addr.src_addr))
+			return -EADDRNOTAVAIL;
+
+		cur_sin = (struct sockaddr_in *) &cur_id->id.route.addr.src_addr;
+		if (sin->sin_addr.s_addr == cur_sin->sin_addr.s_addr)
+			return -EADDRINUSE;
+	}
+
+	cma_bind_port(bind_list, id_priv);
+	return 0;
+}
+
+static int cma_get_tcp_port(struct rdma_id_private *id_priv)
+{
+	int ret;
+	int size;
+	struct socket *sock;
+
+	ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (ret)
+		return ret;
+#ifdef __linux__
+	ret = sock->ops->bind(sock,
+			(struct sockaddr *) &id_priv->id.route.addr.src_addr,
+			ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr));
+#else
+	ret = -sobind(sock,
+			(struct sockaddr *)&id_priv->id.route.addr.src_addr,
+			curthread);
+#endif
+	if (ret) {
+		sock_release(sock);
+		return ret;
+	}
+	size = ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr);
+	ret = sock_getname(sock,
+			(struct sockaddr *) &id_priv->id.route.addr.src_addr,
+			&size, 0);
+	if (ret) {
+		sock_release(sock);
+		return ret;
+	}
+	id_priv->sock = sock;
+	return 0;
+}
+
+static int cma_get_port(struct rdma_id_private *id_priv)
+{
+	struct idr *ps;
+	int ret;
+
+	switch (id_priv->id.ps) {
+	case RDMA_PS_SDP:
+		ps = &sdp_ps;
+		break;
+	case RDMA_PS_TCP:
+		ps = &tcp_ps;
+		if (unify_tcp_port_space) {
+			ret = cma_get_tcp_port(id_priv);
+			if (ret)
+				goto out;
+		}
+		break;
+	case RDMA_PS_UDP:
+		ps = &udp_ps;
+		break;
+	case RDMA_PS_IPOIB:
+		ps = &ipoib_ps;
+		break;
+	default:
+		return -EPROTONOSUPPORT;
+	}
+
+	mutex_lock(&lock);
+	if (cma_any_port((struct sockaddr *) &id_priv->id.route.addr.src_addr))
+		ret = cma_alloc_any_port(ps, id_priv);
+	else
+		ret = cma_use_port(ps, id_priv);
+	mutex_unlock(&lock);
+out:
+	return ret;
+}
+
+static int cma_check_linklocal(struct rdma_dev_addr *dev_addr,
+			       struct sockaddr *addr)
+{
+#if defined(INET6)
+	struct sockaddr_in6 *sin6;
+
+	if (addr->sa_family != AF_INET6)
+		return 0;
+
+	sin6 = (struct sockaddr_in6 *) addr;
+#ifdef __linux__
+	if ((ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) &&
+#else
+	if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) &&
+#endif
+	    !sin6->sin6_scope_id)
+			return -EINVAL;
+
+	dev_addr->bound_dev_if = sin6->sin6_scope_id;
+#endif
+	return 0;
+}
+
+int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
+		return -EAFNOSUPPORT;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp_exch(id_priv, CMA_IDLE, CMA_ADDR_BOUND))
+		return -EINVAL;
+
+	ret = cma_check_linklocal(&id->route.addr.dev_addr, addr);
+	if (ret)
+		goto err1;
+
+	if (!cma_any_addr(addr)) {
+		ret = rdma_translate_ip(addr, &id->route.addr.dev_addr);
+		if (ret)
+			goto err1;
+
+		mutex_lock(&lock);
+		ret = cma_acquire_dev(id_priv);
+		mutex_unlock(&lock);
+		if (ret)
+			goto err1;
+	}
+
+	memcpy(&id->route.addr.src_addr, addr, ip_addr_size(addr));
+	ret = cma_get_port(id_priv);
+	if (ret)
+		goto err2;
+
+	return 0;
+err2:
+	if (id_priv->cma_dev) {
+		mutex_lock(&lock);
+		cma_detach_from_dev(id_priv);
+		mutex_unlock(&lock);
+	}
+err1:
+	cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_IDLE);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_bind_addr);
+
+static int cma_format_hdr(void *hdr, enum rdma_port_space ps,
+			  struct rdma_route *route)
+{
+	struct cma_hdr *cma_hdr;
+	struct sdp_hh *sdp_hdr;
+
+	if (route->addr.src_addr.ss_family == AF_INET) {
+		struct sockaddr_in *src4, *dst4;
+
+		src4 = (struct sockaddr_in *) &route->addr.src_addr;
+		dst4 = (struct sockaddr_in *) &route->addr.dst_addr;
+
+		switch (ps) {
+		case RDMA_PS_SDP:
+			sdp_hdr = hdr;
+			if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION)
+				return -EINVAL;
+			sdp_set_ip_ver(sdp_hdr, 4);
+			sdp_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr;
+			sdp_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr;
+			sdp_hdr->port = src4->sin_port;
+			break;
+		default:
+			cma_hdr = hdr;
+			cma_hdr->cma_version = CMA_VERSION;
+			cma_set_ip_ver(cma_hdr, 4);
+			cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr;
+			cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr;
+			cma_hdr->port = src4->sin_port;
+			break;
+		}
+	} else {
+		struct sockaddr_in6 *src6, *dst6;
+
+		src6 = (struct sockaddr_in6 *) &route->addr.src_addr;
+		dst6 = (struct sockaddr_in6 *) &route->addr.dst_addr;
+
+		switch (ps) {
+		case RDMA_PS_SDP:
+			sdp_hdr = hdr;
+			if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION)
+				return -EINVAL;
+			sdp_set_ip_ver(sdp_hdr, 6);
+			sdp_hdr->src_addr.ip6 = src6->sin6_addr;
+			sdp_hdr->dst_addr.ip6 = dst6->sin6_addr;
+			sdp_hdr->port = src6->sin6_port;
+			break;
+		default:
+			cma_hdr = hdr;
+			cma_hdr->cma_version = CMA_VERSION;
+			cma_set_ip_ver(cma_hdr, 6);
+			cma_hdr->src_addr.ip6 = src6->sin6_addr;
+			cma_hdr->dst_addr.ip6 = dst6->sin6_addr;
+			cma_hdr->port = src6->sin6_port;
+			break;
+		}
+	}
+	return 0;
+}
+
+static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
+				struct ib_cm_event *ib_event)
+{
+	struct rdma_id_private *id_priv = cm_id->context;
+	struct rdma_cm_event event;
+	struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd;
+	int ret = 0;
+
+	if (cma_disable_callback(id_priv, CMA_CONNECT))
+		return 0;
+
+	memset(&event, 0, sizeof event);
+	switch (ib_event->event) {
+	case IB_CM_SIDR_REQ_ERROR:
+		event.event = RDMA_CM_EVENT_UNREACHABLE;
+		event.status = -ETIMEDOUT;
+		break;
+	case IB_CM_SIDR_REP_RECEIVED:
+		event.param.ud.private_data = ib_event->private_data;
+		event.param.ud.private_data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE;
+		if (rep->status != IB_SIDR_SUCCESS) {
+			event.event = RDMA_CM_EVENT_UNREACHABLE;
+			event.status = ib_event->param.sidr_rep_rcvd.status;
+			break;
+		}
+		ret = cma_set_qkey(id_priv);
+		if (ret) {
+			event.event = RDMA_CM_EVENT_ADDR_ERROR;
+			event.status = -EINVAL;
+			break;
+		}
+		if (id_priv->qkey != rep->qkey) {
+			event.event = RDMA_CM_EVENT_UNREACHABLE;
+			event.status = -EINVAL;
+			break;
+		}
+		ib_init_ah_from_path(id_priv->id.device, id_priv->id.port_num,
+				     id_priv->id.route.path_rec,
+				     &event.param.ud.ah_attr);
+		event.param.ud.qp_num = rep->qpn;
+		event.param.ud.qkey = rep->qkey;
+		event.event = RDMA_CM_EVENT_ESTABLISHED;
+		event.status = 0;
+		break;
+	default:
+		printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n",
+		       ib_event->event);
+		goto out;
+	}
+
+	ret = id_priv->id.event_handler(&id_priv->id, &event);
+	if (ret) {
+		/* Destroy the CM ID by returning a non-zero value. */
+		id_priv->cm_id.ib = NULL;
+		cma_exch(id_priv, CMA_DESTROYING);
+		mutex_unlock(&id_priv->handler_mutex);
+		rdma_destroy_id(&id_priv->id);
+		return ret;
+	}
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	return ret;
+}
+
+static int cma_resolve_ib_udp(struct rdma_id_private *id_priv,
+			      struct rdma_conn_param *conn_param)
+{
+	struct ib_cm_sidr_req_param req;
+	struct rdma_route *route;
+	int ret;
+
+	req.private_data_len = sizeof(struct cma_hdr) +
+			       conn_param->private_data_len;
+	req.private_data = kzalloc(req.private_data_len, GFP_ATOMIC);
+	if (!req.private_data)
+		return -ENOMEM;
+
+	if (conn_param->private_data && conn_param->private_data_len)
+		memcpy((void *) req.private_data + sizeof(struct cma_hdr),
+		       conn_param->private_data, conn_param->private_data_len);
+
+	route = &id_priv->id.route;
+	ret = cma_format_hdr((void *) req.private_data, id_priv->id.ps, route);
+	if (ret)
+		goto out;
+
+	id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device,
+					    cma_sidr_rep_handler, id_priv);
+	if (IS_ERR(id_priv->cm_id.ib)) {
+		ret = PTR_ERR(id_priv->cm_id.ib);
+		goto out;
+	}
+
+	req.path = route->path_rec;
+	req.service_id = cma_get_service_id(id_priv->id.ps,
+					    (struct sockaddr *) &route->addr.dst_addr);
+	req.timeout_ms = 1 << (cma_response_timeout - 8);
+	req.max_cm_retries = CMA_MAX_CM_RETRIES;
+
+	ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req);
+	if (ret) {
+		ib_destroy_cm_id(id_priv->cm_id.ib);
+		id_priv->cm_id.ib = NULL;
+	}
+out:
+	kfree(req.private_data);
+	return ret;
+}
+
+static int cma_connect_ib(struct rdma_id_private *id_priv,
+			  struct rdma_conn_param *conn_param)
+{
+	struct ib_cm_req_param req;
+	struct rdma_route *route;
+	void *private_data;
+	int offset, ret;
+
+	memset(&req, 0, sizeof req);
+	offset = cma_user_data_offset(id_priv->id.ps);
+	req.private_data_len = offset + conn_param->private_data_len;
+	private_data = kzalloc(req.private_data_len, GFP_ATOMIC);
+	if (!private_data)
+		return -ENOMEM;
+
+	if (conn_param->private_data && conn_param->private_data_len)
+		memcpy(private_data + offset, conn_param->private_data,
+		       conn_param->private_data_len);
+
+	id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device, cma_ib_handler,
+					    id_priv);
+	if (IS_ERR(id_priv->cm_id.ib)) {
+		ret = PTR_ERR(id_priv->cm_id.ib);
+		goto out;
+	}
+
+	route = &id_priv->id.route;
+	ret = cma_format_hdr(private_data, id_priv->id.ps, route);
+	if (ret)
+		goto out;
+	req.private_data = private_data;
+
+	req.primary_path = &route->path_rec[0];
+	if (route->num_paths == 2)
+		req.alternate_path = &route->path_rec[1];
+
+	req.service_id = cma_get_service_id(id_priv->id.ps,
+					    (struct sockaddr *) &route->addr.dst_addr);
+	req.qp_num = id_priv->qp_num;
+	req.qp_type = IB_QPT_RC;
+	req.starting_psn = id_priv->seq_num;
+	req.responder_resources = conn_param->responder_resources;
+	req.initiator_depth = conn_param->initiator_depth;
+	req.flow_control = conn_param->flow_control;
+	req.retry_count = conn_param->retry_count;
+	req.rnr_retry_count = conn_param->rnr_retry_count;
+       req.remote_cm_response_timeout = cma_response_timeout;
+       req.local_cm_response_timeout = cma_response_timeout;
+	req.max_cm_retries = CMA_MAX_CM_RETRIES;
+	req.srq = id_priv->srq ? 1 : 0;
+
+	ret = ib_send_cm_req(id_priv->cm_id.ib, &req);
+out:
+	if (ret && !IS_ERR(id_priv->cm_id.ib)) {
+		ib_destroy_cm_id(id_priv->cm_id.ib);
+		id_priv->cm_id.ib = NULL;
+	}
+
+	kfree(private_data);
+	return ret;
+}
+
+static int cma_connect_iw(struct rdma_id_private *id_priv,
+			  struct rdma_conn_param *conn_param)
+{
+	struct iw_cm_id *cm_id;
+	struct sockaddr_in* sin;
+	int ret;
+	struct iw_cm_conn_param iw_param;
+
+	cm_id = iw_create_cm_id(id_priv->id.device, cma_iw_handler, id_priv);
+	if (IS_ERR(cm_id)) {
+		ret = PTR_ERR(cm_id);
+		goto out;
+	}
+
+	id_priv->cm_id.iw = cm_id;
+
+	sin = (struct sockaddr_in*) &id_priv->id.route.addr.src_addr;
+	cm_id->local_addr = *sin;
+
+	sin = (struct sockaddr_in*) &id_priv->id.route.addr.dst_addr;
+	cm_id->remote_addr = *sin;
+
+	ret = cma_modify_qp_rtr(id_priv, conn_param);
+	if (ret)
+		goto out;
+
+	iw_param.ord = conn_param->initiator_depth;
+	iw_param.ird = conn_param->responder_resources;
+	iw_param.private_data = conn_param->private_data;
+	iw_param.private_data_len = conn_param->private_data_len;
+	if (id_priv->id.qp)
+		iw_param.qpn = id_priv->qp_num;
+	else
+		iw_param.qpn = conn_param->qp_num;
+	ret = iw_cm_connect(cm_id, &iw_param);
+out:
+	if (ret && !IS_ERR(cm_id)) {
+		iw_destroy_cm_id(cm_id);
+		id_priv->cm_id.iw = NULL;
+	}
+	return ret;
+}
+
+int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_CONNECT))
+		return -EINVAL;
+
+	if (!id->qp) {
+		id_priv->qp_num = conn_param->qp_num;
+		id_priv->srq = conn_param->srq;
+	}
+
+	switch (rdma_node_get_transport(id->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		if (cma_is_ud_ps(id->ps))
+			ret = cma_resolve_ib_udp(id_priv, conn_param);
+		else
+			ret = cma_connect_ib(id_priv, conn_param);
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		ret = cma_connect_iw(id_priv, conn_param);
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	cma_comp_exch(id_priv, CMA_CONNECT, CMA_ROUTE_RESOLVED);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_connect);
+
+static int cma_accept_ib(struct rdma_id_private *id_priv,
+			 struct rdma_conn_param *conn_param)
+{
+	struct ib_cm_rep_param rep;
+	int ret;
+
+	ret = cma_modify_qp_rtr(id_priv, conn_param);
+	if (ret)
+		goto out;
+
+	ret = cma_modify_qp_rts(id_priv, conn_param);
+	if (ret)
+		goto out;
+
+	memset(&rep, 0, sizeof rep);
+	rep.qp_num = id_priv->qp_num;
+	rep.starting_psn = id_priv->seq_num;
+	rep.private_data = conn_param->private_data;
+	rep.private_data_len = conn_param->private_data_len;
+	rep.responder_resources = conn_param->responder_resources;
+	rep.initiator_depth = conn_param->initiator_depth;
+	rep.failover_accepted = 0;
+	rep.flow_control = conn_param->flow_control;
+	rep.rnr_retry_count = conn_param->rnr_retry_count;
+	rep.srq = id_priv->srq ? 1 : 0;
+
+	ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep);
+out:
+	return ret;
+}
+
+static int cma_accept_iw(struct rdma_id_private *id_priv,
+		  struct rdma_conn_param *conn_param)
+{
+	struct iw_cm_conn_param iw_param;
+	int ret;
+
+	ret = cma_modify_qp_rtr(id_priv, conn_param);
+	if (ret)
+		return ret;
+
+	iw_param.ord = conn_param->initiator_depth;
+	iw_param.ird = conn_param->responder_resources;
+	iw_param.private_data = conn_param->private_data;
+	iw_param.private_data_len = conn_param->private_data_len;
+	if (id_priv->id.qp) {
+		iw_param.qpn = id_priv->qp_num;
+	} else
+		iw_param.qpn = conn_param->qp_num;
+
+	return iw_cm_accept(id_priv->cm_id.iw, &iw_param);
+}
+
+static int cma_send_sidr_rep(struct rdma_id_private *id_priv,
+			     enum ib_cm_sidr_status status,
+			     const void *private_data, int private_data_len)
+{
+	struct ib_cm_sidr_rep_param rep;
+	int ret;
+
+	memset(&rep, 0, sizeof rep);
+	rep.status = status;
+	if (status == IB_SIDR_SUCCESS) {
+		ret = cma_set_qkey(id_priv);
+		if (ret)
+			return ret;
+		rep.qp_num = id_priv->qp_num;
+		rep.qkey = id_priv->qkey;
+	}
+	rep.private_data = private_data;
+	rep.private_data_len = private_data_len;
+
+	return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep);
+}
+
+int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp(id_priv, CMA_CONNECT))
+		return -EINVAL;
+
+	if (!id->qp && conn_param) {
+		id_priv->qp_num = conn_param->qp_num;
+		id_priv->srq = conn_param->srq;
+	}
+
+	switch (rdma_node_get_transport(id->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		if (cma_is_ud_ps(id->ps))
+			ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
+						conn_param->private_data,
+						conn_param->private_data_len);
+		else if (conn_param)
+			ret = cma_accept_ib(id_priv, conn_param);
+		else
+			ret = cma_rep_recv(id_priv);
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		ret = cma_accept_iw(id_priv, conn_param);
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+
+	if (ret)
+		goto reject;
+
+	return 0;
+reject:
+	cma_modify_qp_err(id_priv);
+	rdma_reject(id, NULL, 0);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_accept);
+
+int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_has_cm_dev(id_priv))
+		return -EINVAL;
+
+	switch (id->device->node_type) {
+	case RDMA_NODE_IB_CA:
+		ret = ib_cm_notify(id_priv->cm_id.ib, event);
+		break;
+	default:
+		ret = 0;
+		break;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(rdma_notify);
+
+int rdma_reject(struct rdma_cm_id *id, const void *private_data,
+		u8 private_data_len)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_has_cm_dev(id_priv))
+		return -EINVAL;
+
+	switch (rdma_node_get_transport(id->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		if (cma_is_ud_ps(id->ps))
+			ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT,
+						private_data, private_data_len);
+		else
+			ret = ib_send_cm_rej(id_priv->cm_id.ib,
+					     IB_CM_REJ_CONSUMER_DEFINED, NULL,
+					     0, private_data, private_data_len);
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		ret = iw_cm_reject(id_priv->cm_id.iw,
+				   private_data, private_data_len);
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(rdma_reject);
+
+int rdma_disconnect(struct rdma_cm_id *id)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_has_cm_dev(id_priv))
+		return -EINVAL;
+
+	switch (rdma_node_get_transport(id->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		ret = cma_modify_qp_err(id_priv);
+		if (ret)
+			goto out;
+		/* Initiate or respond to a disconnect. */
+		if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0))
+			ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0);
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		ret = iw_cm_disconnect(id_priv->cm_id.iw, 0);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+out:
+	return ret;
+}
+EXPORT_SYMBOL(rdma_disconnect);
+
+static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
+{
+	struct rdma_id_private *id_priv;
+	struct cma_multicast *mc = multicast->context;
+	struct rdma_cm_event event;
+	int ret;
+
+	id_priv = mc->id_priv;
+	if (cma_disable_callback(id_priv, CMA_ADDR_BOUND) &&
+	    cma_disable_callback(id_priv, CMA_ADDR_RESOLVED))
+		return 0;
+
+	mutex_lock(&id_priv->qp_mutex);
+	if (!status && id_priv->id.qp)
+		status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid,
+					 multicast->rec.mlid);
+	mutex_unlock(&id_priv->qp_mutex);
+
+	memset(&event, 0, sizeof event);
+	event.status = status;
+	event.param.ud.private_data = mc->context;
+	if (!status) {
+		event.event = RDMA_CM_EVENT_MULTICAST_JOIN;
+		ib_init_ah_from_mcmember(id_priv->id.device,
+					 id_priv->id.port_num, &multicast->rec,
+					 &event.param.ud.ah_attr);
+		event.param.ud.qp_num = 0xFFFFFF;
+		event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey);
+	} else
+		event.event = RDMA_CM_EVENT_MULTICAST_ERROR;
+
+	ret = id_priv->id.event_handler(&id_priv->id, &event);
+	if (ret) {
+		cma_exch(id_priv, CMA_DESTROYING);
+		mutex_unlock(&id_priv->handler_mutex);
+		rdma_destroy_id(&id_priv->id);
+		return 0;
+	}
+
+	mutex_unlock(&id_priv->handler_mutex);
+	return 0;
+}
+
+static void cma_set_mgid(struct rdma_id_private *id_priv,
+			 struct sockaddr *addr, union ib_gid *mgid)
+{
+	unsigned char mc_map[MAX_ADDR_LEN];
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	struct sockaddr_in *sin = (struct sockaddr_in *) addr;
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr;
+
+	if (cma_any_addr(addr)) {
+		memset(mgid, 0, sizeof *mgid);
+	} else if ((addr->sa_family == AF_INET6) &&
+		   ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) ==
+								 0xFF10A01B)) {
+		/* IPv6 address is an SA assigned MGID. */
+		memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
+	} else if ((addr->sa_family == AF_INET6)) {
+		ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map);
+		if (id_priv->id.ps == RDMA_PS_UDP)
+			mc_map[7] = 0x01;	/* Use RDMA CM signature */
+		*mgid = *(union ib_gid *) (mc_map + 4);
+	} else {
+		ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map);
+		if (id_priv->id.ps == RDMA_PS_UDP)
+			mc_map[7] = 0x01;	/* Use RDMA CM signature */
+		*mgid = *(union ib_gid *) (mc_map + 4);
+	}
+}
+
+static int cma_join_ib_multicast(struct rdma_id_private *id_priv,
+				 struct cma_multicast *mc)
+{
+	struct ib_sa_mcmember_rec rec;
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	ib_sa_comp_mask comp_mask;
+	int ret;
+
+	ib_addr_get_mgid(dev_addr, &rec.mgid);
+	ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num,
+				     &rec.mgid, &rec);
+	if (ret)
+		return ret;
+
+	cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid);
+	if (id_priv->id.ps == RDMA_PS_UDP)
+		rec.qkey = cpu_to_be32(RDMA_UDP_QKEY);
+	rdma_addr_get_sgid(dev_addr, &rec.port_gid);
+	rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
+	rec.join_state = 1;
+
+	comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID |
+		    IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE |
+		    IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL |
+		    IB_SA_MCMEMBER_REC_FLOW_LABEL |
+		    IB_SA_MCMEMBER_REC_TRAFFIC_CLASS;
+
+	if (id_priv->id.ps == RDMA_PS_IPOIB)
+		comp_mask |= IB_SA_MCMEMBER_REC_RATE |
+			     IB_SA_MCMEMBER_REC_RATE_SELECTOR;
+
+	mc->multicast.ib = ib_sa_join_multicast(&sa_client, id_priv->id.device,
+						id_priv->id.port_num, &rec,
+						comp_mask, GFP_KERNEL,
+						cma_ib_mc_handler, mc);
+	if (IS_ERR(mc->multicast.ib))
+		return PTR_ERR(mc->multicast.ib);
+
+	return 0;
+}
+
+
+static void iboe_mcast_work_handler(struct work_struct *work)
+{
+	struct iboe_mcast_work *mw = container_of(work, struct iboe_mcast_work, work);
+	struct cma_multicast *mc = mw->mc;
+	struct ib_sa_multicast *m = mc->multicast.ib;
+
+	mc->multicast.ib->context = mc;
+	cma_ib_mc_handler(0, m);
+	kref_put(&mc->mcref, release_mc);
+	kfree(mw);
+}
+
+static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid)
+{
+	struct sockaddr_in *sin = (struct sockaddr_in *)addr;
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr;
+
+	if (cma_any_addr(addr)) {
+		memset(mgid, 0, sizeof *mgid);
+	} else if (addr->sa_family == AF_INET6)
+		memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
+	else {
+		mgid->raw[0] = 0xff;
+		mgid->raw[1] = 0x0e;
+		mgid->raw[2] = 0;
+		mgid->raw[3] = 0;
+		mgid->raw[4] = 0;
+		mgid->raw[5] = 0;
+		mgid->raw[6] = 0;
+		mgid->raw[7] = 0;
+		mgid->raw[8] = 0;
+		mgid->raw[9] = 0;
+		mgid->raw[10] = 0xff;
+		mgid->raw[11] = 0xff;
+		*(__be32 *)(&mgid->raw[12]) = sin->sin_addr.s_addr;
+	}
+}
+
+static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
+				   struct cma_multicast *mc)
+{
+	struct iboe_mcast_work *work;
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	int err;
+	struct sockaddr *addr = (struct sockaddr *)&mc->addr;
+	struct net_device *ndev = NULL;
+
+	if (cma_zero_addr((struct sockaddr *)&mc->addr))
+		return -EINVAL;
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	mc->multicast.ib = kzalloc(sizeof(struct ib_sa_multicast), GFP_KERNEL);
+	if (!mc->multicast.ib) {
+		err = -ENOMEM;
+		goto out1;
+	}
+
+	cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid);
+
+	mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff);
+	if (id_priv->id.ps == RDMA_PS_UDP)
+		mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY);
+
+	if (dev_addr->bound_dev_if)
+		ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+	if (!ndev) {
+		err = -ENODEV;
+		goto out2;
+	}
+
+	mc->multicast.ib->rec.rate = iboe_get_rate(ndev);
+	mc->multicast.ib->rec.hop_limit = 1;
+#ifdef __linux__
+	mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu);
+#else
+	mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->if_mtu);
+#endif
+	dev_put(ndev);
+	if (!mc->multicast.ib->rec.mtu) {
+		err = -EINVAL;
+		goto out2;
+	}
+	iboe_addr_get_sgid(dev_addr, &mc->multicast.ib->rec.port_gid);
+	work->id = id_priv;
+	work->mc = mc;
+	INIT_WORK(&work->work, iboe_mcast_work_handler);
+	kref_get(&mc->mcref);
+	queue_work(cma_wq, &work->work);
+
+	return 0;
+
+out2:
+	kfree(mc->multicast.ib);
+out1:
+	kfree(work);
+	return err;
+}
+
+int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
+			void *context)
+{
+	struct rdma_id_private *id_priv;
+	struct cma_multicast *mc;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp(id_priv, CMA_ADDR_BOUND) &&
+	    !cma_comp(id_priv, CMA_ADDR_RESOLVED))
+		return -EINVAL;
+
+	mc = kmalloc(sizeof *mc, GFP_KERNEL);
+	if (!mc)
+		return -ENOMEM;
+
+	memcpy(&mc->addr, addr, ip_addr_size(addr));
+	mc->context = context;
+	mc->id_priv = id_priv;
+
+	spin_lock(&id_priv->lock);
+	list_add(&mc->list, &id_priv->mc_list);
+	spin_unlock(&id_priv->lock);
+
+	switch (rdma_node_get_transport(id->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		switch (rdma_port_get_link_layer(id->device, id->port_num)) {
+		case IB_LINK_LAYER_INFINIBAND:
+			ret = cma_join_ib_multicast(id_priv, mc);
+			break;
+		case IB_LINK_LAYER_ETHERNET:
+			kref_init(&mc->mcref);
+			ret = cma_iboe_join_multicast(id_priv, mc);
+			break;
+		default:
+			ret = -EINVAL;
+		}
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+
+	if (ret) {
+		spin_lock_irq(&id_priv->lock);
+		list_del(&mc->list);
+		spin_unlock_irq(&id_priv->lock);
+		kfree(mc);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(rdma_join_multicast);
+
+void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+	struct rdma_id_private *id_priv;
+	struct cma_multicast *mc;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	spin_lock_irq(&id_priv->lock);
+	list_for_each_entry(mc, &id_priv->mc_list, list) {
+		if (!memcmp(&mc->addr, addr, ip_addr_size(addr))) {
+			list_del(&mc->list);
+			spin_unlock_irq(&id_priv->lock);
+
+			if (id->qp)
+				ib_detach_mcast(id->qp,
+						&mc->multicast.ib->rec.mgid,
+						mc->multicast.ib->rec.mlid);
+			if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) == RDMA_TRANSPORT_IB) {
+				switch (rdma_port_get_link_layer(id->device, id->port_num)) {
+				case IB_LINK_LAYER_INFINIBAND:
+					ib_sa_free_multicast(mc->multicast.ib);
+					kfree(mc);
+					break;
+				case IB_LINK_LAYER_ETHERNET:
+					kref_put(&mc->mcref, release_mc);
+					break;
+				default:
+					break;
+				}
+			}
+			return;
+		}
+	}
+	spin_unlock_irq(&id_priv->lock);
+}
+EXPORT_SYMBOL(rdma_leave_multicast);
+
+static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv)
+{
+	struct rdma_dev_addr *dev_addr;
+	struct cma_ndev_work *work;
+
+	dev_addr = &id_priv->id.route.addr.dev_addr;
+
+#ifdef __linux__
+	if ((dev_addr->bound_dev_if == ndev->ifindex) &&
+	    memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) {
+		printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n",
+		       ndev->name, &id_priv->id);
+#else
+	if ((dev_addr->bound_dev_if == ndev->if_index) &&
+	    memcmp(dev_addr->src_dev_addr, IF_LLADDR(ndev), ndev->if_addrlen)) {
+		printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n",
+		       ndev->if_xname, &id_priv->id);
+#endif
+		work = kzalloc(sizeof *work, GFP_KERNEL);
+		if (!work)
+			return -ENOMEM;
+
+		INIT_WORK(&work->work, cma_ndev_work_handler);
+		work->id = id_priv;
+		work->event.event = RDMA_CM_EVENT_ADDR_CHANGE;
+		atomic_inc(&id_priv->refcount);
+		queue_work(cma_wq, &work->work);
+	}
+
+	return 0;
+}
+
+static int cma_netdev_callback(struct notifier_block *self, unsigned long event,
+			       void *ctx)
+{
+	struct net_device *ndev = (struct net_device *)ctx;
+	struct cma_device *cma_dev;
+	struct rdma_id_private *id_priv;
+	int ret = NOTIFY_DONE;
+
+#ifdef __linux__
+	if (dev_net(ndev) != &init_net)
+		return NOTIFY_DONE;
+
+	if (event != NETDEV_BONDING_FAILOVER)
+		return NOTIFY_DONE;
+
+	if (!(ndev->flags & IFF_MASTER) || !(ndev->priv_flags & IFF_BONDING))
+		return NOTIFY_DONE;
+#else
+	if (event != NETDEV_DOWN && event != NETDEV_UNREGISTER)
+		return NOTIFY_DONE;
+#endif
+
+	mutex_lock(&lock);
+	list_for_each_entry(cma_dev, &dev_list, list)
+		list_for_each_entry(id_priv, &cma_dev->id_list, list) {
+			ret = cma_netdev_change(ndev, id_priv);
+			if (ret)
+				goto out;
+		}
+
+out:
+	mutex_unlock(&lock);
+	return ret;
+}
+
+static struct notifier_block cma_nb = {
+	.notifier_call = cma_netdev_callback
+};
+
+static void cma_add_one(struct ib_device *device)
+{
+	struct cma_device *cma_dev;
+	struct rdma_id_private *id_priv;
+
+	cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL);
+	if (!cma_dev)
+		return;
+
+	cma_dev->device = device;
+
+	init_completion(&cma_dev->comp);
+	atomic_set(&cma_dev->refcount, 1);
+	INIT_LIST_HEAD(&cma_dev->id_list);
+	ib_set_client_data(device, &cma_client, cma_dev);
+
+	mutex_lock(&lock);
+	list_add_tail(&cma_dev->list, &dev_list);
+	list_for_each_entry(id_priv, &listen_any_list, list)
+		cma_listen_on_dev(id_priv, cma_dev);
+	mutex_unlock(&lock);
+}
+
+static int cma_remove_id_dev(struct rdma_id_private *id_priv)
+{
+	struct rdma_cm_event event;
+	enum cma_state state;
+	int ret = 0;
+
+	/* Record that we want to remove the device */
+	state = cma_exch(id_priv, CMA_DEVICE_REMOVAL);
+	if (state == CMA_DESTROYING)
+		return 0;
+
+	cma_cancel_operation(id_priv, state);
+	mutex_lock(&id_priv->handler_mutex);
+
+	/* Check for destruction from another callback. */
+	if (!cma_comp(id_priv, CMA_DEVICE_REMOVAL))
+		goto out;
+
+	memset(&event, 0, sizeof event);
+	event.event = RDMA_CM_EVENT_DEVICE_REMOVAL;
+	ret = id_priv->id.event_handler(&id_priv->id, &event);
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	return ret;
+}
+
+static void cma_process_remove(struct cma_device *cma_dev)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	mutex_lock(&lock);
+	while (!list_empty(&cma_dev->id_list)) {
+		id_priv = list_entry(cma_dev->id_list.next,
+				     struct rdma_id_private, list);
+
+		list_del(&id_priv->listen_list);
+		list_del_init(&id_priv->list);
+		atomic_inc(&id_priv->refcount);
+		mutex_unlock(&lock);
+
+		ret = id_priv->internal_id ? 1 : cma_remove_id_dev(id_priv);
+		cma_deref_id(id_priv);
+		if (ret)
+			rdma_destroy_id(&id_priv->id);
+
+		mutex_lock(&lock);
+	}
+	mutex_unlock(&lock);
+
+	cma_deref_dev(cma_dev);
+	wait_for_completion(&cma_dev->comp);
+}
+
+static void cma_remove_one(struct ib_device *device)
+{
+	struct cma_device *cma_dev;
+
+	cma_dev = ib_get_client_data(device, &cma_client);
+	if (!cma_dev)
+		return;
+
+	mutex_lock(&lock);
+	list_del(&cma_dev->list);
+	mutex_unlock(&lock);
+
+	cma_process_remove(cma_dev);
+	kfree(cma_dev);
+}
+
+static int cma_init(void)
+{
+	int ret, low, high, remaining;
+
+	get_random_bytes(&next_port, sizeof next_port);
+	inet_get_local_port_range(&low, &high);
+	remaining = (high - low) + 1;
+	next_port = ((unsigned int) next_port % remaining) + low;
+
+	cma_wq = create_singlethread_workqueue("rdma_cm");
+	if (!cma_wq)
+		return -ENOMEM;
+
+	ib_sa_register_client(&sa_client);
+	rdma_addr_register_client(&addr_client);
+	register_netdevice_notifier(&cma_nb);
+
+	ret = ib_register_client(&cma_client);
+	if (ret)
+		goto err;
+	return 0;
+
+err:
+	unregister_netdevice_notifier(&cma_nb);
+	rdma_addr_unregister_client(&addr_client);
+	ib_sa_unregister_client(&sa_client);
+	destroy_workqueue(cma_wq);
+	return ret;
+}
+
+static void cma_cleanup(void)
+{
+	ib_unregister_client(&cma_client);
+	unregister_netdevice_notifier(&cma_nb);
+	rdma_addr_unregister_client(&addr_client);
+	ib_sa_unregister_client(&sa_client);
+	destroy_workqueue(cma_wq);
+	idr_destroy(&sdp_ps);
+	idr_destroy(&tcp_ps);
+	idr_destroy(&udp_ps);
+	idr_destroy(&ipoib_ps);
+}
+
+module_init(cma_init);
+module_exit(cma_cleanup);
diff --git a/sys/ofed/drivers/infiniband/core/core_priv.h b/sys/ofed/drivers/infiniband/core/core_priv.h
new file mode 100644
index 0000000..05ac36e
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/core_priv.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _CORE_PRIV_H
+#define _CORE_PRIV_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+#include <rdma/ib_verbs.h>
+
+int  ib_device_register_sysfs(struct ib_device *device);
+void ib_device_unregister_sysfs(struct ib_device *device);
+
+int  ib_sysfs_setup(void);
+void ib_sysfs_cleanup(void);
+
+int  ib_cache_setup(void);
+void ib_cache_cleanup(void);
+
+#endif /* _CORE_PRIV_H */
diff --git a/sys/ofed/drivers/infiniband/core/device.c b/sys/ofed/drivers/infiniband/core/device.c
new file mode 100644
index 0000000..9d34bb6
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/device.c
@@ -0,0 +1,754 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+
+#include "core_priv.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("core kernel InfiniBand API");
+MODULE_LICENSE("Dual BSD/GPL");
+
+#ifdef __ia64__
+/* workaround for a bug in hp chipset that would cause kernel
+   panic when dma resources are exhaused */
+int dma_map_sg_hp_wa = 0;
+#endif
+
+struct ib_client_data {
+	struct list_head  list;
+	struct ib_client *client;
+	void *            data;
+};
+
+static LIST_HEAD(device_list);
+static LIST_HEAD(client_list);
+
+/*
+ * device_mutex protects access to both device_list and client_list.
+ * There's no real point to using multiple locks or something fancier
+ * like an rwsem: we always access both lists, and we're always
+ * modifying one list or the other list.  In any case this is not a
+ * hot path so there's no point in trying to optimize.
+ */
+static DEFINE_MUTEX(device_mutex);
+
+static int ib_device_check_mandatory(struct ib_device *device)
+{
+#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device, x), #x }
+	static const struct {
+		size_t offset;
+		char  *name;
+	} mandatory_table[] = {
+		IB_MANDATORY_FUNC(query_device),
+		IB_MANDATORY_FUNC(query_port),
+		IB_MANDATORY_FUNC(query_pkey),
+		IB_MANDATORY_FUNC(query_gid),
+		IB_MANDATORY_FUNC(alloc_pd),
+		IB_MANDATORY_FUNC(dealloc_pd),
+		IB_MANDATORY_FUNC(create_ah),
+		IB_MANDATORY_FUNC(destroy_ah),
+		IB_MANDATORY_FUNC(create_qp),
+		IB_MANDATORY_FUNC(modify_qp),
+		IB_MANDATORY_FUNC(destroy_qp),
+		IB_MANDATORY_FUNC(post_send),
+		IB_MANDATORY_FUNC(post_recv),
+		IB_MANDATORY_FUNC(create_cq),
+		IB_MANDATORY_FUNC(destroy_cq),
+		IB_MANDATORY_FUNC(poll_cq),
+		IB_MANDATORY_FUNC(req_notify_cq),
+		IB_MANDATORY_FUNC(get_dma_mr),
+		IB_MANDATORY_FUNC(dereg_mr)
+	};
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
+		if (!*(void **) ((u_char *) device + mandatory_table[i].offset)) {
+			printk(KERN_WARNING "Device %s is missing mandatory function %s\n",
+			       device->name, mandatory_table[i].name);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static struct ib_device *__ib_device_get_by_name(const char *name)
+{
+	struct ib_device *device;
+
+	list_for_each_entry(device, &device_list, core_list)
+		if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX))
+			return device;
+
+	return NULL;
+}
+
+
+static int alloc_name(char *name)
+{
+	unsigned long *inuse;
+	char buf[IB_DEVICE_NAME_MAX];
+	struct ib_device *device;
+	int i;
+
+	inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL);
+	if (!inuse)
+		return -ENOMEM;
+
+	list_for_each_entry(device, &device_list, core_list) {
+		if (!sscanf(device->name, name, &i))
+			continue;
+		if (i < 0 || i >= PAGE_SIZE * 8)
+			continue;
+		snprintf(buf, sizeof buf, name, i);
+		if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX))
+			set_bit(i, inuse);
+	}
+
+	i = find_first_zero_bit(inuse, PAGE_SIZE * 8);
+	free_page((unsigned long) inuse);
+	snprintf(buf, sizeof buf, name, i);
+
+	if (__ib_device_get_by_name(buf))
+		return -ENFILE;
+
+	strlcpy(name, buf, IB_DEVICE_NAME_MAX);
+	return 0;
+}
+
+static int start_port(struct ib_device *device)
+{
+	return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1;
+}
+
+
+static int end_port(struct ib_device *device)
+{
+	return (device->node_type == RDMA_NODE_IB_SWITCH) ?
+		0 : device->phys_port_cnt;
+}
+
+/**
+ * ib_alloc_device - allocate an IB device struct
+ * @size:size of structure to allocate
+ *
+ * Low-level drivers should use ib_alloc_device() to allocate &struct
+ * ib_device.  @size is the size of the structure to be allocated,
+ * including any private data used by the low-level driver.
+ * ib_dealloc_device() must be used to free structures allocated with
+ * ib_alloc_device().
+ */
+struct ib_device *ib_alloc_device(size_t size)
+{
+	BUG_ON(size < sizeof (struct ib_device));
+
+	return kzalloc(size, GFP_KERNEL);
+}
+EXPORT_SYMBOL(ib_alloc_device);
+
+/**
+ * ib_dealloc_device - free an IB device struct
+ * @device:structure to free
+ *
+ * Free a structure allocated with ib_alloc_device().
+ */
+void ib_dealloc_device(struct ib_device *device)
+{
+	if (device->reg_state == IB_DEV_UNINITIALIZED) {
+		kfree(device);
+		return;
+	}
+
+	BUG_ON(device->reg_state != IB_DEV_UNREGISTERED);
+
+	kobject_put(&device->dev.kobj);
+}
+EXPORT_SYMBOL(ib_dealloc_device);
+
+static int add_client_context(struct ib_device *device, struct ib_client *client)
+{
+	struct ib_client_data *context;
+	unsigned long flags;
+
+	context = kmalloc(sizeof *context, GFP_KERNEL);
+	if (!context) {
+		printk(KERN_WARNING "Couldn't allocate client context for %s/%s\n",
+		       device->name, client->name);
+		return -ENOMEM;
+	}
+
+	context->client = client;
+	context->data   = NULL;
+
+	spin_lock_irqsave(&device->client_data_lock, flags);
+	list_add(&context->list, &device->client_data_list);
+	spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+	return 0;
+}
+
+static int read_port_table_lengths(struct ib_device *device)
+{
+	struct ib_port_attr *tprops = NULL;
+	int num_ports, ret = -ENOMEM;
+	u8 port_index;
+
+	tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+	if (!tprops)
+		goto out;
+
+	num_ports = end_port(device) - start_port(device) + 1;
+
+	device->pkey_tbl_len = kmalloc(sizeof *device->pkey_tbl_len * num_ports,
+				       GFP_KERNEL);
+	device->gid_tbl_len = kmalloc(sizeof *device->gid_tbl_len * num_ports,
+				      GFP_KERNEL);
+	if (!device->pkey_tbl_len || !device->gid_tbl_len)
+		goto err;
+
+	for (port_index = 0; port_index < num_ports; ++port_index) {
+		ret = ib_query_port(device, port_index + start_port(device),
+					tprops);
+		if (ret)
+			goto err;
+		device->pkey_tbl_len[port_index] = tprops->pkey_tbl_len;
+		device->gid_tbl_len[port_index]  = tprops->gid_tbl_len;
+	}
+
+	ret = 0;
+	goto out;
+
+err:
+	kfree(device->gid_tbl_len);
+	kfree(device->pkey_tbl_len);
+out:
+	kfree(tprops);
+	return ret;
+}
+
+/**
+ * ib_register_device - Register an IB device with IB core
+ * @device:Device to register
+ *
+ * Low-level drivers use ib_register_device() to register their
+ * devices with the IB core.  All registered clients will receive a
+ * callback for each device that is added. @device must be allocated
+ * with ib_alloc_device().
+ */
+int ib_register_device(struct ib_device *device)
+{
+	int ret;
+
+	mutex_lock(&device_mutex);
+
+	if (strchr(device->name, '%')) {
+		ret = alloc_name(device->name);
+		if (ret)
+			goto out;
+	}
+
+	if (ib_device_check_mandatory(device)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&device->event_handler_list);
+	INIT_LIST_HEAD(&device->client_data_list);
+	spin_lock_init(&device->event_handler_lock);
+	spin_lock_init(&device->client_data_lock);
+	device->ib_uverbs_xrcd_table = RB_ROOT;
+	mutex_init(&device->xrcd_table_mutex);
+
+	ret = read_port_table_lengths(device);
+	if (ret) {
+		printk(KERN_WARNING "Couldn't create table lengths cache for device %s\n",
+		       device->name);
+		goto out;
+	}
+
+	ret = ib_device_register_sysfs(device);
+	if (ret) {
+		printk(KERN_WARNING "Couldn't register device %s with driver model\n",
+		       device->name);
+		kfree(device->gid_tbl_len);
+		kfree(device->pkey_tbl_len);
+		goto out;
+	}
+
+	list_add_tail(&device->core_list, &device_list);
+
+	device->reg_state = IB_DEV_REGISTERED;
+
+	{
+		struct ib_client *client;
+
+		list_for_each_entry(client, &client_list, list)
+			if (client->add && !add_client_context(device, client))
+				client->add(device);
+	}
+
+ out:
+	mutex_unlock(&device_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(ib_register_device);
+
+/**
+ * ib_unregister_device - Unregister an IB device
+ * @device:Device to unregister
+ *
+ * Unregister an IB device.  All clients will receive a remove callback.
+ */
+void ib_unregister_device(struct ib_device *device)
+{
+	struct ib_client *client;
+	struct ib_client_data *context, *tmp;
+	unsigned long flags;
+
+	mutex_lock(&device_mutex);
+
+	list_for_each_entry_reverse(client, &client_list, list)
+		if (client->remove)
+			client->remove(device);
+
+	list_del(&device->core_list);
+
+	kfree(device->gid_tbl_len);
+	kfree(device->pkey_tbl_len);
+
+	mutex_unlock(&device_mutex);
+
+	ib_device_unregister_sysfs(device);
+
+	spin_lock_irqsave(&device->client_data_lock, flags);
+	list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+		kfree(context);
+	spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+	device->reg_state = IB_DEV_UNREGISTERED;
+}
+EXPORT_SYMBOL(ib_unregister_device);
+
+/**
+ * ib_register_client - Register an IB client
+ * @client:Client to register
+ *
+ * Upper level users of the IB drivers can use ib_register_client() to
+ * register callbacks for IB device addition and removal.  When an IB
+ * device is added, each registered client's add method will be called
+ * (in the order the clients were registered), and when a device is
+ * removed, each client's remove method will be called (in the reverse
+ * order that clients were registered).  In addition, when
+ * ib_register_client() is called, the client will receive an add
+ * callback for all devices already registered.
+ */
+int ib_register_client(struct ib_client *client)
+{
+	struct ib_device *device;
+
+	mutex_lock(&device_mutex);
+
+	list_add_tail(&client->list, &client_list);
+	list_for_each_entry(device, &device_list, core_list)
+		if (client->add && !add_client_context(device, client))
+			client->add(device);
+
+	mutex_unlock(&device_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_register_client);
+
+/**
+ * ib_unregister_client - Unregister an IB client
+ * @client:Client to unregister
+ *
+ * Upper level users use ib_unregister_client() to remove their client
+ * registration.  When ib_unregister_client() is called, the client
+ * will receive a remove callback for each IB device still registered.
+ */
+void ib_unregister_client(struct ib_client *client)
+{
+	struct ib_client_data *context, *tmp;
+	struct ib_device *device;
+	unsigned long flags;
+
+	mutex_lock(&device_mutex);
+
+	list_for_each_entry(device, &device_list, core_list) {
+		if (client->remove)
+			client->remove(device);
+
+		spin_lock_irqsave(&device->client_data_lock, flags);
+		list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+			if (context->client == client) {
+				list_del(&context->list);
+				kfree(context);
+			}
+		spin_unlock_irqrestore(&device->client_data_lock, flags);
+	}
+	list_del(&client->list);
+
+	mutex_unlock(&device_mutex);
+}
+EXPORT_SYMBOL(ib_unregister_client);
+
+/**
+ * ib_get_client_data - Get IB client context
+ * @device:Device to get context for
+ * @client:Client to get context for
+ *
+ * ib_get_client_data() returns client context set with
+ * ib_set_client_data().
+ */
+void *ib_get_client_data(struct ib_device *device, struct ib_client *client)
+{
+	struct ib_client_data *context;
+	void *ret = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&device->client_data_lock, flags);
+	list_for_each_entry(context, &device->client_data_list, list)
+		if (context->client == client) {
+			ret = context->data;
+			break;
+		}
+	spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_get_client_data);
+
+/**
+ * ib_set_client_data - Set IB client context
+ * @device:Device to set context for
+ * @client:Client to set context for
+ * @data:Context to set
+ *
+ * ib_set_client_data() sets client context that can be retrieved with
+ * ib_get_client_data().
+ */
+void ib_set_client_data(struct ib_device *device, struct ib_client *client,
+			void *data)
+{
+	struct ib_client_data *context;
+	unsigned long flags;
+
+	spin_lock_irqsave(&device->client_data_lock, flags);
+	list_for_each_entry(context, &device->client_data_list, list)
+		if (context->client == client) {
+			context->data = data;
+			goto out;
+		}
+
+	printk(KERN_WARNING "No client context found for %s/%s\n",
+	       device->name, client->name);
+
+out:
+	spin_unlock_irqrestore(&device->client_data_lock, flags);
+}
+EXPORT_SYMBOL(ib_set_client_data);
+
+/**
+ * ib_register_event_handler - Register an IB event handler
+ * @event_handler:Handler to register
+ *
+ * ib_register_event_handler() registers an event handler that will be
+ * called back when asynchronous IB events occur (as defined in
+ * chapter 11 of the InfiniBand Architecture Specification).  This
+ * callback may occur in interrupt context.
+ */
+int ib_register_event_handler  (struct ib_event_handler *event_handler)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
+	list_add_tail(&event_handler->list,
+		      &event_handler->device->event_handler_list);
+	spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_register_event_handler);
+
+/**
+ * ib_unregister_event_handler - Unregister an event handler
+ * @event_handler:Handler to unregister
+ *
+ * Unregister an event handler registered with
+ * ib_register_event_handler().
+ */
+int ib_unregister_event_handler(struct ib_event_handler *event_handler)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
+	list_del(&event_handler->list);
+	spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_unregister_event_handler);
+
+/**
+ * ib_dispatch_event - Dispatch an asynchronous event
+ * @event:Event to dispatch
+ *
+ * Low-level drivers must call ib_dispatch_event() to dispatch the
+ * event to all registered event handlers when an asynchronous event
+ * occurs.
+ */
+void ib_dispatch_event(struct ib_event *event)
+{
+	unsigned long flags;
+	struct ib_event_handler *handler;
+
+	spin_lock_irqsave(&event->device->event_handler_lock, flags);
+
+	list_for_each_entry(handler, &event->device->event_handler_list, list)
+		handler->handler(handler, event);
+
+	spin_unlock_irqrestore(&event->device->event_handler_lock, flags);
+}
+EXPORT_SYMBOL(ib_dispatch_event);
+
+/**
+ * ib_query_device - Query IB device attributes
+ * @device:Device to query
+ * @device_attr:Device attributes
+ *
+ * ib_query_device() returns the attributes of a device through the
+ * @device_attr pointer.
+ */
+int ib_query_device(struct ib_device *device,
+		    struct ib_device_attr *device_attr)
+{
+	return device->query_device(device, device_attr);
+}
+EXPORT_SYMBOL(ib_query_device);
+
+/**
+ * ib_query_port - Query IB port attributes
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @port_attr:Port attributes
+ *
+ * ib_query_port() returns the attributes of a port through the
+ * @port_attr pointer.
+ */
+int ib_query_port(struct ib_device *device,
+		  u8 port_num,
+		  struct ib_port_attr *port_attr)
+{
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	return device->query_port(device, port_num, port_attr);
+}
+EXPORT_SYMBOL(ib_query_port);
+
+/**
+ * ib_query_gid - Get GID table entry
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @index:GID table index to query
+ * @gid:Returned GID
+ *
+ * ib_query_gid() fetches the specified GID table entry.
+ */
+int ib_query_gid(struct ib_device *device,
+		 u8 port_num, int index, union ib_gid *gid)
+{
+	return device->query_gid(device, port_num, index, gid);
+}
+EXPORT_SYMBOL(ib_query_gid);
+
+/**
+ * ib_query_pkey - Get P_Key table entry
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @index:P_Key table index to query
+ * @pkey:Returned P_Key
+ *
+ * ib_query_pkey() fetches the specified P_Key table entry.
+ */
+int ib_query_pkey(struct ib_device *device,
+		  u8 port_num, u16 index, u16 *pkey)
+{
+	return device->query_pkey(device, port_num, index, pkey);
+}
+EXPORT_SYMBOL(ib_query_pkey);
+
+/**
+ * ib_modify_device - Change IB device attributes
+ * @device:Device to modify
+ * @device_modify_mask:Mask of attributes to change
+ * @device_modify:New attribute values
+ *
+ * ib_modify_device() changes a device's attributes as specified by
+ * the @device_modify_mask and @device_modify structure.
+ */
+int ib_modify_device(struct ib_device *device,
+		     int device_modify_mask,
+		     struct ib_device_modify *device_modify)
+{
+	return device->modify_device(device, device_modify_mask,
+				     device_modify);
+}
+EXPORT_SYMBOL(ib_modify_device);
+
+/**
+ * ib_modify_port - Modifies the attributes for the specified port.
+ * @device: The device to modify.
+ * @port_num: The number of the port to modify.
+ * @port_modify_mask: Mask used to specify which attributes of the port
+ *   to change.
+ * @port_modify: New attribute values for the port.
+ *
+ * ib_modify_port() changes a port's attributes as specified by the
+ * @port_modify_mask and @port_modify structure.
+ */
+int ib_modify_port(struct ib_device *device,
+		   u8 port_num, int port_modify_mask,
+		   struct ib_port_modify *port_modify)
+{
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	return device->modify_port(device, port_num, port_modify_mask,
+				   port_modify);
+}
+EXPORT_SYMBOL(ib_modify_port);
+
+/**
+ * ib_find_gid - Returns the port number and GID table index where
+ *   a specified GID value occurs.
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @port_num: The port number of the device where the GID value was found.
+ * @index: The index into the GID table where the GID was found.  This
+ *   parameter may be NULL.
+ */
+int ib_find_gid(struct ib_device *device, union ib_gid *gid,
+		u8 *port_num, u16 *index)
+{
+	union ib_gid tmp_gid;
+	int ret, port, i;
+
+	for (port = start_port(device); port <= end_port(device); ++port) {
+		for (i = 0; i < device->gid_tbl_len[port - start_port(device)]; ++i) {
+			ret = ib_query_gid(device, port, i, &tmp_gid);
+			if (ret)
+				return ret;
+			if (!memcmp(&tmp_gid, gid, sizeof *gid)) {
+				*port_num = port;
+				if (index)
+					*index = i;
+				return 0;
+			}
+		}
+	}
+
+	return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_gid);
+
+/**
+ * ib_find_pkey - Returns the PKey table index where a specified
+ *   PKey value occurs.
+ * @device: The device to query.
+ * @port_num: The port number of the device to search for the PKey.
+ * @pkey: The PKey value to search for.
+ * @index: The index into the PKey table where the PKey was found.
+ */
+int ib_find_pkey(struct ib_device *device,
+		 u8 port_num, u16 pkey, u16 *index)
+{
+	int ret, i;
+	u16 tmp_pkey;
+
+	for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) {
+		ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
+		if (ret)
+			return ret;
+
+		if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) {
+			*index = i;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_pkey);
+
+static int __init ib_core_init(void)
+{
+	int ret;
+
+#ifdef __ia64__
+	if (ia64_platform_is("hpzx1"))
+		dma_map_sg_hp_wa = 1;
+#endif
+
+	ret = ib_sysfs_setup();
+	if (ret)
+		printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
+
+	ret = ib_cache_setup();
+	if (ret) {
+		printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
+		ib_sysfs_cleanup();
+	}
+
+	return ret;
+}
+
+static void __exit ib_core_cleanup(void)
+{
+	ib_cache_cleanup();
+	ib_sysfs_cleanup();
+	/* Make sure that any pending umem accounting work is done. */
+	flush_scheduled_work();
+}
+
+module_init(ib_core_init);
+module_exit(ib_core_cleanup);
diff --git a/sys/ofed/drivers/infiniband/core/fmr_pool.c b/sys/ofed/drivers/infiniband/core/fmr_pool.c
new file mode 100644
index 0000000..4507043
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/fmr_pool.c
@@ -0,0 +1,544 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/jhash.h>
+#include <linux/kthread.h>
+
+#include <rdma/ib_fmr_pool.h>
+
+#include "core_priv.h"
+
+#define PFX "fmr_pool: "
+
+enum {
+	IB_FMR_MAX_REMAPS = 32,
+
+	IB_FMR_HASH_BITS  = 8,
+	IB_FMR_HASH_SIZE  = 1 << IB_FMR_HASH_BITS,
+	IB_FMR_HASH_MASK  = IB_FMR_HASH_SIZE - 1
+};
+
+/*
+ * If an FMR is not in use, then the list member will point to either
+ * its pool's free_list (if the FMR can be mapped again; that is,
+ * remap_count < pool->max_remaps) or its pool's dirty_list (if the
+ * FMR needs to be unmapped before being remapped).  In either of
+ * these cases it is a bug if the ref_count is not 0.  In other words,
+ * if ref_count is > 0, then the list member must not be linked into
+ * either free_list or dirty_list.
+ *
+ * The cache_node member is used to link the FMR into a cache bucket
+ * (if caching is enabled).  This is independent of the reference
+ * count of the FMR.  When a valid FMR is released, its ref_count is
+ * decremented, and if ref_count reaches 0, the FMR is placed in
+ * either free_list or dirty_list as appropriate.  However, it is not
+ * removed from the cache and may be "revived" if a call to
+ * ib_fmr_register_physical() occurs before the FMR is remapped.  In
+ * this case we just increment the ref_count and remove the FMR from
+ * free_list/dirty_list.
+ *
+ * Before we remap an FMR from free_list, we remove it from the cache
+ * (to prevent another user from obtaining a stale FMR).  When an FMR
+ * is released, we add it to the tail of the free list, so that our
+ * cache eviction policy is "least recently used."
+ *
+ * All manipulation of ref_count, list and cache_node is protected by
+ * pool_lock to maintain consistency.
+ */
+
+struct ib_fmr_pool {
+	spinlock_t                pool_lock;
+
+	int                       pool_size;
+	int                       max_pages;
+	int			  max_remaps;
+	int                       dirty_watermark;
+	int                       dirty_len;
+	struct list_head          free_list;
+	struct list_head          dirty_list;
+	struct hlist_head        *cache_bucket;
+
+	void                     (*flush_function)(struct ib_fmr_pool *pool,
+						   void *              arg);
+	void                     *flush_arg;
+
+	struct task_struct       *thread;
+
+	atomic_t                  req_ser;
+	atomic_t                  flush_ser;
+
+	wait_queue_head_t         force_wait;
+};
+
+static inline u32 ib_fmr_hash(u64 first_page)
+{
+	return jhash_2words((u32) first_page, (u32) (first_page >> 32), 0) &
+		(IB_FMR_HASH_SIZE - 1);
+}
+
+/* Caller must hold pool_lock */
+static inline struct ib_pool_fmr *ib_fmr_cache_lookup(struct ib_fmr_pool *pool,
+						      u64 *page_list,
+						      int  page_list_len,
+						      u64  io_virtual_address)
+{
+	struct hlist_head *bucket;
+	struct ib_pool_fmr *fmr;
+	struct hlist_node *pos;
+
+	if (!pool->cache_bucket)
+		return NULL;
+
+	bucket = pool->cache_bucket + ib_fmr_hash(*page_list);
+
+	hlist_for_each_entry(fmr, pos, bucket, cache_node)
+		if (io_virtual_address == fmr->io_virtual_address &&
+		    page_list_len      == fmr->page_list_len      &&
+		    !memcmp(page_list, fmr->page_list,
+			    page_list_len * sizeof *page_list))
+			return fmr;
+
+	return NULL;
+}
+
+static void ib_fmr_batch_release(struct ib_fmr_pool *pool)
+{
+	int                 ret;
+	struct ib_pool_fmr *fmr;
+	LIST_HEAD(unmap_list);
+	LIST_HEAD(fmr_list);
+
+	spin_lock_irq(&pool->pool_lock);
+
+	list_for_each_entry(fmr, &pool->dirty_list, list) {
+		hlist_del_init(&fmr->cache_node);
+		fmr->remap_count = 0;
+		list_add_tail(&fmr->fmr->list, &fmr_list);
+
+#ifdef DEBUG
+		if (fmr->ref_count !=0) {
+			printk(KERN_WARNING PFX "Unmapping FMR 0x%08x with ref count %d\n",
+			       fmr, fmr->ref_count);
+		}
+#endif
+	}
+
+	list_splice_init(&pool->dirty_list, &unmap_list);
+	pool->dirty_len = 0;
+
+	spin_unlock_irq(&pool->pool_lock);
+
+	if (list_empty(&unmap_list)) {
+		return;
+	}
+
+	ret = ib_unmap_fmr(&fmr_list);
+	if (ret)
+		printk(KERN_WARNING PFX "ib_unmap_fmr returned %d\n", ret);
+
+	spin_lock_irq(&pool->pool_lock);
+	list_splice(&unmap_list, &pool->free_list);
+	spin_unlock_irq(&pool->pool_lock);
+}
+
+static int ib_fmr_cleanup_thread(void *pool_ptr)
+{
+	struct ib_fmr_pool *pool = pool_ptr;
+
+	do {
+		if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) < 0) {
+			ib_fmr_batch_release(pool);
+
+			atomic_inc(&pool->flush_ser);
+			wake_up_interruptible(&pool->force_wait);
+
+			if (pool->flush_function)
+				pool->flush_function(pool, pool->flush_arg);
+		}
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) >= 0 &&
+		    !kthread_should_stop())
+			schedule();
+		__set_current_state(TASK_RUNNING);
+	} while (!kthread_should_stop());
+
+	return 0;
+}
+
+/**
+ * ib_create_fmr_pool - Create an FMR pool
+ * @pd:Protection domain for FMRs
+ * @params:FMR pool parameters
+ *
+ * Create a pool of FMRs.  Return value is pointer to new pool or
+ * error code if creation failed.
+ */
+struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd             *pd,
+				       struct ib_fmr_pool_param *params)
+{
+	struct ib_device   *device;
+	struct ib_fmr_pool *pool;
+	struct ib_device_attr *attr;
+	int i;
+	int ret;
+	int max_remaps;
+
+	if (!params)
+		return ERR_PTR(-EINVAL);
+
+	device = pd->device;
+	if (!device->alloc_fmr    || !device->dealloc_fmr  ||
+	    !device->map_phys_fmr || !device->unmap_fmr) {
+		printk(KERN_INFO PFX "Device %s does not support FMRs\n",
+		       device->name);
+		return ERR_PTR(-ENOSYS);
+	}
+
+	attr = kmalloc(sizeof *attr, GFP_KERNEL);
+	if (!attr) {
+		printk(KERN_WARNING PFX "couldn't allocate device attr struct\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	ret = ib_query_device(device, attr);
+	if (ret) {
+		printk(KERN_WARNING PFX "couldn't query device: %d\n", ret);
+		kfree(attr);
+		return ERR_PTR(ret);
+	}
+
+	if (!attr->max_map_per_fmr)
+		max_remaps = IB_FMR_MAX_REMAPS;
+	else
+		max_remaps = attr->max_map_per_fmr;
+
+	kfree(attr);
+
+	pool = kmalloc(sizeof *pool, GFP_KERNEL);
+	if (!pool) {
+		printk(KERN_WARNING PFX "couldn't allocate pool struct\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	pool->cache_bucket   = NULL;
+
+	pool->flush_function = params->flush_function;
+	pool->flush_arg      = params->flush_arg;
+
+	INIT_LIST_HEAD(&pool->free_list);
+	INIT_LIST_HEAD(&pool->dirty_list);
+
+	if (params->cache) {
+		pool->cache_bucket =
+			kmalloc(IB_FMR_HASH_SIZE * sizeof *pool->cache_bucket,
+				GFP_KERNEL);
+		if (!pool->cache_bucket) {
+			printk(KERN_WARNING PFX "Failed to allocate cache in pool\n");
+			ret = -ENOMEM;
+			goto out_free_pool;
+		}
+
+		for (i = 0; i < IB_FMR_HASH_SIZE; ++i)
+			INIT_HLIST_HEAD(pool->cache_bucket + i);
+	}
+
+	pool->pool_size       = 0;
+	pool->max_pages       = params->max_pages_per_fmr;
+	pool->max_remaps      = max_remaps;
+	pool->dirty_watermark = params->dirty_watermark;
+	pool->dirty_len       = 0;
+	spin_lock_init(&pool->pool_lock);
+	atomic_set(&pool->req_ser,   0);
+	atomic_set(&pool->flush_ser, 0);
+	init_waitqueue_head(&pool->force_wait);
+
+	pool->thread = kthread_run(ib_fmr_cleanup_thread,
+				   pool,
+				   "ib_fmr(%s)",
+				   device->name);
+	if (IS_ERR(pool->thread)) {
+		printk(KERN_WARNING PFX "couldn't start cleanup thread\n");
+		ret = PTR_ERR(pool->thread);
+		goto out_free_pool;
+	}
+
+	{
+		struct ib_pool_fmr *fmr;
+		struct ib_fmr_attr fmr_attr = {
+			.max_pages  = params->max_pages_per_fmr,
+			.max_maps   = pool->max_remaps,
+			.page_shift = params->page_shift
+		};
+		int bytes_per_fmr = sizeof *fmr;
+
+		if (pool->cache_bucket)
+			bytes_per_fmr += params->max_pages_per_fmr * sizeof (u64);
+
+		for (i = 0; i < params->pool_size; ++i) {
+			fmr = kmalloc(bytes_per_fmr, GFP_KERNEL);
+			if (!fmr) {
+				printk(KERN_WARNING PFX "failed to allocate fmr "
+				       "struct for FMR %d\n", i);
+				goto out_fail;
+			}
+
+			fmr->pool             = pool;
+			fmr->remap_count      = 0;
+			fmr->ref_count        = 0;
+			INIT_HLIST_NODE(&fmr->cache_node);
+
+			fmr->fmr = ib_alloc_fmr(pd, params->access, &fmr_attr);
+			if (IS_ERR(fmr->fmr)) {
+				printk(KERN_WARNING PFX "fmr_create failed "
+				       "for FMR %d\n", i);
+				kfree(fmr);
+				goto out_fail;
+			}
+
+			list_add_tail(&fmr->list, &pool->free_list);
+			++pool->pool_size;
+		}
+	}
+
+	return pool;
+
+ out_free_pool:
+	kfree(pool->cache_bucket);
+	kfree(pool);
+
+	return ERR_PTR(ret);
+
+ out_fail:
+	ib_destroy_fmr_pool(pool);
+
+	return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(ib_create_fmr_pool);
+
+/**
+ * ib_destroy_fmr_pool - Free FMR pool
+ * @pool:FMR pool to free
+ *
+ * Destroy an FMR pool and free all associated resources.
+ */
+void ib_destroy_fmr_pool(struct ib_fmr_pool *pool)
+{
+	struct ib_pool_fmr *fmr;
+	struct ib_pool_fmr *tmp;
+	LIST_HEAD(fmr_list);
+	int                 i;
+
+	kthread_stop(pool->thread);
+	ib_fmr_batch_release(pool);
+
+	i = 0;
+	list_for_each_entry_safe(fmr, tmp, &pool->free_list, list) {
+		if (fmr->remap_count) {
+			INIT_LIST_HEAD(&fmr_list);
+			list_add_tail(&fmr->fmr->list, &fmr_list);
+			ib_unmap_fmr(&fmr_list);
+		}
+		ib_dealloc_fmr(fmr->fmr);
+		list_del(&fmr->list);
+		kfree(fmr);
+		++i;
+	}
+
+	if (i < pool->pool_size)
+		printk(KERN_WARNING PFX "pool still has %d regions registered\n",
+		       pool->pool_size - i);
+
+	kfree(pool->cache_bucket);
+	kfree(pool);
+}
+EXPORT_SYMBOL(ib_destroy_fmr_pool);
+
+/**
+ * ib_flush_fmr_pool - Invalidate all unmapped FMRs
+ * @pool:FMR pool to flush
+ *
+ * Ensure that all unmapped FMRs are fully invalidated.
+ */
+int ib_flush_fmr_pool(struct ib_fmr_pool *pool)
+{
+	int serial;
+	struct ib_pool_fmr *fmr, *next;
+
+	/*
+	 * The free_list holds FMRs that may have been used
+	 * but have not been remapped enough times to be dirty.
+	 * Put them on the dirty list now so that the cleanup
+	 * thread will reap them too.
+	 */
+	spin_lock_irq(&pool->pool_lock);
+	list_for_each_entry_safe(fmr, next, &pool->free_list, list) {
+		if (fmr->remap_count > 0)
+			list_move(&fmr->list, &pool->dirty_list);
+	}
+	spin_unlock_irq(&pool->pool_lock);
+
+	serial = atomic_inc_return(&pool->req_ser);
+	wake_up_process(pool->thread);
+
+	if (wait_event_interruptible(pool->force_wait,
+				     atomic_read(&pool->flush_ser) - serial >= 0))
+		return -EINTR;
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_flush_fmr_pool);
+
+/**
+ * ib_fmr_pool_map_phys -
+ * @pool:FMR pool to allocate FMR from
+ * @page_list:List of pages to map
+ * @list_len:Number of pages in @page_list
+ * @io_virtual_address:I/O virtual address for new FMR
+ *
+ * Map an FMR from an FMR pool.
+ */
+struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle,
+					 u64                *page_list,
+					 int                 list_len,
+					 u64                 io_virtual_address)
+{
+	struct ib_fmr_pool *pool = pool_handle;
+	struct ib_pool_fmr *fmr;
+	unsigned long       flags;
+	int                 result;
+
+	if (list_len < 1 || list_len > pool->max_pages)
+		return ERR_PTR(-EINVAL);
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+	fmr = ib_fmr_cache_lookup(pool,
+				  page_list,
+				  list_len,
+				  io_virtual_address);
+	if (fmr) {
+		/* found in cache */
+		++fmr->ref_count;
+		if (fmr->ref_count == 1) {
+			list_del(&fmr->list);
+		}
+
+		spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+		return fmr;
+	}
+
+	if (list_empty(&pool->free_list)) {
+		spin_unlock_irqrestore(&pool->pool_lock, flags);
+		return ERR_PTR(-EAGAIN);
+	}
+
+	fmr = list_entry(pool->free_list.next, struct ib_pool_fmr, list);
+	list_del(&fmr->list);
+	hlist_del_init(&fmr->cache_node);
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+	result = ib_map_phys_fmr(fmr->fmr, page_list, list_len,
+				 io_virtual_address);
+
+	if (result) {
+		spin_lock_irqsave(&pool->pool_lock, flags);
+		list_add(&fmr->list, &pool->free_list);
+		spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+		printk(KERN_WARNING PFX "fmr_map returns %d\n", result);
+
+		return ERR_PTR(result);
+	}
+
+	++fmr->remap_count;
+	fmr->ref_count = 1;
+
+	if (pool->cache_bucket) {
+		fmr->io_virtual_address = io_virtual_address;
+		fmr->page_list_len      = list_len;
+		memcpy(fmr->page_list, page_list, list_len * sizeof(*page_list));
+
+		spin_lock_irqsave(&pool->pool_lock, flags);
+		hlist_add_head(&fmr->cache_node,
+			       pool->cache_bucket + ib_fmr_hash(fmr->page_list[0]));
+		spin_unlock_irqrestore(&pool->pool_lock, flags);
+	}
+
+	return fmr;
+}
+EXPORT_SYMBOL(ib_fmr_pool_map_phys);
+
+/**
+ * ib_fmr_pool_unmap - Unmap FMR
+ * @fmr:FMR to unmap
+ *
+ * Unmap an FMR.  The FMR mapping may remain valid until the FMR is
+ * reused (or until ib_flush_fmr_pool() is called).
+ */
+int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr)
+{
+	struct ib_fmr_pool *pool;
+	unsigned long flags;
+
+	pool = fmr->pool;
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+
+	--fmr->ref_count;
+	if (!fmr->ref_count) {
+		if (fmr->remap_count < pool->max_remaps) {
+			list_add_tail(&fmr->list, &pool->free_list);
+		} else {
+			list_add_tail(&fmr->list, &pool->dirty_list);
+			if (++pool->dirty_len >= pool->dirty_watermark) {
+				atomic_inc(&pool->req_ser);
+				wake_up_process(pool->thread);
+			}
+		}
+	}
+
+#ifdef DEBUG
+	if (fmr->ref_count < 0)
+		printk(KERN_WARNING PFX "FMR %p has ref count %d < 0\n",
+		       fmr, fmr->ref_count);
+#endif
+
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_fmr_pool_unmap);
diff --git a/sys/ofed/drivers/infiniband/core/iwcm.c b/sys/ofed/drivers/infiniband/core/iwcm.c
new file mode 100644
index 0000000..625fec5
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/iwcm.c
@@ -0,0 +1,1025 @@
+/*
+ * Copyright (c) 2004, 2005 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ * Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/interrupt.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/completion.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_addr.h>
+
+#include "iwcm.h"
+
+MODULE_AUTHOR("Tom Tucker");
+MODULE_DESCRIPTION("iWARP CM");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static struct workqueue_struct *iwcm_wq;
+struct iwcm_work {
+	struct work_struct work;
+	struct iwcm_id_private *cm_id;
+	struct list_head list;
+	struct iw_cm_event event;
+	struct list_head free_list;
+};
+
+/*
+ * The following services provide a mechanism for pre-allocating iwcm_work
+ * elements.  The design pre-allocates them  based on the cm_id type:
+ *	LISTENING IDS: 	Get enough elements preallocated to handle the
+ *			listen backlog.
+ *	ACTIVE IDS:	4: CONNECT_REPLY, ESTABLISHED, DISCONNECT, CLOSE
+ *	PASSIVE IDS:	3: ESTABLISHED, DISCONNECT, CLOSE
+ *
+ * Allocating them in connect and listen avoids having to deal
+ * with allocation failures on the event upcall from the provider (which
+ * is called in the interrupt context).
+ *
+ * One exception is when creating the cm_id for incoming connection requests.
+ * There are two cases:
+ * 1) in the event upcall, cm_event_handler(), for a listening cm_id.  If
+ *    the backlog is exceeded, then no more connection request events will
+ *    be processed.  cm_event_handler() returns -ENOMEM in this case.  Its up
+ *    to the provider to reject the connection request.
+ * 2) in the connection request workqueue handler, cm_conn_req_handler().
+ *    If work elements cannot be allocated for the new connect request cm_id,
+ *    then IWCM will call the provider reject method.  This is ok since
+ *    cm_conn_req_handler() runs in the workqueue thread context.
+ */
+
+static struct iwcm_work *get_work(struct iwcm_id_private *cm_id_priv)
+{
+	struct iwcm_work *work;
+
+	if (list_empty(&cm_id_priv->work_free_list))
+		return NULL;
+	work = list_entry(cm_id_priv->work_free_list.next, struct iwcm_work,
+			  free_list);
+	list_del_init(&work->free_list);
+	return work;
+}
+
+static void put_work(struct iwcm_work *work)
+{
+	list_add(&work->free_list, &work->cm_id->work_free_list);
+}
+
+static void dealloc_work_entries(struct iwcm_id_private *cm_id_priv)
+{
+	struct list_head *e, *tmp;
+
+	list_for_each_safe(e, tmp, &cm_id_priv->work_free_list)
+		kfree(list_entry(e, struct iwcm_work, free_list));
+}
+
+static int alloc_work_entries(struct iwcm_id_private *cm_id_priv, int count)
+{
+	struct iwcm_work *work;
+
+	BUG_ON(!list_empty(&cm_id_priv->work_free_list));
+	while (count--) {
+		work = kmalloc(sizeof(struct iwcm_work), GFP_KERNEL);
+		if (!work) {
+			dealloc_work_entries(cm_id_priv);
+			return -ENOMEM;
+		}
+		work->cm_id = cm_id_priv;
+		INIT_LIST_HEAD(&work->list);
+		put_work(work);
+	}
+	return 0;
+}
+
+/*
+ * Save private data from incoming connection requests to
+ * iw_cm_event, so the low level driver doesn't have to. Adjust
+ * the event ptr to point to the local copy.
+ */
+static int copy_private_data(struct iw_cm_event *event)
+{
+	void *p;
+
+	p = kmemdup(event->private_data, event->private_data_len, GFP_ATOMIC);
+	if (!p)
+		return -ENOMEM;
+	event->private_data = p;
+	return 0;
+}
+
+static void free_cm_id(struct iwcm_id_private *cm_id_priv)
+{
+	dealloc_work_entries(cm_id_priv);
+	kfree(cm_id_priv);
+}
+
+/*
+ * Release a reference on cm_id. If the last reference is being
+ * released, enable the waiting thread (in iw_destroy_cm_id) to
+ * get woken up, and return 1 if a thread is already waiting.
+ */
+static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
+{
+	BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
+	if (atomic_dec_and_test(&cm_id_priv->refcount)) {
+		BUG_ON(!list_empty(&cm_id_priv->work_list));
+		complete(&cm_id_priv->destroy_comp);
+		return 1;
+	}
+
+	return 0;
+}
+
+static void add_ref(struct iw_cm_id *cm_id)
+{
+	struct iwcm_id_private *cm_id_priv;
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	atomic_inc(&cm_id_priv->refcount);
+}
+
+static void rem_ref(struct iw_cm_id *cm_id)
+{
+	struct iwcm_id_private *cm_id_priv;
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	if (iwcm_deref_id(cm_id_priv) &&
+	    test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags)) {
+		BUG_ON(!list_empty(&cm_id_priv->work_list));
+		free_cm_id(cm_id_priv);
+	}
+}
+
+static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event);
+
+struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
+				 iw_cm_handler cm_handler,
+				 void *context)
+{
+	struct iwcm_id_private *cm_id_priv;
+
+	cm_id_priv = kzalloc(sizeof(*cm_id_priv), GFP_KERNEL);
+	if (!cm_id_priv)
+		return ERR_PTR(-ENOMEM);
+
+	cm_id_priv->state = IW_CM_STATE_IDLE;
+	cm_id_priv->id.device = device;
+	cm_id_priv->id.cm_handler = cm_handler;
+	cm_id_priv->id.context = context;
+	cm_id_priv->id.event_handler = cm_event_handler;
+	cm_id_priv->id.add_ref = add_ref;
+	cm_id_priv->id.rem_ref = rem_ref;
+	spin_lock_init(&cm_id_priv->lock);
+	atomic_set(&cm_id_priv->refcount, 1);
+	init_waitqueue_head(&cm_id_priv->connect_wait);
+	init_completion(&cm_id_priv->destroy_comp);
+	INIT_LIST_HEAD(&cm_id_priv->work_list);
+	INIT_LIST_HEAD(&cm_id_priv->work_free_list);
+
+	return &cm_id_priv->id;
+}
+EXPORT_SYMBOL(iw_create_cm_id);
+
+
+static int iwcm_modify_qp_err(struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+
+	if (!qp)
+		return -EINVAL;
+
+	qp_attr.qp_state = IB_QPS_ERR;
+	return ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+}
+
+/*
+ * This is really the RDMAC CLOSING state. It is most similar to the
+ * IB SQD QP state.
+ */
+static int iwcm_modify_qp_sqd(struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+
+	BUG_ON(qp == NULL);
+	qp_attr.qp_state = IB_QPS_SQD;
+	return ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+}
+
+/*
+ * CM_ID <-- CLOSING
+ *
+ * Block if a passive or active connection is currently being processed. Then
+ * process the event as follows:
+ * - If we are ESTABLISHED, move to CLOSING and modify the QP state
+ *   based on the abrupt flag
+ * - If the connection is already in the CLOSING or IDLE state, the peer is
+ *   disconnecting concurrently with us and we've already seen the
+ *   DISCONNECT event -- ignore the request and return 0
+ * - Disconnect on a listening endpoint returns -EINVAL
+ */
+int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt)
+{
+	struct iwcm_id_private *cm_id_priv;
+	unsigned long flags;
+	int ret = 0;
+	struct ib_qp *qp = NULL;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	/* Wait if we're currently in a connect or accept downcall */
+	wait_event(cm_id_priv->connect_wait,
+		   !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags));
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_ESTABLISHED:
+		cm_id_priv->state = IW_CM_STATE_CLOSING;
+
+		/* QP could be <nul> for user-mode client */
+		if (cm_id_priv->qp)
+			qp = cm_id_priv->qp;
+		else
+			ret = -EINVAL;
+		break;
+	case IW_CM_STATE_LISTEN:
+		ret = -EINVAL;
+		break;
+	case IW_CM_STATE_CLOSING:
+		/* remote peer closed first */
+	case IW_CM_STATE_IDLE:
+		/* accept or connect returned !0 */
+		break;
+	case IW_CM_STATE_CONN_RECV:
+		/*
+		 * App called disconnect before/without calling accept after
+		 * connect_request event delivered.
+		 */
+		break;
+	case IW_CM_STATE_CONN_SENT:
+		/* Can only get here if wait above fails */
+	default:
+		BUG();
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	if (qp) {
+		if (abrupt)
+			ret = iwcm_modify_qp_err(qp);
+		else
+			ret = iwcm_modify_qp_sqd(qp);
+
+		/*
+		 * If both sides are disconnecting the QP could
+		 * already be in ERR or SQD states
+		 */
+		ret = 0;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_disconnect);
+
+/*
+ * CM_ID <-- DESTROYING
+ *
+ * Clean up all resources associated with the connection and release
+ * the initial reference taken by iw_create_cm_id.
+ */
+static void destroy_cm_id(struct iw_cm_id *cm_id)
+{
+	struct iwcm_id_private *cm_id_priv;
+	unsigned long flags;
+	int ret;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	/*
+	 * Wait if we're currently in a connect or accept downcall. A
+	 * listening endpoint should never block here.
+	 */
+	wait_event(cm_id_priv->connect_wait,
+		   !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags));
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_LISTEN:
+		cm_id_priv->state = IW_CM_STATE_DESTROYING;
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		/* destroy the listening endpoint */
+		ret = cm_id->device->iwcm->destroy_listen(cm_id);
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		break;
+	case IW_CM_STATE_ESTABLISHED:
+		cm_id_priv->state = IW_CM_STATE_DESTROYING;
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		/* Abrupt close of the connection */
+		(void)iwcm_modify_qp_err(cm_id_priv->qp);
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		break;
+	case IW_CM_STATE_IDLE:
+	case IW_CM_STATE_CLOSING:
+		cm_id_priv->state = IW_CM_STATE_DESTROYING;
+		break;
+	case IW_CM_STATE_CONN_RECV:
+		/*
+		 * App called destroy before/without calling accept after
+		 * receiving connection request event notification or
+		 * returned non zero from the event callback function.
+		 * In either case, must tell the provider to reject.
+		 */
+		cm_id_priv->state = IW_CM_STATE_DESTROYING;
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_id->device->iwcm->reject(cm_id, NULL, 0);
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		break;
+	case IW_CM_STATE_CONN_SENT:
+	case IW_CM_STATE_DESTROYING:
+	default:
+		BUG();
+		break;
+	}
+	if (cm_id_priv->qp) {
+		cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+		cm_id_priv->qp = NULL;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	(void)iwcm_deref_id(cm_id_priv);
+}
+
+/*
+ * This function is only called by the application thread and cannot
+ * be called by the event thread. The function will wait for all
+ * references to be released on the cm_id and then kfree the cm_id
+ * object.
+ */
+void iw_destroy_cm_id(struct iw_cm_id *cm_id)
+{
+	struct iwcm_id_private *cm_id_priv;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	BUG_ON(test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags));
+
+	destroy_cm_id(cm_id);
+
+	wait_for_completion(&cm_id_priv->destroy_comp);
+
+	free_cm_id(cm_id_priv);
+}
+EXPORT_SYMBOL(iw_destroy_cm_id);
+
+/*
+ * CM_ID <-- LISTEN
+ *
+ * Start listening for connect requests. Generates one CONNECT_REQUEST
+ * event for each inbound connect request.
+ */
+int iw_cm_listen(struct iw_cm_id *cm_id, int backlog)
+{
+	struct iwcm_id_private *cm_id_priv;
+	unsigned long flags;
+	int ret;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+	ret = alloc_work_entries(cm_id_priv, backlog);
+	if (ret)
+		return ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_IDLE:
+		cm_id_priv->state = IW_CM_STATE_LISTEN;
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		ret = cm_id->device->iwcm->create_listen(cm_id, backlog);
+		if (ret)
+			cm_id_priv->state = IW_CM_STATE_IDLE;
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_listen);
+
+/*
+ * CM_ID <-- IDLE
+ *
+ * Rejects an inbound connection request. No events are generated.
+ */
+int iw_cm_reject(struct iw_cm_id *cm_id,
+		 const void *private_data,
+		 u8 private_data_len)
+{
+	struct iwcm_id_private *cm_id_priv;
+	unsigned long flags;
+	int ret;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+		return -EINVAL;
+	}
+	cm_id_priv->state = IW_CM_STATE_IDLE;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	ret = cm_id->device->iwcm->reject(cm_id, private_data,
+					  private_data_len);
+
+	clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+	wake_up_all(&cm_id_priv->connect_wait);
+
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_reject);
+
+/*
+ * CM_ID <-- ESTABLISHED
+ *
+ * Accepts an inbound connection request and generates an ESTABLISHED
+ * event. Callers of iw_cm_disconnect and iw_destroy_cm_id will block
+ * until the ESTABLISHED event is received from the provider.
+ */
+int iw_cm_accept(struct iw_cm_id *cm_id,
+		 struct iw_cm_conn_param *iw_param)
+{
+	struct iwcm_id_private *cm_id_priv;
+	struct ib_qp *qp;
+	unsigned long flags;
+	int ret;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+		return -EINVAL;
+	}
+	/* Get the ib_qp given the QPN */
+	qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
+	if (!qp) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		return -EINVAL;
+	}
+	cm_id->device->iwcm->add_ref(qp);
+	cm_id_priv->qp = qp;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	ret = cm_id->device->iwcm->accept(cm_id, iw_param);
+	if (ret) {
+		/* An error on accept precludes provider events */
+		BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
+		cm_id_priv->state = IW_CM_STATE_IDLE;
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		if (cm_id_priv->qp) {
+			cm_id->device->iwcm->rem_ref(qp);
+			cm_id_priv->qp = NULL;
+		}
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_accept);
+
+/*
+ * Active Side: CM_ID <-- CONN_SENT
+ *
+ * If successful, results in the generation of a CONNECT_REPLY
+ * event. iw_cm_disconnect and iw_cm_destroy will block until the
+ * CONNECT_REPLY event is received from the provider.
+ */
+int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+	struct iwcm_id_private *cm_id_priv;
+	int ret;
+	unsigned long flags;
+	struct ib_qp *qp;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+	ret = alloc_work_entries(cm_id_priv, 4);
+	if (ret)
+		return ret;
+
+	set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+
+	if (cm_id_priv->state != IW_CM_STATE_IDLE) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+		return -EINVAL;
+	}
+
+	/* Get the ib_qp given the QPN */
+	qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
+	if (!qp) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		return -EINVAL;
+	}
+	cm_id->device->iwcm->add_ref(qp);
+	cm_id_priv->qp = qp;
+	cm_id_priv->state = IW_CM_STATE_CONN_SENT;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	ret = cm_id->device->iwcm->connect(cm_id, iw_param);
+	if (ret) {
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		if (cm_id_priv->qp) {
+			cm_id->device->iwcm->rem_ref(qp);
+			cm_id_priv->qp = NULL;
+		}
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT);
+		cm_id_priv->state = IW_CM_STATE_IDLE;
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_connect);
+
+/*
+ * Passive Side: new CM_ID <-- CONN_RECV
+ *
+ * Handles an inbound connect request. The function creates a new
+ * iw_cm_id to represent the new connection and inherits the client
+ * callback function and other attributes from the listening parent.
+ *
+ * The work item contains a pointer to the listen_cm_id and the event. The
+ * listen_cm_id contains the client cm_handler, context and
+ * device. These are copied when the device is cloned. The event
+ * contains the new four tuple.
+ *
+ * An error on the child should not affect the parent, so this
+ * function does not return a value.
+ */
+static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv,
+				struct iw_cm_event *iw_event)
+{
+	unsigned long flags;
+	struct iw_cm_id *cm_id;
+	struct iwcm_id_private *cm_id_priv;
+	int ret;
+
+	/*
+	 * The provider should never generate a connection request
+	 * event with a bad status.
+	 */
+	BUG_ON(iw_event->status);
+
+	/*
+	 * We could be destroying the listening id. If so, ignore this
+	 * upcall.
+	 */
+	spin_lock_irqsave(&listen_id_priv->lock, flags);
+	if (listen_id_priv->state != IW_CM_STATE_LISTEN) {
+		spin_unlock_irqrestore(&listen_id_priv->lock, flags);
+		goto out;
+	}
+	spin_unlock_irqrestore(&listen_id_priv->lock, flags);
+
+	cm_id = iw_create_cm_id(listen_id_priv->id.device,
+				listen_id_priv->id.cm_handler,
+				listen_id_priv->id.context);
+	/* If the cm_id could not be created, ignore the request */
+	if (IS_ERR(cm_id))
+		goto out;
+
+	cm_id->provider_data = iw_event->provider_data;
+	cm_id->local_addr = iw_event->local_addr;
+	cm_id->remote_addr = iw_event->remote_addr;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	cm_id_priv->state = IW_CM_STATE_CONN_RECV;
+
+	ret = alloc_work_entries(cm_id_priv, 3);
+	if (ret) {
+		iw_cm_reject(cm_id, NULL, 0);
+		iw_destroy_cm_id(cm_id);
+		goto out;
+	}
+
+	/* Call the client CM handler */
+	ret = cm_id->cm_handler(cm_id, iw_event);
+	if (ret) {
+		iw_cm_reject(cm_id, NULL, 0);
+		set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
+		destroy_cm_id(cm_id);
+		if (atomic_read(&cm_id_priv->refcount)==0)
+			free_cm_id(cm_id_priv);
+	}
+
+out:
+	if (iw_event->private_data_len)
+		kfree(iw_event->private_data);
+}
+
+/*
+ * Passive Side: CM_ID <-- ESTABLISHED
+ *
+ * The provider generated an ESTABLISHED event which means that
+ * the MPA negotion has completed successfully and we are now in MPA
+ * FPDU mode.
+ *
+ * This event can only be received in the CONN_RECV state. If the
+ * remote peer closed, the ESTABLISHED event would be received followed
+ * by the CLOSE event. If the app closes, it will block until we wake
+ * it up after processing this event.
+ */
+static int cm_conn_est_handler(struct iwcm_id_private *cm_id_priv,
+			       struct iw_cm_event *iw_event)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+
+	/*
+	 * We clear the CONNECT_WAIT bit here to allow the callback
+	 * function to call iw_cm_disconnect. Calling iw_destroy_cm_id
+	 * from a callback handler is not allowed.
+	 */
+	clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+	BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
+	cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+	wake_up_all(&cm_id_priv->connect_wait);
+
+	return ret;
+}
+
+/*
+ * Active Side: CM_ID <-- ESTABLISHED
+ *
+ * The app has called connect and is waiting for the established event to
+ * post it's requests to the server. This event will wake up anyone
+ * blocked in iw_cm_disconnect or iw_destroy_id.
+ */
+static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv,
+			       struct iw_cm_event *iw_event)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	/*
+	 * Clear the connect wait bit so a callback function calling
+	 * iw_cm_disconnect will not wait and deadlock this thread
+	 */
+	clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+	BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT);
+	if (iw_event->status == IW_CM_EVENT_STATUS_ACCEPTED) {
+		cm_id_priv->id.local_addr = iw_event->local_addr;
+		cm_id_priv->id.remote_addr = iw_event->remote_addr;
+		cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
+	} else {
+		/* REJECTED or RESET */
+		cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+		cm_id_priv->qp = NULL;
+		cm_id_priv->state = IW_CM_STATE_IDLE;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+
+	if (iw_event->private_data_len)
+		kfree(iw_event->private_data);
+
+	/* Wake up waiters on connect complete */
+	wake_up_all(&cm_id_priv->connect_wait);
+
+	return ret;
+}
+
+/*
+ * CM_ID <-- CLOSING
+ *
+ * If in the ESTABLISHED state, move to CLOSING.
+ */
+static void cm_disconnect_handler(struct iwcm_id_private *cm_id_priv,
+				  struct iw_cm_event *iw_event)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id_priv->state == IW_CM_STATE_ESTABLISHED)
+		cm_id_priv->state = IW_CM_STATE_CLOSING;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+}
+
+/*
+ * CM_ID <-- IDLE
+ *
+ * If in the ESTBLISHED or CLOSING states, the QP will have have been
+ * moved by the provider to the ERR state. Disassociate the CM_ID from
+ * the QP,  move to IDLE, and remove the 'connected' reference.
+ *
+ * If in some other state, the cm_id was destroyed asynchronously.
+ * This is the last reference that will result in waking up
+ * the app thread blocked in iw_destroy_cm_id.
+ */
+static int cm_close_handler(struct iwcm_id_private *cm_id_priv,
+				  struct iw_cm_event *iw_event)
+{
+	unsigned long flags;
+	int ret = 0;
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+
+	if (cm_id_priv->qp) {
+		cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+		cm_id_priv->qp = NULL;
+	}
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_ESTABLISHED:
+	case IW_CM_STATE_CLOSING:
+		cm_id_priv->state = IW_CM_STATE_IDLE;
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		break;
+	case IW_CM_STATE_DESTROYING:
+		break;
+	default:
+		BUG();
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	return ret;
+}
+
+static int process_event(struct iwcm_id_private *cm_id_priv,
+			 struct iw_cm_event *iw_event)
+{
+	int ret = 0;
+
+	switch (iw_event->event) {
+	case IW_CM_EVENT_CONNECT_REQUEST:
+		cm_conn_req_handler(cm_id_priv, iw_event);
+		break;
+	case IW_CM_EVENT_CONNECT_REPLY:
+		ret = cm_conn_rep_handler(cm_id_priv, iw_event);
+		break;
+	case IW_CM_EVENT_ESTABLISHED:
+		ret = cm_conn_est_handler(cm_id_priv, iw_event);
+		break;
+	case IW_CM_EVENT_DISCONNECT:
+		cm_disconnect_handler(cm_id_priv, iw_event);
+		break;
+	case IW_CM_EVENT_CLOSE:
+		ret = cm_close_handler(cm_id_priv, iw_event);
+		break;
+	default:
+		BUG();
+	}
+
+	return ret;
+}
+
+/*
+ * Process events on the work_list for the cm_id. If the callback
+ * function requests that the cm_id be deleted, a flag is set in the
+ * cm_id flags to indicate that when the last reference is
+ * removed, the cm_id is to be destroyed. This is necessary to
+ * distinguish between an object that will be destroyed by the app
+ * thread asleep on the destroy_comp list vs. an object destroyed
+ * here synchronously when the last reference is removed.
+ */
+static void cm_work_handler(struct work_struct *_work)
+{
+	struct iwcm_work *work = container_of(_work, struct iwcm_work, work);
+	struct iw_cm_event levent;
+	struct iwcm_id_private *cm_id_priv = work->cm_id;
+	unsigned long flags;
+	int empty;
+	int ret = 0;
+	int destroy_id;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	empty = list_empty(&cm_id_priv->work_list);
+	while (!empty) {
+		work = list_entry(cm_id_priv->work_list.next,
+				  struct iwcm_work, list);
+		list_del_init(&work->list);
+		empty = list_empty(&cm_id_priv->work_list);
+		levent = work->event;
+		put_work(work);
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+		ret = process_event(cm_id_priv, &levent);
+		if (ret) {
+			set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
+			destroy_cm_id(&cm_id_priv->id);
+		}
+		BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
+		destroy_id = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
+		if (iwcm_deref_id(cm_id_priv)) {
+			if (destroy_id) {
+				BUG_ON(!list_empty(&cm_id_priv->work_list));
+				free_cm_id(cm_id_priv);
+			}
+			return;
+		}
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+}
+
+/*
+ * This function is called on interrupt context. Schedule events on
+ * the iwcm_wq thread to allow callback functions to downcall into
+ * the CM and/or block.  Events are queued to a per-CM_ID
+ * work_list. If this is the first event on the work_list, the work
+ * element is also queued on the iwcm_wq thread.
+ *
+ * Each event holds a reference on the cm_id. Until the last posted
+ * event has been delivered and processed, the cm_id cannot be
+ * deleted.
+ *
+ * Returns:
+ * 	      0	- the event was handled.
+ *	-ENOMEM	- the event was not handled due to lack of resources.
+ */
+static int cm_event_handler(struct iw_cm_id *cm_id,
+			     struct iw_cm_event *iw_event)
+{
+	struct iwcm_work *work;
+	struct iwcm_id_private *cm_id_priv;
+	unsigned long flags;
+	int ret = 0;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	work = get_work(cm_id_priv);
+	if (!work) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	INIT_WORK(&work->work, cm_work_handler);
+	work->cm_id = cm_id_priv;
+	work->event = *iw_event;
+
+	if ((work->event.event == IW_CM_EVENT_CONNECT_REQUEST ||
+	     work->event.event == IW_CM_EVENT_CONNECT_REPLY) &&
+	    work->event.private_data_len) {
+		ret = copy_private_data(&work->event);
+		if (ret) {
+			put_work(work);
+			goto out;
+		}
+	}
+
+	atomic_inc(&cm_id_priv->refcount);
+	if (list_empty(&cm_id_priv->work_list)) {
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+		queue_work(iwcm_wq, &work->work);
+	} else
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+out:
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+static int iwcm_init_qp_init_attr(struct iwcm_id_private *cm_id_priv,
+				  struct ib_qp_attr *qp_attr,
+				  int *qp_attr_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_IDLE:
+	case IW_CM_STATE_CONN_SENT:
+	case IW_CM_STATE_CONN_RECV:
+	case IW_CM_STATE_ESTABLISHED:
+		*qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
+		qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE|
+					   IB_ACCESS_REMOTE_READ;
+		ret = 0;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+static int iwcm_init_qp_rts_attr(struct iwcm_id_private *cm_id_priv,
+				  struct ib_qp_attr *qp_attr,
+				  int *qp_attr_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_IDLE:
+	case IW_CM_STATE_CONN_SENT:
+	case IW_CM_STATE_CONN_RECV:
+	case IW_CM_STATE_ESTABLISHED:
+		*qp_attr_mask = 0;
+		ret = 0;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+int iw_cm_init_qp_attr(struct iw_cm_id *cm_id,
+		       struct ib_qp_attr *qp_attr,
+		       int *qp_attr_mask)
+{
+	struct iwcm_id_private *cm_id_priv;
+	int ret;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	switch (qp_attr->qp_state) {
+	case IB_QPS_INIT:
+	case IB_QPS_RTR:
+		ret = iwcm_init_qp_init_attr(cm_id_priv,
+					     qp_attr, qp_attr_mask);
+		break;
+	case IB_QPS_RTS:
+		ret = iwcm_init_qp_rts_attr(cm_id_priv,
+					    qp_attr, qp_attr_mask);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_init_qp_attr);
+
+static int __init iw_cm_init(void)
+{
+	iwcm_wq = create_singlethread_workqueue("iw_cm_wq");
+	if (!iwcm_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void __exit iw_cm_cleanup(void)
+{
+	destroy_workqueue(iwcm_wq);
+}
+
+module_init(iw_cm_init);
+module_exit(iw_cm_cleanup);
diff --git a/sys/ofed/drivers/infiniband/core/iwcm.h b/sys/ofed/drivers/infiniband/core/iwcm.h
new file mode 100644
index 0000000..3f6cc82
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/iwcm.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef IWCM_H
+#define IWCM_H
+
+enum iw_cm_state {
+	IW_CM_STATE_IDLE,             /* unbound, inactive */
+	IW_CM_STATE_LISTEN,           /* listen waiting for connect */
+	IW_CM_STATE_CONN_RECV,        /* inbound waiting for user accept */
+	IW_CM_STATE_CONN_SENT,        /* outbound waiting for peer accept */
+	IW_CM_STATE_ESTABLISHED,      /* established */
+	IW_CM_STATE_CLOSING,	      /* disconnect */
+	IW_CM_STATE_DESTROYING        /* object being deleted */
+};
+
+struct iwcm_id_private {
+	struct iw_cm_id	id;
+	enum iw_cm_state state;
+	unsigned long flags;
+	struct ib_qp *qp;
+	struct completion destroy_comp;
+	wait_queue_head_t connect_wait;
+	struct list_head work_list;
+	spinlock_t lock;
+	atomic_t refcount;
+	struct list_head work_free_list;
+};
+
+#define IWCM_F_CALLBACK_DESTROY   1
+#define IWCM_F_CONNECT_WAIT       2
+
+#endif /* IWCM_H */
diff --git a/sys/ofed/drivers/infiniband/core/local_sa.c b/sys/ofed/drivers/infiniband/core/local_sa.c
new file mode 100644
index 0000000..eb62c42
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/local_sa.c
@@ -0,0 +1,1273 @@
+/*
+ * Copyright (c) 2006 Intel Corporation.� All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/rbtree.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/miscdevice.h>
+#include <linux/random.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_sa.h>
+#include "sa.h"
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("InfiniBand subnet administration caching");
+MODULE_LICENSE("Dual BSD/GPL");
+
+enum {
+	SA_DB_MAX_PATHS_PER_DEST = 0x7F,
+	SA_DB_MIN_RETRY_TIMER	 = 4000,  /*   4 sec */
+	SA_DB_MAX_RETRY_TIMER	 = 256000 /* 256 sec */
+};
+
+static int set_paths_per_dest(const char *val, struct kernel_param *kp);
+static unsigned long paths_per_dest = 0;
+module_param_call(paths_per_dest, set_paths_per_dest, param_get_ulong,
+		  &paths_per_dest, 0644);
+MODULE_PARM_DESC(paths_per_dest, "Maximum number of paths to retrieve "
+				 "to each destination (DGID).  Set to 0 "
+				 "to disable cache.");
+
+static int set_subscribe_inform_info(const char *val, struct kernel_param *kp);
+static char subscribe_inform_info = 1;
+module_param_call(subscribe_inform_info, set_subscribe_inform_info,
+		  param_get_bool, &subscribe_inform_info, 0644);
+MODULE_PARM_DESC(subscribe_inform_info,
+		 "Subscribe for SA InformInfo/Notice events.");
+
+static int do_refresh(const char *val, struct kernel_param *kp);
+module_param_call(refresh, do_refresh, NULL, NULL, 0200);
+
+static unsigned long retry_timer = SA_DB_MIN_RETRY_TIMER;
+
+enum sa_db_lookup_method {
+	SA_DB_LOOKUP_LEAST_USED,
+	SA_DB_LOOKUP_RANDOM
+};
+
+static int set_lookup_method(const char *val, struct kernel_param *kp);
+static int get_lookup_method(char *buf, struct kernel_param *kp);
+static unsigned long lookup_method;
+module_param_call(lookup_method, set_lookup_method, get_lookup_method,
+		  &lookup_method, 0644);
+MODULE_PARM_DESC(lookup_method, "Method used to return path records when "
+				"multiple paths exist to a given destination.");
+
+static void sa_db_add_dev(struct ib_device *device);
+static void sa_db_remove_dev(struct ib_device *device);
+
+static struct ib_client sa_db_client = {
+	.name   = "local_sa",
+	.add    = sa_db_add_dev,
+	.remove = sa_db_remove_dev
+};
+
+static LIST_HEAD(dev_list);
+static DEFINE_MUTEX(lock);
+static rwlock_t rwlock;
+static struct workqueue_struct *sa_wq;
+static struct ib_sa_client sa_client;
+
+enum sa_db_state {
+	SA_DB_IDLE,
+	SA_DB_REFRESH,
+	SA_DB_DESTROY
+};
+
+struct sa_db_port {
+	struct sa_db_device	*dev;
+	struct ib_mad_agent	*agent;
+	/* Limit number of outstanding MADs to SA to reduce SA flooding */
+	struct ib_mad_send_buf	*msg;
+	u16			sm_lid;
+	u8			sm_sl;
+	struct ib_inform_info	*in_info;
+	struct ib_inform_info	*out_info;
+	struct rb_root		paths;
+	struct list_head	update_list;
+	unsigned long		update_id;
+	enum sa_db_state	state;
+	struct work_struct	work;
+	union ib_gid		gid;
+	int			port_num;
+};
+
+struct sa_db_device {
+	struct list_head	list;
+	struct ib_device	*device;
+	struct ib_event_handler event_handler;
+	int			start_port;
+	int			port_count;
+	struct sa_db_port	port[0];
+};
+
+struct ib_sa_iterator {
+	struct ib_sa_iterator	*next;
+};
+
+struct ib_sa_attr_iter {
+	struct ib_sa_iterator	*iter;
+	unsigned long		flags;
+};
+
+struct ib_sa_attr_list {
+	struct ib_sa_iterator	iter;
+	struct ib_sa_iterator	*tail;
+	int			update_id;
+	union ib_gid		gid;
+	struct rb_node		node;
+};
+
+struct ib_path_rec_info {
+	struct ib_sa_iterator	iter; /* keep first */
+	struct ib_sa_path_rec	rec;
+	unsigned long		lookups;
+};
+
+struct ib_sa_mad_iter {
+	struct ib_mad_recv_wc	*recv_wc;
+	struct ib_mad_recv_buf	*recv_buf;
+	int			attr_size;
+	int			attr_offset;
+	int			data_offset;
+	int			data_left;
+	void			*attr;
+	u8			attr_data[0];
+};
+
+enum sa_update_type {
+	SA_UPDATE_FULL,
+	SA_UPDATE_ADD,
+	SA_UPDATE_REMOVE
+};
+
+struct update_info {
+	struct list_head	list;
+	union ib_gid		gid;
+	enum sa_update_type	type;
+};
+
+struct sa_path_request {
+	struct work_struct	work;
+	struct ib_sa_client	*client;
+	void			(*callback)(int, struct ib_sa_path_rec *, void *);
+	void			*context;
+	struct ib_sa_path_rec	path_rec;
+};
+
+static void process_updates(struct sa_db_port *port);
+
+static void free_attr_list(struct ib_sa_attr_list *attr_list)
+{
+	struct ib_sa_iterator *cur;
+
+	for (cur = attr_list->iter.next; cur; cur = attr_list->iter.next) {
+		attr_list->iter.next = cur->next;
+		kfree(cur);
+	}
+	attr_list->tail = &attr_list->iter;
+}
+
+static void remove_attr(struct rb_root *root, struct ib_sa_attr_list *attr_list)
+{
+	rb_erase(&attr_list->node, root);
+	free_attr_list(attr_list);
+	kfree(attr_list);
+}
+
+static void remove_all_attrs(struct rb_root *root)
+{
+	struct rb_node *node, *next_node;
+	struct ib_sa_attr_list *attr_list;
+
+	write_lock_irq(&rwlock);
+	for (node = rb_first(root); node; node = next_node) {
+		next_node = rb_next(node);
+		attr_list = rb_entry(node, struct ib_sa_attr_list, node);
+		remove_attr(root, attr_list);
+	}
+	write_unlock_irq(&rwlock);
+}
+
+static void remove_old_attrs(struct rb_root *root, unsigned long update_id)
+{
+	struct rb_node *node, *next_node;
+	struct ib_sa_attr_list *attr_list;
+
+	write_lock_irq(&rwlock);
+	for (node = rb_first(root); node; node = next_node) {
+		next_node = rb_next(node);
+		attr_list = rb_entry(node, struct ib_sa_attr_list, node);
+		if (attr_list->update_id != update_id)
+			remove_attr(root, attr_list);
+	}
+	write_unlock_irq(&rwlock);
+}
+
+static struct ib_sa_attr_list *insert_attr_list(struct rb_root *root,
+						struct ib_sa_attr_list *attr_list)
+{
+	struct rb_node **link = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct ib_sa_attr_list *cur_attr_list;
+	int cmp;
+
+	while (*link) {
+		parent = *link;
+		cur_attr_list = rb_entry(parent, struct ib_sa_attr_list, node);
+		cmp = memcmp(&cur_attr_list->gid, &attr_list->gid,
+			     sizeof attr_list->gid);
+		if (cmp < 0)
+			link = &(*link)->rb_left;
+		else if (cmp > 0)
+			link = &(*link)->rb_right;
+		else
+			return cur_attr_list;
+	}
+	rb_link_node(&attr_list->node, parent, link);
+	rb_insert_color(&attr_list->node, root);
+	return NULL;
+}
+
+static struct ib_sa_attr_list *find_attr_list(struct rb_root *root, u8 *gid)
+{
+	struct rb_node *node = root->rb_node;
+	struct ib_sa_attr_list *attr_list;
+	int cmp;
+
+	while (node) {
+		attr_list = rb_entry(node, struct ib_sa_attr_list, node);
+		cmp = memcmp(&attr_list->gid, gid, sizeof attr_list->gid);
+		if (cmp < 0)
+			node = node->rb_left;
+		else if (cmp > 0)
+			node = node->rb_right;
+		else
+			return attr_list;
+	}
+	return NULL;
+}
+
+static int insert_attr(struct rb_root *root, unsigned long update_id, void *key,
+		       struct ib_sa_iterator *iter)
+{
+	struct ib_sa_attr_list *attr_list;
+	void *err;
+
+	write_lock_irq(&rwlock);
+	attr_list = find_attr_list(root, key);
+	if (!attr_list) {
+		write_unlock_irq(&rwlock);
+		attr_list = kmalloc(sizeof *attr_list, GFP_KERNEL);
+		if (!attr_list)
+			return -ENOMEM;
+
+		attr_list->iter.next = NULL;
+		attr_list->tail = &attr_list->iter;
+		attr_list->update_id = update_id;
+		memcpy(attr_list->gid.raw, key, sizeof attr_list->gid);
+
+		write_lock_irq(&rwlock);
+		err = insert_attr_list(root, attr_list);
+		if (err) {
+			write_unlock_irq(&rwlock);
+			kfree(attr_list);
+			return PTR_ERR(err);
+		}
+	} else if (attr_list->update_id != update_id) {
+		free_attr_list(attr_list);
+		attr_list->update_id = update_id;
+	}
+
+	attr_list->tail->next = iter;
+	iter->next = NULL;
+	attr_list->tail = iter;
+	write_unlock_irq(&rwlock);
+	return 0;
+}
+
+static struct ib_sa_mad_iter *ib_sa_iter_create(struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_sa_mad_iter *iter;
+	struct ib_sa_mad *mad = (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad;
+	int attr_size, attr_offset;
+
+	attr_offset = be16_to_cpu(mad->sa_hdr.attr_offset) * 8;
+	attr_size = 64;		/* path record length */
+	if (attr_offset < attr_size)
+		return ERR_PTR(-EINVAL);
+
+	iter = kzalloc(sizeof *iter + attr_size, GFP_KERNEL);
+	if (!iter)
+		return ERR_PTR(-ENOMEM);
+
+	iter->data_left = mad_recv_wc->mad_len - IB_MGMT_SA_HDR;
+	iter->recv_wc = mad_recv_wc;
+	iter->recv_buf = &mad_recv_wc->recv_buf;
+	iter->attr_offset = attr_offset;
+	iter->attr_size = attr_size;
+	return iter;
+}
+
+static void ib_sa_iter_free(struct ib_sa_mad_iter *iter)
+{
+	kfree(iter);
+}
+
+static void *ib_sa_iter_next(struct ib_sa_mad_iter *iter)
+{
+	struct ib_sa_mad *mad;
+	int left, offset = 0;
+
+	while (iter->data_left >= iter->attr_offset) {
+		while (iter->data_offset < IB_MGMT_SA_DATA) {
+			mad = (struct ib_sa_mad *) iter->recv_buf->mad;
+
+			left = IB_MGMT_SA_DATA - iter->data_offset;
+			if (left < iter->attr_size) {
+				/* copy first piece of the attribute */
+				iter->attr = &iter->attr_data;
+				memcpy(iter->attr,
+				       &mad->data[iter->data_offset], left);
+				offset = left;
+				break;
+			} else if (offset) {
+				/* copy the second piece of the attribute */
+				memcpy(iter->attr + offset, &mad->data[0],
+				       iter->attr_size - offset);
+				iter->data_offset = iter->attr_size - offset;
+				offset = 0;
+			} else {
+				iter->attr = &mad->data[iter->data_offset];
+				iter->data_offset += iter->attr_size;
+			}
+
+			iter->data_left -= iter->attr_offset;
+			goto out;
+		}
+		iter->data_offset = 0;
+		iter->recv_buf = list_entry(iter->recv_buf->list.next,
+					    struct ib_mad_recv_buf, list);
+	}
+	iter->attr = NULL;
+out:
+	return iter->attr;
+}
+
+/*
+ * Copy path records from a received response and insert them into our cache.
+ * A path record in the MADs are in network order, packed, and may
+ * span multiple MAD buffers, just to make our life hard.
+ */
+static void update_path_db(struct sa_db_port *port,
+			   struct ib_mad_recv_wc *mad_recv_wc,
+			   enum sa_update_type type)
+{
+	struct ib_sa_mad_iter *iter;
+	struct ib_path_rec_info *path_info;
+	void *attr;
+	int ret;
+
+	iter = ib_sa_iter_create(mad_recv_wc);
+	if (IS_ERR(iter))
+		return;
+
+	port->update_id += (type == SA_UPDATE_FULL);
+
+	while ((attr = ib_sa_iter_next(iter)) &&
+	       (path_info = kmalloc(sizeof *path_info, GFP_KERNEL))) {
+
+		ib_sa_unpack_attr(&path_info->rec, attr, IB_SA_ATTR_PATH_REC);
+
+		ret = insert_attr(&port->paths, port->update_id,
+				  path_info->rec.dgid.raw, &path_info->iter);
+		if (ret) {
+			kfree(path_info);
+			break;
+		}
+	}
+	ib_sa_iter_free(iter);
+
+	if (type == SA_UPDATE_FULL)
+		remove_old_attrs(&port->paths, port->update_id);
+}
+
+static struct ib_mad_send_buf *get_sa_msg(struct sa_db_port *port,
+					  struct update_info *update)
+{
+	struct ib_ah_attr ah_attr;
+	struct ib_mad_send_buf *msg;
+
+	msg = ib_create_send_mad(port->agent, 1, 0, 0, IB_MGMT_SA_HDR,
+				 IB_MGMT_SA_DATA, GFP_KERNEL);
+	if (IS_ERR(msg))
+		return NULL;
+
+	memset(&ah_attr, 0, sizeof ah_attr);
+	ah_attr.dlid = port->sm_lid;
+	ah_attr.sl = port->sm_sl;
+	ah_attr.port_num = port->port_num;
+
+	msg->ah = ib_create_ah(port->agent->qp->pd, &ah_attr);
+	if (IS_ERR(msg->ah)) {
+		ib_free_send_mad(msg);
+		return NULL;
+	}
+
+	msg->timeout_ms = retry_timer;
+	msg->retries = 0;
+	msg->context[0] = port;
+	msg->context[1] = update;
+	return msg;
+}
+
+static __be64 form_tid(u32 hi_tid)
+{
+	static atomic_t tid;
+	return cpu_to_be64((((u64) hi_tid) << 32) |
+			   ((u32) atomic_inc_return(&tid)));
+}
+
+static void format_path_req(struct sa_db_port *port,
+			    struct update_info *update,
+			    struct ib_mad_send_buf *msg)
+{
+	struct ib_sa_mad *mad = msg->mad;
+	struct ib_sa_path_rec path_rec;
+
+	mad->mad_hdr.base_version  = IB_MGMT_BASE_VERSION;
+	mad->mad_hdr.mgmt_class	   = IB_MGMT_CLASS_SUBN_ADM;
+	mad->mad_hdr.class_version = IB_SA_CLASS_VERSION;
+	mad->mad_hdr.method	   = IB_SA_METHOD_GET_TABLE;
+	mad->mad_hdr.attr_id	   = cpu_to_be16(IB_SA_ATTR_PATH_REC);
+	mad->mad_hdr.tid	   = form_tid(msg->mad_agent->hi_tid);
+
+	mad->sa_hdr.comp_mask = IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH;
+
+	path_rec.sgid = port->gid;
+	path_rec.numb_path = (u8) paths_per_dest;
+
+	if (update->type == SA_UPDATE_ADD) {
+		mad->sa_hdr.comp_mask |= IB_SA_PATH_REC_DGID;
+		memcpy(&path_rec.dgid, &update->gid, sizeof path_rec.dgid);
+	}
+
+	ib_sa_pack_attr(mad->data, &path_rec, IB_SA_ATTR_PATH_REC);
+}
+
+static int send_query(struct sa_db_port *port,
+		      struct update_info *update)
+{
+	int ret;
+
+	port->msg = get_sa_msg(port, update);
+	if (!port->msg)
+		return -ENOMEM;
+
+	format_path_req(port, update, port->msg);
+
+	ret = ib_post_send_mad(port->msg, NULL);
+	if (ret)
+		goto err;
+
+	return 0;
+
+err:
+	ib_destroy_ah(port->msg->ah);
+	ib_free_send_mad(port->msg);
+	return ret;
+}
+
+static void add_update(struct sa_db_port *port, u8 *gid,
+		       enum sa_update_type type)
+{
+	struct update_info *update;
+
+	update = kmalloc(sizeof *update, GFP_KERNEL);
+	if (update) {
+		if (gid)
+			memcpy(&update->gid, gid, sizeof update->gid);
+		update->type = type;
+		list_add(&update->list, &port->update_list);
+	}
+
+	if (port->state == SA_DB_IDLE) {
+		port->state = SA_DB_REFRESH;
+		process_updates(port);
+	}
+}
+
+static void clean_update_list(struct sa_db_port *port)
+{
+	struct update_info *update;
+
+	while (!list_empty(&port->update_list)) {
+		update = list_entry(port->update_list.next,
+				    struct update_info, list);
+		list_del(&update->list);
+		kfree(update);
+	}
+}
+
+static int notice_handler(int status, struct ib_inform_info *info,
+			  struct ib_sa_notice *notice)
+{
+	struct sa_db_port *port = info->context;
+	struct ib_sa_notice_data_gid *gid_data;
+	struct ib_inform_info **pinfo;
+	enum sa_update_type type;
+
+	if (info->trap_number == IB_SA_SM_TRAP_GID_IN_SERVICE) {
+		pinfo = &port->in_info;
+		type = SA_UPDATE_ADD;
+	} else {
+		pinfo = &port->out_info;
+		type = SA_UPDATE_REMOVE;
+	}
+
+	mutex_lock(&lock);
+	if (port->state == SA_DB_DESTROY || !*pinfo) {
+		mutex_unlock(&lock);
+		return 0;
+	}
+
+	if (notice) {
+		gid_data = (struct ib_sa_notice_data_gid *)
+			   &notice->data_details;
+		add_update(port, gid_data->gid, type);
+		mutex_unlock(&lock);
+	} else if (status == -ENETRESET) {
+		*pinfo = NULL;
+		mutex_unlock(&lock);
+	} else {
+		if (status)
+			*pinfo = ERR_PTR(-EINVAL);
+		port->state = SA_DB_IDLE;
+		clean_update_list(port);
+		mutex_unlock(&lock);
+		queue_work(sa_wq, &port->work);
+	}
+
+	return status;
+}
+
+static int reg_in_info(struct sa_db_port *port)
+{
+	int ret = 0;
+
+	port->in_info = ib_sa_register_inform_info(&sa_client,
+						   port->dev->device,
+						   port->port_num,
+						   IB_SA_SM_TRAP_GID_IN_SERVICE,
+						   GFP_KERNEL, notice_handler,
+						   port);
+	if (IS_ERR(port->in_info))
+		ret = PTR_ERR(port->in_info);
+
+	return ret;
+}
+
+static int reg_out_info(struct sa_db_port *port)
+{
+	int ret = 0;
+
+	port->out_info = ib_sa_register_inform_info(&sa_client,
+						    port->dev->device,
+						    port->port_num,
+						    IB_SA_SM_TRAP_GID_OUT_OF_SERVICE,
+						    GFP_KERNEL, notice_handler,
+						    port);
+	if (IS_ERR(port->out_info))
+		ret = PTR_ERR(port->out_info);
+
+	return ret;
+}
+
+static void unsubscribe_port(struct sa_db_port *port)
+{
+	if (port->in_info && !IS_ERR(port->in_info))
+		ib_sa_unregister_inform_info(port->in_info);
+
+	if (port->out_info && !IS_ERR(port->out_info))
+		ib_sa_unregister_inform_info(port->out_info);
+
+	port->out_info = NULL;
+	port->in_info = NULL;
+
+}
+
+static void cleanup_port(struct sa_db_port *port)
+{
+	unsubscribe_port(port);
+
+	clean_update_list(port);
+	remove_all_attrs(&port->paths);
+}
+
+static int update_port_info(struct sa_db_port *port)
+{
+	struct ib_port_attr port_attr;
+	int ret;
+
+	ret = ib_query_port(port->dev->device, port->port_num, &port_attr);
+	if (ret)
+		return ret;
+
+	if (port_attr.state != IB_PORT_ACTIVE)
+		return -ENODATA;
+
+        port->sm_lid = port_attr.sm_lid;
+	port->sm_sl = port_attr.sm_sl;
+	return 0;
+}
+
+static void process_updates(struct sa_db_port *port)
+{
+	struct update_info *update;
+	struct ib_sa_attr_list *attr_list;
+	int ret;
+
+	if (!paths_per_dest || update_port_info(port)) {
+		cleanup_port(port);
+		goto out;
+	}
+
+	/* Event registration is an optimization, so ignore failures. */
+	if (subscribe_inform_info) {
+		if (!port->out_info) {
+			ret = reg_out_info(port);
+			if (!ret)
+				return;
+		}
+
+		if (!port->in_info) {
+			ret = reg_in_info(port);
+			if (!ret)
+				return;
+		}
+	} else
+		unsubscribe_port(port);
+
+	while (!list_empty(&port->update_list)) {
+		update = list_entry(port->update_list.next,
+				    struct update_info, list);
+
+		if (update->type == SA_UPDATE_REMOVE) {
+			write_lock_irq(&rwlock);
+			attr_list = find_attr_list(&port->paths,
+						   update->gid.raw);
+			if (attr_list)
+				remove_attr(&port->paths, attr_list);
+			write_unlock_irq(&rwlock);
+		} else {
+			ret = send_query(port, update);
+			if (!ret)
+				return;
+
+		}
+		list_del(&update->list);
+		kfree(update);
+	}
+out:
+	port->state = SA_DB_IDLE;
+}
+
+static void refresh_port_db(struct sa_db_port *port)
+{
+	if (port->state == SA_DB_DESTROY)
+		return;
+
+	if (port->state == SA_DB_REFRESH) {
+		clean_update_list(port);
+		ib_cancel_mad(port->agent, port->msg);
+	}
+
+	add_update(port, NULL, SA_UPDATE_FULL);
+}
+
+static void refresh_dev_db(struct sa_db_device *dev)
+{
+	int i;
+
+	for (i = 0; i < dev->port_count; i++)
+		refresh_port_db(&dev->port[i]);
+}
+
+static void refresh_db(void)
+{
+	struct sa_db_device *dev;
+
+	list_for_each_entry(dev, &dev_list, list)
+		refresh_dev_db(dev);
+}
+
+static int do_refresh(const char *val, struct kernel_param *kp)
+{
+	mutex_lock(&lock);
+	refresh_db();
+	mutex_unlock(&lock);
+	return 0;
+}
+
+static int get_lookup_method(char *buf, struct kernel_param *kp)
+{
+	return sprintf(buf,
+		       "%c %d round robin\n"
+		       "%c %d random",
+		       (lookup_method == SA_DB_LOOKUP_LEAST_USED) ? '*' : ' ',
+		       SA_DB_LOOKUP_LEAST_USED,
+		       (lookup_method == SA_DB_LOOKUP_RANDOM) ? '*' : ' ',
+		       SA_DB_LOOKUP_RANDOM);
+}
+
+static int set_lookup_method(const char *val, struct kernel_param *kp)
+{
+	unsigned long method;
+	int ret = 0;
+
+	method = simple_strtoul(val, NULL, 0);
+
+	switch (method) {
+	case SA_DB_LOOKUP_LEAST_USED:
+	case SA_DB_LOOKUP_RANDOM:
+		lookup_method = method;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static int set_paths_per_dest(const char *val, struct kernel_param *kp)
+{
+	int ret;
+
+	mutex_lock(&lock);
+	ret = param_set_ulong(val, kp);
+	if (ret)
+		goto out;
+
+	if (paths_per_dest > SA_DB_MAX_PATHS_PER_DEST)
+		paths_per_dest = SA_DB_MAX_PATHS_PER_DEST;
+	refresh_db();
+out:
+	mutex_unlock(&lock);
+	return ret;
+}
+
+static int set_subscribe_inform_info(const char *val, struct kernel_param *kp)
+{
+	int ret;
+
+	ret = param_set_bool(val, kp);
+	if (ret)
+		return ret;
+
+	return do_refresh(val, kp);
+}
+
+static void port_work_handler(struct work_struct *work)
+{
+	struct sa_db_port *port;
+
+	port = container_of(work, typeof(*port), work);
+	mutex_lock(&lock);
+	refresh_port_db(port);
+	mutex_unlock(&lock);
+}
+
+static void handle_event(struct ib_event_handler *event_handler,
+			 struct ib_event *event)
+{
+	struct sa_db_device *dev;
+	struct sa_db_port *port;
+
+	dev = container_of(event_handler, typeof(*dev), event_handler);
+	port = &dev->port[event->element.port_num - dev->start_port];
+
+	switch (event->event) {
+	case IB_EVENT_PORT_ERR:
+	case IB_EVENT_LID_CHANGE:
+	case IB_EVENT_SM_CHANGE:
+	case IB_EVENT_CLIENT_REREGISTER:
+	case IB_EVENT_PKEY_CHANGE:
+	case IB_EVENT_PORT_ACTIVE:
+		queue_work(sa_wq, &port->work);
+		break;
+	default:
+		break;
+	}
+}
+
+static void ib_free_path_iter(struct ib_sa_attr_iter *iter)
+{
+	read_unlock_irqrestore(&rwlock, iter->flags);
+}
+
+static int ib_create_path_iter(struct ib_device *device, u8 port_num,
+			       union ib_gid *dgid, struct ib_sa_attr_iter *iter)
+{
+	struct sa_db_device *dev;
+	struct sa_db_port *port;
+	struct ib_sa_attr_list *list;
+
+	dev = ib_get_client_data(device, &sa_db_client);
+	if (!dev)
+		return -ENODEV;
+
+	port = &dev->port[port_num - dev->start_port];
+
+	read_lock_irqsave(&rwlock, iter->flags);
+	list = find_attr_list(&port->paths, dgid->raw);
+	if (!list) {
+		ib_free_path_iter(iter);
+		return -ENODATA;
+	}
+
+	iter->iter = &list->iter;
+	return 0;
+}
+
+static struct ib_sa_path_rec *ib_get_next_path(struct ib_sa_attr_iter *iter)
+{
+	struct ib_path_rec_info *next_path;
+
+	iter->iter = iter->iter->next;
+	if (iter->iter) {
+		next_path = container_of(iter->iter, struct ib_path_rec_info, iter);
+		return &next_path->rec;
+	} else
+		return NULL;
+}
+
+static int cmp_rec(struct ib_sa_path_rec *src,
+		   struct ib_sa_path_rec *dst, ib_sa_comp_mask comp_mask)
+{
+	/* DGID check already done */
+	if (comp_mask & IB_SA_PATH_REC_SGID &&
+	    memcmp(&src->sgid, &dst->sgid, sizeof src->sgid))
+		return -EINVAL;
+	if (comp_mask & IB_SA_PATH_REC_DLID && src->dlid != dst->dlid)
+		return -EINVAL;
+	if (comp_mask & IB_SA_PATH_REC_SLID && src->slid != dst->slid)
+		return -EINVAL;
+	if (comp_mask & IB_SA_PATH_REC_RAW_TRAFFIC &&
+	    src->raw_traffic != dst->raw_traffic)
+		return -EINVAL;
+
+	if (comp_mask & IB_SA_PATH_REC_FLOW_LABEL &&
+	    src->flow_label != dst->flow_label)
+		return -EINVAL;
+	if (comp_mask & IB_SA_PATH_REC_HOP_LIMIT &&
+	    src->hop_limit != dst->hop_limit)
+		return -EINVAL;
+	if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS &&
+	    src->traffic_class != dst->traffic_class)
+		return -EINVAL;
+	if (comp_mask & IB_SA_PATH_REC_REVERSIBLE &&
+	    dst->reversible && !src->reversible)
+		return -EINVAL;
+	/* Numb path check already done */
+	if (comp_mask & IB_SA_PATH_REC_PKEY && src->pkey != dst->pkey)
+		return -EINVAL;
+
+	if (comp_mask & IB_SA_PATH_REC_SL && src->sl != dst->sl)
+		return -EINVAL;
+
+	if (ib_sa_check_selector(comp_mask, IB_SA_PATH_REC_MTU_SELECTOR,
+				 IB_SA_PATH_REC_MTU, dst->mtu_selector,
+				 src->mtu, dst->mtu))
+		return -EINVAL;
+	if (ib_sa_check_selector(comp_mask, IB_SA_PATH_REC_RATE_SELECTOR,
+				 IB_SA_PATH_REC_RATE, dst->rate_selector,
+				 src->rate, dst->rate))
+		return -EINVAL;
+	if (ib_sa_check_selector(comp_mask,
+				 IB_SA_PATH_REC_PACKET_LIFE_TIME_SELECTOR,
+				 IB_SA_PATH_REC_PACKET_LIFE_TIME,
+				 dst->packet_life_time_selector,
+				 src->packet_life_time, dst->packet_life_time))
+		return -EINVAL;
+
+	return 0;
+}
+
+static struct ib_sa_path_rec *get_random_path(struct ib_sa_attr_iter *iter,
+					      struct ib_sa_path_rec *req_path,
+					      ib_sa_comp_mask comp_mask)
+{
+	struct ib_sa_path_rec *path, *rand_path = NULL;
+	int num, count = 0;
+
+	for (path = ib_get_next_path(iter); path;
+	     path = ib_get_next_path(iter)) {
+		if (!cmp_rec(path, req_path, comp_mask)) {
+			get_random_bytes(&num, sizeof num);
+			if ((num % ++count) == 0)
+				rand_path = path;
+		}
+	}
+
+	return rand_path;
+}
+
+static struct ib_sa_path_rec *get_next_path(struct ib_sa_attr_iter *iter,
+					    struct ib_sa_path_rec *req_path,
+					    ib_sa_comp_mask comp_mask)
+{
+	struct ib_path_rec_info *cur_path, *next_path = NULL;
+	struct ib_sa_path_rec *path;
+	unsigned long lookups = ~0;
+
+	for (path = ib_get_next_path(iter); path;
+	     path = ib_get_next_path(iter)) {
+		if (!cmp_rec(path, req_path, comp_mask)) {
+
+			cur_path = container_of(iter->iter, struct ib_path_rec_info,
+						iter);
+			if (cur_path->lookups < lookups) {
+				lookups = cur_path->lookups;
+				next_path = cur_path;
+			}
+		}
+	}
+
+	if (next_path) {
+		next_path->lookups++;
+		return &next_path->rec;
+	} else
+		return NULL;
+}
+
+static void report_path(struct work_struct *work)
+{
+	struct sa_path_request *req;
+
+	req = container_of(work, struct sa_path_request, work);
+	req->callback(0, &req->path_rec, req->context);
+	ib_sa_client_put(req->client);
+	kfree(req);
+}
+
+/**
+ * ib_sa_path_rec_get - Start a Path get query
+ * @client:SA client
+ * @device:device to send query on
+ * @port_num: port number to send query on
+ * @rec:Path Record to send in query
+ * @comp_mask:component mask to send in query
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when query completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:query context, used to cancel query
+ *
+ * Send a Path Record Get query to the SA to look up a path.  The
+ * callback function will be called when the query completes (or
+ * fails); status is 0 for a successful response, -EINTR if the query
+ * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
+ * occurred sending the query.  The resp parameter of the callback is
+ * only valid if status is 0.
+ *
+ * If the return value of ib_sa_path_rec_get() is negative, it is an
+ * error code.  Otherwise it is a query ID that can be used to cancel
+ * the query.
+ */
+int ib_sa_path_rec_get(struct ib_sa_client *client,
+		       struct ib_device *device, u8 port_num,
+		       struct ib_sa_path_rec *rec,
+		       ib_sa_comp_mask comp_mask,
+		       int timeout_ms, gfp_t gfp_mask,
+		       void (*callback)(int status,
+					struct ib_sa_path_rec *resp,
+					void *context),
+		       void *context,
+		       struct ib_sa_query **sa_query)
+{
+	struct sa_path_request *req;
+	struct ib_sa_attr_iter iter;
+	struct ib_sa_path_rec *path_rec;
+	int ret;
+
+	if (!paths_per_dest)
+		goto query_sa;
+
+	if (!(comp_mask & IB_SA_PATH_REC_DGID) ||
+	    !(comp_mask & IB_SA_PATH_REC_NUMB_PATH) || rec->numb_path != 1)
+		goto query_sa;
+
+	req = kmalloc(sizeof *req, gfp_mask);
+	if (!req)
+		goto query_sa;
+
+	ret = ib_create_path_iter(device, port_num, &rec->dgid, &iter);
+	if (ret)
+		goto free_req;
+
+	if (lookup_method == SA_DB_LOOKUP_RANDOM)
+		path_rec = get_random_path(&iter, rec, comp_mask);
+	else
+		path_rec = get_next_path(&iter, rec, comp_mask);
+
+	if (!path_rec)
+		goto free_iter;
+
+	memcpy(&req->path_rec, path_rec, sizeof *path_rec);
+	ib_free_path_iter(&iter);
+
+	INIT_WORK(&req->work, report_path);
+	req->client = client;
+	req->callback = callback;
+	req->context = context;
+
+	ib_sa_client_get(client);
+	queue_work(sa_wq, &req->work);
+	*sa_query = ERR_PTR(-EEXIST);
+	return 0;
+
+free_iter:
+	ib_free_path_iter(&iter);
+free_req:
+	kfree(req);
+query_sa:
+	return ib_sa_path_rec_query(client, device, port_num, rec, comp_mask,
+				    timeout_ms, gfp_mask, callback, context,
+				    sa_query);
+}
+EXPORT_SYMBOL(ib_sa_path_rec_get);
+
+static void recv_handler(struct ib_mad_agent *mad_agent,
+			 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct sa_db_port *port;
+	struct update_info *update;
+	struct ib_mad_send_buf *msg;
+	enum sa_update_type type;
+
+	msg = (struct ib_mad_send_buf *) (unsigned long) mad_recv_wc->wc->wr_id;
+	port = msg->context[0];
+	update = msg->context[1];
+
+	mutex_lock(&lock);
+	if (port->state == SA_DB_DESTROY ||
+	    update != list_entry(port->update_list.next,
+				 struct update_info, list)) {
+		mutex_unlock(&lock);
+	} else {
+		type = update->type;
+		mutex_unlock(&lock);
+		update_path_db(mad_agent->context, mad_recv_wc, type);
+	}
+
+	ib_free_recv_mad(mad_recv_wc);
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_mad_send_buf *msg;
+	struct sa_db_port *port;
+	struct update_info *update;
+	int ret;
+
+	msg = mad_send_wc->send_buf;
+	port = msg->context[0];
+	update = msg->context[1];
+
+	mutex_lock(&lock);
+	if (port->state == SA_DB_DESTROY)
+		goto unlock;
+
+	if (update == list_entry(port->update_list.next,
+				 struct update_info, list)) {
+
+		if (mad_send_wc->status == IB_WC_RESP_TIMEOUT_ERR &&
+		    msg->timeout_ms < SA_DB_MAX_RETRY_TIMER) {
+
+			msg->timeout_ms <<= 1;
+			ret = ib_post_send_mad(msg, NULL);
+			if (!ret) {
+				mutex_unlock(&lock);
+				return;
+			}
+		}
+		list_del(&update->list);
+		kfree(update);
+	}
+	process_updates(port);
+unlock:
+	mutex_unlock(&lock);
+
+	ib_destroy_ah(msg->ah);
+	ib_free_send_mad(msg);
+}
+
+static int init_port(struct sa_db_device *dev, int port_num)
+{
+	struct sa_db_port *port;
+	int ret;
+
+	port = &dev->port[port_num - dev->start_port];
+	port->dev = dev;
+	port->port_num = port_num;
+	INIT_WORK(&port->work, port_work_handler);
+	port->paths = RB_ROOT;
+	INIT_LIST_HEAD(&port->update_list);
+
+	ret = ib_get_cached_gid(dev->device, port_num, 0, &port->gid);
+	if (ret)
+		return ret;
+
+	port->agent = ib_register_mad_agent(dev->device, port_num, IB_QPT_GSI,
+					    NULL, IB_MGMT_RMPP_VERSION,
+					    send_handler, recv_handler, port);
+	if (IS_ERR(port->agent))
+		ret = PTR_ERR(port->agent);
+
+	return ret;
+}
+
+static void destroy_port(struct sa_db_port *port)
+{
+	mutex_lock(&lock);
+	port->state = SA_DB_DESTROY;
+	mutex_unlock(&lock);
+
+	ib_unregister_mad_agent(port->agent);
+	cleanup_port(port);
+	flush_workqueue(sa_wq);
+}
+
+static void sa_db_add_dev(struct ib_device *device)
+{
+	struct sa_db_device *dev;
+	struct sa_db_port *port;
+	int s, e, i, ret;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH) {
+		s = e = 0;
+	} else {
+		s = 1;
+		e = device->phys_port_cnt;
+	}
+
+	dev = kzalloc(sizeof *dev + (e - s + 1) * sizeof *port, GFP_KERNEL);
+	if (!dev)
+		return;
+
+	dev->start_port = s;
+	dev->port_count = e - s + 1;
+	dev->device = device;
+	for (i = 0; i < dev->port_count; i++) {
+		ret = init_port(dev, s + i);
+		if (ret)
+			goto err;
+	}
+
+	ib_set_client_data(device, &sa_db_client, dev);
+
+	INIT_IB_EVENT_HANDLER(&dev->event_handler, device, handle_event);
+
+	mutex_lock(&lock);
+	list_add_tail(&dev->list, &dev_list);
+	refresh_dev_db(dev);
+	mutex_unlock(&lock);
+
+	ib_register_event_handler(&dev->event_handler);
+	return;
+err:
+	while (i--)
+		destroy_port(&dev->port[i]);
+	kfree(dev);
+}
+
+static void sa_db_remove_dev(struct ib_device *device)
+{
+	struct sa_db_device *dev;
+	int i;
+
+	dev = ib_get_client_data(device, &sa_db_client);
+	if (!dev)
+		return;
+
+	ib_unregister_event_handler(&dev->event_handler);
+	flush_workqueue(sa_wq);
+
+	for (i = 0; i < dev->port_count; i++)
+		destroy_port(&dev->port[i]);
+
+	mutex_lock(&lock);
+	list_del(&dev->list);
+	mutex_unlock(&lock);
+
+	kfree(dev);
+}
+
+int sa_db_init(void)
+{
+	int ret;
+
+	rwlock_init(&rwlock);
+	sa_wq = create_singlethread_workqueue("local_sa");
+	if (!sa_wq)
+		return -ENOMEM;
+
+	ib_sa_register_client(&sa_client);
+	ret = ib_register_client(&sa_db_client);
+	if (ret)
+		goto err;
+
+	return 0;
+
+err:
+	ib_sa_unregister_client(&sa_client);
+	destroy_workqueue(sa_wq);
+	return ret;
+}
+
+void sa_db_cleanup(void)
+{
+	ib_unregister_client(&sa_db_client);
+	ib_sa_unregister_client(&sa_client);
+	destroy_workqueue(sa_wq);
+}
diff --git a/sys/ofed/drivers/infiniband/core/mad.c b/sys/ofed/drivers/infiniband/core/mad.c
new file mode 100644
index 0000000..64e660c
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/mad.c
@@ -0,0 +1,3057 @@
+/*
+ * Copyright (c) 2004-2007 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/dma-mapping.h>
+#include <rdma/ib_cache.h>
+
+#include "mad_priv.h"
+#include "mad_rmpp.h"
+#include "smi.h"
+#include "agent.h"
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("kernel IB MAD API");
+MODULE_AUTHOR("Hal Rosenstock");
+MODULE_AUTHOR("Sean Hefty");
+
+int mad_sendq_size = IB_MAD_QP_SEND_SIZE;
+int mad_recvq_size = IB_MAD_QP_RECV_SIZE;
+
+module_param_named(send_queue_size, mad_sendq_size, int, 0444);
+MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests");
+module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
+MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests");
+
+static struct kmem_cache *ib_mad_cache;
+
+static struct list_head ib_mad_port_list;
+static u32 ib_mad_client_id = 0;
+
+/* Port list lock */
+static spinlock_t ib_mad_port_list_lock;
+
+
+/* Forward declarations */
+static int method_in_use(struct ib_mad_mgmt_method_table **method,
+			 struct ib_mad_reg_req *mad_reg_req);
+static void remove_mad_reg_req(struct ib_mad_agent_private *priv);
+static struct ib_mad_agent_private *find_mad_agent(
+					struct ib_mad_port_private *port_priv,
+					struct ib_mad *mad);
+static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
+				    struct ib_mad_private *mad);
+static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv);
+static void timeout_sends(struct work_struct *work);
+static void local_completions(struct work_struct *work);
+static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+			      struct ib_mad_agent_private *agent_priv,
+			      u8 mgmt_class);
+static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+			   struct ib_mad_agent_private *agent_priv);
+
+/*
+ * Returns a ib_mad_port_private structure or NULL for a device/port
+ * Assumes ib_mad_port_list_lock is being held
+ */
+static inline struct ib_mad_port_private *
+__ib_get_mad_port(struct ib_device *device, int port_num)
+{
+	struct ib_mad_port_private *entry;
+
+	list_for_each_entry(entry, &ib_mad_port_list, port_list) {
+		if (entry->device == device && entry->port_num == port_num)
+			return entry;
+	}
+	return NULL;
+}
+
+/*
+ * Wrapper function to return a ib_mad_port_private structure or NULL
+ * for a device/port
+ */
+static inline struct ib_mad_port_private *
+ib_get_mad_port(struct ib_device *device, int port_num)
+{
+	struct ib_mad_port_private *entry;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+	entry = __ib_get_mad_port(device, port_num);
+	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+	return entry;
+}
+
+static inline u8 convert_mgmt_class(u8 mgmt_class)
+{
+	/* Alias IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE to 0 */
+	return mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE ?
+		0 : mgmt_class;
+}
+
+static int get_spl_qp_index(enum ib_qp_type qp_type)
+{
+	switch (qp_type)
+	{
+	case IB_QPT_SMI:
+		return 0;
+	case IB_QPT_GSI:
+		return 1;
+	default:
+		return -1;
+	}
+}
+
+static int vendor_class_index(u8 mgmt_class)
+{
+	return mgmt_class - IB_MGMT_CLASS_VENDOR_RANGE2_START;
+}
+
+static int is_vendor_class(u8 mgmt_class)
+{
+	if ((mgmt_class < IB_MGMT_CLASS_VENDOR_RANGE2_START) ||
+	    (mgmt_class > IB_MGMT_CLASS_VENDOR_RANGE2_END))
+		return 0;
+	return 1;
+}
+
+static int is_vendor_oui(char *oui)
+{
+	if (oui[0] || oui[1] || oui[2])
+		return 1;
+	return 0;
+}
+
+static int is_vendor_method_in_use(
+		struct ib_mad_mgmt_vendor_class *vendor_class,
+		struct ib_mad_reg_req *mad_reg_req)
+{
+	struct ib_mad_mgmt_method_table *method;
+	int i;
+
+	for (i = 0; i < MAX_MGMT_OUI; i++) {
+		if (!memcmp(vendor_class->oui[i], mad_reg_req->oui, 3)) {
+			method = vendor_class->method_table[i];
+			if (method) {
+				if (method_in_use(&method, mad_reg_req))
+					return 1;
+				else
+					break;
+			}
+		}
+	}
+	return 0;
+}
+
+int ib_response_mad(struct ib_mad *mad)
+{
+	return ((mad->mad_hdr.method & IB_MGMT_METHOD_RESP) ||
+		(mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) ||
+		((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_BM) &&
+		 (mad->mad_hdr.attr_mod & IB_BM_ATTR_MOD_RESP)));
+}
+EXPORT_SYMBOL(ib_response_mad);
+
+static void timeout_callback(unsigned long data)
+{
+	struct ib_mad_agent_private *mad_agent_priv =
+		(struct ib_mad_agent_private *) data;
+
+	queue_work(mad_agent_priv->qp_info->port_priv->wq,
+		   &mad_agent_priv->timeout_work);
+}
+
+/*
+ * ib_register_mad_agent - Register to send/receive MADs
+ */
+struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
+					   u8 port_num,
+					   enum ib_qp_type qp_type,
+					   struct ib_mad_reg_req *mad_reg_req,
+					   u8 rmpp_version,
+					   ib_mad_send_handler send_handler,
+					   ib_mad_recv_handler recv_handler,
+					   void *context)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_agent *ret = ERR_PTR(-EINVAL);
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_reg_req *reg_req = NULL;
+	struct ib_mad_mgmt_class_table *class;
+	struct ib_mad_mgmt_vendor_class_table *vendor;
+	struct ib_mad_mgmt_vendor_class *vendor_class;
+	struct ib_mad_mgmt_method_table *method;
+	int ret2, qpn;
+	unsigned long flags;
+	u8 mgmt_class, vclass;
+
+	/* Validate parameters */
+	qpn = get_spl_qp_index(qp_type);
+	if (qpn == -1)
+		goto error1;
+
+	if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION)
+		goto error1;
+
+	/* Validate MAD registration request if supplied */
+	if (mad_reg_req) {
+		if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION)
+			goto error1;
+		if (!recv_handler)
+			goto error1;
+		if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) {
+			/*
+			 * IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE is the only
+			 * one in this range currently allowed
+			 */
+			if (mad_reg_req->mgmt_class !=
+			    IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+				goto error1;
+		} else if (mad_reg_req->mgmt_class == 0) {
+			/*
+			 * Class 0 is reserved in IBA and is used for
+			 * aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+			 */
+			goto error1;
+		} else if (is_vendor_class(mad_reg_req->mgmt_class)) {
+			/*
+			 * If class is in "new" vendor range,
+			 * ensure supplied OUI is not zero
+			 */
+			if (!is_vendor_oui(mad_reg_req->oui))
+				goto error1;
+		}
+		/* Make sure class supplied is consistent with RMPP */
+		if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) {
+			if (rmpp_version)
+				goto error1;
+		}
+		/* Make sure class supplied is consistent with QP type */
+		if (qp_type == IB_QPT_SMI) {
+			if ((mad_reg_req->mgmt_class !=
+					IB_MGMT_CLASS_SUBN_LID_ROUTED) &&
+			    (mad_reg_req->mgmt_class !=
+					IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE))
+				goto error1;
+		} else {
+			if ((mad_reg_req->mgmt_class ==
+					IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
+			    (mad_reg_req->mgmt_class ==
+					IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE))
+				goto error1;
+		}
+	} else {
+		/* No registration request supplied */
+		if (!send_handler)
+			goto error1;
+	}
+
+	/* Validate device and port */
+	port_priv = ib_get_mad_port(device, port_num);
+	if (!port_priv) {
+		ret = ERR_PTR(-ENODEV);
+		goto error1;
+	}
+
+	/* Allocate structures */
+	mad_agent_priv = kzalloc(sizeof *mad_agent_priv, GFP_KERNEL);
+	if (!mad_agent_priv) {
+		ret = ERR_PTR(-ENOMEM);
+		goto error1;
+	}
+
+	mad_agent_priv->agent.mr = ib_get_dma_mr(port_priv->qp_info[qpn].qp->pd,
+						 IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(mad_agent_priv->agent.mr)) {
+		ret = ERR_PTR(-ENOMEM);
+		goto error2;
+	}
+
+	if (mad_reg_req) {
+		reg_req = kmalloc(sizeof *reg_req, GFP_KERNEL);
+		if (!reg_req) {
+			ret = ERR_PTR(-ENOMEM);
+			goto error3;
+		}
+		/* Make a copy of the MAD registration request */
+		memcpy(reg_req, mad_reg_req, sizeof *reg_req);
+	}
+
+	/* Now, fill in the various structures */
+	mad_agent_priv->qp_info = &port_priv->qp_info[qpn];
+	mad_agent_priv->reg_req = reg_req;
+	mad_agent_priv->agent.rmpp_version = rmpp_version;
+	mad_agent_priv->agent.device = device;
+	mad_agent_priv->agent.recv_handler = recv_handler;
+	mad_agent_priv->agent.send_handler = send_handler;
+	mad_agent_priv->agent.context = context;
+	mad_agent_priv->agent.qp = port_priv->qp_info[qpn].qp;
+	mad_agent_priv->agent.port_num = port_num;
+	spin_lock_init(&mad_agent_priv->lock);
+	INIT_LIST_HEAD(&mad_agent_priv->send_list);
+	INIT_LIST_HEAD(&mad_agent_priv->wait_list);
+	INIT_LIST_HEAD(&mad_agent_priv->done_list);
+	INIT_LIST_HEAD(&mad_agent_priv->rmpp_list);
+	INIT_WORK(&mad_agent_priv->timeout_work, timeout_sends);
+	setup_timer(&mad_agent_priv->timeout_timer, timeout_callback,
+		    (unsigned long) mad_agent_priv);
+	INIT_LIST_HEAD(&mad_agent_priv->local_list);
+	INIT_WORK(&mad_agent_priv->local_work, local_completions);
+	atomic_set(&mad_agent_priv->refcount, 1);
+	init_completion(&mad_agent_priv->comp);
+
+	spin_lock_irqsave(&port_priv->reg_lock, flags);
+	mad_agent_priv->agent.hi_tid = ++ib_mad_client_id;
+
+	/*
+	 * Make sure MAD registration (if supplied)
+	 * is non overlapping with any existing ones
+	 */
+	if (mad_reg_req) {
+		mgmt_class = convert_mgmt_class(mad_reg_req->mgmt_class);
+		if (!is_vendor_class(mgmt_class)) {
+			class = port_priv->version[mad_reg_req->
+						   mgmt_class_version].class;
+			if (class) {
+				method = class->method_table[mgmt_class];
+				if (method) {
+					if (method_in_use(&method,
+							   mad_reg_req))
+						goto error4;
+				}
+			}
+			ret2 = add_nonoui_reg_req(mad_reg_req, mad_agent_priv,
+						  mgmt_class);
+		} else {
+			/* "New" vendor class range */
+			vendor = port_priv->version[mad_reg_req->
+						    mgmt_class_version].vendor;
+			if (vendor) {
+				vclass = vendor_class_index(mgmt_class);
+				vendor_class = vendor->vendor_class[vclass];
+				if (vendor_class) {
+					if (is_vendor_method_in_use(
+							vendor_class,
+							mad_reg_req))
+						goto error4;
+				}
+			}
+			ret2 = add_oui_reg_req(mad_reg_req, mad_agent_priv);
+		}
+		if (ret2) {
+			ret = ERR_PTR(ret2);
+			goto error4;
+		}
+	}
+
+	/* Add mad agent into port's agent list */
+	list_add_tail(&mad_agent_priv->agent_list, &port_priv->agent_list);
+	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+
+	return &mad_agent_priv->agent;
+
+error4:
+	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+	kfree(reg_req);
+error3:
+	ib_dereg_mr(mad_agent_priv->agent.mr);
+error2:
+	kfree(mad_agent_priv);
+error1:
+	return ret;
+}
+EXPORT_SYMBOL(ib_register_mad_agent);
+
+static inline int is_snooping_sends(int mad_snoop_flags)
+{
+	return (mad_snoop_flags &
+		(/*IB_MAD_SNOOP_POSTED_SENDS |
+		 IB_MAD_SNOOP_RMPP_SENDS |*/
+		 IB_MAD_SNOOP_SEND_COMPLETIONS /*|
+		 IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS*/));
+}
+
+static inline int is_snooping_recvs(int mad_snoop_flags)
+{
+	return (mad_snoop_flags &
+		(IB_MAD_SNOOP_RECVS /*|
+		 IB_MAD_SNOOP_RMPP_RECVS*/));
+}
+
+static int register_snoop_agent(struct ib_mad_qp_info *qp_info,
+				struct ib_mad_snoop_private *mad_snoop_priv)
+{
+	struct ib_mad_snoop_private **new_snoop_table;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	/* Check for empty slot in array. */
+	for (i = 0; i < qp_info->snoop_table_size; i++)
+		if (!qp_info->snoop_table[i])
+			break;
+
+	if (i == qp_info->snoop_table_size) {
+		/* Grow table. */
+		new_snoop_table = krealloc(qp_info->snoop_table,
+					   sizeof mad_snoop_priv *
+					   (qp_info->snoop_table_size + 1),
+					   GFP_ATOMIC);
+		if (!new_snoop_table) {
+			i = -ENOMEM;
+			goto out;
+		}
+
+		qp_info->snoop_table = new_snoop_table;
+		qp_info->snoop_table_size++;
+	}
+	qp_info->snoop_table[i] = mad_snoop_priv;
+	atomic_inc(&qp_info->snoop_count);
+out:
+	spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+	return i;
+}
+
+struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device,
+					   u8 port_num,
+					   enum ib_qp_type qp_type,
+					   int mad_snoop_flags,
+					   ib_mad_snoop_handler snoop_handler,
+					   ib_mad_recv_handler recv_handler,
+					   void *context)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_agent *ret;
+	struct ib_mad_snoop_private *mad_snoop_priv;
+	int qpn;
+
+	/* Validate parameters */
+	if ((is_snooping_sends(mad_snoop_flags) && !snoop_handler) ||
+	    (is_snooping_recvs(mad_snoop_flags) && !recv_handler)) {
+		ret = ERR_PTR(-EINVAL);
+		goto error1;
+	}
+	qpn = get_spl_qp_index(qp_type);
+	if (qpn == -1) {
+		ret = ERR_PTR(-EINVAL);
+		goto error1;
+	}
+	port_priv = ib_get_mad_port(device, port_num);
+	if (!port_priv) {
+		ret = ERR_PTR(-ENODEV);
+		goto error1;
+	}
+	/* Allocate structures */
+	mad_snoop_priv = kzalloc(sizeof *mad_snoop_priv, GFP_KERNEL);
+	if (!mad_snoop_priv) {
+		ret = ERR_PTR(-ENOMEM);
+		goto error1;
+	}
+
+	/* Now, fill in the various structures */
+	mad_snoop_priv->qp_info = &port_priv->qp_info[qpn];
+	mad_snoop_priv->agent.device = device;
+	mad_snoop_priv->agent.recv_handler = recv_handler;
+	mad_snoop_priv->agent.snoop_handler = snoop_handler;
+	mad_snoop_priv->agent.context = context;
+	mad_snoop_priv->agent.qp = port_priv->qp_info[qpn].qp;
+	mad_snoop_priv->agent.port_num = port_num;
+	mad_snoop_priv->mad_snoop_flags = mad_snoop_flags;
+	init_completion(&mad_snoop_priv->comp);
+	mad_snoop_priv->snoop_index = register_snoop_agent(
+						&port_priv->qp_info[qpn],
+						mad_snoop_priv);
+	if (mad_snoop_priv->snoop_index < 0) {
+		ret = ERR_PTR(mad_snoop_priv->snoop_index);
+		goto error2;
+	}
+
+	atomic_set(&mad_snoop_priv->refcount, 1);
+	return &mad_snoop_priv->agent;
+
+error2:
+	kfree(mad_snoop_priv);
+error1:
+	return ret;
+}
+EXPORT_SYMBOL(ib_register_mad_snoop);
+
+static inline void deref_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
+{
+	if (atomic_dec_and_test(&mad_agent_priv->refcount))
+		complete(&mad_agent_priv->comp);
+}
+
+static inline void deref_snoop_agent(struct ib_mad_snoop_private *mad_snoop_priv)
+{
+	if (atomic_dec_and_test(&mad_snoop_priv->refcount))
+		complete(&mad_snoop_priv->comp);
+}
+
+static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
+{
+	struct ib_mad_port_private *port_priv;
+	unsigned long flags;
+
+	/* Note that we could still be handling received MADs */
+
+	/*
+	 * Canceling all sends results in dropping received response
+	 * MADs, preventing us from queuing additional work
+	 */
+	cancel_mads(mad_agent_priv);
+	port_priv = mad_agent_priv->qp_info->port_priv;
+	del_timer_sync(&mad_agent_priv->timeout_timer);
+	cancel_work_sync(&mad_agent_priv->timeout_work);
+
+	spin_lock_irqsave(&port_priv->reg_lock, flags);
+	remove_mad_reg_req(mad_agent_priv);
+	list_del(&mad_agent_priv->agent_list);
+	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+
+	flush_workqueue(port_priv->wq);
+	ib_cancel_rmpp_recvs(mad_agent_priv);
+
+	deref_mad_agent(mad_agent_priv);
+	wait_for_completion(&mad_agent_priv->comp);
+
+	kfree(mad_agent_priv->reg_req);
+	ib_dereg_mr(mad_agent_priv->agent.mr);
+	kfree(mad_agent_priv);
+}
+
+static void unregister_mad_snoop(struct ib_mad_snoop_private *mad_snoop_priv)
+{
+	struct ib_mad_qp_info *qp_info;
+	unsigned long flags;
+
+	qp_info = mad_snoop_priv->qp_info;
+	spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	qp_info->snoop_table[mad_snoop_priv->snoop_index] = NULL;
+	atomic_dec(&qp_info->snoop_count);
+	spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+
+	deref_snoop_agent(mad_snoop_priv);
+	wait_for_completion(&mad_snoop_priv->comp);
+
+	kfree(mad_snoop_priv);
+}
+
+/*
+ * ib_unregister_mad_agent - Unregisters a client from using MAD services
+ */
+int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_snoop_private *mad_snoop_priv;
+
+	/* If the TID is zero, the agent can only snoop. */
+	if (mad_agent->hi_tid) {
+		mad_agent_priv = container_of(mad_agent,
+					      struct ib_mad_agent_private,
+					      agent);
+		unregister_mad_agent(mad_agent_priv);
+	} else {
+		mad_snoop_priv = container_of(mad_agent,
+					      struct ib_mad_snoop_private,
+					      agent);
+		unregister_mad_snoop(mad_snoop_priv);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ib_unregister_mad_agent);
+
+static void dequeue_mad(struct ib_mad_list_head *mad_list)
+{
+	struct ib_mad_queue *mad_queue;
+	unsigned long flags;
+
+	BUG_ON(!mad_list->mad_queue);
+	mad_queue = mad_list->mad_queue;
+	spin_lock_irqsave(&mad_queue->lock, flags);
+	list_del(&mad_list->list);
+	mad_queue->count--;
+	spin_unlock_irqrestore(&mad_queue->lock, flags);
+}
+
+static void snoop_send(struct ib_mad_qp_info *qp_info,
+		       struct ib_mad_send_buf *send_buf,
+		       struct ib_mad_send_wc *mad_send_wc,
+		       int mad_snoop_flags)
+{
+	struct ib_mad_snoop_private *mad_snoop_priv;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	for (i = 0; i < qp_info->snoop_table_size; i++) {
+		mad_snoop_priv = qp_info->snoop_table[i];
+		if (!mad_snoop_priv ||
+		    !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags))
+			continue;
+
+		atomic_inc(&mad_snoop_priv->refcount);
+		spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+		mad_snoop_priv->agent.snoop_handler(&mad_snoop_priv->agent,
+						    send_buf, mad_send_wc);
+		deref_snoop_agent(mad_snoop_priv);
+		spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	}
+	spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+}
+
+static void snoop_recv(struct ib_mad_qp_info *qp_info,
+		       struct ib_mad_recv_wc *mad_recv_wc,
+		       int mad_snoop_flags)
+{
+	struct ib_mad_snoop_private *mad_snoop_priv;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	for (i = 0; i < qp_info->snoop_table_size; i++) {
+		mad_snoop_priv = qp_info->snoop_table[i];
+		if (!mad_snoop_priv ||
+		    !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags))
+			continue;
+
+		atomic_inc(&mad_snoop_priv->refcount);
+		spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+		mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent,
+						   mad_recv_wc);
+		deref_snoop_agent(mad_snoop_priv);
+		spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	}
+	spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+}
+
+static void build_smp_wc(struct ib_qp *qp,
+			 u64 wr_id, u16 slid, u16 pkey_index, u8 port_num,
+			 struct ib_wc *wc)
+{
+	memset(wc, 0, sizeof *wc);
+	wc->wr_id = wr_id;
+	wc->status = IB_WC_SUCCESS;
+	wc->opcode = IB_WC_RECV;
+	wc->pkey_index = pkey_index;
+	wc->byte_len = sizeof(struct ib_mad) + sizeof(struct ib_grh);
+	wc->src_qp = IB_QP0;
+	wc->qp = qp;
+	wc->slid = slid;
+	wc->sl = 0;
+	wc->dlid_path_bits = 0;
+	wc->port_num = port_num;
+}
+
+/*
+ * Return 0 if SMP is to be sent
+ * Return 1 if SMP was consumed locally (whether or not solicited)
+ * Return < 0 if error
+ */
+static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
+				  struct ib_mad_send_wr_private *mad_send_wr)
+{
+	int ret = 0;
+	struct ib_smp *smp = mad_send_wr->send_buf.mad;
+	unsigned long flags;
+	struct ib_mad_local_private *local;
+	struct ib_mad_private *mad_priv;
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_agent_private *recv_mad_agent = NULL;
+	struct ib_device *device = mad_agent_priv->agent.device;
+	u8 port_num;
+	struct ib_wc mad_wc;
+	struct ib_send_wr *send_wr = &mad_send_wr->send_wr;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH)
+		port_num = send_wr->wr.ud.port_num;
+	else
+		port_num = mad_agent_priv->agent.port_num;
+
+	/*
+	 * Directed route handling starts if the initial LID routed part of
+	 * a request or the ending LID routed part of a response is empty.
+	 * If we are at the start of the LID routed part, don't update the
+	 * hop_ptr or hop_cnt.  See section 14.2.2, Vol 1 IB spec.
+	 */
+	if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) !=
+	     IB_LID_PERMISSIVE)
+		goto out;
+	if (smi_handle_dr_smp_send(smp, device->node_type, port_num) ==
+	     IB_SMI_DISCARD) {
+		ret = -EINVAL;
+		printk(KERN_ERR PFX "Invalid directed route\n");
+		goto out;
+	}
+
+	/* Check to post send on QP or process locally */
+	if (smi_check_local_smp(smp, device) == IB_SMI_DISCARD &&
+	    smi_check_local_returning_smp(smp, device) == IB_SMI_DISCARD)
+		goto out;
+
+	local = kmalloc(sizeof *local, GFP_ATOMIC);
+	if (!local) {
+		ret = -ENOMEM;
+		printk(KERN_ERR PFX "No memory for ib_mad_local_private\n");
+		goto out;
+	}
+	local->mad_priv = NULL;
+	local->recv_mad_agent = NULL;
+	mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_ATOMIC);
+	if (!mad_priv) {
+		ret = -ENOMEM;
+		printk(KERN_ERR PFX "No memory for local response MAD\n");
+		kfree(local);
+		goto out;
+	}
+
+	build_smp_wc(mad_agent_priv->agent.qp,
+		     send_wr->wr_id, be16_to_cpu(smp->dr_slid),
+		     send_wr->wr.ud.pkey_index,
+		     send_wr->wr.ud.port_num, &mad_wc);
+
+	/* No GRH for DR SMP */
+	ret = device->process_mad(device, 0, port_num, &mad_wc, NULL,
+				  (struct ib_mad *)smp,
+				  (struct ib_mad *)&mad_priv->mad);
+	switch (ret)
+	{
+	case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY:
+		if (ib_response_mad(&mad_priv->mad.mad) &&
+		    mad_agent_priv->agent.recv_handler) {
+			local->mad_priv = mad_priv;
+			local->recv_mad_agent = mad_agent_priv;
+			/*
+			 * Reference MAD agent until receive
+			 * side of local completion handled
+			 */
+			atomic_inc(&mad_agent_priv->refcount);
+		} else
+			kmem_cache_free(ib_mad_cache, mad_priv);
+		break;
+	case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED:
+		kmem_cache_free(ib_mad_cache, mad_priv);
+		break;
+	case IB_MAD_RESULT_SUCCESS:
+		/* Treat like an incoming receive MAD */
+		port_priv = ib_get_mad_port(mad_agent_priv->agent.device,
+					    mad_agent_priv->agent.port_num);
+		if (port_priv) {
+			memcpy(&mad_priv->mad.mad, smp, sizeof(struct ib_mad));
+			recv_mad_agent = find_mad_agent(port_priv,
+						        &mad_priv->mad.mad);
+		}
+		if (!port_priv || !recv_mad_agent) {
+			/*
+			 * No receiving agent so drop packet and
+			 * generate send completion.
+			 */
+			kmem_cache_free(ib_mad_cache, mad_priv);
+			break;
+		}
+		local->mad_priv = mad_priv;
+		local->recv_mad_agent = recv_mad_agent;
+		break;
+	default:
+		kmem_cache_free(ib_mad_cache, mad_priv);
+		kfree(local);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	local->mad_send_wr = mad_send_wr;
+	/* Reference MAD agent until send side of local completion handled */
+	atomic_inc(&mad_agent_priv->refcount);
+	/* Queue local completion to local list */
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	list_add_tail(&local->completion_list, &mad_agent_priv->local_list);
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+	queue_work(mad_agent_priv->qp_info->port_priv->wq,
+		   &mad_agent_priv->local_work);
+	ret = 1;
+out:
+	return ret;
+}
+
+static int get_pad_size(int hdr_len, int data_len)
+{
+	int seg_size, pad;
+
+	seg_size = sizeof(struct ib_mad) - hdr_len;
+	if (data_len && seg_size) {
+		pad = seg_size - data_len % seg_size;
+		return pad == seg_size ? 0 : pad;
+	} else
+		return seg_size;
+}
+
+static void free_send_rmpp_list(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_rmpp_segment *s, *t;
+
+	list_for_each_entry_safe(s, t, &mad_send_wr->rmpp_list, list) {
+		list_del(&s->list);
+		kfree(s);
+	}
+}
+
+static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr,
+				gfp_t gfp_mask)
+{
+	struct ib_mad_send_buf *send_buf = &send_wr->send_buf;
+	struct ib_rmpp_mad *rmpp_mad = send_buf->mad;
+	struct ib_rmpp_segment *seg = NULL;
+	int left, seg_size, pad;
+
+	send_buf->seg_size = sizeof (struct ib_mad) - send_buf->hdr_len;
+	seg_size = send_buf->seg_size;
+	pad = send_wr->pad;
+
+	/* Allocate data segments. */
+	for (left = send_buf->data_len + pad; left > 0; left -= seg_size) {
+		seg = kmalloc(sizeof (*seg) + seg_size, gfp_mask);
+		if (!seg) {
+			printk(KERN_ERR "alloc_send_rmpp_segs: RMPP mem "
+			       "alloc failed for len %zd, gfp %#x\n",
+			       sizeof (*seg) + seg_size, gfp_mask);
+			free_send_rmpp_list(send_wr);
+			return -ENOMEM;
+		}
+		seg->num = ++send_buf->seg_count;
+		list_add_tail(&seg->list, &send_wr->rmpp_list);
+	}
+
+	/* Zero any padding */
+	if (pad)
+		memset(seg->data + seg_size - pad, 0, pad);
+
+	rmpp_mad->rmpp_hdr.rmpp_version = send_wr->mad_agent_priv->
+					  agent.rmpp_version;
+	rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_DATA;
+	ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+
+	send_wr->cur_seg = container_of(send_wr->rmpp_list.next,
+					struct ib_rmpp_segment, list);
+	send_wr->last_ack_seg = send_wr->cur_seg;
+	return 0;
+}
+
+struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
+					    u32 remote_qpn, u16 pkey_index,
+					    int rmpp_active,
+					    int hdr_len, int data_len,
+					    gfp_t gfp_mask)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	int pad, message_size, ret, size;
+	void *buf;
+
+	mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
+				      agent);
+	pad = get_pad_size(hdr_len, data_len);
+	message_size = hdr_len + data_len + pad;
+
+	if ((!mad_agent->rmpp_version &&
+	     (rmpp_active || message_size > sizeof(struct ib_mad))) ||
+	    (!rmpp_active && message_size > sizeof(struct ib_mad)))
+		return ERR_PTR(-EINVAL);
+
+	size = rmpp_active ? hdr_len : sizeof(struct ib_mad);
+	buf = kzalloc(sizeof *mad_send_wr + size, gfp_mask);
+	if (!buf)
+		return ERR_PTR(-ENOMEM);
+
+	mad_send_wr = buf + size;
+	INIT_LIST_HEAD(&mad_send_wr->rmpp_list);
+	mad_send_wr->send_buf.mad = buf;
+	mad_send_wr->send_buf.hdr_len = hdr_len;
+	mad_send_wr->send_buf.data_len = data_len;
+	mad_send_wr->pad = pad;
+
+	mad_send_wr->mad_agent_priv = mad_agent_priv;
+	mad_send_wr->sg_list[0].length = hdr_len;
+	mad_send_wr->sg_list[0].lkey = mad_agent->mr->lkey;
+	mad_send_wr->sg_list[1].length = sizeof(struct ib_mad) - hdr_len;
+	mad_send_wr->sg_list[1].lkey = mad_agent->mr->lkey;
+
+	mad_send_wr->send_wr.wr_id = (unsigned long) mad_send_wr;
+	mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list;
+	mad_send_wr->send_wr.num_sge = 2;
+	mad_send_wr->send_wr.opcode = IB_WR_SEND;
+	mad_send_wr->send_wr.send_flags = IB_SEND_SIGNALED;
+	mad_send_wr->send_wr.wr.ud.remote_qpn = remote_qpn;
+	mad_send_wr->send_wr.wr.ud.remote_qkey = IB_QP_SET_QKEY;
+	mad_send_wr->send_wr.wr.ud.pkey_index = pkey_index;
+
+	if (rmpp_active) {
+		ret = alloc_send_rmpp_list(mad_send_wr, gfp_mask);
+		if (ret) {
+			kfree(buf);
+			return ERR_PTR(ret);
+		}
+	}
+
+	mad_send_wr->send_buf.mad_agent = mad_agent;
+	atomic_inc(&mad_agent_priv->refcount);
+	return &mad_send_wr->send_buf;
+}
+EXPORT_SYMBOL(ib_create_send_mad);
+
+int ib_get_mad_data_offset(u8 mgmt_class)
+{
+	if (mgmt_class == IB_MGMT_CLASS_SUBN_ADM)
+		return IB_MGMT_SA_HDR;
+	else if ((mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) ||
+		 (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) ||
+		 (mgmt_class == IB_MGMT_CLASS_BIS))
+		return IB_MGMT_DEVICE_HDR;
+	else if ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) &&
+		 (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END))
+		return IB_MGMT_VENDOR_HDR;
+	else
+		return IB_MGMT_MAD_HDR;
+}
+EXPORT_SYMBOL(ib_get_mad_data_offset);
+
+int ib_is_mad_class_rmpp(u8 mgmt_class)
+{
+	if ((mgmt_class == IB_MGMT_CLASS_SUBN_ADM) ||
+	    (mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) ||
+	    (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) ||
+	    (mgmt_class == IB_MGMT_CLASS_BIS) ||
+	    ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) &&
+	     (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END)))
+		return 1;
+	return 0;
+}
+EXPORT_SYMBOL(ib_is_mad_class_rmpp);
+
+void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct list_head *list;
+
+	mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
+				   send_buf);
+	list = &mad_send_wr->cur_seg->list;
+
+	if (mad_send_wr->cur_seg->num < seg_num) {
+		list_for_each_entry(mad_send_wr->cur_seg, list, list)
+			if (mad_send_wr->cur_seg->num == seg_num)
+				break;
+	} else if (mad_send_wr->cur_seg->num > seg_num) {
+		list_for_each_entry_reverse(mad_send_wr->cur_seg, list, list)
+			if (mad_send_wr->cur_seg->num == seg_num)
+				break;
+	}
+	return mad_send_wr->cur_seg->data;
+}
+EXPORT_SYMBOL(ib_get_rmpp_segment);
+
+static inline void *ib_get_payload(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	if (mad_send_wr->send_buf.seg_count)
+		return ib_get_rmpp_segment(&mad_send_wr->send_buf,
+					   mad_send_wr->seg_num);
+	else
+		return mad_send_wr->send_buf.mad +
+		       mad_send_wr->send_buf.hdr_len;
+}
+
+void ib_free_send_mad(struct ib_mad_send_buf *send_buf)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *mad_send_wr;
+
+	mad_agent_priv = container_of(send_buf->mad_agent,
+				      struct ib_mad_agent_private, agent);
+	mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
+				   send_buf);
+
+	free_send_rmpp_list(mad_send_wr);
+	kfree(send_buf->mad);
+	deref_mad_agent(mad_agent_priv);
+}
+EXPORT_SYMBOL(ib_free_send_mad);
+
+int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_mad_qp_info *qp_info;
+	struct list_head *list;
+	struct ib_send_wr *bad_send_wr;
+	struct ib_mad_agent *mad_agent;
+	struct ib_sge *sge;
+	unsigned long flags;
+	int ret;
+
+	/* Set WR ID to find mad_send_wr upon completion */
+	qp_info = mad_send_wr->mad_agent_priv->qp_info;
+	mad_send_wr->send_wr.wr_id = (unsigned long)&mad_send_wr->mad_list;
+	mad_send_wr->mad_list.mad_queue = &qp_info->send_queue;
+
+	mad_agent = mad_send_wr->send_buf.mad_agent;
+	sge = mad_send_wr->sg_list;
+	sge[0].addr = ib_dma_map_single(mad_agent->device,
+					mad_send_wr->send_buf.mad,
+					sge[0].length,
+					DMA_TO_DEVICE);
+	mad_send_wr->header_mapping = sge[0].addr;
+
+	sge[1].addr = ib_dma_map_single(mad_agent->device,
+					ib_get_payload(mad_send_wr),
+					sge[1].length,
+					DMA_TO_DEVICE);
+	mad_send_wr->payload_mapping = sge[1].addr;
+
+	spin_lock_irqsave(&qp_info->send_queue.lock, flags);
+	if (qp_info->send_queue.count < qp_info->send_queue.max_active) {
+		ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr,
+				   &bad_send_wr);
+		list = &qp_info->send_queue.list;
+	} else {
+		ret = 0;
+		list = &qp_info->overflow_list;
+	}
+
+	if (!ret) {
+		qp_info->send_queue.count++;
+		list_add_tail(&mad_send_wr->mad_list.list, list);
+	}
+	spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
+	if (ret) {
+		ib_dma_unmap_single(mad_agent->device,
+				    mad_send_wr->header_mapping,
+				    sge[0].length, DMA_TO_DEVICE);
+		ib_dma_unmap_single(mad_agent->device,
+				    mad_send_wr->payload_mapping,
+				    sge[1].length, DMA_TO_DEVICE);
+	}
+	return ret;
+}
+
+/*
+ * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated
+ *  with the registered client
+ */
+int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
+		     struct ib_mad_send_buf **bad_send_buf)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_buf *next_send_buf;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	unsigned long flags;
+	int ret = -EINVAL;
+
+	/* Walk list of send WRs and post each on send list */
+	for (; send_buf; send_buf = next_send_buf) {
+
+		mad_send_wr = container_of(send_buf,
+					   struct ib_mad_send_wr_private,
+					   send_buf);
+		mad_agent_priv = mad_send_wr->mad_agent_priv;
+
+		if (!send_buf->mad_agent->send_handler ||
+		    (send_buf->timeout_ms &&
+		     !send_buf->mad_agent->recv_handler)) {
+			ret = -EINVAL;
+			goto error;
+		}
+
+		if (!ib_is_mad_class_rmpp(((struct ib_mad_hdr *) send_buf->mad)->mgmt_class)) {
+			if (mad_agent_priv->agent.rmpp_version) {
+				ret = -EINVAL;
+				goto error;
+			}
+		}
+
+		/*
+		 * Save pointer to next work request to post in case the
+		 * current one completes, and the user modifies the work
+		 * request associated with the completion
+		 */
+		next_send_buf = send_buf->next;
+		mad_send_wr->send_wr.wr.ud.ah = send_buf->ah;
+
+		if (((struct ib_mad_hdr *) send_buf->mad)->mgmt_class ==
+		    IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+			ret = handle_outgoing_dr_smp(mad_agent_priv,
+						     mad_send_wr);
+			if (ret < 0)		/* error */
+				goto error;
+			else if (ret == 1)	/* locally consumed */
+				continue;
+		}
+
+		mad_send_wr->tid = ((struct ib_mad_hdr *) send_buf->mad)->tid;
+		/* Timeout will be updated after send completes */
+		mad_send_wr->timeout = msecs_to_jiffies(send_buf->timeout_ms);
+		mad_send_wr->max_retries = send_buf->retries;
+		mad_send_wr->retries_left = send_buf->retries;
+		send_buf->retries = 0;
+		/* Reference for work request to QP + response */
+		mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
+		mad_send_wr->status = IB_WC_SUCCESS;
+
+		/* Reference MAD agent until send completes */
+		atomic_inc(&mad_agent_priv->refcount);
+		spin_lock_irqsave(&mad_agent_priv->lock, flags);
+		list_add_tail(&mad_send_wr->agent_list,
+			      &mad_agent_priv->send_list);
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+		if (mad_agent_priv->agent.rmpp_version) {
+			ret = ib_send_rmpp_mad(mad_send_wr);
+			if (ret >= 0 && ret != IB_RMPP_RESULT_CONSUMED)
+				ret = ib_send_mad(mad_send_wr);
+		} else
+			ret = ib_send_mad(mad_send_wr);
+		if (ret < 0) {
+			/* Fail send request */
+			spin_lock_irqsave(&mad_agent_priv->lock, flags);
+			list_del(&mad_send_wr->agent_list);
+			spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+			atomic_dec(&mad_agent_priv->refcount);
+			goto error;
+		}
+	}
+	return 0;
+error:
+	if (bad_send_buf)
+		*bad_send_buf = send_buf;
+	return ret;
+}
+EXPORT_SYMBOL(ib_post_send_mad);
+
+/*
+ * ib_free_recv_mad - Returns data buffers used to receive
+ *  a MAD to the access layer
+ */
+void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_mad_recv_buf *mad_recv_buf, *temp_recv_buf;
+	struct ib_mad_private_header *mad_priv_hdr;
+	struct ib_mad_private *priv;
+	struct list_head free_list;
+
+	INIT_LIST_HEAD(&free_list);
+	list_splice_init(&mad_recv_wc->rmpp_list, &free_list);
+
+	list_for_each_entry_safe(mad_recv_buf, temp_recv_buf,
+					&free_list, list) {
+		mad_recv_wc = container_of(mad_recv_buf, struct ib_mad_recv_wc,
+					   recv_buf);
+		mad_priv_hdr = container_of(mad_recv_wc,
+					    struct ib_mad_private_header,
+					    recv_wc);
+		priv = container_of(mad_priv_hdr, struct ib_mad_private,
+				    header);
+		kmem_cache_free(ib_mad_cache, priv);
+	}
+}
+EXPORT_SYMBOL(ib_free_recv_mad);
+
+struct ib_mad_agent *ib_redirect_mad_qp(struct ib_qp *qp,
+					u8 rmpp_version,
+					ib_mad_send_handler send_handler,
+					ib_mad_recv_handler recv_handler,
+					void *context)
+{
+	return ERR_PTR(-EINVAL);	/* XXX: for now */
+}
+EXPORT_SYMBOL(ib_redirect_mad_qp);
+
+int ib_process_mad_wc(struct ib_mad_agent *mad_agent,
+		      struct ib_wc *wc)
+{
+	printk(KERN_ERR PFX "ib_process_mad_wc() not implemented yet\n");
+	return 0;
+}
+EXPORT_SYMBOL(ib_process_mad_wc);
+
+static int method_in_use(struct ib_mad_mgmt_method_table **method,
+			 struct ib_mad_reg_req *mad_reg_req)
+{
+	int i;
+
+	for (i = find_first_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS);
+	     i < IB_MGMT_MAX_METHODS;
+	     i = find_next_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS,
+			       1+i)) {
+		if ((*method)->agent[i]) {
+			printk(KERN_ERR PFX "Method %d already in use\n", i);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static int allocate_method_table(struct ib_mad_mgmt_method_table **method)
+{
+	/* Allocate management method table */
+	*method = kzalloc(sizeof **method, GFP_ATOMIC);
+	if (!*method) {
+		printk(KERN_ERR PFX "No memory for "
+		       "ib_mad_mgmt_method_table\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/*
+ * Check to see if there are any methods still in use
+ */
+static int check_method_table(struct ib_mad_mgmt_method_table *method)
+{
+	int i;
+
+	for (i = 0; i < IB_MGMT_MAX_METHODS; i++)
+		if (method->agent[i])
+			return 1;
+	return 0;
+}
+
+/*
+ * Check to see if there are any method tables for this class still in use
+ */
+static int check_class_table(struct ib_mad_mgmt_class_table *class)
+{
+	int i;
+
+	for (i = 0; i < MAX_MGMT_CLASS; i++)
+		if (class->method_table[i])
+			return 1;
+	return 0;
+}
+
+static int check_vendor_class(struct ib_mad_mgmt_vendor_class *vendor_class)
+{
+	int i;
+
+	for (i = 0; i < MAX_MGMT_OUI; i++)
+		if (vendor_class->method_table[i])
+			return 1;
+	return 0;
+}
+
+static int find_vendor_oui(struct ib_mad_mgmt_vendor_class *vendor_class,
+			   char *oui)
+{
+	int i;
+
+	for (i = 0; i < MAX_MGMT_OUI; i++)
+		/* Is there matching OUI for this vendor class ? */
+		if (!memcmp(vendor_class->oui[i], oui, 3))
+			return i;
+
+	return -1;
+}
+
+static int check_vendor_table(struct ib_mad_mgmt_vendor_class_table *vendor)
+{
+	int i;
+
+	for (i = 0; i < MAX_MGMT_VENDOR_RANGE2; i++)
+		if (vendor->vendor_class[i])
+			return 1;
+
+	return 0;
+}
+
+static void remove_methods_mad_agent(struct ib_mad_mgmt_method_table *method,
+				     struct ib_mad_agent_private *agent)
+{
+	int i;
+
+	/* Remove any methods for this mad agent */
+	for (i = 0; i < IB_MGMT_MAX_METHODS; i++) {
+		if (method->agent[i] == agent) {
+			method->agent[i] = NULL;
+		}
+	}
+}
+
+static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+			      struct ib_mad_agent_private *agent_priv,
+			      u8 mgmt_class)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_mgmt_class_table **class;
+	struct ib_mad_mgmt_method_table **method;
+	int i, ret;
+
+	port_priv = agent_priv->qp_info->port_priv;
+	class = &port_priv->version[mad_reg_req->mgmt_class_version].class;
+	if (!*class) {
+		/* Allocate management class table for "new" class version */
+		*class = kzalloc(sizeof **class, GFP_ATOMIC);
+		if (!*class) {
+			printk(KERN_ERR PFX "No memory for "
+			       "ib_mad_mgmt_class_table\n");
+			ret = -ENOMEM;
+			goto error1;
+		}
+
+		/* Allocate method table for this management class */
+		method = &(*class)->method_table[mgmt_class];
+		if ((ret = allocate_method_table(method)))
+			goto error2;
+	} else {
+		method = &(*class)->method_table[mgmt_class];
+		if (!*method) {
+			/* Allocate method table for this management class */
+			if ((ret = allocate_method_table(method)))
+				goto error1;
+		}
+	}
+
+	/* Now, make sure methods are not already in use */
+	if (method_in_use(method, mad_reg_req))
+		goto error3;
+
+	/* Finally, add in methods being registered */
+	for (i = find_first_bit(mad_reg_req->method_mask,
+				IB_MGMT_MAX_METHODS);
+	     i < IB_MGMT_MAX_METHODS;
+	     i = find_next_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS,
+			       1+i)) {
+		(*method)->agent[i] = agent_priv;
+	}
+	return 0;
+
+error3:
+	/* Remove any methods for this mad agent */
+	remove_methods_mad_agent(*method, agent_priv);
+	/* Now, check to see if there are any methods in use */
+	if (!check_method_table(*method)) {
+		/* If not, release management method table */
+		kfree(*method);
+		*method = NULL;
+	}
+	ret = -EINVAL;
+	goto error1;
+error2:
+	kfree(*class);
+	*class = NULL;
+error1:
+	return ret;
+}
+
+static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+			   struct ib_mad_agent_private *agent_priv)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_mgmt_vendor_class_table **vendor_table;
+	struct ib_mad_mgmt_vendor_class_table *vendor = NULL;
+	struct ib_mad_mgmt_vendor_class *vendor_class = NULL;
+	struct ib_mad_mgmt_method_table **method;
+	int i, ret = -ENOMEM;
+	u8 vclass;
+
+	/* "New" vendor (with OUI) class */
+	vclass = vendor_class_index(mad_reg_req->mgmt_class);
+	port_priv = agent_priv->qp_info->port_priv;
+	vendor_table = &port_priv->version[
+				mad_reg_req->mgmt_class_version].vendor;
+	if (!*vendor_table) {
+		/* Allocate mgmt vendor class table for "new" class version */
+		vendor = kzalloc(sizeof *vendor, GFP_ATOMIC);
+		if (!vendor) {
+			printk(KERN_ERR PFX "No memory for "
+			       "ib_mad_mgmt_vendor_class_table\n");
+			goto error1;
+		}
+
+		*vendor_table = vendor;
+	}
+	if (!(*vendor_table)->vendor_class[vclass]) {
+		/* Allocate table for this management vendor class */
+		vendor_class = kzalloc(sizeof *vendor_class, GFP_ATOMIC);
+		if (!vendor_class) {
+			printk(KERN_ERR PFX "No memory for "
+			       "ib_mad_mgmt_vendor_class\n");
+			goto error2;
+		}
+
+		(*vendor_table)->vendor_class[vclass] = vendor_class;
+	}
+	for (i = 0; i < MAX_MGMT_OUI; i++) {
+		/* Is there matching OUI for this vendor class ? */
+		if (!memcmp((*vendor_table)->vendor_class[vclass]->oui[i],
+			    mad_reg_req->oui, 3)) {
+			method = &(*vendor_table)->vendor_class[
+						vclass]->method_table[i];
+			BUG_ON(!*method);
+			goto check_in_use;
+		}
+	}
+	for (i = 0; i < MAX_MGMT_OUI; i++) {
+		/* OUI slot available ? */
+		if (!is_vendor_oui((*vendor_table)->vendor_class[
+				vclass]->oui[i])) {
+			method = &(*vendor_table)->vendor_class[
+				vclass]->method_table[i];
+			BUG_ON(*method);
+			/* Allocate method table for this OUI */
+			if ((ret = allocate_method_table(method)))
+				goto error3;
+			memcpy((*vendor_table)->vendor_class[vclass]->oui[i],
+			       mad_reg_req->oui, 3);
+			goto check_in_use;
+		}
+	}
+	printk(KERN_ERR PFX "All OUI slots in use\n");
+	goto error3;
+
+check_in_use:
+	/* Now, make sure methods are not already in use */
+	if (method_in_use(method, mad_reg_req))
+		goto error4;
+
+	/* Finally, add in methods being registered */
+	for (i = find_first_bit(mad_reg_req->method_mask,
+				IB_MGMT_MAX_METHODS);
+	     i < IB_MGMT_MAX_METHODS;
+	     i = find_next_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS,
+			       1+i)) {
+		(*method)->agent[i] = agent_priv;
+	}
+	return 0;
+
+error4:
+	/* Remove any methods for this mad agent */
+	remove_methods_mad_agent(*method, agent_priv);
+	/* Now, check to see if there are any methods in use */
+	if (!check_method_table(*method)) {
+		/* If not, release management method table */
+		kfree(*method);
+		*method = NULL;
+	}
+	ret = -EINVAL;
+error3:
+	if (vendor_class) {
+		(*vendor_table)->vendor_class[vclass] = NULL;
+		kfree(vendor_class);
+	}
+error2:
+	if (vendor) {
+		*vendor_table = NULL;
+		kfree(vendor);
+	}
+error1:
+	return ret;
+}
+
+static void remove_mad_reg_req(struct ib_mad_agent_private *agent_priv)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_mgmt_class_table *class;
+	struct ib_mad_mgmt_method_table *method;
+	struct ib_mad_mgmt_vendor_class_table *vendor;
+	struct ib_mad_mgmt_vendor_class *vendor_class;
+	int index;
+	u8 mgmt_class;
+
+	/*
+	 * Was MAD registration request supplied
+	 * with original registration ?
+	 */
+	if (!agent_priv->reg_req) {
+		goto out;
+	}
+
+	port_priv = agent_priv->qp_info->port_priv;
+	mgmt_class = convert_mgmt_class(agent_priv->reg_req->mgmt_class);
+	class = port_priv->version[
+			agent_priv->reg_req->mgmt_class_version].class;
+	if (!class)
+		goto vendor_check;
+
+	method = class->method_table[mgmt_class];
+	if (method) {
+		/* Remove any methods for this mad agent */
+		remove_methods_mad_agent(method, agent_priv);
+		/* Now, check to see if there are any methods still in use */
+		if (!check_method_table(method)) {
+			/* If not, release management method table */
+			 kfree(method);
+			 class->method_table[mgmt_class] = NULL;
+			 /* Any management classes left ? */
+			if (!check_class_table(class)) {
+				/* If not, release management class table */
+				kfree(class);
+				port_priv->version[
+					agent_priv->reg_req->
+					mgmt_class_version].class = NULL;
+			}
+		}
+	}
+
+vendor_check:
+	if (!is_vendor_class(mgmt_class))
+		goto out;
+
+	/* normalize mgmt_class to vendor range 2 */
+	mgmt_class = vendor_class_index(agent_priv->reg_req->mgmt_class);
+	vendor = port_priv->version[
+			agent_priv->reg_req->mgmt_class_version].vendor;
+
+	if (!vendor)
+		goto out;
+
+	vendor_class = vendor->vendor_class[mgmt_class];
+	if (vendor_class) {
+		index = find_vendor_oui(vendor_class, agent_priv->reg_req->oui);
+		if (index < 0)
+			goto out;
+		method = vendor_class->method_table[index];
+		if (method) {
+			/* Remove any methods for this mad agent */
+			remove_methods_mad_agent(method, agent_priv);
+			/*
+			 * Now, check to see if there are
+			 * any methods still in use
+			 */
+			if (!check_method_table(method)) {
+				/* If not, release management method table */
+				kfree(method);
+				vendor_class->method_table[index] = NULL;
+				memset(vendor_class->oui[index], 0, 3);
+				/* Any OUIs left ? */
+				if (!check_vendor_class(vendor_class)) {
+					/* If not, release vendor class table */
+					kfree(vendor_class);
+					vendor->vendor_class[mgmt_class] = NULL;
+					/* Any other vendor classes left ? */
+					if (!check_vendor_table(vendor)) {
+						kfree(vendor);
+						port_priv->version[
+							agent_priv->reg_req->
+							mgmt_class_version].
+							vendor = NULL;
+					}
+				}
+			}
+		}
+	}
+
+out:
+	return;
+}
+
+static struct ib_mad_agent_private *
+find_mad_agent(struct ib_mad_port_private *port_priv,
+	       struct ib_mad *mad)
+{
+	struct ib_mad_agent_private *mad_agent = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port_priv->reg_lock, flags);
+	if (ib_response_mad(mad)) {
+		u32 hi_tid;
+		struct ib_mad_agent_private *entry;
+
+		/*
+		 * Routing is based on high 32 bits of transaction ID
+		 * of MAD.
+		 */
+		hi_tid = be64_to_cpu(mad->mad_hdr.tid) >> 32;
+		list_for_each_entry(entry, &port_priv->agent_list, agent_list) {
+			if (entry->agent.hi_tid == hi_tid) {
+				mad_agent = entry;
+				break;
+			}
+		}
+	} else {
+		struct ib_mad_mgmt_class_table *class;
+		struct ib_mad_mgmt_method_table *method;
+		struct ib_mad_mgmt_vendor_class_table *vendor;
+		struct ib_mad_mgmt_vendor_class *vendor_class;
+		struct ib_vendor_mad *vendor_mad;
+		int index;
+
+		/*
+		 * Routing is based on version, class, and method
+		 * For "newer" vendor MADs, also based on OUI
+		 */
+		if (mad->mad_hdr.class_version >= MAX_MGMT_VERSION)
+			goto out;
+		if (!is_vendor_class(mad->mad_hdr.mgmt_class)) {
+			class = port_priv->version[
+					mad->mad_hdr.class_version].class;
+			if (!class)
+				goto out;
+			method = class->method_table[convert_mgmt_class(
+							mad->mad_hdr.mgmt_class)];
+			if (method)
+				mad_agent = method->agent[mad->mad_hdr.method &
+							  ~IB_MGMT_METHOD_RESP];
+		} else {
+			vendor = port_priv->version[
+					mad->mad_hdr.class_version].vendor;
+			if (!vendor)
+				goto out;
+			vendor_class = vendor->vendor_class[vendor_class_index(
+						mad->mad_hdr.mgmt_class)];
+			if (!vendor_class)
+				goto out;
+			/* Find matching OUI */
+			vendor_mad = (struct ib_vendor_mad *)mad;
+			index = find_vendor_oui(vendor_class, vendor_mad->oui);
+			if (index == -1)
+				goto out;
+			method = vendor_class->method_table[index];
+			if (method) {
+				mad_agent = method->agent[mad->mad_hdr.method &
+							  ~IB_MGMT_METHOD_RESP];
+			}
+		}
+	}
+
+	if (mad_agent) {
+		if (mad_agent->agent.recv_handler)
+			atomic_inc(&mad_agent->refcount);
+		else {
+			printk(KERN_NOTICE PFX "No receive handler for client "
+			       "%p on port %d\n",
+			       &mad_agent->agent, port_priv->port_num);
+			mad_agent = NULL;
+		}
+	}
+out:
+	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+
+	return mad_agent;
+}
+
+static int validate_mad(struct ib_mad *mad, u32 qp_num)
+{
+	int valid = 0;
+
+	/* Make sure MAD base version is understood */
+	if (mad->mad_hdr.base_version != IB_MGMT_BASE_VERSION) {
+		printk(KERN_ERR PFX "MAD received with unsupported base "
+		       "version %d\n", mad->mad_hdr.base_version);
+		goto out;
+	}
+
+	/* Filter SMI packets sent to other than QP0 */
+	if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
+	    (mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+		if (qp_num == 0)
+			valid = 1;
+	} else {
+		/* Filter GSI packets sent to QP0 */
+		if (qp_num != 0)
+			valid = 1;
+	}
+
+out:
+	return valid;
+}
+
+static int is_data_mad(struct ib_mad_agent_private *mad_agent_priv,
+		       struct ib_mad_hdr *mad_hdr)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *)mad_hdr;
+	return !mad_agent_priv->agent.rmpp_version ||
+		!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+				    IB_MGMT_RMPP_FLAG_ACTIVE) ||
+		(rmpp_mad->rmpp_hdr.rmpp_type == IB_MGMT_RMPP_TYPE_DATA);
+}
+
+static inline int rcv_has_same_class(struct ib_mad_send_wr_private *wr,
+				     struct ib_mad_recv_wc *rwc)
+{
+	return ((struct ib_mad *)(wr->send_buf.mad))->mad_hdr.mgmt_class ==
+		rwc->recv_buf.mad->mad_hdr.mgmt_class;
+}
+
+static inline int rcv_has_same_gid(struct ib_mad_agent_private *mad_agent_priv,
+				   struct ib_mad_send_wr_private *wr,
+				   struct ib_mad_recv_wc *rwc )
+{
+	struct ib_ah_attr attr;
+	u8 send_resp, rcv_resp;
+	union ib_gid sgid;
+	struct ib_device *device = mad_agent_priv->agent.device;
+	u8 port_num = mad_agent_priv->agent.port_num;
+	u8 lmc;
+
+	send_resp = ib_response_mad((struct ib_mad *)wr->send_buf.mad);
+	rcv_resp = ib_response_mad(rwc->recv_buf.mad);
+
+	if (send_resp == rcv_resp)
+		/* both requests, or both responses. GIDs different */
+		return 0;
+
+	if (ib_query_ah(wr->send_buf.ah, &attr))
+		/* Assume not equal, to avoid false positives. */
+		return 0;
+
+	if (!!(attr.ah_flags & IB_AH_GRH) !=
+	    !!(rwc->wc->wc_flags & IB_WC_GRH))
+		/* one has GID, other does not.  Assume different */
+		return 0;
+
+	if (!send_resp && rcv_resp) {
+		/* is request/response. */
+		if (!(attr.ah_flags & IB_AH_GRH)) {
+			if (ib_get_cached_lmc(device, port_num, &lmc))
+				return 0;
+			return (!lmc || !((attr.src_path_bits ^
+					   rwc->wc->dlid_path_bits) &
+					  ((1 << lmc) - 1)));
+		} else {
+			if (ib_get_cached_gid(device, port_num,
+					      attr.grh.sgid_index, &sgid))
+				return 0;
+			return !memcmp(sgid.raw, rwc->recv_buf.grh->dgid.raw,
+				       16);
+		}
+	}
+
+	if (!(attr.ah_flags & IB_AH_GRH))
+		return attr.dlid == rwc->wc->slid;
+	else
+		return !memcmp(attr.grh.dgid.raw, rwc->recv_buf.grh->sgid.raw,
+			       16);
+}
+
+static inline int is_direct(u8 class)
+{
+	return (class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE);
+}
+
+struct ib_mad_send_wr_private*
+ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv,
+		 struct ib_mad_recv_wc *wc)
+{
+	struct ib_mad_send_wr_private *wr;
+	struct ib_mad *mad;
+
+	mad = (struct ib_mad *)wc->recv_buf.mad;
+
+	list_for_each_entry(wr, &mad_agent_priv->wait_list, agent_list) {
+		if ((wr->tid == mad->mad_hdr.tid) &&
+		    rcv_has_same_class(wr, wc) &&
+		    /*
+		     * Don't check GID for direct routed MADs.
+		     * These might have permissive LIDs.
+		     */
+		    (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) ||
+		     rcv_has_same_gid(mad_agent_priv, wr, wc)))
+			return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
+	}
+
+	/*
+	 * It's possible to receive the response before we've
+	 * been notified that the send has completed
+	 */
+	list_for_each_entry(wr, &mad_agent_priv->send_list, agent_list) {
+		if (is_data_mad(mad_agent_priv, wr->send_buf.mad) &&
+		    wr->tid == mad->mad_hdr.tid &&
+		    wr->timeout &&
+		    rcv_has_same_class(wr, wc) &&
+		    /*
+		     * Don't check GID for direct routed MADs.
+		     * These might have permissive LIDs.
+		     */
+		    (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) ||
+		     rcv_has_same_gid(mad_agent_priv, wr, wc)))
+			/* Verify request has not been canceled */
+			return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
+	}
+	return NULL;
+}
+
+void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	mad_send_wr->timeout = 0;
+	if (mad_send_wr->refcount == 1)
+		list_move_tail(&mad_send_wr->agent_list,
+			      &mad_send_wr->mad_agent_priv->done_list);
+}
+
+static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
+				 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_mad_send_wc mad_send_wc;
+	unsigned long flags;
+
+	INIT_LIST_HEAD(&mad_recv_wc->rmpp_list);
+	list_add(&mad_recv_wc->recv_buf.list, &mad_recv_wc->rmpp_list);
+	if (mad_agent_priv->agent.rmpp_version) {
+		mad_recv_wc = ib_process_rmpp_recv_wc(mad_agent_priv,
+						      mad_recv_wc);
+		if (!mad_recv_wc) {
+			deref_mad_agent(mad_agent_priv);
+			return;
+		}
+	}
+
+	/* Complete corresponding request */
+	if (ib_response_mad(mad_recv_wc->recv_buf.mad)) {
+		spin_lock_irqsave(&mad_agent_priv->lock, flags);
+		mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc);
+		if (!mad_send_wr) {
+			spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+			ib_free_recv_mad(mad_recv_wc);
+			deref_mad_agent(mad_agent_priv);
+			return;
+		}
+		ib_mark_mad_done(mad_send_wr);
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+		/* Defined behavior is to complete response before request */
+		mad_recv_wc->wc->wr_id = (unsigned long) &mad_send_wr->send_buf;
+		mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
+						   mad_recv_wc);
+		atomic_dec(&mad_agent_priv->refcount);
+
+		mad_send_wc.status = IB_WC_SUCCESS;
+		mad_send_wc.vendor_err = 0;
+		mad_send_wc.send_buf = &mad_send_wr->send_buf;
+		ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+	} else {
+		mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
+						   mad_recv_wc);
+		deref_mad_agent(mad_agent_priv);
+	}
+}
+
+static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
+				     struct ib_wc *wc)
+{
+	struct ib_mad_qp_info *qp_info;
+	struct ib_mad_private_header *mad_priv_hdr;
+	struct ib_mad_private *recv, *response = NULL;
+	struct ib_mad_list_head *mad_list;
+	struct ib_mad_agent_private *mad_agent;
+	int port_num;
+
+	mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+	qp_info = mad_list->mad_queue->qp_info;
+	dequeue_mad(mad_list);
+
+	mad_priv_hdr = container_of(mad_list, struct ib_mad_private_header,
+				    mad_list);
+	recv = container_of(mad_priv_hdr, struct ib_mad_private, header);
+	ib_dma_unmap_single(port_priv->device,
+			    recv->header.mapping,
+			    sizeof(struct ib_mad_private) -
+			      sizeof(struct ib_mad_private_header),
+			    DMA_FROM_DEVICE);
+
+	/* Setup MAD receive work completion from "normal" work completion */
+	recv->header.wc = *wc;
+	recv->header.recv_wc.wc = &recv->header.wc;
+	recv->header.recv_wc.mad_len = sizeof(struct ib_mad);
+	recv->header.recv_wc.recv_buf.mad = &recv->mad.mad;
+	recv->header.recv_wc.recv_buf.grh = &recv->grh;
+
+	if (atomic_read(&qp_info->snoop_count))
+		snoop_recv(qp_info, &recv->header.recv_wc, IB_MAD_SNOOP_RECVS);
+
+	/* Validate MAD */
+	if (!validate_mad(&recv->mad.mad, qp_info->qp->qp_num))
+		goto out;
+
+	response = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL);
+	if (!response) {
+		printk(KERN_ERR PFX "ib_mad_recv_done_handler no memory "
+		       "for response buffer\n");
+		goto out;
+	}
+
+	if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH)
+		port_num = wc->port_num;
+	else
+		port_num = port_priv->port_num;
+
+	if (recv->mad.mad.mad_hdr.mgmt_class ==
+	    IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		enum smi_forward_action retsmi;
+
+		if (smi_handle_dr_smp_recv(&recv->mad.smp,
+					   port_priv->device->node_type,
+					   port_num,
+					   port_priv->device->phys_port_cnt) ==
+					   IB_SMI_DISCARD)
+			goto out;
+
+		retsmi = smi_check_forward_dr_smp(&recv->mad.smp);
+		if (retsmi == IB_SMI_LOCAL)
+			goto local;
+
+		if (retsmi == IB_SMI_SEND) { /* don't forward */
+			if (smi_handle_dr_smp_send(&recv->mad.smp,
+						   port_priv->device->node_type,
+						   port_num) == IB_SMI_DISCARD)
+				goto out;
+
+			if (smi_check_local_smp(&recv->mad.smp, port_priv->device) == IB_SMI_DISCARD)
+				goto out;
+		} else if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH) {
+			/* forward case for switches */
+			memcpy(response, recv, sizeof(*response));
+			response->header.recv_wc.wc = &response->header.wc;
+			response->header.recv_wc.recv_buf.mad = &response->mad.mad;
+			response->header.recv_wc.recv_buf.grh = &response->grh;
+
+			agent_send_response(&response->mad.mad,
+					    &response->grh, wc,
+					    port_priv->device,
+					    smi_get_fwd_port(&recv->mad.smp),
+					    qp_info->qp->qp_num);
+
+			goto out;
+		}
+	}
+
+local:
+	/* Give driver "right of first refusal" on incoming MAD */
+	if (port_priv->device->process_mad) {
+		int ret;
+
+		ret = port_priv->device->process_mad(port_priv->device, 0,
+						     port_priv->port_num,
+						     wc, &recv->grh,
+						     &recv->mad.mad,
+						     &response->mad.mad);
+		if (ret & IB_MAD_RESULT_SUCCESS) {
+			if (ret & IB_MAD_RESULT_CONSUMED)
+				goto out;
+			if (ret & IB_MAD_RESULT_REPLY) {
+				agent_send_response(&response->mad.mad,
+						    &recv->grh, wc,
+						    port_priv->device,
+						    port_num,
+						    qp_info->qp->qp_num);
+				goto out;
+			}
+		}
+	}
+
+	mad_agent = find_mad_agent(port_priv, &recv->mad.mad);
+	if (mad_agent) {
+		ib_mad_complete_recv(mad_agent, &recv->header.recv_wc);
+		/*
+		 * recv is freed up in error cases in ib_mad_complete_recv
+		 * or via recv_handler in ib_mad_complete_recv()
+		 */
+		recv = NULL;
+	}
+
+out:
+	/* Post another receive request for this QP */
+	if (response) {
+		ib_mad_post_receive_mads(qp_info, response);
+		if (recv)
+			kmem_cache_free(ib_mad_cache, recv);
+	} else
+		ib_mad_post_receive_mads(qp_info, recv);
+}
+
+static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+
+	if (list_empty(&mad_agent_priv->wait_list)) {
+		del_timer(&mad_agent_priv->timeout_timer);
+	} else {
+		mad_send_wr = list_entry(mad_agent_priv->wait_list.next,
+					 struct ib_mad_send_wr_private,
+					 agent_list);
+
+		if (time_after(mad_agent_priv->timeout,
+			       mad_send_wr->timeout)) {
+			mad_agent_priv->timeout = mad_send_wr->timeout;
+			mod_timer(&mad_agent_priv->timeout_timer,
+				  mad_send_wr->timeout);
+		}
+	}
+}
+
+static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *temp_mad_send_wr;
+	struct list_head *list_item;
+	unsigned long delay;
+
+	mad_agent_priv = mad_send_wr->mad_agent_priv;
+	list_del(&mad_send_wr->agent_list);
+
+	delay = mad_send_wr->timeout;
+	mad_send_wr->timeout += jiffies;
+
+	if (delay) {
+		list_for_each_prev(list_item, &mad_agent_priv->wait_list) {
+			temp_mad_send_wr = list_entry(list_item,
+						struct ib_mad_send_wr_private,
+						agent_list);
+			if (time_after(mad_send_wr->timeout,
+				       temp_mad_send_wr->timeout))
+				break;
+		}
+	} else
+		list_item = &mad_agent_priv->wait_list;
+	list_add(&mad_send_wr->agent_list, list_item);
+
+	/* Reschedule a work item if we have a shorter timeout */
+	if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list)
+		mod_timer(&mad_agent_priv->timeout_timer,
+			  mad_send_wr->timeout);
+}
+
+void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr,
+			  int timeout_ms)
+{
+	mad_send_wr->timeout = msecs_to_jiffies(timeout_ms);
+	wait_for_response(mad_send_wr);
+}
+
+/*
+ * Process a send work completion
+ */
+void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
+			     struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_mad_agent_private	*mad_agent_priv;
+	unsigned long			flags;
+	int				ret;
+
+	mad_agent_priv = mad_send_wr->mad_agent_priv;
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	if (mad_agent_priv->agent.rmpp_version) {
+		ret = ib_process_rmpp_send_wc(mad_send_wr, mad_send_wc);
+		if (ret == IB_RMPP_RESULT_CONSUMED)
+			goto done;
+	} else
+		ret = IB_RMPP_RESULT_UNHANDLED;
+
+	if (mad_send_wc->status != IB_WC_SUCCESS &&
+	    mad_send_wr->status == IB_WC_SUCCESS) {
+		mad_send_wr->status = mad_send_wc->status;
+		mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+	}
+
+	if (--mad_send_wr->refcount > 0) {
+		if (mad_send_wr->refcount == 1 && mad_send_wr->timeout &&
+		    mad_send_wr->status == IB_WC_SUCCESS) {
+			wait_for_response(mad_send_wr);
+		}
+		goto done;
+	}
+
+	/* Remove send from MAD agent and notify client of completion */
+	list_del(&mad_send_wr->agent_list);
+	adjust_timeout(mad_agent_priv);
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+	if (mad_send_wr->status != IB_WC_SUCCESS )
+		mad_send_wc->status = mad_send_wr->status;
+	if (ret == IB_RMPP_RESULT_INTERNAL)
+		ib_rmpp_send_handler(mad_send_wc);
+	else
+		mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+						   mad_send_wc);
+
+	/* Release reference on agent taken when sending */
+	deref_mad_agent(mad_agent_priv);
+	return;
+done:
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static void ib_mad_send_done_handler(struct ib_mad_port_private *port_priv,
+				     struct ib_wc *wc)
+{
+	struct ib_mad_send_wr_private	*mad_send_wr, *queued_send_wr;
+	struct ib_mad_list_head		*mad_list;
+	struct ib_mad_qp_info		*qp_info;
+	struct ib_mad_queue		*send_queue;
+	struct ib_send_wr		*bad_send_wr;
+	struct ib_mad_send_wc		mad_send_wc;
+	unsigned long flags;
+	int ret;
+
+	mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+	mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
+				   mad_list);
+	send_queue = mad_list->mad_queue;
+	qp_info = send_queue->qp_info;
+
+retry:
+	ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device,
+			    mad_send_wr->header_mapping,
+			    mad_send_wr->sg_list[0].length, DMA_TO_DEVICE);
+	ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device,
+			    mad_send_wr->payload_mapping,
+			    mad_send_wr->sg_list[1].length, DMA_TO_DEVICE);
+	queued_send_wr = NULL;
+	spin_lock_irqsave(&send_queue->lock, flags);
+	list_del(&mad_list->list);
+
+	/* Move queued send to the send queue */
+	if (send_queue->count-- > send_queue->max_active) {
+		mad_list = container_of(qp_info->overflow_list.next,
+					struct ib_mad_list_head, list);
+		queued_send_wr = container_of(mad_list,
+					struct ib_mad_send_wr_private,
+					mad_list);
+		list_move_tail(&mad_list->list, &send_queue->list);
+	}
+	spin_unlock_irqrestore(&send_queue->lock, flags);
+
+	mad_send_wc.send_buf = &mad_send_wr->send_buf;
+	mad_send_wc.status = wc->status;
+	mad_send_wc.vendor_err = wc->vendor_err;
+	if (atomic_read(&qp_info->snoop_count))
+		snoop_send(qp_info, &mad_send_wr->send_buf, &mad_send_wc,
+			   IB_MAD_SNOOP_SEND_COMPLETIONS);
+	ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+
+	if (queued_send_wr) {
+		ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr,
+				   &bad_send_wr);
+		if (ret) {
+			printk(KERN_ERR PFX "ib_post_send failed: %d\n", ret);
+			mad_send_wr = queued_send_wr;
+			wc->status = IB_WC_LOC_QP_OP_ERR;
+			goto retry;
+		}
+	}
+}
+
+static void mark_sends_for_retry(struct ib_mad_qp_info *qp_info)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_mad_list_head *mad_list;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp_info->send_queue.lock, flags);
+	list_for_each_entry(mad_list, &qp_info->send_queue.list, list) {
+		mad_send_wr = container_of(mad_list,
+					   struct ib_mad_send_wr_private,
+					   mad_list);
+		mad_send_wr->retry = 1;
+	}
+	spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
+}
+
+static void mad_error_handler(struct ib_mad_port_private *port_priv,
+			      struct ib_wc *wc)
+{
+	struct ib_mad_list_head *mad_list;
+	struct ib_mad_qp_info *qp_info;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	int ret;
+
+	/* Determine if failure was a send or receive */
+	mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+	qp_info = mad_list->mad_queue->qp_info;
+	if (mad_list->mad_queue == &qp_info->recv_queue)
+		/*
+		 * Receive errors indicate that the QP has entered the error
+		 * state - error handling/shutdown code will cleanup
+		 */
+		return;
+
+	/*
+	 * Send errors will transition the QP to SQE - move
+	 * QP to RTS and repost flushed work requests
+	 */
+	mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
+				   mad_list);
+	if (wc->status == IB_WC_WR_FLUSH_ERR) {
+		if (mad_send_wr->retry) {
+			/* Repost send */
+			struct ib_send_wr *bad_send_wr;
+
+			mad_send_wr->retry = 0;
+			ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr,
+					&bad_send_wr);
+			if (ret)
+				ib_mad_send_done_handler(port_priv, wc);
+		} else
+			ib_mad_send_done_handler(port_priv, wc);
+	} else {
+		struct ib_qp_attr *attr;
+
+		/* Transition QP to RTS and fail offending send */
+		attr = kmalloc(sizeof *attr, GFP_KERNEL);
+		if (attr) {
+			attr->qp_state = IB_QPS_RTS;
+			attr->cur_qp_state = IB_QPS_SQE;
+			ret = ib_modify_qp(qp_info->qp, attr,
+					   IB_QP_STATE | IB_QP_CUR_STATE);
+			kfree(attr);
+			if (ret)
+				printk(KERN_ERR PFX "mad_error_handler - "
+				       "ib_modify_qp to RTS : %d\n", ret);
+			else
+				mark_sends_for_retry(qp_info);
+		}
+		ib_mad_send_done_handler(port_priv, wc);
+	}
+}
+
+/*
+ * IB MAD completion callback
+ */
+static void ib_mad_completion_handler(struct work_struct *work)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_wc wc;
+
+	port_priv = container_of(work, struct ib_mad_port_private, work);
+	ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
+
+	while (ib_poll_cq(port_priv->cq, 1, &wc) == 1) {
+		if (wc.status == IB_WC_SUCCESS) {
+			switch (wc.opcode) {
+			case IB_WC_SEND:
+				ib_mad_send_done_handler(port_priv, &wc);
+				break;
+			case IB_WC_RECV:
+				ib_mad_recv_done_handler(port_priv, &wc);
+				break;
+			default:
+				BUG_ON(1);
+				break;
+			}
+		} else
+			mad_error_handler(port_priv, &wc);
+	}
+}
+
+static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv)
+{
+	unsigned long flags;
+	struct ib_mad_send_wr_private *mad_send_wr, *temp_mad_send_wr;
+	struct ib_mad_send_wc mad_send_wc;
+	struct list_head cancel_list;
+
+	INIT_LIST_HEAD(&cancel_list);
+
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
+				 &mad_agent_priv->send_list, agent_list) {
+		if (mad_send_wr->status == IB_WC_SUCCESS) {
+			mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
+			mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+		}
+	}
+
+	/* Empty wait list to prevent receives from finding a request */
+	list_splice_init(&mad_agent_priv->wait_list, &cancel_list);
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+	/* Report all cancelled requests */
+	mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
+	mad_send_wc.vendor_err = 0;
+
+	list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
+				 &cancel_list, agent_list) {
+		mad_send_wc.send_buf = &mad_send_wr->send_buf;
+		list_del(&mad_send_wr->agent_list);
+		mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+						   &mad_send_wc);
+		atomic_dec(&mad_agent_priv->refcount);
+	}
+}
+
+static struct ib_mad_send_wr_private*
+find_send_wr(struct ib_mad_agent_private *mad_agent_priv,
+	     struct ib_mad_send_buf *send_buf)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+
+	list_for_each_entry(mad_send_wr, &mad_agent_priv->wait_list,
+			    agent_list) {
+		if (&mad_send_wr->send_buf == send_buf)
+			return mad_send_wr;
+	}
+
+	list_for_each_entry(mad_send_wr, &mad_agent_priv->send_list,
+			    agent_list) {
+		if (is_data_mad(mad_agent_priv, mad_send_wr->send_buf.mad) &&
+		    &mad_send_wr->send_buf == send_buf)
+			return mad_send_wr;
+	}
+	return NULL;
+}
+
+int ib_modify_mad(struct ib_mad_agent *mad_agent,
+		  struct ib_mad_send_buf *send_buf, u32 timeout_ms)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	unsigned long flags;
+	int active;
+
+	mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
+				      agent);
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	mad_send_wr = find_send_wr(mad_agent_priv, send_buf);
+	if (!mad_send_wr || mad_send_wr->status != IB_WC_SUCCESS) {
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+		return -EINVAL;
+	}
+
+	active = (!mad_send_wr->timeout || mad_send_wr->refcount > 1);
+	if (!timeout_ms) {
+		mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
+		mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+	}
+
+	mad_send_wr->send_buf.timeout_ms = timeout_ms;
+	if (active)
+		mad_send_wr->timeout = msecs_to_jiffies(timeout_ms);
+	else
+		ib_reset_mad_timeout(mad_send_wr, timeout_ms);
+
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+	return 0;
+}
+EXPORT_SYMBOL(ib_modify_mad);
+
+void ib_cancel_mad(struct ib_mad_agent *mad_agent,
+		   struct ib_mad_send_buf *send_buf)
+{
+	ib_modify_mad(mad_agent, send_buf, 0);
+}
+EXPORT_SYMBOL(ib_cancel_mad);
+
+static void local_completions(struct work_struct *work)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_local_private *local;
+	struct ib_mad_agent_private *recv_mad_agent;
+	unsigned long flags;
+	int free_mad;
+	struct ib_wc wc;
+	struct ib_mad_send_wc mad_send_wc;
+
+	mad_agent_priv =
+		container_of(work, struct ib_mad_agent_private, local_work);
+
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	while (!list_empty(&mad_agent_priv->local_list)) {
+		local = list_entry(mad_agent_priv->local_list.next,
+				   struct ib_mad_local_private,
+				   completion_list);
+		list_del(&local->completion_list);
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+		free_mad = 0;
+		if (local->mad_priv) {
+			recv_mad_agent = local->recv_mad_agent;
+			if (!recv_mad_agent) {
+				printk(KERN_ERR PFX "No receive MAD agent for local completion\n");
+				free_mad = 1;
+				goto local_send_completion;
+			}
+
+			/*
+			 * Defined behavior is to complete response
+			 * before request
+			 */
+			build_smp_wc(recv_mad_agent->agent.qp,
+				     (unsigned long) local->mad_send_wr,
+				     be16_to_cpu(IB_LID_PERMISSIVE),
+				     0, recv_mad_agent->agent.port_num, &wc);
+
+			local->mad_priv->header.recv_wc.wc = &wc;
+			local->mad_priv->header.recv_wc.mad_len =
+						sizeof(struct ib_mad);
+			INIT_LIST_HEAD(&local->mad_priv->header.recv_wc.rmpp_list);
+			list_add(&local->mad_priv->header.recv_wc.recv_buf.list,
+				 &local->mad_priv->header.recv_wc.rmpp_list);
+			local->mad_priv->header.recv_wc.recv_buf.grh = NULL;
+			local->mad_priv->header.recv_wc.recv_buf.mad =
+						&local->mad_priv->mad.mad;
+			if (atomic_read(&recv_mad_agent->qp_info->snoop_count))
+				snoop_recv(recv_mad_agent->qp_info,
+					  &local->mad_priv->header.recv_wc,
+					   IB_MAD_SNOOP_RECVS);
+			recv_mad_agent->agent.recv_handler(
+						&recv_mad_agent->agent,
+						&local->mad_priv->header.recv_wc);
+			spin_lock_irqsave(&recv_mad_agent->lock, flags);
+			atomic_dec(&recv_mad_agent->refcount);
+			spin_unlock_irqrestore(&recv_mad_agent->lock, flags);
+		}
+
+local_send_completion:
+		/* Complete send */
+		mad_send_wc.status = IB_WC_SUCCESS;
+		mad_send_wc.vendor_err = 0;
+		mad_send_wc.send_buf = &local->mad_send_wr->send_buf;
+		if (atomic_read(&mad_agent_priv->qp_info->snoop_count))
+			snoop_send(mad_agent_priv->qp_info,
+				   &local->mad_send_wr->send_buf,
+				   &mad_send_wc, IB_MAD_SNOOP_SEND_COMPLETIONS);
+		mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+						   &mad_send_wc);
+
+		spin_lock_irqsave(&mad_agent_priv->lock, flags);
+		atomic_dec(&mad_agent_priv->refcount);
+		if (free_mad)
+			kmem_cache_free(ib_mad_cache, local->mad_priv);
+		kfree(local);
+	}
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	int ret;
+
+	if (!mad_send_wr->retries_left)
+		return -ETIMEDOUT;
+
+	mad_send_wr->retries_left--;
+	mad_send_wr->send_buf.retries++;
+
+	mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+
+	if (mad_send_wr->mad_agent_priv->agent.rmpp_version) {
+		ret = ib_retry_rmpp(mad_send_wr);
+		switch (ret) {
+		case IB_RMPP_RESULT_UNHANDLED:
+			ret = ib_send_mad(mad_send_wr);
+			break;
+		case IB_RMPP_RESULT_CONSUMED:
+			ret = 0;
+			break;
+		default:
+			ret = -ECOMM;
+			break;
+		}
+	} else
+		ret = ib_send_mad(mad_send_wr);
+
+	if (!ret) {
+		mad_send_wr->refcount++;
+		list_add_tail(&mad_send_wr->agent_list,
+			      &mad_send_wr->mad_agent_priv->send_list);
+	}
+	return ret;
+}
+
+static void timeout_sends(struct work_struct *work)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_mad_send_wc mad_send_wc;
+	unsigned long flags;
+
+	mad_agent_priv = container_of(work, struct ib_mad_agent_private,
+				      timeout_work);
+	mad_send_wc.vendor_err = 0;
+
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	while (!list_empty(&mad_agent_priv->wait_list)) {
+		mad_send_wr = list_entry(mad_agent_priv->wait_list.next,
+					 struct ib_mad_send_wr_private,
+					 agent_list);
+
+		if (time_after(mad_send_wr->timeout, jiffies)) {
+			mod_timer(&mad_agent_priv->timeout_timer,
+				  mad_send_wr->timeout);
+			break;
+		}
+
+		list_del(&mad_send_wr->agent_list);
+		if (mad_send_wr->status == IB_WC_SUCCESS &&
+		    !retry_send(mad_send_wr))
+			continue;
+
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+		if (mad_send_wr->status == IB_WC_SUCCESS)
+			mad_send_wc.status = IB_WC_RESP_TIMEOUT_ERR;
+		else
+			mad_send_wc.status = mad_send_wr->status;
+		mad_send_wc.send_buf = &mad_send_wr->send_buf;
+		mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+						   &mad_send_wc);
+
+		atomic_dec(&mad_agent_priv->refcount);
+		spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	}
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static void ib_mad_thread_completion_handler(struct ib_cq *cq, void *arg)
+{
+	struct ib_mad_port_private *port_priv = cq->cq_context;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+	if (!list_empty(&port_priv->port_list))
+		queue_work(port_priv->wq, &port_priv->work);
+	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+}
+
+/*
+ * Allocate receive MADs and post receive WRs for them
+ */
+static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
+				    struct ib_mad_private *mad)
+{
+	unsigned long flags;
+	int post, ret;
+	struct ib_mad_private *mad_priv;
+	struct ib_sge sg_list;
+	struct ib_recv_wr recv_wr, *bad_recv_wr;
+	struct ib_mad_queue *recv_queue = &qp_info->recv_queue;
+
+	/* Initialize common scatter list fields */
+	sg_list.length = sizeof *mad_priv - sizeof mad_priv->header;
+	sg_list.lkey = (*qp_info->port_priv->mr).lkey;
+
+	/* Initialize common receive WR fields */
+	recv_wr.next = NULL;
+	recv_wr.sg_list = &sg_list;
+	recv_wr.num_sge = 1;
+
+	do {
+		/* Allocate and map receive buffer */
+		if (mad) {
+			mad_priv = mad;
+			mad = NULL;
+		} else {
+			mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL);
+			if (!mad_priv) {
+				printk(KERN_ERR PFX "No memory for receive buffer\n");
+				ret = -ENOMEM;
+				break;
+			}
+		}
+		sg_list.addr = ib_dma_map_single(qp_info->port_priv->device,
+						 &mad_priv->grh,
+						 sizeof *mad_priv -
+						   sizeof mad_priv->header,
+						 DMA_FROM_DEVICE);
+		mad_priv->header.mapping = sg_list.addr;
+		recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list;
+		mad_priv->header.mad_list.mad_queue = recv_queue;
+
+		/* Post receive WR */
+		spin_lock_irqsave(&recv_queue->lock, flags);
+		post = (++recv_queue->count < recv_queue->max_active);
+		list_add_tail(&mad_priv->header.mad_list.list, &recv_queue->list);
+		spin_unlock_irqrestore(&recv_queue->lock, flags);
+		ret = ib_post_recv(qp_info->qp, &recv_wr, &bad_recv_wr);
+		if (ret) {
+			spin_lock_irqsave(&recv_queue->lock, flags);
+			list_del(&mad_priv->header.mad_list.list);
+			recv_queue->count--;
+			spin_unlock_irqrestore(&recv_queue->lock, flags);
+			ib_dma_unmap_single(qp_info->port_priv->device,
+					    mad_priv->header.mapping,
+					    sizeof *mad_priv -
+					      sizeof mad_priv->header,
+					    DMA_FROM_DEVICE);
+			kmem_cache_free(ib_mad_cache, mad_priv);
+			printk(KERN_ERR PFX "ib_post_recv failed: %d\n", ret);
+			break;
+		}
+	} while (post);
+
+	return ret;
+}
+
+/*
+ * Return all the posted receive MADs
+ */
+static void cleanup_recv_queue(struct ib_mad_qp_info *qp_info)
+{
+	struct ib_mad_private_header *mad_priv_hdr;
+	struct ib_mad_private *recv;
+	struct ib_mad_list_head *mad_list;
+
+	if (!qp_info->qp)
+		return;
+
+	while (!list_empty(&qp_info->recv_queue.list)) {
+
+		mad_list = list_entry(qp_info->recv_queue.list.next,
+				      struct ib_mad_list_head, list);
+		mad_priv_hdr = container_of(mad_list,
+					    struct ib_mad_private_header,
+					    mad_list);
+		recv = container_of(mad_priv_hdr, struct ib_mad_private,
+				    header);
+
+		/* Remove from posted receive MAD list */
+		list_del(&mad_list->list);
+
+		ib_dma_unmap_single(qp_info->port_priv->device,
+				    recv->header.mapping,
+				    sizeof(struct ib_mad_private) -
+				      sizeof(struct ib_mad_private_header),
+				    DMA_FROM_DEVICE);
+		kmem_cache_free(ib_mad_cache, recv);
+	}
+
+	qp_info->recv_queue.count = 0;
+}
+
+/*
+ * Start the port
+ */
+static int ib_mad_port_start(struct ib_mad_port_private *port_priv)
+{
+	int ret, i;
+	struct ib_qp_attr *attr;
+	struct ib_qp *qp;
+
+	attr = kmalloc(sizeof *attr, GFP_KERNEL);
+	if (!attr) {
+		printk(KERN_ERR PFX "Couldn't kmalloc ib_qp_attr\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < IB_MAD_QPS_CORE; i++) {
+		qp = port_priv->qp_info[i].qp;
+		if (!qp)
+			continue;
+
+		/*
+		 * PKey index for QP1 is irrelevant but
+		 * one is needed for the Reset to Init transition
+		 */
+		attr->qp_state = IB_QPS_INIT;
+		attr->pkey_index = 0;
+		attr->qkey = (qp->qp_num == 0) ? 0 : IB_QP1_QKEY;
+		ret = ib_modify_qp(qp, attr, IB_QP_STATE |
+					     IB_QP_PKEY_INDEX | IB_QP_QKEY);
+		if (ret) {
+			printk(KERN_ERR PFX "Couldn't change QP%d state to "
+			       "INIT: %d\n", i, ret);
+			goto out;
+		}
+
+		attr->qp_state = IB_QPS_RTR;
+		ret = ib_modify_qp(qp, attr, IB_QP_STATE);
+		if (ret) {
+			printk(KERN_ERR PFX "Couldn't change QP%d state to "
+			       "RTR: %d\n", i, ret);
+			goto out;
+		}
+
+		attr->qp_state = IB_QPS_RTS;
+		attr->sq_psn = IB_MAD_SEND_Q_PSN;
+		ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_SQ_PSN);
+		if (ret) {
+			printk(KERN_ERR PFX "Couldn't change QP%d state to "
+			       "RTS: %d\n", i, ret);
+			goto out;
+		}
+	}
+
+	ret = ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
+	if (ret) {
+		printk(KERN_ERR PFX "Failed to request completion "
+		       "notification: %d\n", ret);
+		goto out;
+	}
+
+	for (i = 0; i < IB_MAD_QPS_CORE; i++) {
+		if (!port_priv->qp_info[i].qp)
+			continue;
+
+		ret = ib_mad_post_receive_mads(&port_priv->qp_info[i], NULL);
+		if (ret) {
+			printk(KERN_ERR PFX "Couldn't post receive WRs\n");
+			goto out;
+		}
+	}
+out:
+	kfree(attr);
+	return ret;
+}
+
+static void qp_event_handler(struct ib_event *event, void *qp_context)
+{
+	struct ib_mad_qp_info	*qp_info = qp_context;
+
+	/* It's worse than that! He's dead, Jim! */
+	printk(KERN_ERR PFX "Fatal error (%d) on MAD QP (%d)\n",
+		event->event, qp_info->qp->qp_num);
+}
+
+static void init_mad_queue(struct ib_mad_qp_info *qp_info,
+			   struct ib_mad_queue *mad_queue)
+{
+	mad_queue->qp_info = qp_info;
+	mad_queue->count = 0;
+	spin_lock_init(&mad_queue->lock);
+	INIT_LIST_HEAD(&mad_queue->list);
+}
+
+static void init_mad_qp(struct ib_mad_port_private *port_priv,
+			struct ib_mad_qp_info *qp_info)
+{
+	qp_info->port_priv = port_priv;
+	init_mad_queue(qp_info, &qp_info->send_queue);
+	init_mad_queue(qp_info, &qp_info->recv_queue);
+	INIT_LIST_HEAD(&qp_info->overflow_list);
+	spin_lock_init(&qp_info->snoop_lock);
+	qp_info->snoop_table = NULL;
+	qp_info->snoop_table_size = 0;
+	atomic_set(&qp_info->snoop_count, 0);
+}
+
+static int create_mad_qp(struct ib_mad_qp_info *qp_info,
+			 enum ib_qp_type qp_type)
+{
+	struct ib_qp_init_attr	qp_init_attr;
+	int ret;
+
+	memset(&qp_init_attr, 0, sizeof qp_init_attr);
+	qp_init_attr.send_cq = qp_info->port_priv->cq;
+	qp_init_attr.recv_cq = qp_info->port_priv->cq;
+	qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+	qp_init_attr.cap.max_send_wr = mad_sendq_size;
+	qp_init_attr.cap.max_recv_wr = mad_recvq_size;
+	qp_init_attr.cap.max_send_sge = IB_MAD_SEND_REQ_MAX_SG;
+	qp_init_attr.cap.max_recv_sge = IB_MAD_RECV_REQ_MAX_SG;
+	qp_init_attr.qp_type = qp_type;
+	qp_init_attr.port_num = qp_info->port_priv->port_num;
+	qp_init_attr.qp_context = qp_info;
+	qp_init_attr.event_handler = qp_event_handler;
+	qp_info->qp = ib_create_qp(qp_info->port_priv->pd, &qp_init_attr);
+	if (IS_ERR(qp_info->qp)) {
+		printk(KERN_ERR PFX "Couldn't create ib_mad QP%d\n",
+		       get_spl_qp_index(qp_type));
+		ret = PTR_ERR(qp_info->qp);
+		goto error;
+	}
+	/* Use minimum queue sizes unless the CQ is resized */
+	qp_info->send_queue.max_active = mad_sendq_size;
+	qp_info->recv_queue.max_active = mad_recvq_size;
+	return 0;
+
+error:
+	return ret;
+}
+
+static void destroy_mad_qp(struct ib_mad_qp_info *qp_info)
+{
+	if (!qp_info->qp)
+		return;
+
+	ib_destroy_qp(qp_info->qp);
+	kfree(qp_info->snoop_table);
+}
+
+/*
+ * Open the port
+ * Create the QP, PD, MR, and CQ if needed
+ */
+static int ib_mad_port_open(struct ib_device *device,
+			    int port_num)
+{
+	int ret, cq_size;
+	struct ib_mad_port_private *port_priv;
+	unsigned long flags;
+	char name[sizeof "ib_mad123"];
+	int has_smi;
+
+	/* Create new device info */
+	port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
+	if (!port_priv) {
+		printk(KERN_ERR PFX "No memory for ib_mad_port_private\n");
+		return -ENOMEM;
+	}
+
+	port_priv->device = device;
+	port_priv->port_num = port_num;
+	spin_lock_init(&port_priv->reg_lock);
+	INIT_LIST_HEAD(&port_priv->agent_list);
+	init_mad_qp(port_priv, &port_priv->qp_info[0]);
+	init_mad_qp(port_priv, &port_priv->qp_info[1]);
+
+	cq_size = mad_sendq_size + mad_recvq_size;
+	has_smi = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND;
+	if (has_smi)
+		cq_size *= 2;
+
+	port_priv->cq = ib_create_cq(port_priv->device,
+				     ib_mad_thread_completion_handler,
+				     NULL, port_priv, cq_size, 0);
+	if (IS_ERR(port_priv->cq)) {
+		printk(KERN_ERR PFX "Couldn't create ib_mad CQ\n");
+		ret = PTR_ERR(port_priv->cq);
+		goto error3;
+	}
+
+	port_priv->pd = ib_alloc_pd(device);
+	if (IS_ERR(port_priv->pd)) {
+		printk(KERN_ERR PFX "Couldn't create ib_mad PD\n");
+		ret = PTR_ERR(port_priv->pd);
+		goto error4;
+	}
+
+	port_priv->mr = ib_get_dma_mr(port_priv->pd, IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(port_priv->mr)) {
+		printk(KERN_ERR PFX "Couldn't get ib_mad DMA MR\n");
+		ret = PTR_ERR(port_priv->mr);
+		goto error5;
+	}
+
+	if (has_smi) {
+		ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI);
+		if (ret)
+			goto error6;
+	}
+	ret = create_mad_qp(&port_priv->qp_info[1], IB_QPT_GSI);
+	if (ret)
+		goto error7;
+
+	snprintf(name, sizeof name, "ib_mad%d", port_num);
+	port_priv->wq = create_singlethread_workqueue(name);
+	if (!port_priv->wq) {
+		ret = -ENOMEM;
+		goto error8;
+	}
+	INIT_WORK(&port_priv->work, ib_mad_completion_handler);
+
+	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+	list_add_tail(&port_priv->port_list, &ib_mad_port_list);
+	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+	ret = ib_mad_port_start(port_priv);
+	if (ret) {
+		printk(KERN_ERR PFX "Couldn't start port\n");
+		goto error9;
+	}
+
+	return 0;
+
+error9:
+	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+	list_del_init(&port_priv->port_list);
+	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+	destroy_workqueue(port_priv->wq);
+error8:
+	destroy_mad_qp(&port_priv->qp_info[1]);
+error7:
+	destroy_mad_qp(&port_priv->qp_info[0]);
+error6:
+	ib_dereg_mr(port_priv->mr);
+error5:
+	ib_dealloc_pd(port_priv->pd);
+error4:
+	ib_destroy_cq(port_priv->cq);
+	cleanup_recv_queue(&port_priv->qp_info[1]);
+	cleanup_recv_queue(&port_priv->qp_info[0]);
+error3:
+	kfree(port_priv);
+
+	return ret;
+}
+
+/*
+ * Close the port
+ * If there are no classes using the port, free the port
+ * resources (CQ, MR, PD, QP) and remove the port's info structure
+ */
+static int ib_mad_port_close(struct ib_device *device, int port_num)
+{
+	struct ib_mad_port_private *port_priv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+	port_priv = __ib_get_mad_port(device, port_num);
+	if (port_priv == NULL) {
+		spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+		printk(KERN_ERR PFX "Port %d not found\n", port_num);
+		return -ENODEV;
+	}
+	list_del_init(&port_priv->port_list);
+	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+	destroy_workqueue(port_priv->wq);
+	destroy_mad_qp(&port_priv->qp_info[1]);
+	destroy_mad_qp(&port_priv->qp_info[0]);
+	ib_dereg_mr(port_priv->mr);
+	ib_dealloc_pd(port_priv->pd);
+	ib_destroy_cq(port_priv->cq);
+	cleanup_recv_queue(&port_priv->qp_info[1]);
+	cleanup_recv_queue(&port_priv->qp_info[0]);
+	/* XXX: Handle deallocation of MAD registration tables */
+
+	kfree(port_priv);
+
+	return 0;
+}
+
+static void ib_mad_init_device(struct ib_device *device)
+{
+	int start, end, i;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH) {
+		start = 0;
+		end   = 0;
+	} else {
+		start = 1;
+		end   = device->phys_port_cnt;
+	}
+
+	for (i = start; i <= end; i++) {
+		if (ib_mad_port_open(device, i)) {
+			printk(KERN_ERR PFX "Couldn't open %s port %d\n",
+			       device->name, i);
+			goto error;
+		}
+		if (ib_agent_port_open(device, i)) {
+			printk(KERN_ERR PFX "Couldn't open %s port %d "
+			       "for agents\n",
+			       device->name, i);
+			goto error_agent;
+		}
+	}
+	return;
+
+error_agent:
+	if (ib_mad_port_close(device, i))
+		printk(KERN_ERR PFX "Couldn't close %s port %d\n",
+		       device->name, i);
+
+error:
+	i--;
+
+	while (i >= start) {
+		if (ib_agent_port_close(device, i))
+			printk(KERN_ERR PFX "Couldn't close %s port %d "
+			       "for agents\n",
+			       device->name, i);
+		if (ib_mad_port_close(device, i))
+			printk(KERN_ERR PFX "Couldn't close %s port %d\n",
+			       device->name, i);
+		i--;
+	}
+}
+
+static void ib_mad_remove_device(struct ib_device *device)
+{
+	int i, num_ports, cur_port;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH) {
+		num_ports = 1;
+		cur_port = 0;
+	} else {
+		num_ports = device->phys_port_cnt;
+		cur_port = 1;
+	}
+	for (i = 0; i < num_ports; i++, cur_port++) {
+		if (ib_agent_port_close(device, cur_port))
+			printk(KERN_ERR PFX "Couldn't close %s port %d "
+			       "for agents\n",
+			       device->name, cur_port);
+		if (ib_mad_port_close(device, cur_port))
+			printk(KERN_ERR PFX "Couldn't close %s port %d\n",
+			       device->name, cur_port);
+	}
+}
+
+static struct ib_client mad_client = {
+	.name   = "mad",
+	.add = ib_mad_init_device,
+	.remove = ib_mad_remove_device
+};
+
+static int __init ib_mad_init_module(void)
+{
+	int ret;
+
+	mad_recvq_size = min(mad_recvq_size, IB_MAD_QP_MAX_SIZE);
+	mad_recvq_size = max(mad_recvq_size, IB_MAD_QP_MIN_SIZE);
+
+	mad_sendq_size = min(mad_sendq_size, IB_MAD_QP_MAX_SIZE);
+	mad_sendq_size = max(mad_sendq_size, IB_MAD_QP_MIN_SIZE);
+
+	spin_lock_init(&ib_mad_port_list_lock);
+
+	ib_mad_cache = kmem_cache_create("ib_mad",
+					 sizeof(struct ib_mad_private),
+					 0,
+					 SLAB_HWCACHE_ALIGN,
+					 NULL);
+	if (!ib_mad_cache) {
+		printk(KERN_ERR PFX "Couldn't create ib_mad cache\n");
+		ret = -ENOMEM;
+		goto error1;
+	}
+
+	INIT_LIST_HEAD(&ib_mad_port_list);
+
+	if (ib_register_client(&mad_client)) {
+		printk(KERN_ERR PFX "Couldn't register ib_mad client\n");
+		ret = -EINVAL;
+		goto error2;
+	}
+
+	return 0;
+
+error2:
+	kmem_cache_destroy(ib_mad_cache);
+error1:
+	return ret;
+}
+
+static void __exit ib_mad_cleanup_module(void)
+{
+	ib_unregister_client(&mad_client);
+	kmem_cache_destroy(ib_mad_cache);
+}
+
+module_init(ib_mad_init_module);
+module_exit(ib_mad_cleanup_module);
+
diff --git a/sys/ofed/drivers/infiniband/core/mad_priv.h b/sys/ofed/drivers/infiniband/core/mad_priv.h
new file mode 100644
index 0000000..8b4df0a
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/mad_priv.h
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2004, 2005, Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __IB_MAD_PRIV_H__
+#define __IB_MAD_PRIV_H__
+
+#include <linux/completion.h>
+#include <linux/err.h>
+#include <linux/workqueue.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+
+
+#define PFX "ib_mad: "
+
+#define IB_MAD_QPS_CORE		2 /* Always QP0 and QP1 as a minimum */
+
+/* QP and CQ parameters */
+#define IB_MAD_QP_SEND_SIZE	128
+#define IB_MAD_QP_RECV_SIZE	512
+#define IB_MAD_QP_MIN_SIZE	64
+#define IB_MAD_QP_MAX_SIZE	8192
+#define IB_MAD_SEND_REQ_MAX_SG	2
+#define IB_MAD_RECV_REQ_MAX_SG	1
+
+#define IB_MAD_SEND_Q_PSN	0
+
+/* Registration table sizes */
+#define MAX_MGMT_CLASS		80
+#define MAX_MGMT_VERSION	8
+#define MAX_MGMT_OUI		8
+#define MAX_MGMT_VENDOR_RANGE2	(IB_MGMT_CLASS_VENDOR_RANGE2_END - \
+				IB_MGMT_CLASS_VENDOR_RANGE2_START + 1)
+
+struct ib_mad_list_head {
+	struct list_head list;
+	struct ib_mad_queue *mad_queue;
+};
+
+struct ib_mad_private_header {
+	struct ib_mad_list_head mad_list;
+	struct ib_mad_recv_wc recv_wc;
+	struct ib_wc wc;
+	u64 mapping;
+} __attribute__ ((packed));
+
+struct ib_mad_private {
+	struct ib_mad_private_header header;
+	struct ib_grh grh;
+	union {
+		struct ib_mad mad;
+		struct ib_rmpp_mad rmpp_mad;
+		struct ib_smp smp;
+	} mad;
+} __attribute__ ((packed));
+
+struct ib_rmpp_segment {
+	struct list_head list;
+	u32 num;
+	u8 data[0];
+};
+
+struct ib_mad_agent_private {
+	struct list_head agent_list;
+	struct ib_mad_agent agent;
+	struct ib_mad_reg_req *reg_req;
+	struct ib_mad_qp_info *qp_info;
+
+	spinlock_t lock;
+	struct list_head send_list;
+	struct list_head wait_list;
+	struct list_head done_list;
+	struct work_struct timeout_work;
+	struct timer_list timeout_timer;
+	unsigned long timeout;
+	struct list_head local_list;
+	struct work_struct local_work;
+	struct list_head rmpp_list;
+
+	atomic_t refcount;
+	struct completion comp;
+};
+
+struct ib_mad_snoop_private {
+	struct ib_mad_agent agent;
+	struct ib_mad_qp_info *qp_info;
+	int snoop_index;
+	int mad_snoop_flags;
+	atomic_t refcount;
+	struct completion comp;
+};
+
+struct ib_mad_send_wr_private {
+	struct ib_mad_list_head mad_list;
+	struct list_head agent_list;
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_buf send_buf;
+	u64 header_mapping;
+	u64 payload_mapping;
+	struct ib_send_wr send_wr;
+	struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
+	__be64 tid;
+	unsigned long timeout;
+	int max_retries;
+	int retries_left;
+	int retry;
+	int refcount;
+	enum ib_wc_status status;
+
+	/* RMPP control */
+	struct list_head rmpp_list;
+	struct ib_rmpp_segment *last_ack_seg;
+	struct ib_rmpp_segment *cur_seg;
+	int last_ack;
+	int seg_num;
+	int newwin;
+	int pad;
+};
+
+struct ib_mad_local_private {
+	struct list_head completion_list;
+	struct ib_mad_private *mad_priv;
+	struct ib_mad_agent_private *recv_mad_agent;
+	struct ib_mad_send_wr_private *mad_send_wr;
+};
+
+struct ib_mad_mgmt_method_table {
+	struct ib_mad_agent_private *agent[IB_MGMT_MAX_METHODS];
+};
+
+struct ib_mad_mgmt_class_table {
+	struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_CLASS];
+};
+
+struct ib_mad_mgmt_vendor_class {
+	u8	oui[MAX_MGMT_OUI][3];
+	struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_OUI];
+};
+
+struct ib_mad_mgmt_vendor_class_table {
+	struct ib_mad_mgmt_vendor_class *vendor_class[MAX_MGMT_VENDOR_RANGE2];
+};
+
+struct ib_mad_mgmt_version_table {
+	struct ib_mad_mgmt_class_table *class;
+	struct ib_mad_mgmt_vendor_class_table *vendor;
+};
+
+struct ib_mad_queue {
+	spinlock_t lock;
+	struct list_head list;
+	int count;
+	int max_active;
+	struct ib_mad_qp_info *qp_info;
+};
+
+struct ib_mad_qp_info {
+	struct ib_mad_port_private *port_priv;
+	struct ib_qp *qp;
+	struct ib_mad_queue send_queue;
+	struct ib_mad_queue recv_queue;
+	struct list_head overflow_list;
+	spinlock_t snoop_lock;
+	struct ib_mad_snoop_private **snoop_table;
+	int snoop_table_size;
+	atomic_t snoop_count;
+};
+
+struct ib_mad_port_private {
+	struct list_head port_list;
+	struct ib_device *device;
+	int port_num;
+	struct ib_cq *cq;
+	struct ib_pd *pd;
+	struct ib_mr *mr;
+
+	spinlock_t reg_lock;
+	struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION];
+	struct list_head agent_list;
+	struct workqueue_struct *wq;
+	struct work_struct work;
+	struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE];
+};
+
+int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr);
+
+struct ib_mad_send_wr_private *
+ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv,
+		 struct ib_mad_recv_wc *mad_recv_wc);
+
+void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
+			     struct ib_mad_send_wc *mad_send_wc);
+
+void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr);
+
+void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr,
+			  int timeout_ms);
+
+#endif	/* __IB_MAD_PRIV_H__ */
diff --git a/sys/ofed/drivers/infiniband/core/mad_rmpp.c b/sys/ofed/drivers/infiniband/core/mad_rmpp.c
new file mode 100644
index 0000000..4e0f282
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/mad_rmpp.c
@@ -0,0 +1,951 @@
+/*
+ * Copyright (c) 2005 Intel Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "mad_priv.h"
+#include "mad_rmpp.h"
+
+enum rmpp_state {
+	RMPP_STATE_ACTIVE,
+	RMPP_STATE_TIMEOUT,
+	RMPP_STATE_COMPLETE,
+	RMPP_STATE_CANCELING
+};
+
+struct mad_rmpp_recv {
+	struct ib_mad_agent_private *agent;
+	struct list_head list;
+	struct delayed_work timeout_work;
+	struct delayed_work cleanup_work;
+	struct completion comp;
+	enum rmpp_state state;
+	spinlock_t lock;
+	atomic_t refcount;
+
+	struct ib_ah *ah;
+	struct ib_mad_recv_wc *rmpp_wc;
+	struct ib_mad_recv_buf *cur_seg_buf;
+	int last_ack;
+	int seg_num;
+	int newwin;
+	int repwin;
+
+	__be64 tid;
+	u32 src_qp;
+	u16 slid;
+	u8 mgmt_class;
+	u8 class_version;
+	u8 method;
+};
+
+static inline void deref_rmpp_recv(struct mad_rmpp_recv *rmpp_recv)
+{
+	if (atomic_dec_and_test(&rmpp_recv->refcount))
+		complete(&rmpp_recv->comp);
+}
+
+static void destroy_rmpp_recv(struct mad_rmpp_recv *rmpp_recv)
+{
+	deref_rmpp_recv(rmpp_recv);
+	wait_for_completion(&rmpp_recv->comp);
+	ib_destroy_ah(rmpp_recv->ah);
+	kfree(rmpp_recv);
+}
+
+void ib_cancel_rmpp_recvs(struct ib_mad_agent_private *agent)
+{
+	struct mad_rmpp_recv *rmpp_recv, *temp_rmpp_recv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&agent->lock, flags);
+	list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+		if (rmpp_recv->state != RMPP_STATE_COMPLETE)
+			ib_free_recv_mad(rmpp_recv->rmpp_wc);
+		rmpp_recv->state = RMPP_STATE_CANCELING;
+	}
+	spin_unlock_irqrestore(&agent->lock, flags);
+
+	list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+		cancel_delayed_work(&rmpp_recv->timeout_work);
+		cancel_delayed_work(&rmpp_recv->cleanup_work);
+	}
+
+	flush_workqueue(agent->qp_info->port_priv->wq);
+
+	list_for_each_entry_safe(rmpp_recv, temp_rmpp_recv,
+				 &agent->rmpp_list, list) {
+		list_del(&rmpp_recv->list);
+		destroy_rmpp_recv(rmpp_recv);
+	}
+}
+
+static void format_ack(struct ib_mad_send_buf *msg,
+		       struct ib_rmpp_mad *data,
+		       struct mad_rmpp_recv *rmpp_recv)
+{
+	struct ib_rmpp_mad *ack = msg->mad;
+	unsigned long flags;
+
+	memcpy(ack, &data->mad_hdr, msg->hdr_len);
+
+	ack->mad_hdr.method ^= IB_MGMT_METHOD_RESP;
+	ack->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_ACK;
+	ib_set_rmpp_flags(&ack->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+
+	spin_lock_irqsave(&rmpp_recv->lock, flags);
+	rmpp_recv->last_ack = rmpp_recv->seg_num;
+	ack->rmpp_hdr.seg_num = cpu_to_be32(rmpp_recv->seg_num);
+	ack->rmpp_hdr.paylen_newwin = cpu_to_be32(rmpp_recv->newwin);
+	spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+}
+
+static void ack_recv(struct mad_rmpp_recv *rmpp_recv,
+		     struct ib_mad_recv_wc *recv_wc)
+{
+	struct ib_mad_send_buf *msg;
+	int ret, hdr_len;
+
+	hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
+	msg = ib_create_send_mad(&rmpp_recv->agent->agent, recv_wc->wc->src_qp,
+				 recv_wc->wc->pkey_index, 1, hdr_len,
+				 0, GFP_KERNEL);
+	if (IS_ERR(msg))
+		return;
+
+	format_ack(msg, (struct ib_rmpp_mad *) recv_wc->recv_buf.mad, rmpp_recv);
+	msg->ah = rmpp_recv->ah;
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		ib_free_send_mad(msg);
+}
+
+static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent,
+						  struct ib_mad_recv_wc *recv_wc)
+{
+	struct ib_mad_send_buf *msg;
+	struct ib_ah *ah;
+	int hdr_len;
+
+	ah = ib_create_ah_from_wc(agent->qp->pd, recv_wc->wc,
+				  recv_wc->recv_buf.grh, agent->port_num);
+	if (IS_ERR(ah))
+		return (void *) ah;
+
+	hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
+	msg = ib_create_send_mad(agent, recv_wc->wc->src_qp,
+				 recv_wc->wc->pkey_index, 1,
+				 hdr_len, 0, GFP_KERNEL);
+	if (IS_ERR(msg))
+		ib_destroy_ah(ah);
+	else {
+		msg->ah = ah;
+		msg->context[0] = ah;
+	}
+
+	return msg;
+}
+
+static void ack_ds_ack(struct ib_mad_agent_private *agent,
+		       struct ib_mad_recv_wc *recv_wc)
+{
+	struct ib_mad_send_buf *msg;
+	struct ib_rmpp_mad *rmpp_mad;
+	int ret;
+
+	msg = alloc_response_msg(&agent->agent, recv_wc);
+	if (IS_ERR(msg))
+		return;
+
+	rmpp_mad = msg->mad;
+	memcpy(rmpp_mad, recv_wc->recv_buf.mad, msg->hdr_len);
+
+	rmpp_mad->mad_hdr.method ^= IB_MGMT_METHOD_RESP;
+	ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+	rmpp_mad->rmpp_hdr.seg_num = 0;
+	rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(1);
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		ib_destroy_ah(msg->ah);
+		ib_free_send_mad(msg);
+	}
+}
+
+void ib_rmpp_send_handler(struct ib_mad_send_wc *mad_send_wc)
+{
+	if (mad_send_wc->send_buf->context[0] == mad_send_wc->send_buf->ah)
+		ib_destroy_ah(mad_send_wc->send_buf->ah);
+	ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+static void nack_recv(struct ib_mad_agent_private *agent,
+		      struct ib_mad_recv_wc *recv_wc, u8 rmpp_status)
+{
+	struct ib_mad_send_buf *msg;
+	struct ib_rmpp_mad *rmpp_mad;
+	int ret;
+
+	msg = alloc_response_msg(&agent->agent, recv_wc);
+	if (IS_ERR(msg))
+		return;
+
+	rmpp_mad = msg->mad;
+	memcpy(rmpp_mad, recv_wc->recv_buf.mad, msg->hdr_len);
+
+	rmpp_mad->mad_hdr.method ^= IB_MGMT_METHOD_RESP;
+	rmpp_mad->rmpp_hdr.rmpp_version = IB_MGMT_RMPP_VERSION;
+	rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_ABORT;
+	ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+	rmpp_mad->rmpp_hdr.rmpp_status = rmpp_status;
+	rmpp_mad->rmpp_hdr.seg_num = 0;
+	rmpp_mad->rmpp_hdr.paylen_newwin = 0;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		ib_destroy_ah(msg->ah);
+		ib_free_send_mad(msg);
+	}
+}
+
+static void recv_timeout_handler(struct work_struct *work)
+{
+	struct mad_rmpp_recv *rmpp_recv =
+		container_of(work, struct mad_rmpp_recv, timeout_work.work);
+	struct ib_mad_recv_wc *rmpp_wc;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rmpp_recv->agent->lock, flags);
+	if (rmpp_recv->state != RMPP_STATE_ACTIVE) {
+		spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+		return;
+	}
+	rmpp_recv->state = RMPP_STATE_TIMEOUT;
+	list_del(&rmpp_recv->list);
+	spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+
+	rmpp_wc = rmpp_recv->rmpp_wc;
+	nack_recv(rmpp_recv->agent, rmpp_wc, IB_MGMT_RMPP_STATUS_T2L);
+	destroy_rmpp_recv(rmpp_recv);
+	ib_free_recv_mad(rmpp_wc);
+}
+
+static void recv_cleanup_handler(struct work_struct *work)
+{
+	struct mad_rmpp_recv *rmpp_recv =
+		container_of(work, struct mad_rmpp_recv, cleanup_work.work);
+	unsigned long flags;
+
+	spin_lock_irqsave(&rmpp_recv->agent->lock, flags);
+	if (rmpp_recv->state == RMPP_STATE_CANCELING) {
+		spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+		return;
+	}
+	list_del(&rmpp_recv->list);
+	spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+	destroy_rmpp_recv(rmpp_recv);
+}
+
+static struct mad_rmpp_recv *
+create_rmpp_recv(struct ib_mad_agent_private *agent,
+		 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+	struct ib_mad_hdr *mad_hdr;
+
+	rmpp_recv = kmalloc(sizeof *rmpp_recv, GFP_KERNEL);
+	if (!rmpp_recv)
+		return NULL;
+
+	rmpp_recv->ah = ib_create_ah_from_wc(agent->agent.qp->pd,
+					     mad_recv_wc->wc,
+					     mad_recv_wc->recv_buf.grh,
+					     agent->agent.port_num);
+	if (IS_ERR(rmpp_recv->ah))
+		goto error;
+
+	rmpp_recv->agent = agent;
+	init_completion(&rmpp_recv->comp);
+	INIT_DELAYED_WORK(&rmpp_recv->timeout_work, recv_timeout_handler);
+	INIT_DELAYED_WORK(&rmpp_recv->cleanup_work, recv_cleanup_handler);
+	spin_lock_init(&rmpp_recv->lock);
+	rmpp_recv->state = RMPP_STATE_ACTIVE;
+	atomic_set(&rmpp_recv->refcount, 1);
+
+	rmpp_recv->rmpp_wc = mad_recv_wc;
+	rmpp_recv->cur_seg_buf = &mad_recv_wc->recv_buf;
+	rmpp_recv->newwin = 1;
+	rmpp_recv->seg_num = 1;
+	rmpp_recv->last_ack = 0;
+	rmpp_recv->repwin = 1;
+
+	mad_hdr = &mad_recv_wc->recv_buf.mad->mad_hdr;
+	rmpp_recv->tid = mad_hdr->tid;
+	rmpp_recv->src_qp = mad_recv_wc->wc->src_qp;
+	rmpp_recv->slid = mad_recv_wc->wc->slid;
+	rmpp_recv->mgmt_class = mad_hdr->mgmt_class;
+	rmpp_recv->class_version = mad_hdr->class_version;
+	rmpp_recv->method  = mad_hdr->method;
+	return rmpp_recv;
+
+error:	kfree(rmpp_recv);
+	return NULL;
+}
+
+static struct mad_rmpp_recv *
+find_rmpp_recv(struct ib_mad_agent_private *agent,
+	       struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+	struct ib_mad_hdr *mad_hdr = &mad_recv_wc->recv_buf.mad->mad_hdr;
+
+	list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+		if (rmpp_recv->tid == mad_hdr->tid &&
+		    rmpp_recv->src_qp == mad_recv_wc->wc->src_qp &&
+		    rmpp_recv->slid == mad_recv_wc->wc->slid &&
+		    rmpp_recv->mgmt_class == mad_hdr->mgmt_class &&
+		    rmpp_recv->class_version == mad_hdr->class_version &&
+		    rmpp_recv->method == mad_hdr->method)
+			return rmpp_recv;
+	}
+	return NULL;
+}
+
+static struct mad_rmpp_recv *
+acquire_rmpp_recv(struct ib_mad_agent_private *agent,
+		  struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&agent->lock, flags);
+	rmpp_recv = find_rmpp_recv(agent, mad_recv_wc);
+	if (rmpp_recv)
+		atomic_inc(&rmpp_recv->refcount);
+	spin_unlock_irqrestore(&agent->lock, flags);
+	return rmpp_recv;
+}
+
+static struct mad_rmpp_recv *
+insert_rmpp_recv(struct ib_mad_agent_private *agent,
+		 struct mad_rmpp_recv *rmpp_recv)
+{
+	struct mad_rmpp_recv *cur_rmpp_recv;
+
+	cur_rmpp_recv = find_rmpp_recv(agent, rmpp_recv->rmpp_wc);
+	if (!cur_rmpp_recv)
+		list_add_tail(&rmpp_recv->list, &agent->rmpp_list);
+
+	return cur_rmpp_recv;
+}
+
+static inline int get_last_flag(struct ib_mad_recv_buf *seg)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *) seg->mad;
+	return ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_LAST;
+}
+
+static inline int get_seg_num(struct ib_mad_recv_buf *seg)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *) seg->mad;
+	return be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num);
+}
+
+static inline struct ib_mad_recv_buf * get_next_seg(struct list_head *rmpp_list,
+						    struct ib_mad_recv_buf *seg)
+{
+	if (seg->list.next == rmpp_list)
+		return NULL;
+
+	return container_of(seg->list.next, struct ib_mad_recv_buf, list);
+}
+
+static inline int window_size(struct ib_mad_agent_private *agent)
+{
+	return max(agent->qp_info->recv_queue.max_active >> 3, 1);
+}
+
+static struct ib_mad_recv_buf * find_seg_location(struct list_head *rmpp_list,
+						  int seg_num)
+{
+	struct ib_mad_recv_buf *seg_buf;
+	int cur_seg_num;
+
+	list_for_each_entry_reverse(seg_buf, rmpp_list, list) {
+		cur_seg_num = get_seg_num(seg_buf);
+		if (seg_num > cur_seg_num)
+			return seg_buf;
+		if (seg_num == cur_seg_num)
+			break;
+	}
+	return NULL;
+}
+
+static void update_seg_num(struct mad_rmpp_recv *rmpp_recv,
+			   struct ib_mad_recv_buf *new_buf)
+{
+	struct list_head *rmpp_list = &rmpp_recv->rmpp_wc->rmpp_list;
+
+	while (new_buf && (get_seg_num(new_buf) == rmpp_recv->seg_num + 1)) {
+		rmpp_recv->cur_seg_buf = new_buf;
+		rmpp_recv->seg_num++;
+		new_buf = get_next_seg(rmpp_list, new_buf);
+	}
+}
+
+static inline int get_mad_len(struct mad_rmpp_recv *rmpp_recv)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+	int hdr_size, data_size, pad;
+
+	rmpp_mad = (struct ib_rmpp_mad *)rmpp_recv->cur_seg_buf->mad;
+
+	hdr_size = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class);
+	data_size = sizeof(struct ib_rmpp_mad) - hdr_size;
+	pad = IB_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
+	if (pad > IB_MGMT_RMPP_DATA || pad < 0)
+		pad = 0;
+
+	return hdr_size + rmpp_recv->seg_num * data_size - pad;
+}
+
+static struct ib_mad_recv_wc * complete_rmpp(struct mad_rmpp_recv *rmpp_recv)
+{
+	struct ib_mad_recv_wc *rmpp_wc;
+
+	ack_recv(rmpp_recv, rmpp_recv->rmpp_wc);
+	if (rmpp_recv->seg_num > 1)
+		cancel_delayed_work(&rmpp_recv->timeout_work);
+
+	rmpp_wc = rmpp_recv->rmpp_wc;
+	rmpp_wc->mad_len = get_mad_len(rmpp_recv);
+	/* 10 seconds until we can find the packet lifetime */
+	queue_delayed_work(rmpp_recv->agent->qp_info->port_priv->wq,
+			   &rmpp_recv->cleanup_work, msecs_to_jiffies(10000));
+	return rmpp_wc;
+}
+
+static struct ib_mad_recv_wc *
+continue_rmpp(struct ib_mad_agent_private *agent,
+	      struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+	struct ib_mad_recv_buf *prev_buf;
+	struct ib_mad_recv_wc *done_wc;
+	int seg_num;
+	unsigned long flags;
+
+	rmpp_recv = acquire_rmpp_recv(agent, mad_recv_wc);
+	if (!rmpp_recv)
+		goto drop1;
+
+	seg_num = get_seg_num(&mad_recv_wc->recv_buf);
+
+	spin_lock_irqsave(&rmpp_recv->lock, flags);
+	if ((rmpp_recv->state == RMPP_STATE_TIMEOUT) ||
+	    (seg_num > rmpp_recv->newwin))
+		goto drop3;
+
+	if ((seg_num <= rmpp_recv->last_ack) ||
+	    (rmpp_recv->state == RMPP_STATE_COMPLETE)) {
+		spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+		ack_recv(rmpp_recv, mad_recv_wc);
+		goto drop2;
+	}
+
+	prev_buf = find_seg_location(&rmpp_recv->rmpp_wc->rmpp_list, seg_num);
+	if (!prev_buf)
+		goto drop3;
+
+	done_wc = NULL;
+	list_add(&mad_recv_wc->recv_buf.list, &prev_buf->list);
+	if (rmpp_recv->cur_seg_buf == prev_buf) {
+		update_seg_num(rmpp_recv, &mad_recv_wc->recv_buf);
+		if (get_last_flag(rmpp_recv->cur_seg_buf)) {
+			rmpp_recv->state = RMPP_STATE_COMPLETE;
+			spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+			done_wc = complete_rmpp(rmpp_recv);
+			goto out;
+		} else if (rmpp_recv->seg_num == rmpp_recv->newwin) {
+			rmpp_recv->newwin += window_size(agent);
+			spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+			ack_recv(rmpp_recv, mad_recv_wc);
+			goto out;
+		}
+	}
+	spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+out:
+	deref_rmpp_recv(rmpp_recv);
+	return done_wc;
+
+drop3:	spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+drop2:	deref_rmpp_recv(rmpp_recv);
+drop1:	ib_free_recv_mad(mad_recv_wc);
+	return NULL;
+}
+
+static struct ib_mad_recv_wc *
+start_rmpp(struct ib_mad_agent_private *agent,
+	   struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+	unsigned long flags;
+
+	rmpp_recv = create_rmpp_recv(agent, mad_recv_wc);
+	if (!rmpp_recv) {
+		ib_free_recv_mad(mad_recv_wc);
+		return NULL;
+	}
+
+	spin_lock_irqsave(&agent->lock, flags);
+	if (insert_rmpp_recv(agent, rmpp_recv)) {
+		spin_unlock_irqrestore(&agent->lock, flags);
+		/* duplicate first MAD */
+		destroy_rmpp_recv(rmpp_recv);
+		return continue_rmpp(agent, mad_recv_wc);
+	}
+	atomic_inc(&rmpp_recv->refcount);
+
+	if (get_last_flag(&mad_recv_wc->recv_buf)) {
+		rmpp_recv->state = RMPP_STATE_COMPLETE;
+		spin_unlock_irqrestore(&agent->lock, flags);
+		complete_rmpp(rmpp_recv);
+	} else {
+		spin_unlock_irqrestore(&agent->lock, flags);
+		/* 40 seconds until we can find the packet lifetimes */
+		queue_delayed_work(agent->qp_info->port_priv->wq,
+				   &rmpp_recv->timeout_work,
+				   msecs_to_jiffies(40000));
+		rmpp_recv->newwin += window_size(agent);
+		ack_recv(rmpp_recv, mad_recv_wc);
+		mad_recv_wc = NULL;
+	}
+	deref_rmpp_recv(rmpp_recv);
+	return mad_recv_wc;
+}
+
+static int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+	int timeout;
+	u32 paylen = 0;
+
+	rmpp_mad = mad_send_wr->send_buf.mad;
+	ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+	rmpp_mad->rmpp_hdr.seg_num = cpu_to_be32(++mad_send_wr->seg_num);
+
+	if (mad_send_wr->seg_num == 1) {
+		rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_FIRST;
+		paylen = mad_send_wr->send_buf.seg_count * IB_MGMT_RMPP_DATA -
+			 mad_send_wr->pad;
+	}
+
+	if (mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count) {
+		rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_LAST;
+		paylen = IB_MGMT_RMPP_DATA - mad_send_wr->pad;
+	}
+	rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(paylen);
+
+	/* 2 seconds for an ACK until we can find the packet lifetime */
+	timeout = mad_send_wr->send_buf.timeout_ms;
+	if (!timeout || timeout > 2000)
+		mad_send_wr->timeout = msecs_to_jiffies(2000);
+
+	return ib_send_mad(mad_send_wr);
+}
+
+static void abort_send(struct ib_mad_agent_private *agent,
+		       struct ib_mad_recv_wc *mad_recv_wc, u8 rmpp_status)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_mad_send_wc wc;
+	unsigned long flags;
+
+	spin_lock_irqsave(&agent->lock, flags);
+	mad_send_wr = ib_find_send_mad(agent, mad_recv_wc);
+	if (!mad_send_wr)
+		goto out;	/* Unmatched send */
+
+	if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) ||
+	    (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS))
+		goto out;	/* Send is already done */
+
+	ib_mark_mad_done(mad_send_wr);
+	spin_unlock_irqrestore(&agent->lock, flags);
+
+	wc.status = IB_WC_REM_ABORT_ERR;
+	wc.vendor_err = rmpp_status;
+	wc.send_buf = &mad_send_wr->send_buf;
+	ib_mad_complete_send_wr(mad_send_wr, &wc);
+	return;
+out:
+	spin_unlock_irqrestore(&agent->lock, flags);
+}
+
+static inline void adjust_last_ack(struct ib_mad_send_wr_private *wr,
+				   int seg_num)
+{
+	struct list_head *list;
+
+	wr->last_ack = seg_num;
+	list = &wr->last_ack_seg->list;
+	list_for_each_entry(wr->last_ack_seg, list, list)
+		if (wr->last_ack_seg->num == seg_num)
+			break;
+}
+
+static void process_ds_ack(struct ib_mad_agent_private *agent,
+			   struct ib_mad_recv_wc *mad_recv_wc, int newwin)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+
+	rmpp_recv = find_rmpp_recv(agent, mad_recv_wc);
+	if (rmpp_recv && rmpp_recv->state == RMPP_STATE_COMPLETE)
+		rmpp_recv->repwin = newwin;
+}
+
+static void process_rmpp_ack(struct ib_mad_agent_private *agent,
+			     struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_rmpp_mad *rmpp_mad;
+	unsigned long flags;
+	int seg_num, newwin, ret;
+
+	rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+	if (rmpp_mad->rmpp_hdr.rmpp_status) {
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+		return;
+	}
+
+	seg_num = be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num);
+	newwin = be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
+	if (newwin < seg_num) {
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_W2S);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_W2S);
+		return;
+	}
+
+	spin_lock_irqsave(&agent->lock, flags);
+	mad_send_wr = ib_find_send_mad(agent, mad_recv_wc);
+	if (!mad_send_wr) {
+		if (!seg_num)
+			process_ds_ack(agent, mad_recv_wc, newwin);
+		goto out;	/* Unmatched or DS RMPP ACK */
+	}
+
+	if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) &&
+	    (mad_send_wr->timeout)) {
+		spin_unlock_irqrestore(&agent->lock, flags);
+		ack_ds_ack(agent, mad_recv_wc);
+		return;		/* Repeated ACK for DS RMPP transaction */
+	}
+
+	if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) ||
+	    (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS))
+		goto out;	/* Send is already done */
+
+	if (seg_num > mad_send_wr->send_buf.seg_count ||
+	    seg_num > mad_send_wr->newwin) {
+		spin_unlock_irqrestore(&agent->lock, flags);
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_S2B);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_S2B);
+		return;
+	}
+
+	if (newwin < mad_send_wr->newwin || seg_num < mad_send_wr->last_ack)
+		goto out;	/* Old ACK */
+
+	if (seg_num > mad_send_wr->last_ack) {
+		adjust_last_ack(mad_send_wr, seg_num);
+		mad_send_wr->retries_left = mad_send_wr->max_retries;
+	}
+	mad_send_wr->newwin = newwin;
+	if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) {
+		/* If no response is expected, the ACK completes the send */
+		if (!mad_send_wr->send_buf.timeout_ms) {
+			struct ib_mad_send_wc wc;
+
+			ib_mark_mad_done(mad_send_wr);
+			spin_unlock_irqrestore(&agent->lock, flags);
+
+			wc.status = IB_WC_SUCCESS;
+			wc.vendor_err = 0;
+			wc.send_buf = &mad_send_wr->send_buf;
+			ib_mad_complete_send_wr(mad_send_wr, &wc);
+			return;
+		}
+		if (mad_send_wr->refcount == 1)
+			ib_reset_mad_timeout(mad_send_wr,
+					     mad_send_wr->send_buf.timeout_ms);
+		spin_unlock_irqrestore(&agent->lock, flags);
+		ack_ds_ack(agent, mad_recv_wc);
+		return;
+	} else if (mad_send_wr->refcount == 1 &&
+		   mad_send_wr->seg_num < mad_send_wr->newwin &&
+		   mad_send_wr->seg_num < mad_send_wr->send_buf.seg_count) {
+		/* Send failure will just result in a timeout/retry */
+		ret = send_next_seg(mad_send_wr);
+		if (ret)
+			goto out;
+
+		mad_send_wr->refcount++;
+		list_move_tail(&mad_send_wr->agent_list,
+			      &mad_send_wr->mad_agent_priv->send_list);
+	}
+out:
+	spin_unlock_irqrestore(&agent->lock, flags);
+}
+
+static struct ib_mad_recv_wc *
+process_rmpp_data(struct ib_mad_agent_private *agent,
+		  struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_rmpp_hdr *rmpp_hdr;
+	u8 rmpp_status;
+
+	rmpp_hdr = &((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr;
+
+	if (rmpp_hdr->rmpp_status) {
+		rmpp_status = IB_MGMT_RMPP_STATUS_BAD_STATUS;
+		goto bad;
+	}
+
+	if (rmpp_hdr->seg_num == cpu_to_be32(1)) {
+		if (!(ib_get_rmpp_flags(rmpp_hdr) & IB_MGMT_RMPP_FLAG_FIRST)) {
+			rmpp_status = IB_MGMT_RMPP_STATUS_BAD_SEG;
+			goto bad;
+		}
+		return start_rmpp(agent, mad_recv_wc);
+	} else {
+		if (ib_get_rmpp_flags(rmpp_hdr) & IB_MGMT_RMPP_FLAG_FIRST) {
+			rmpp_status = IB_MGMT_RMPP_STATUS_BAD_SEG;
+			goto bad;
+		}
+		return continue_rmpp(agent, mad_recv_wc);
+	}
+bad:
+	nack_recv(agent, mad_recv_wc, rmpp_status);
+	ib_free_recv_mad(mad_recv_wc);
+	return NULL;
+}
+
+static void process_rmpp_stop(struct ib_mad_agent_private *agent,
+			      struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+
+	if (rmpp_mad->rmpp_hdr.rmpp_status != IB_MGMT_RMPP_STATUS_RESX) {
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+	} else
+		abort_send(agent, mad_recv_wc, rmpp_mad->rmpp_hdr.rmpp_status);
+}
+
+static void process_rmpp_abort(struct ib_mad_agent_private *agent,
+			       struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+
+	if (rmpp_mad->rmpp_hdr.rmpp_status < IB_MGMT_RMPP_STATUS_ABORT_MIN ||
+	    rmpp_mad->rmpp_hdr.rmpp_status > IB_MGMT_RMPP_STATUS_ABORT_MAX) {
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+	} else
+		abort_send(agent, mad_recv_wc, rmpp_mad->rmpp_hdr.rmpp_status);
+}
+
+struct ib_mad_recv_wc *
+ib_process_rmpp_recv_wc(struct ib_mad_agent_private *agent,
+			struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+	if (!(rmpp_mad->rmpp_hdr.rmpp_rtime_flags & IB_MGMT_RMPP_FLAG_ACTIVE))
+		return mad_recv_wc;
+
+	if (rmpp_mad->rmpp_hdr.rmpp_version != IB_MGMT_RMPP_VERSION) {
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_UNV);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_UNV);
+		goto out;
+	}
+
+	switch (rmpp_mad->rmpp_hdr.rmpp_type) {
+	case IB_MGMT_RMPP_TYPE_DATA:
+		return process_rmpp_data(agent, mad_recv_wc);
+	case IB_MGMT_RMPP_TYPE_ACK:
+		process_rmpp_ack(agent, mad_recv_wc);
+		break;
+	case IB_MGMT_RMPP_TYPE_STOP:
+		process_rmpp_stop(agent, mad_recv_wc);
+		break;
+	case IB_MGMT_RMPP_TYPE_ABORT:
+		process_rmpp_abort(agent, mad_recv_wc);
+		break;
+	default:
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BADT);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BADT);
+		break;
+	}
+out:
+	ib_free_recv_mad(mad_recv_wc);
+	return NULL;
+}
+
+static int init_newwin(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_mad_agent_private *agent = mad_send_wr->mad_agent_priv;
+	struct ib_mad_hdr *mad_hdr = mad_send_wr->send_buf.mad;
+	struct mad_rmpp_recv *rmpp_recv;
+	struct ib_ah_attr ah_attr;
+	unsigned long flags;
+	int newwin = 1;
+
+	if (!(mad_hdr->method & IB_MGMT_METHOD_RESP))
+		goto out;
+
+	spin_lock_irqsave(&agent->lock, flags);
+	list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+		if (rmpp_recv->tid != mad_hdr->tid ||
+		    rmpp_recv->mgmt_class != mad_hdr->mgmt_class ||
+		    rmpp_recv->class_version != mad_hdr->class_version ||
+		    (rmpp_recv->method & IB_MGMT_METHOD_RESP))
+			continue;
+
+		if (ib_query_ah(mad_send_wr->send_buf.ah, &ah_attr))
+			continue;
+
+		if (rmpp_recv->slid == ah_attr.dlid) {
+			newwin = rmpp_recv->repwin;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&agent->lock, flags);
+out:
+	return newwin;
+}
+
+int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+	int ret;
+
+	rmpp_mad = mad_send_wr->send_buf.mad;
+	if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+	      IB_MGMT_RMPP_FLAG_ACTIVE))
+		return IB_RMPP_RESULT_UNHANDLED;
+
+	if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA) {
+		mad_send_wr->seg_num = 1;
+		return IB_RMPP_RESULT_INTERNAL;
+	}
+
+	mad_send_wr->newwin = init_newwin(mad_send_wr);
+
+	/* We need to wait for the final ACK even if there isn't a response */
+	mad_send_wr->refcount += (mad_send_wr->timeout == 0);
+	ret = send_next_seg(mad_send_wr);
+	if (!ret)
+		return IB_RMPP_RESULT_CONSUMED;
+	return ret;
+}
+
+int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr,
+			    struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+	int ret;
+
+	rmpp_mad = mad_send_wr->send_buf.mad;
+	if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+	      IB_MGMT_RMPP_FLAG_ACTIVE))
+		return IB_RMPP_RESULT_UNHANDLED; /* RMPP not active */
+
+	if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA)
+		return IB_RMPP_RESULT_INTERNAL;	 /* ACK, STOP, or ABORT */
+
+	if (mad_send_wc->status != IB_WC_SUCCESS ||
+	    mad_send_wr->status != IB_WC_SUCCESS)
+		return IB_RMPP_RESULT_PROCESSED; /* Canceled or send error */
+
+	if (!mad_send_wr->timeout)
+		return IB_RMPP_RESULT_PROCESSED; /* Response received */
+
+	if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) {
+		mad_send_wr->timeout =
+			msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+		return IB_RMPP_RESULT_PROCESSED; /* Send done */
+	}
+
+	if (mad_send_wr->seg_num == mad_send_wr->newwin ||
+	    mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count)
+		return IB_RMPP_RESULT_PROCESSED; /* Wait for ACK */
+
+	ret = send_next_seg(mad_send_wr);
+	if (ret) {
+		mad_send_wc->status = IB_WC_GENERAL_ERR;
+		return IB_RMPP_RESULT_PROCESSED;
+	}
+	return IB_RMPP_RESULT_CONSUMED;
+}
+
+int ib_retry_rmpp(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+	int ret;
+
+	rmpp_mad = mad_send_wr->send_buf.mad;
+	if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+	      IB_MGMT_RMPP_FLAG_ACTIVE))
+		return IB_RMPP_RESULT_UNHANDLED; /* RMPP not active */
+
+	if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count)
+		return IB_RMPP_RESULT_PROCESSED;
+
+	mad_send_wr->seg_num = mad_send_wr->last_ack;
+	mad_send_wr->cur_seg = mad_send_wr->last_ack_seg;
+
+	ret = send_next_seg(mad_send_wr);
+	if (ret)
+		return IB_RMPP_RESULT_PROCESSED;
+
+	return IB_RMPP_RESULT_CONSUMED;
+}
diff --git a/sys/ofed/drivers/infiniband/core/mad_rmpp.h b/sys/ofed/drivers/infiniband/core/mad_rmpp.h
new file mode 100644
index 0000000..3d336bf
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/mad_rmpp.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2005 Intel Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MAD_RMPP_H__
+#define __MAD_RMPP_H__
+
+enum {
+	IB_RMPP_RESULT_PROCESSED,
+	IB_RMPP_RESULT_CONSUMED,
+	IB_RMPP_RESULT_INTERNAL,
+	IB_RMPP_RESULT_UNHANDLED
+};
+
+int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr);
+
+struct ib_mad_recv_wc *
+ib_process_rmpp_recv_wc(struct ib_mad_agent_private *agent,
+			struct ib_mad_recv_wc *mad_recv_wc);
+
+int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr,
+			    struct ib_mad_send_wc *mad_send_wc);
+
+void ib_rmpp_send_handler(struct ib_mad_send_wc *mad_send_wc);
+
+void ib_cancel_rmpp_recvs(struct ib_mad_agent_private *agent);
+
+int ib_retry_rmpp(struct ib_mad_send_wr_private *mad_send_wr);
+
+#endif	/* __MAD_RMPP_H__ */
diff --git a/sys/ofed/drivers/infiniband/core/multicast.c b/sys/ofed/drivers/infiniband/core/multicast.c
new file mode 100644
index 0000000..f8d7ef8
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/multicast.c
@@ -0,0 +1,868 @@
+/*
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/bitops.h>
+#include <linux/random.h>
+
+#include <rdma/ib_cache.h>
+#include "sa.h"
+
+static void mcast_add_one(struct ib_device *device);
+static void mcast_remove_one(struct ib_device *device);
+
+static struct ib_client mcast_client = {
+	.name   = "ib_multicast",
+	.add    = mcast_add_one,
+	.remove = mcast_remove_one
+};
+
+static struct ib_sa_client	sa_client;
+static struct workqueue_struct	*mcast_wq;
+static union ib_gid mgid0;
+
+struct mcast_device;
+
+struct mcast_port {
+	struct mcast_device	*dev;
+	spinlock_t		lock;
+	struct rb_root		table;
+	atomic_t		refcount;
+	struct completion	comp;
+	u8			port_num;
+};
+
+struct mcast_device {
+	struct ib_device	*device;
+	struct ib_event_handler	event_handler;
+	int			start_port;
+	int			end_port;
+	struct mcast_port	port[0];
+};
+
+enum mcast_state {
+	MCAST_JOINING,
+	MCAST_MEMBER,
+	MCAST_ERROR,
+};
+
+enum mcast_group_state {
+	MCAST_IDLE,
+	MCAST_BUSY,
+	MCAST_GROUP_ERROR,
+	MCAST_PKEY_EVENT
+};
+
+enum {
+	MCAST_INVALID_PKEY_INDEX = 0xFFFF
+};
+
+struct mcast_member;
+
+struct mcast_group {
+	struct ib_sa_mcmember_rec rec;
+	struct rb_node		node;
+	struct mcast_port	*port;
+	spinlock_t		lock;
+	struct work_struct	work;
+	struct list_head	pending_list;
+	struct list_head	active_list;
+	struct mcast_member	*last_join;
+	int			members[3];
+	atomic_t		refcount;
+	enum mcast_group_state	state;
+	struct ib_sa_query	*query;
+	int			query_id;
+	u16			pkey_index;
+	u8			leave_state;
+	int			retries;
+};
+
+struct mcast_member {
+	struct ib_sa_multicast	multicast;
+	struct ib_sa_client	*client;
+	struct mcast_group	*group;
+	struct list_head	list;
+	enum mcast_state	state;
+	atomic_t		refcount;
+	struct completion	comp;
+};
+
+static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
+			 void *context);
+static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
+			  void *context);
+
+static struct mcast_group *mcast_find(struct mcast_port *port,
+				      union ib_gid *mgid)
+{
+	struct rb_node *node = port->table.rb_node;
+	struct mcast_group *group;
+	int ret;
+
+	while (node) {
+		group = rb_entry(node, struct mcast_group, node);
+		ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid);
+		if (!ret)
+			return group;
+
+		if (ret < 0)
+			node = node->rb_left;
+		else
+			node = node->rb_right;
+	}
+	return NULL;
+}
+
+static struct mcast_group *mcast_insert(struct mcast_port *port,
+					struct mcast_group *group,
+					int allow_duplicates)
+{
+	struct rb_node **link = &port->table.rb_node;
+	struct rb_node *parent = NULL;
+	struct mcast_group *cur_group;
+	int ret;
+
+	while (*link) {
+		parent = *link;
+		cur_group = rb_entry(parent, struct mcast_group, node);
+
+		ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw,
+			     sizeof group->rec.mgid);
+		if (ret < 0)
+			link = &(*link)->rb_left;
+		else if (ret > 0)
+			link = &(*link)->rb_right;
+		else if (allow_duplicates)
+			link = &(*link)->rb_left;
+		else
+			return cur_group;
+	}
+	rb_link_node(&group->node, parent, link);
+	rb_insert_color(&group->node, &port->table);
+	return NULL;
+}
+
+static void deref_port(struct mcast_port *port)
+{
+	if (atomic_dec_and_test(&port->refcount))
+		complete(&port->comp);
+}
+
+static void release_group(struct mcast_group *group)
+{
+	struct mcast_port *port = group->port;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	if (atomic_dec_and_test(&group->refcount)) {
+		rb_erase(&group->node, &port->table);
+		spin_unlock_irqrestore(&port->lock, flags);
+		kfree(group);
+		deref_port(port);
+	} else
+		spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void deref_member(struct mcast_member *member)
+{
+	if (atomic_dec_and_test(&member->refcount))
+		complete(&member->comp);
+}
+
+static void queue_join(struct mcast_member *member)
+{
+	struct mcast_group *group = member->group;
+	unsigned long flags;
+
+	spin_lock_irqsave(&group->lock, flags);
+	list_add_tail(&member->list, &group->pending_list);
+	if (group->state == MCAST_IDLE) {
+		group->state = MCAST_BUSY;
+		atomic_inc(&group->refcount);
+		queue_work(mcast_wq, &group->work);
+	}
+	spin_unlock_irqrestore(&group->lock, flags);
+}
+
+/*
+ * A multicast group has three types of members: full member, non member, and
+ * send only member.  We need to keep track of the number of members of each
+ * type based on their join state.  Adjust the number of members the belong to
+ * the specified join states.
+ */
+static void adjust_membership(struct mcast_group *group, u8 join_state, int inc)
+{
+	int i;
+
+	for (i = 0; i < 3; i++, join_state >>= 1)
+		if (join_state & 0x1)
+			group->members[i] += inc;
+}
+
+/*
+ * If a multicast group has zero members left for a particular join state, but
+ * the group is still a member with the SA, we need to leave that join state.
+ * Determine which join states we still belong to, but that do not have any
+ * active members.
+ */
+static u8 get_leave_state(struct mcast_group *group)
+{
+	u8 leave_state = 0;
+	int i;
+
+	for (i = 0; i < 3; i++)
+		if (!group->members[i])
+			leave_state |= (0x1 << i);
+
+	return leave_state & group->rec.join_state;
+}
+
+static int cmp_rec(struct ib_sa_mcmember_rec *src,
+		   struct ib_sa_mcmember_rec *dst, ib_sa_comp_mask comp_mask)
+{
+	/* MGID must already match */
+
+	if (comp_mask & IB_SA_MCMEMBER_REC_PORT_GID &&
+	    memcmp(&src->port_gid, &dst->port_gid, sizeof src->port_gid))
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid)
+		return -EINVAL;
+	if (ib_sa_check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR,
+				 IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector,
+				 src->mtu, dst->mtu))
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS &&
+	    src->traffic_class != dst->traffic_class)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey)
+		return -EINVAL;
+	if (ib_sa_check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR,
+				 IB_SA_MCMEMBER_REC_RATE, dst->rate_selector,
+				 src->rate, dst->rate))
+		return -EINVAL;
+	if (ib_sa_check_selector(comp_mask,
+				 IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR,
+				 IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME,
+				 dst->packet_life_time_selector,
+				 src->packet_life_time, dst->packet_life_time))
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_SL && src->sl != dst->sl)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL &&
+	    src->flow_label != dst->flow_label)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT &&
+	    src->hop_limit != dst->hop_limit)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE && src->scope != dst->scope)
+		return -EINVAL;
+
+	/* join_state checked separately, proxy_join ignored */
+
+	return 0;
+}
+
+static int send_join(struct mcast_group *group, struct mcast_member *member)
+{
+	struct mcast_port *port = group->port;
+	int ret;
+
+	group->last_join = member;
+	ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device,
+				       port->port_num, IB_MGMT_METHOD_SET,
+				       &member->multicast.rec,
+				       member->multicast.comp_mask,
+				       3000, GFP_KERNEL, join_handler, group,
+				       &group->query);
+	if (ret >= 0) {
+		group->query_id = ret;
+		ret = 0;
+	}
+	return ret;
+}
+
+static int send_leave(struct mcast_group *group, u8 leave_state)
+{
+	struct mcast_port *port = group->port;
+	struct ib_sa_mcmember_rec rec;
+	int ret;
+
+	rec = group->rec;
+	rec.join_state = leave_state;
+	group->leave_state = leave_state;
+
+	ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device,
+				       port->port_num, IB_SA_METHOD_DELETE, &rec,
+				       IB_SA_MCMEMBER_REC_MGID     |
+				       IB_SA_MCMEMBER_REC_PORT_GID |
+				       IB_SA_MCMEMBER_REC_JOIN_STATE,
+				       3000, GFP_KERNEL, leave_handler,
+				       group, &group->query);
+	if (ret >= 0) {
+		group->query_id = ret;
+		ret = 0;
+	}
+	return ret;
+}
+
+static void join_group(struct mcast_group *group, struct mcast_member *member,
+		       u8 join_state)
+{
+	member->state = MCAST_MEMBER;
+	adjust_membership(group, join_state, 1);
+	group->rec.join_state |= join_state;
+	member->multicast.rec = group->rec;
+	member->multicast.rec.join_state = join_state;
+	list_move(&member->list, &group->active_list);
+}
+
+static int fail_join(struct mcast_group *group, struct mcast_member *member,
+		     int status)
+{
+	spin_lock_irq(&group->lock);
+	list_del_init(&member->list);
+	spin_unlock_irq(&group->lock);
+	return member->multicast.callback(status, &member->multicast);
+}
+
+static void process_group_error(struct mcast_group *group)
+{
+	struct mcast_member *member;
+	int ret = 0;
+	u16 pkey_index;
+
+	if (group->state == MCAST_PKEY_EVENT)
+		ret = ib_find_pkey(group->port->dev->device,
+				   group->port->port_num,
+				   be16_to_cpu(group->rec.pkey), &pkey_index);
+
+	spin_lock_irq(&group->lock);
+	if (group->state == MCAST_PKEY_EVENT && !ret &&
+	    group->pkey_index == pkey_index)
+		goto out;
+
+	while (!list_empty(&group->active_list)) {
+		member = list_entry(group->active_list.next,
+				    struct mcast_member, list);
+		atomic_inc(&member->refcount);
+		list_del_init(&member->list);
+		adjust_membership(group, member->multicast.rec.join_state, -1);
+		member->state = MCAST_ERROR;
+		spin_unlock_irq(&group->lock);
+
+		ret = member->multicast.callback(-ENETRESET,
+						 &member->multicast);
+		deref_member(member);
+		if (ret)
+			ib_sa_free_multicast(&member->multicast);
+		spin_lock_irq(&group->lock);
+	}
+
+	group->rec.join_state = 0;
+out:
+	group->state = MCAST_BUSY;
+	spin_unlock_irq(&group->lock);
+}
+
+static void mcast_work_handler(struct work_struct *work)
+{
+	struct mcast_group *group;
+	struct mcast_member *member;
+	struct ib_sa_multicast *multicast;
+	int status, ret;
+	u8 join_state;
+
+	group = container_of(work, typeof(*group), work);
+retest:
+	spin_lock_irq(&group->lock);
+	while (!list_empty(&group->pending_list) ||
+	       (group->state != MCAST_BUSY)) {
+
+		if (group->state != MCAST_BUSY) {
+			spin_unlock_irq(&group->lock);
+			process_group_error(group);
+			goto retest;
+		}
+
+		member = list_entry(group->pending_list.next,
+				    struct mcast_member, list);
+		multicast = &member->multicast;
+		join_state = multicast->rec.join_state;
+		atomic_inc(&member->refcount);
+
+		if (join_state == (group->rec.join_state & join_state)) {
+			status = cmp_rec(&group->rec, &multicast->rec,
+					 multicast->comp_mask);
+			if (!status)
+				join_group(group, member, join_state);
+			else
+				list_del_init(&member->list);
+			spin_unlock_irq(&group->lock);
+			ret = multicast->callback(status, multicast);
+		} else {
+			spin_unlock_irq(&group->lock);
+			status = send_join(group, member);
+			if (!status) {
+				deref_member(member);
+				return;
+			}
+			ret = fail_join(group, member, status);
+		}
+
+		deref_member(member);
+		if (ret)
+			ib_sa_free_multicast(&member->multicast);
+		spin_lock_irq(&group->lock);
+	}
+
+	join_state = get_leave_state(group);
+	if (join_state) {
+		group->rec.join_state &= ~join_state;
+		spin_unlock_irq(&group->lock);
+		if (send_leave(group, join_state))
+			goto retest;
+	} else {
+		group->state = MCAST_IDLE;
+		spin_unlock_irq(&group->lock);
+		release_group(group);
+	}
+}
+
+/*
+ * Fail a join request if it is still active - at the head of the pending queue.
+ */
+static void process_join_error(struct mcast_group *group, int status)
+{
+	struct mcast_member *member;
+	int ret;
+
+	spin_lock_irq(&group->lock);
+	member = list_entry(group->pending_list.next,
+			    struct mcast_member, list);
+	if (group->last_join == member) {
+		atomic_inc(&member->refcount);
+		list_del_init(&member->list);
+		spin_unlock_irq(&group->lock);
+		ret = member->multicast.callback(status, &member->multicast);
+		deref_member(member);
+		if (ret)
+			ib_sa_free_multicast(&member->multicast);
+	} else
+		spin_unlock_irq(&group->lock);
+}
+
+static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
+			 void *context)
+{
+	struct mcast_group *group = context;
+	u16 pkey_index = MCAST_INVALID_PKEY_INDEX;
+
+	if (status)
+		process_join_error(group, status);
+	else {
+		ib_find_pkey(group->port->dev->device, group->port->port_num,
+			     be16_to_cpu(rec->pkey), &pkey_index);
+
+		spin_lock_irq(&group->port->lock);
+		group->rec = *rec;
+		if (group->state == MCAST_BUSY &&
+		    group->pkey_index == MCAST_INVALID_PKEY_INDEX)
+			group->pkey_index = pkey_index;
+		if (!memcmp(&mgid0, &group->rec.mgid, sizeof mgid0)) {
+			rb_erase(&group->node, &group->port->table);
+			mcast_insert(group->port, group, 1);
+		}
+		spin_unlock_irq(&group->port->lock);
+	}
+	mcast_work_handler(&group->work);
+}
+
+static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
+			  void *context)
+{
+	struct mcast_group *group = context;
+
+	if (status && (group->retries > 0) &&
+	    !send_leave(group, group->leave_state))
+		group->retries--;
+	else
+		mcast_work_handler(&group->work);
+}
+
+static struct mcast_group *acquire_group(struct mcast_port *port,
+					 union ib_gid *mgid, gfp_t gfp_mask)
+{
+	struct mcast_group *group, *cur_group;
+	unsigned long flags;
+	int is_mgid0;
+
+	is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0);
+	if (!is_mgid0) {
+		spin_lock_irqsave(&port->lock, flags);
+		group = mcast_find(port, mgid);
+		if (group)
+			goto found;
+		spin_unlock_irqrestore(&port->lock, flags);
+	}
+
+	group = kzalloc(sizeof *group, gfp_mask);
+	if (!group)
+		return NULL;
+
+	group->retries = 3;
+	group->port = port;
+	group->rec.mgid = *mgid;
+	group->pkey_index = MCAST_INVALID_PKEY_INDEX;
+	INIT_LIST_HEAD(&group->pending_list);
+	INIT_LIST_HEAD(&group->active_list);
+	INIT_WORK(&group->work, mcast_work_handler);
+	spin_lock_init(&group->lock);
+
+	spin_lock_irqsave(&port->lock, flags);
+	cur_group = mcast_insert(port, group, is_mgid0);
+	if (cur_group) {
+		kfree(group);
+		group = cur_group;
+	} else
+		atomic_inc(&port->refcount);
+found:
+	atomic_inc(&group->refcount);
+	spin_unlock_irqrestore(&port->lock, flags);
+	return group;
+}
+
+/*
+ * We serialize all join requests to a single group to make our lives much
+ * easier.  Otherwise, two users could try to join the same group
+ * simultaneously, with different configurations, one could leave while the
+ * join is in progress, etc., which makes locking around error recovery
+ * difficult.
+ */
+struct ib_sa_multicast *
+ib_sa_join_multicast(struct ib_sa_client *client,
+		     struct ib_device *device, u8 port_num,
+		     struct ib_sa_mcmember_rec *rec,
+		     ib_sa_comp_mask comp_mask, gfp_t gfp_mask,
+		     int (*callback)(int status,
+				     struct ib_sa_multicast *multicast),
+		     void *context)
+{
+	struct mcast_device *dev;
+	struct mcast_member *member;
+	struct ib_sa_multicast *multicast;
+	int ret;
+
+	dev = ib_get_client_data(device, &mcast_client);
+	if (!dev)
+		return ERR_PTR(-ENODEV);
+
+	member = kmalloc(sizeof *member, gfp_mask);
+	if (!member)
+		return ERR_PTR(-ENOMEM);
+
+	ib_sa_client_get(client);
+	member->client = client;
+	member->multicast.rec = *rec;
+	member->multicast.comp_mask = comp_mask;
+	member->multicast.callback = callback;
+	member->multicast.context = context;
+	init_completion(&member->comp);
+	atomic_set(&member->refcount, 1);
+	member->state = MCAST_JOINING;
+
+	member->group = acquire_group(&dev->port[port_num - dev->start_port],
+				      &rec->mgid, gfp_mask);
+	if (!member->group) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	/*
+	 * The user will get the multicast structure in their callback.  They
+	 * could then free the multicast structure before we can return from
+	 * this routine.  So we save the pointer to return before queuing
+	 * any callback.
+	 */
+	multicast = &member->multicast;
+	queue_join(member);
+	return multicast;
+
+err:
+	ib_sa_client_put(client);
+	kfree(member);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_sa_join_multicast);
+
+void ib_sa_free_multicast(struct ib_sa_multicast *multicast)
+{
+	struct mcast_member *member;
+	struct mcast_group *group;
+
+	member = container_of(multicast, struct mcast_member, multicast);
+	group = member->group;
+
+	spin_lock_irq(&group->lock);
+	if (member->state == MCAST_MEMBER)
+		adjust_membership(group, multicast->rec.join_state, -1);
+
+	list_del_init(&member->list);
+
+	if (group->state == MCAST_IDLE) {
+		group->state = MCAST_BUSY;
+		spin_unlock_irq(&group->lock);
+		/* Continue to hold reference on group until callback */
+		queue_work(mcast_wq, &group->work);
+	} else {
+		spin_unlock_irq(&group->lock);
+		release_group(group);
+	}
+
+	deref_member(member);
+	wait_for_completion(&member->comp);
+	ib_sa_client_put(member->client);
+	kfree(member);
+}
+EXPORT_SYMBOL(ib_sa_free_multicast);
+
+int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num,
+			   union ib_gid *mgid, struct ib_sa_mcmember_rec *rec)
+{
+	struct mcast_device *dev;
+	struct mcast_port *port;
+	struct mcast_group *group;
+	unsigned long flags;
+	int ret = 0;
+
+	dev = ib_get_client_data(device, &mcast_client);
+	if (!dev)
+		return -ENODEV;
+
+	port = &dev->port[port_num - dev->start_port];
+	spin_lock_irqsave(&port->lock, flags);
+	group = mcast_find(port, mgid);
+	if (group)
+		*rec = group->rec;
+	else
+		ret = -EADDRNOTAVAIL;
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_sa_get_mcmember_rec);
+
+int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
+			     struct ib_sa_mcmember_rec *rec,
+			     struct ib_ah_attr *ah_attr)
+{
+	int ret;
+	u16 gid_index;
+	u8 p;
+
+	ret = ib_find_cached_gid(device, &rec->port_gid, &p, &gid_index);
+	if (ret)
+		return ret;
+
+	memset(ah_attr, 0, sizeof *ah_attr);
+	ah_attr->dlid = be16_to_cpu(rec->mlid);
+	ah_attr->sl = rec->sl;
+	ah_attr->port_num = port_num;
+	ah_attr->static_rate = rec->rate;
+
+	ah_attr->ah_flags = IB_AH_GRH;
+	ah_attr->grh.dgid = rec->mgid;
+
+	ah_attr->grh.sgid_index = (u8) gid_index;
+	ah_attr->grh.flow_label = be32_to_cpu(rec->flow_label);
+	ah_attr->grh.hop_limit = rec->hop_limit;
+	ah_attr->grh.traffic_class = rec->traffic_class;
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_init_ah_from_mcmember);
+
+static void mcast_groups_event(struct mcast_port *port,
+			       enum mcast_group_state state)
+{
+	struct mcast_group *group;
+	struct rb_node *node;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	for (node = rb_first(&port->table); node; node = rb_next(node)) {
+		group = rb_entry(node, struct mcast_group, node);
+		spin_lock(&group->lock);
+		if (group->state == MCAST_IDLE) {
+			atomic_inc(&group->refcount);
+			queue_work(mcast_wq, &group->work);
+		}
+		if (group->state != MCAST_GROUP_ERROR)
+			group->state = state;
+		spin_unlock(&group->lock);
+	}
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void mcast_event_handler(struct ib_event_handler *handler,
+				struct ib_event *event)
+{
+	struct mcast_device *dev;
+	int index;
+
+	dev = container_of(handler, struct mcast_device, event_handler);
+	if (rdma_port_get_link_layer(dev->device, event->element.port_num) !=
+	    IB_LINK_LAYER_INFINIBAND)
+		return;
+
+	index = event->element.port_num - dev->start_port;
+
+	switch (event->event) {
+	case IB_EVENT_PORT_ERR:
+	case IB_EVENT_LID_CHANGE:
+	case IB_EVENT_SM_CHANGE:
+	case IB_EVENT_CLIENT_REREGISTER:
+		mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR);
+		break;
+	case IB_EVENT_PKEY_CHANGE:
+		mcast_groups_event(&dev->port[index], MCAST_PKEY_EVENT);
+		break;
+	default:
+		break;
+	}
+}
+
+static void mcast_add_one(struct ib_device *device)
+{
+	struct mcast_device *dev;
+	struct mcast_port *port;
+	int i;
+	int count = 0;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	dev = kmalloc(sizeof *dev + device->phys_port_cnt * sizeof *port,
+		      GFP_KERNEL);
+	if (!dev)
+		return;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH)
+		dev->start_port = dev->end_port = 0;
+	else {
+		dev->start_port = 1;
+		dev->end_port = device->phys_port_cnt;
+	}
+
+	for (i = 0; i <= dev->end_port - dev->start_port; i++) {
+		if (rdma_port_get_link_layer(device, dev->start_port + i) !=
+		    IB_LINK_LAYER_INFINIBAND)
+			continue;
+		port = &dev->port[i];
+		port->dev = dev;
+		port->port_num = dev->start_port + i;
+		spin_lock_init(&port->lock);
+		port->table = RB_ROOT;
+		init_completion(&port->comp);
+		atomic_set(&port->refcount, 1);
+		++count;
+	}
+
+	if (!count) {
+		kfree(dev);
+		return;
+	}
+
+	dev->device = device;
+	ib_set_client_data(device, &mcast_client, dev);
+
+	INIT_IB_EVENT_HANDLER(&dev->event_handler, device, mcast_event_handler);
+	ib_register_event_handler(&dev->event_handler);
+}
+
+static void mcast_remove_one(struct ib_device *device)
+{
+	struct mcast_device *dev;
+	struct mcast_port *port;
+	int i;
+
+	dev = ib_get_client_data(device, &mcast_client);
+	if (!dev)
+		return;
+
+	ib_unregister_event_handler(&dev->event_handler);
+	flush_workqueue(mcast_wq);
+
+	for (i = 0; i <= dev->end_port - dev->start_port; i++) {
+		if (rdma_port_get_link_layer(device, dev->start_port + i) ==
+		    IB_LINK_LAYER_INFINIBAND) {
+			port = &dev->port[i];
+			deref_port(port);
+			wait_for_completion(&port->comp);
+		}
+	}
+
+	kfree(dev);
+}
+
+int mcast_init(void)
+{
+	int ret;
+
+	mcast_wq = create_singlethread_workqueue("ib_mcast");
+	if (!mcast_wq)
+		return -ENOMEM;
+
+	ib_sa_register_client(&sa_client);
+
+	ret = ib_register_client(&mcast_client);
+	if (ret)
+		goto err;
+	return 0;
+
+err:
+	ib_sa_unregister_client(&sa_client);
+	destroy_workqueue(mcast_wq);
+	return ret;
+}
+
+void mcast_cleanup(void)
+{
+	ib_unregister_client(&mcast_client);
+	ib_sa_unregister_client(&sa_client);
+	destroy_workqueue(mcast_wq);
+}
diff --git a/sys/ofed/drivers/infiniband/core/notice.c b/sys/ofed/drivers/infiniband/core/notice.c
new file mode 100644
index 0000000..4a8d98f
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/notice.c
@@ -0,0 +1,749 @@
+/*
+ * Copyright (c) 2006 Intel Corporation.� All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/bitops.h>
+#include <linux/random.h>
+
+#include "sa.h"
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("InfiniBand InformInfo & Notice event handling");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static void inform_add_one(struct ib_device *device);
+static void inform_remove_one(struct ib_device *device);
+
+static struct ib_client inform_client = {
+	.name   = "ib_notice",
+	.add    = inform_add_one,
+	.remove = inform_remove_one
+};
+
+static struct ib_sa_client	sa_client;
+static struct workqueue_struct	*inform_wq;
+
+struct inform_device;
+
+struct inform_port {
+	struct inform_device	*dev;
+	spinlock_t		lock;
+	struct rb_root		table;
+	atomic_t		refcount;
+	struct completion	comp;
+	u8			port_num;
+};
+
+struct inform_device {
+	struct ib_device	*device;
+	struct ib_event_handler	event_handler;
+	int			start_port;
+	int			end_port;
+	struct inform_port	port[0];
+};
+
+enum inform_state {
+	INFORM_IDLE,
+	INFORM_REGISTERING,
+	INFORM_MEMBER,
+	INFORM_BUSY,
+	INFORM_ERROR
+};
+
+struct inform_member;
+
+struct inform_group {
+	u16			trap_number;
+	struct rb_node		node;
+	struct inform_port	*port;
+	spinlock_t		lock;
+	struct work_struct	work;
+	struct list_head	pending_list;
+	struct list_head	active_list;
+	struct list_head	notice_list;
+	struct inform_member	*last_join;
+	int			members;
+	enum inform_state	join_state; /* State relative to SA */
+	atomic_t		refcount;
+	enum inform_state	state;
+	struct ib_sa_query	*query;
+	int			query_id;
+};
+
+struct inform_member {
+	struct ib_inform_info	info;
+	struct ib_sa_client	*client;
+	struct inform_group	*group;
+	struct list_head	list;
+	enum inform_state	state;
+	atomic_t		refcount;
+	struct completion	comp;
+};
+
+struct inform_notice {
+	struct list_head	list;
+	struct ib_sa_notice	notice;
+};
+
+static void reg_handler(int status, struct ib_sa_inform *inform,
+			 void *context);
+static void unreg_handler(int status, struct ib_sa_inform *inform,
+			  void *context);
+
+static struct inform_group *inform_find(struct inform_port *port,
+					u16 trap_number)
+{
+	struct rb_node *node = port->table.rb_node;
+	struct inform_group *group;
+
+	while (node) {
+		group = rb_entry(node, struct inform_group, node);
+		if (trap_number < group->trap_number)
+			node = node->rb_left;
+		else if (trap_number > group->trap_number)
+			node = node->rb_right;
+		else
+			return group;
+	}
+	return NULL;
+}
+
+static struct inform_group *inform_insert(struct inform_port *port,
+					  struct inform_group *group)
+{
+	struct rb_node **link = &port->table.rb_node;
+	struct rb_node *parent = NULL;
+	struct inform_group *cur_group;
+
+	while (*link) {
+		parent = *link;
+		cur_group = rb_entry(parent, struct inform_group, node);
+		if (group->trap_number < cur_group->trap_number)
+			link = &(*link)->rb_left;
+		else if (group->trap_number > cur_group->trap_number)
+			link = &(*link)->rb_right;
+		else
+			return cur_group;
+	}
+	rb_link_node(&group->node, parent, link);
+	rb_insert_color(&group->node, &port->table);
+	return NULL;
+}
+
+static void deref_port(struct inform_port *port)
+{
+	if (atomic_dec_and_test(&port->refcount))
+		complete(&port->comp);
+}
+
+static void release_group(struct inform_group *group)
+{
+	struct inform_port *port = group->port;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	if (atomic_dec_and_test(&group->refcount)) {
+		rb_erase(&group->node, &port->table);
+		spin_unlock_irqrestore(&port->lock, flags);
+		kfree(group);
+		deref_port(port);
+	} else
+		spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void deref_member(struct inform_member *member)
+{
+	if (atomic_dec_and_test(&member->refcount))
+		complete(&member->comp);
+}
+
+static void queue_reg(struct inform_member *member)
+{
+	struct inform_group *group = member->group;
+	unsigned long flags;
+
+	spin_lock_irqsave(&group->lock, flags);
+	list_add(&member->list, &group->pending_list);
+	if (group->state == INFORM_IDLE) {
+		group->state = INFORM_BUSY;
+		atomic_inc(&group->refcount);
+		queue_work(inform_wq, &group->work);
+	}
+	spin_unlock_irqrestore(&group->lock, flags);
+}
+
+static int send_reg(struct inform_group *group, struct inform_member *member)
+{
+	struct inform_port *port = group->port;
+	struct ib_sa_inform inform;
+	int ret;
+
+	memset(&inform, 0, sizeof inform);
+	inform.lid_range_begin = cpu_to_be16(0xFFFF);
+	inform.is_generic = 1;
+	inform.subscribe = 1;
+	inform.type = cpu_to_be16(IB_SA_EVENT_TYPE_ALL);
+	inform.trap.generic.trap_num = cpu_to_be16(member->info.trap_number);
+	inform.trap.generic.resp_time = 19;
+	inform.trap.generic.producer_type =
+				cpu_to_be32(IB_SA_EVENT_PRODUCER_TYPE_ALL);
+
+	group->last_join = member;
+	ret = ib_sa_informinfo_query(&sa_client, port->dev->device,
+				     port->port_num, &inform, 3000, GFP_KERNEL,
+				     reg_handler, group,&group->query);
+	if (ret >= 0) {
+		group->query_id = ret;
+		ret = 0;
+	}
+	return ret;
+}
+
+static int send_unreg(struct inform_group *group)
+{
+	struct inform_port *port = group->port;
+	struct ib_sa_inform inform;
+	int ret;
+
+	memset(&inform, 0, sizeof inform);
+	inform.lid_range_begin = cpu_to_be16(0xFFFF);
+	inform.is_generic = 1;
+	inform.type = cpu_to_be16(IB_SA_EVENT_TYPE_ALL);
+	inform.trap.generic.trap_num = cpu_to_be16(group->trap_number);
+	inform.trap.generic.qpn = IB_QP1;
+	inform.trap.generic.resp_time = 19;
+	inform.trap.generic.producer_type =
+				cpu_to_be32(IB_SA_EVENT_PRODUCER_TYPE_ALL);
+
+	ret = ib_sa_informinfo_query(&sa_client, port->dev->device,
+				     port->port_num, &inform, 3000, GFP_KERNEL,
+				     unreg_handler, group, &group->query);
+	if (ret >= 0) {
+		group->query_id = ret;
+		ret = 0;
+	}
+	return ret;
+}
+
+static void join_group(struct inform_group *group, struct inform_member *member)
+{
+	member->state = INFORM_MEMBER;
+	group->members++;
+	list_move(&member->list, &group->active_list);
+}
+
+static int fail_join(struct inform_group *group, struct inform_member *member,
+		     int status)
+{
+	spin_lock_irq(&group->lock);
+	list_del_init(&member->list);
+	spin_unlock_irq(&group->lock);
+	return member->info.callback(status, &member->info, NULL);
+}
+
+static void process_group_error(struct inform_group *group)
+{
+	struct inform_member *member;
+	int ret;
+
+	spin_lock_irq(&group->lock);
+	while (!list_empty(&group->active_list)) {
+		member = list_entry(group->active_list.next,
+				    struct inform_member, list);
+		atomic_inc(&member->refcount);
+		list_del_init(&member->list);
+		group->members--;
+		member->state = INFORM_ERROR;
+		spin_unlock_irq(&group->lock);
+
+		ret = member->info.callback(-ENETRESET, &member->info, NULL);
+		deref_member(member);
+		if (ret)
+			ib_sa_unregister_inform_info(&member->info);
+		spin_lock_irq(&group->lock);
+	}
+
+	group->join_state = INFORM_IDLE;
+	group->state = INFORM_BUSY;
+	spin_unlock_irq(&group->lock);
+}
+
+/*
+ * Report a notice to all active subscribers.  We use a temporary list to
+ * handle unsubscription requests while the notice is being reported, which
+ * avoids holding the group lock while in the user's callback.
+ */
+static void process_notice(struct inform_group *group,
+			   struct inform_notice *info_notice)
+{
+	struct inform_member *member;
+	struct list_head list;
+	int ret;
+
+	INIT_LIST_HEAD(&list);
+
+	spin_lock_irq(&group->lock);
+	list_splice_init(&group->active_list, &list);
+	while (!list_empty(&list)) {
+
+		member = list_entry(list.next, struct inform_member, list);
+		atomic_inc(&member->refcount);
+		list_move(&member->list, &group->active_list);
+		spin_unlock_irq(&group->lock);
+
+		ret = member->info.callback(0, &member->info,
+					    &info_notice->notice);
+		deref_member(member);
+		if (ret)
+			ib_sa_unregister_inform_info(&member->info);
+		spin_lock_irq(&group->lock);
+	}
+	spin_unlock_irq(&group->lock);
+}
+
+static void inform_work_handler(struct work_struct *work)
+{
+	struct inform_group *group;
+	struct inform_member *member;
+	struct ib_inform_info *info;
+	struct inform_notice *info_notice;
+	int status, ret;
+
+	group = container_of(work, typeof(*group), work);
+retest:
+	spin_lock_irq(&group->lock);
+	while (!list_empty(&group->pending_list) ||
+	       !list_empty(&group->notice_list) ||
+	       (group->state == INFORM_ERROR)) {
+
+		if (group->state == INFORM_ERROR) {
+			spin_unlock_irq(&group->lock);
+			process_group_error(group);
+			goto retest;
+		}
+
+		if (!list_empty(&group->notice_list)) {
+			info_notice = list_entry(group->notice_list.next,
+						 struct inform_notice, list);
+			list_del(&info_notice->list);
+			spin_unlock_irq(&group->lock);
+			process_notice(group, info_notice);
+			kfree(info_notice);
+			goto retest;
+		}
+
+		member = list_entry(group->pending_list.next,
+				    struct inform_member, list);
+		info = &member->info;
+		atomic_inc(&member->refcount);
+
+		if (group->join_state == INFORM_MEMBER) {
+			join_group(group, member);
+			spin_unlock_irq(&group->lock);
+			ret = info->callback(0, info, NULL);
+		} else {
+			spin_unlock_irq(&group->lock);
+			status = send_reg(group, member);
+			if (!status) {
+				deref_member(member);
+				return;
+			}
+			ret = fail_join(group, member, status);
+		}
+
+		deref_member(member);
+		if (ret)
+			ib_sa_unregister_inform_info(&member->info);
+		spin_lock_irq(&group->lock);
+	}
+
+	if (!group->members && (group->join_state == INFORM_MEMBER)) {
+		group->join_state = INFORM_IDLE;
+		spin_unlock_irq(&group->lock);
+		if (send_unreg(group))
+			goto retest;
+	} else {
+		group->state = INFORM_IDLE;
+		spin_unlock_irq(&group->lock);
+		release_group(group);
+	}
+}
+
+/*
+ * Fail a join request if it is still active - at the head of the pending queue.
+ */
+static void process_join_error(struct inform_group *group, int status)
+{
+	struct inform_member *member;
+	int ret;
+
+	spin_lock_irq(&group->lock);
+	member = list_entry(group->pending_list.next,
+			    struct inform_member, list);
+	if (group->last_join == member) {
+		atomic_inc(&member->refcount);
+		list_del_init(&member->list);
+		spin_unlock_irq(&group->lock);
+		ret = member->info.callback(status, &member->info, NULL);
+		deref_member(member);
+		if (ret)
+			ib_sa_unregister_inform_info(&member->info);
+	} else
+		spin_unlock_irq(&group->lock);
+}
+
+static void reg_handler(int status, struct ib_sa_inform *inform, void *context)
+{
+	struct inform_group *group = context;
+
+	if (status)
+		process_join_error(group, status);
+	else
+		group->join_state = INFORM_MEMBER;
+
+	inform_work_handler(&group->work);
+}
+
+static void unreg_handler(int status, struct ib_sa_inform *rec, void *context)
+{
+	struct inform_group *group = context;
+
+	inform_work_handler(&group->work);
+}
+
+int notice_dispatch(struct ib_device *device, u8 port_num,
+		    struct ib_sa_notice *notice)
+{
+	struct inform_device *dev;
+	struct inform_port *port;
+	struct inform_group *group;
+	struct inform_notice *info_notice;
+
+	dev = ib_get_client_data(device, &inform_client);
+	if (!dev)
+		return 0; /* No one to give notice to. */
+
+	port = &dev->port[port_num - dev->start_port];
+	spin_lock_irq(&port->lock);
+	group = inform_find(port, __be16_to_cpu(notice->trap.
+						generic.trap_num));
+	if (!group) {
+		spin_unlock_irq(&port->lock);
+		return 0;
+	}
+
+	atomic_inc(&group->refcount);
+	spin_unlock_irq(&port->lock);
+
+	info_notice = kmalloc(sizeof *info_notice, GFP_KERNEL);
+	if (!info_notice) {
+		release_group(group);
+		return -ENOMEM;
+	}
+
+	info_notice->notice = *notice;
+
+	spin_lock_irq(&group->lock);
+	list_add(&info_notice->list, &group->notice_list);
+	if (group->state == INFORM_IDLE) {
+		group->state = INFORM_BUSY;
+		spin_unlock_irq(&group->lock);
+		inform_work_handler(&group->work);
+	} else {
+		spin_unlock_irq(&group->lock);
+		release_group(group);
+	}
+
+	return 0;
+}
+
+static struct inform_group *acquire_group(struct inform_port *port,
+					  u16 trap_number, gfp_t gfp_mask)
+{
+	struct inform_group *group, *cur_group;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	group = inform_find(port, trap_number);
+	if (group)
+		goto found;
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	group = kzalloc(sizeof *group, gfp_mask);
+	if (!group)
+		return NULL;
+
+	group->port = port;
+	group->trap_number = trap_number;
+	INIT_LIST_HEAD(&group->pending_list);
+	INIT_LIST_HEAD(&group->active_list);
+	INIT_LIST_HEAD(&group->notice_list);
+	INIT_WORK(&group->work, inform_work_handler);
+	spin_lock_init(&group->lock);
+
+	spin_lock_irqsave(&port->lock, flags);
+	cur_group = inform_insert(port, group);
+	if (cur_group) {
+		kfree(group);
+		group = cur_group;
+	} else
+		atomic_inc(&port->refcount);
+found:
+	atomic_inc(&group->refcount);
+	spin_unlock_irqrestore(&port->lock, flags);
+	return group;
+}
+
+/*
+ * We serialize all join requests to a single group to make our lives much
+ * easier.  Otherwise, two users could try to join the same group
+ * simultaneously, with different configurations, one could leave while the
+ * join is in progress, etc., which makes locking around error recovery
+ * difficult.
+ */
+struct ib_inform_info *
+ib_sa_register_inform_info(struct ib_sa_client *client,
+			   struct ib_device *device, u8 port_num,
+			   u16 trap_number, gfp_t gfp_mask,
+			   int (*callback)(int status,
+					   struct ib_inform_info *info,
+					   struct ib_sa_notice *notice),
+			   void *context)
+{
+	struct inform_device *dev;
+	struct inform_member *member;
+	struct ib_inform_info *info;
+	int ret;
+
+	dev = ib_get_client_data(device, &inform_client);
+	if (!dev)
+		return ERR_PTR(-ENODEV);
+
+	member = kzalloc(sizeof *member, gfp_mask);
+	if (!member)
+		return ERR_PTR(-ENOMEM);
+
+	ib_sa_client_get(client);
+	member->client = client;
+	member->info.trap_number = trap_number;
+	member->info.callback = callback;
+	member->info.context = context;
+	init_completion(&member->comp);
+	atomic_set(&member->refcount, 1);
+	member->state = INFORM_REGISTERING;
+
+	member->group = acquire_group(&dev->port[port_num - dev->start_port],
+				      trap_number, gfp_mask);
+	if (!member->group) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	/*
+	 * The user will get the info structure in their callback.  They
+	 * could then free the info structure before we can return from
+	 * this routine.  So we save the pointer to return before queuing
+	 * any callback.
+	 */
+	info = &member->info;
+	queue_reg(member);
+	return info;
+
+err:
+	ib_sa_client_put(member->client);
+	kfree(member);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_sa_register_inform_info);
+
+void ib_sa_unregister_inform_info(struct ib_inform_info *info)
+{
+	struct inform_member *member;
+	struct inform_group *group;
+
+	member = container_of(info, struct inform_member, info);
+	group = member->group;
+
+	spin_lock_irq(&group->lock);
+	if (member->state == INFORM_MEMBER)
+		group->members--;
+
+	list_del_init(&member->list);
+
+	if (group->state == INFORM_IDLE) {
+		group->state = INFORM_BUSY;
+		spin_unlock_irq(&group->lock);
+		/* Continue to hold reference on group until callback */
+		queue_work(inform_wq, &group->work);
+	} else {
+		spin_unlock_irq(&group->lock);
+		release_group(group);
+	}
+
+	deref_member(member);
+	wait_for_completion(&member->comp);
+	ib_sa_client_put(member->client);
+	kfree(member);
+}
+EXPORT_SYMBOL(ib_sa_unregister_inform_info);
+
+static void inform_groups_lost(struct inform_port *port)
+{
+	struct inform_group *group;
+	struct rb_node *node;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	for (node = rb_first(&port->table); node; node = rb_next(node)) {
+		group = rb_entry(node, struct inform_group, node);
+		spin_lock(&group->lock);
+		if (group->state == INFORM_IDLE) {
+			atomic_inc(&group->refcount);
+			queue_work(inform_wq, &group->work);
+		}
+		group->state = INFORM_ERROR;
+		spin_unlock(&group->lock);
+	}
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void inform_event_handler(struct ib_event_handler *handler,
+				struct ib_event *event)
+{
+	struct inform_device *dev;
+
+	dev = container_of(handler, struct inform_device, event_handler);
+
+	switch (event->event) {
+	case IB_EVENT_PORT_ERR:
+	case IB_EVENT_LID_CHANGE:
+	case IB_EVENT_SM_CHANGE:
+	case IB_EVENT_CLIENT_REREGISTER:
+		inform_groups_lost(&dev->port[event->element.port_num -
+					      dev->start_port]);
+		break;
+	default:
+		break;
+	}
+}
+
+static void inform_add_one(struct ib_device *device)
+{
+	struct inform_device *dev;
+	struct inform_port *port;
+	int i;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	dev = kmalloc(sizeof *dev + device->phys_port_cnt * sizeof *port,
+		      GFP_KERNEL);
+	if (!dev)
+		return;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH)
+		dev->start_port = dev->end_port = 0;
+	else {
+		dev->start_port = 1;
+		dev->end_port = device->phys_port_cnt;
+	}
+
+	for (i = 0; i <= dev->end_port - dev->start_port; i++) {
+		port = &dev->port[i];
+		port->dev = dev;
+		port->port_num = dev->start_port + i;
+		spin_lock_init(&port->lock);
+		port->table = RB_ROOT;
+		init_completion(&port->comp);
+		atomic_set(&port->refcount, 1);
+	}
+
+	dev->device = device;
+	ib_set_client_data(device, &inform_client, dev);
+
+	INIT_IB_EVENT_HANDLER(&dev->event_handler, device, inform_event_handler);
+	ib_register_event_handler(&dev->event_handler);
+}
+
+static void inform_remove_one(struct ib_device *device)
+{
+	struct inform_device *dev;
+	struct inform_port *port;
+	int i;
+
+	dev = ib_get_client_data(device, &inform_client);
+	if (!dev)
+		return;
+
+	ib_unregister_event_handler(&dev->event_handler);
+	flush_workqueue(inform_wq);
+
+	for (i = 0; i <= dev->end_port - dev->start_port; i++) {
+		port = &dev->port[i];
+		deref_port(port);
+		wait_for_completion(&port->comp);
+	}
+
+	kfree(dev);
+}
+
+int notice_init(void)
+{
+	int ret;
+
+	inform_wq = create_singlethread_workqueue("ib_inform");
+	if (!inform_wq)
+		return -ENOMEM;
+
+	ib_sa_register_client(&sa_client);
+
+	ret = ib_register_client(&inform_client);
+	if (ret)
+		goto err;
+	return 0;
+
+err:
+	ib_sa_unregister_client(&sa_client);
+	destroy_workqueue(inform_wq);
+	return ret;
+}
+
+void notice_cleanup(void)
+{
+	ib_unregister_client(&inform_client);
+	ib_sa_unregister_client(&sa_client);
+	destroy_workqueue(inform_wq);
+}
diff --git a/sys/ofed/drivers/infiniband/core/packer.c b/sys/ofed/drivers/infiniband/core/packer.c
new file mode 100644
index 0000000..019bd4b
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/packer.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/string.h>
+
+#include <rdma/ib_pack.h>
+
+static u64 value_read(int offset, int size, void *structure)
+{
+	switch (size) {
+	case 1: return                *(u8  *) (structure + offset);
+	case 2: return be16_to_cpup((__be16 *) (structure + offset));
+	case 4: return be32_to_cpup((__be32 *) (structure + offset));
+	case 8: return be64_to_cpup((__be64 *) (structure + offset));
+	default:
+		printk(KERN_WARNING "Field size %d bits not handled\n", size * 8);
+		return 0;
+	}
+}
+
+/**
+ * ib_pack - Pack a structure into a buffer
+ * @desc:Array of structure field descriptions
+ * @desc_len:Number of entries in @desc
+ * @structure:Structure to pack from
+ * @buf:Buffer to pack into
+ *
+ * ib_pack() packs a list of structure fields into a buffer,
+ * controlled by the array of fields in @desc.
+ */
+void ib_pack(const struct ib_field        *desc,
+	     int                           desc_len,
+	     void                         *structure,
+	     void                         *buf)
+{
+	int i;
+
+	for (i = 0; i < desc_len; ++i) {
+		if (desc[i].size_bits <= 32) {
+			int shift;
+			u32 val;
+			__be32 mask;
+			__be32 *addr;
+
+			shift = 32 - desc[i].offset_bits - desc[i].size_bits;
+			if (desc[i].struct_size_bytes)
+				val = value_read(desc[i].struct_offset_bytes,
+						 desc[i].struct_size_bytes,
+						 structure) << shift;
+			else
+				val = 0;
+
+			mask = cpu_to_be32(((1ull << desc[i].size_bits) - 1) << shift);
+			addr = (__be32 *) buf + desc[i].offset_words;
+			*addr = (*addr & ~mask) | (cpu_to_be32(val) & mask);
+		} else if (desc[i].size_bits <= 64) {
+			int shift;
+			u64 val;
+			__be64 mask;
+			__be64 *addr;
+
+			shift = 64 - desc[i].offset_bits - desc[i].size_bits;
+			if (desc[i].struct_size_bytes)
+				val = value_read(desc[i].struct_offset_bytes,
+						 desc[i].struct_size_bytes,
+						 structure) << shift;
+			else
+				val = 0;
+
+			mask = cpu_to_be64((~0ull >> (64 - desc[i].size_bits)) << shift);
+			addr = (__be64 *) ((__be32 *) buf + desc[i].offset_words);
+			*addr = (*addr & ~mask) | (cpu_to_be64(val) & mask);
+		} else {
+			if (desc[i].offset_bits % 8 ||
+			    desc[i].size_bits   % 8) {
+				printk(KERN_WARNING "Structure field %s of size %d "
+				       "bits is not byte-aligned\n",
+				       desc[i].field_name, desc[i].size_bits);
+			}
+
+			if (desc[i].struct_size_bytes)
+				memcpy(buf + desc[i].offset_words * 4 +
+				       desc[i].offset_bits / 8,
+				       structure + desc[i].struct_offset_bytes,
+				       desc[i].size_bits / 8);
+			else
+				memset(buf + desc[i].offset_words * 4 +
+				       desc[i].offset_bits / 8,
+				       0,
+				       desc[i].size_bits / 8);
+		}
+	}
+}
+EXPORT_SYMBOL(ib_pack);
+
+static void value_write(int offset, int size, u64 val, void *structure)
+{
+	switch (size * 8) {
+	case 8:  *(    u8 *) (structure + offset) = val; break;
+	case 16: *(__be16 *) (structure + offset) = cpu_to_be16(val); break;
+	case 32: *(__be32 *) (structure + offset) = cpu_to_be32(val); break;
+	case 64: *(__be64 *) (structure + offset) = cpu_to_be64(val); break;
+	default:
+		printk(KERN_WARNING "Field size %d bits not handled\n", size * 8);
+	}
+}
+
+/**
+ * ib_unpack - Unpack a buffer into a structure
+ * @desc:Array of structure field descriptions
+ * @desc_len:Number of entries in @desc
+ * @buf:Buffer to unpack from
+ * @structure:Structure to unpack into
+ *
+ * ib_pack() unpacks a list of structure fields from a buffer,
+ * controlled by the array of fields in @desc.
+ */
+void ib_unpack(const struct ib_field        *desc,
+	       int                           desc_len,
+	       void                         *buf,
+	       void                         *structure)
+{
+	int i;
+
+	for (i = 0; i < desc_len; ++i) {
+		if (!desc[i].struct_size_bytes)
+			continue;
+
+		if (desc[i].size_bits <= 32) {
+			int shift;
+			u32  val;
+			u32  mask;
+			__be32 *addr;
+
+			shift = 32 - desc[i].offset_bits - desc[i].size_bits;
+			mask = ((1ull << desc[i].size_bits) - 1) << shift;
+			addr = (__be32 *) buf + desc[i].offset_words;
+			val = (be32_to_cpup(addr) & mask) >> shift;
+			value_write(desc[i].struct_offset_bytes,
+				    desc[i].struct_size_bytes,
+				    val,
+				    structure);
+		} else if (desc[i].size_bits <= 64) {
+			int shift;
+			u64  val;
+			u64  mask;
+			__be64 *addr;
+
+			shift = 64 - desc[i].offset_bits - desc[i].size_bits;
+			mask = (~0ull >> (64 - desc[i].size_bits)) << shift;
+			addr = (__be64 *) buf + desc[i].offset_words;
+			val = (be64_to_cpup(addr) & mask) >> shift;
+			value_write(desc[i].struct_offset_bytes,
+				    desc[i].struct_size_bytes,
+				    val,
+				    structure);
+		} else {
+			if (desc[i].offset_bits % 8 ||
+			    desc[i].size_bits   % 8) {
+				printk(KERN_WARNING "Structure field %s of size %d "
+				       "bits is not byte-aligned\n",
+				       desc[i].field_name, desc[i].size_bits);
+			}
+
+			memcpy(structure + desc[i].struct_offset_bytes,
+			       buf + desc[i].offset_words * 4 +
+			       desc[i].offset_bits / 8,
+			       desc[i].size_bits / 8);
+		}
+	}
+}
+EXPORT_SYMBOL(ib_unpack);
diff --git a/sys/ofed/drivers/infiniband/core/sa.h b/sys/ofed/drivers/infiniband/core/sa.h
new file mode 100644
index 0000000..b8abdd7
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/sa.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc.  All rights reserved.
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef SA_H
+#define SA_H
+
+#include <rdma/ib_sa.h>
+
+static inline void ib_sa_client_get(struct ib_sa_client *client)
+{
+	atomic_inc(&client->users);
+}
+
+static inline void ib_sa_client_put(struct ib_sa_client *client)
+{
+	if (atomic_dec_and_test(&client->users))
+		complete(&client->comp);
+}
+
+int ib_sa_check_selector(ib_sa_comp_mask comp_mask,
+			 ib_sa_comp_mask selector_mask,
+			 ib_sa_comp_mask value_mask,
+			 u8 selector, u8 src_value, u8 dst_value);
+
+int ib_sa_pack_attr(void *dst, void *src, int attr_id);
+
+int ib_sa_unpack_attr(void *dst, void *src, int attr_id);
+
+int ib_sa_path_rec_query(struct ib_sa_client *client,
+			 struct ib_device *device, u8 port_num,
+			 struct ib_sa_path_rec *rec,
+			 ib_sa_comp_mask comp_mask,
+			 int timeout_ms, gfp_t gfp_mask,
+			 void (*callback)(int status,
+					  struct ib_sa_path_rec *resp,
+					  void *context),
+			 void *context,
+			 struct ib_sa_query **sa_query);
+
+int sa_db_init(void);
+void sa_db_cleanup(void);
+
+int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
+			     struct ib_device *device, u8 port_num,
+			     u8 method,
+			     struct ib_sa_mcmember_rec *rec,
+			     ib_sa_comp_mask comp_mask,
+			     int timeout_ms, gfp_t gfp_mask,
+			     void (*callback)(int status,
+					      struct ib_sa_mcmember_rec *resp,
+					      void *context),
+			     void *context,
+			     struct ib_sa_query **sa_query);
+
+int mcast_init(void);
+void mcast_cleanup(void);
+
+int ib_sa_informinfo_query(struct ib_sa_client *client,
+			   struct ib_device *device, u8 port_num,
+			   struct ib_sa_inform *rec,
+			   int timeout_ms, gfp_t gfp_mask,
+			   void (*callback)(int status,
+					    struct ib_sa_inform *resp,
+					    void *context),
+			   void *context,
+			   struct ib_sa_query **sa_query);
+
+int notice_dispatch(struct ib_device *device, u8 port_num,
+		    struct ib_sa_notice *notice);
+
+int notice_init(void);
+void notice_cleanup(void);
+
+#endif /* SA_H */
diff --git a/sys/ofed/drivers/infiniband/core/sa_query.c b/sys/ofed/drivers/infiniband/core/sa_query.c
new file mode 100644
index 0000000..0fc1c0e
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/sa_query.c
@@ -0,0 +1,1480 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc.  All rights reserved.
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/random.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/dma-mapping.h>
+#include <linux/kref.h>
+#include <linux/idr.h>
+#include <linux/workqueue.h>
+
+#include <rdma/ib_pack.h>
+#include <rdma/ib_cache.h>
+#include "sa.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand subnet administration query support");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct ib_sa_sm_ah {
+	struct ib_ah        *ah;
+	struct kref          ref;
+	u16		     pkey_index;
+	u8		     src_path_mask;
+};
+
+struct ib_sa_port {
+	struct ib_mad_agent *agent;
+	struct ib_mad_agent *notice_agent;
+	struct ib_sa_sm_ah  *sm_ah;
+	struct work_struct   update_task;
+	spinlock_t           ah_lock;
+	u8                   port_num;
+	struct ib_device    *device;
+};
+
+struct ib_sa_device {
+	int                     start_port, end_port;
+	struct ib_event_handler event_handler;
+	struct ib_sa_port port[0];
+};
+
+struct ib_sa_query {
+	void (*callback)(struct ib_sa_query *, int, struct ib_sa_mad *);
+	void (*release)(struct ib_sa_query *);
+	struct ib_sa_client    *client;
+	struct ib_sa_port      *port;
+	struct ib_mad_send_buf *mad_buf;
+	struct ib_sa_sm_ah     *sm_ah;
+	int			id;
+};
+
+struct ib_sa_service_query {
+	void (*callback)(int, struct ib_sa_service_rec *, void *);
+	void *context;
+	struct ib_sa_query sa_query;
+};
+
+struct ib_sa_path_query {
+	void (*callback)(int, struct ib_sa_path_rec *, void *);
+	void *context;
+	struct ib_sa_query sa_query;
+};
+
+struct ib_sa_mcmember_query {
+	void (*callback)(int, struct ib_sa_mcmember_rec *, void *);
+	void *context;
+	struct ib_sa_query sa_query;
+};
+
+struct ib_sa_inform_query {
+	void (*callback)(int, struct ib_sa_inform *, void *);
+	void *context;
+	struct ib_sa_query sa_query;
+};
+
+static void ib_sa_add_one(struct ib_device *device);
+static void ib_sa_remove_one(struct ib_device *device);
+
+static struct ib_client sa_client = {
+	.name   = "sa",
+	.add    = ib_sa_add_one,
+	.remove = ib_sa_remove_one
+};
+
+static spinlock_t idr_lock;
+static DEFINE_IDR(query_idr);
+
+static spinlock_t tid_lock;
+static u32 tid;
+
+#define PATH_REC_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct ib_sa_path_rec, field),		\
+	.struct_size_bytes   = sizeof ((struct ib_sa_path_rec *) 0)->field,	\
+	.field_name          = "sa_path_rec:" #field
+
+static const struct ib_field path_rec_table[] = {
+	{ PATH_REC_FIELD(service_id),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 64 },
+	{ PATH_REC_FIELD(dgid),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ PATH_REC_FIELD(sgid),
+	  .offset_words = 6,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ PATH_REC_FIELD(dlid),
+	  .offset_words = 10,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ PATH_REC_FIELD(slid),
+	  .offset_words = 10,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ PATH_REC_FIELD(raw_traffic),
+	  .offset_words = 11,
+	  .offset_bits  = 0,
+	  .size_bits    = 1 },
+	{ RESERVED,
+	  .offset_words = 11,
+	  .offset_bits  = 1,
+	  .size_bits    = 3 },
+	{ PATH_REC_FIELD(flow_label),
+	  .offset_words = 11,
+	  .offset_bits  = 4,
+	  .size_bits    = 20 },
+	{ PATH_REC_FIELD(hop_limit),
+	  .offset_words = 11,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ PATH_REC_FIELD(traffic_class),
+	  .offset_words = 12,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ PATH_REC_FIELD(reversible),
+	  .offset_words = 12,
+	  .offset_bits  = 8,
+	  .size_bits    = 1 },
+	{ PATH_REC_FIELD(numb_path),
+	  .offset_words = 12,
+	  .offset_bits  = 9,
+	  .size_bits    = 7 },
+	{ PATH_REC_FIELD(pkey),
+	  .offset_words = 12,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ PATH_REC_FIELD(qos_class),
+	  .offset_words = 13,
+	  .offset_bits  = 0,
+	  .size_bits    = 12 },
+	{ PATH_REC_FIELD(sl),
+	  .offset_words = 13,
+	  .offset_bits  = 12,
+	  .size_bits    = 4 },
+	{ PATH_REC_FIELD(mtu_selector),
+	  .offset_words = 13,
+	  .offset_bits  = 16,
+	  .size_bits    = 2 },
+	{ PATH_REC_FIELD(mtu),
+	  .offset_words = 13,
+	  .offset_bits  = 18,
+	  .size_bits    = 6 },
+	{ PATH_REC_FIELD(rate_selector),
+	  .offset_words = 13,
+	  .offset_bits  = 24,
+	  .size_bits    = 2 },
+	{ PATH_REC_FIELD(rate),
+	  .offset_words = 13,
+	  .offset_bits  = 26,
+	  .size_bits    = 6 },
+	{ PATH_REC_FIELD(packet_life_time_selector),
+	  .offset_words = 14,
+	  .offset_bits  = 0,
+	  .size_bits    = 2 },
+	{ PATH_REC_FIELD(packet_life_time),
+	  .offset_words = 14,
+	  .offset_bits  = 2,
+	  .size_bits    = 6 },
+	{ PATH_REC_FIELD(preference),
+	  .offset_words = 14,
+	  .offset_bits  = 8,
+	  .size_bits    = 8 },
+	{ RESERVED,
+	  .offset_words = 14,
+	  .offset_bits  = 16,
+	  .size_bits    = 48 },
+};
+
+#define MCMEMBER_REC_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct ib_sa_mcmember_rec, field),	\
+	.struct_size_bytes   = sizeof ((struct ib_sa_mcmember_rec *) 0)->field,	\
+	.field_name          = "sa_mcmember_rec:" #field
+
+static const struct ib_field mcmember_rec_table[] = {
+	{ MCMEMBER_REC_FIELD(mgid),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ MCMEMBER_REC_FIELD(port_gid),
+	  .offset_words = 4,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ MCMEMBER_REC_FIELD(qkey),
+	  .offset_words = 8,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ MCMEMBER_REC_FIELD(mlid),
+	  .offset_words = 9,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ MCMEMBER_REC_FIELD(mtu_selector),
+	  .offset_words = 9,
+	  .offset_bits  = 16,
+	  .size_bits    = 2 },
+	{ MCMEMBER_REC_FIELD(mtu),
+	  .offset_words = 9,
+	  .offset_bits  = 18,
+	  .size_bits    = 6 },
+	{ MCMEMBER_REC_FIELD(traffic_class),
+	  .offset_words = 9,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ MCMEMBER_REC_FIELD(pkey),
+	  .offset_words = 10,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ MCMEMBER_REC_FIELD(rate_selector),
+	  .offset_words = 10,
+	  .offset_bits  = 16,
+	  .size_bits    = 2 },
+	{ MCMEMBER_REC_FIELD(rate),
+	  .offset_words = 10,
+	  .offset_bits  = 18,
+	  .size_bits    = 6 },
+	{ MCMEMBER_REC_FIELD(packet_life_time_selector),
+	  .offset_words = 10,
+	  .offset_bits  = 24,
+	  .size_bits    = 2 },
+	{ MCMEMBER_REC_FIELD(packet_life_time),
+	  .offset_words = 10,
+	  .offset_bits  = 26,
+	  .size_bits    = 6 },
+	{ MCMEMBER_REC_FIELD(sl),
+	  .offset_words = 11,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ MCMEMBER_REC_FIELD(flow_label),
+	  .offset_words = 11,
+	  .offset_bits  = 4,
+	  .size_bits    = 20 },
+	{ MCMEMBER_REC_FIELD(hop_limit),
+	  .offset_words = 11,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ MCMEMBER_REC_FIELD(scope),
+	  .offset_words = 12,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ MCMEMBER_REC_FIELD(join_state),
+	  .offset_words = 12,
+	  .offset_bits  = 4,
+	  .size_bits    = 4 },
+	{ MCMEMBER_REC_FIELD(proxy_join),
+	  .offset_words = 12,
+	  .offset_bits  = 8,
+	  .size_bits    = 1 },
+	{ RESERVED,
+	  .offset_words = 12,
+	  .offset_bits  = 9,
+	  .size_bits    = 23 },
+};
+
+#define SERVICE_REC_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct ib_sa_service_rec, field),	\
+	.struct_size_bytes   = sizeof ((struct ib_sa_service_rec *) 0)->field,	\
+	.field_name          = "sa_service_rec:" #field
+
+static const struct ib_field service_rec_table[] = {
+	{ SERVICE_REC_FIELD(id),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 64 },
+	{ SERVICE_REC_FIELD(gid),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ SERVICE_REC_FIELD(pkey),
+	  .offset_words = 6,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ SERVICE_REC_FIELD(lease),
+	  .offset_words = 7,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ SERVICE_REC_FIELD(key),
+	  .offset_words = 8,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ SERVICE_REC_FIELD(name),
+	  .offset_words = 12,
+	  .offset_bits  = 0,
+	  .size_bits    = 64*8 },
+	{ SERVICE_REC_FIELD(data8),
+	  .offset_words = 28,
+	  .offset_bits  = 0,
+	  .size_bits    = 16*8 },
+	{ SERVICE_REC_FIELD(data16),
+	  .offset_words = 32,
+	  .offset_bits  = 0,
+	  .size_bits    = 8*16 },
+	{ SERVICE_REC_FIELD(data32),
+	  .offset_words = 36,
+	  .offset_bits  = 0,
+	  .size_bits    = 4*32 },
+	{ SERVICE_REC_FIELD(data64),
+	  .offset_words = 40,
+	  .offset_bits  = 0,
+	  .size_bits    = 2*64 },
+};
+
+#define INFORM_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct ib_sa_inform, field), \
+	.struct_size_bytes   = sizeof ((struct ib_sa_inform *) 0)->field, \
+	.field_name          = "sa_inform:" #field
+
+static const struct ib_field inform_table[] = {
+	{ INFORM_FIELD(gid),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ INFORM_FIELD(lid_range_begin),
+	  .offset_words = 4,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ INFORM_FIELD(lid_range_end),
+	  .offset_words = 4,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ RESERVED,
+	  .offset_words = 5,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ INFORM_FIELD(is_generic),
+	  .offset_words = 5,
+	  .offset_bits  = 16,
+	  .size_bits    = 8 },
+	{ INFORM_FIELD(subscribe),
+	  .offset_words = 5,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ INFORM_FIELD(type),
+	  .offset_words = 6,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ INFORM_FIELD(trap.generic.trap_num),
+	  .offset_words = 6,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ INFORM_FIELD(trap.generic.qpn),
+	  .offset_words = 7,
+	  .offset_bits  = 0,
+	  .size_bits    = 24 },
+	{ RESERVED,
+	  .offset_words = 7,
+	  .offset_bits  = 24,
+	  .size_bits    = 3 },
+	{ INFORM_FIELD(trap.generic.resp_time),
+	  .offset_words = 7,
+	  .offset_bits  = 27,
+	  .size_bits    = 5 },
+	{ RESERVED,
+	  .offset_words = 8,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ INFORM_FIELD(trap.generic.producer_type),
+	  .offset_words = 8,
+	  .offset_bits  = 8,
+	  .size_bits    = 24 },
+};
+
+#define NOTICE_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct ib_sa_notice, field), \
+	.struct_size_bytes   = sizeof ((struct ib_sa_notice *) 0)->field, \
+	.field_name          = "sa_notice:" #field
+
+static const struct ib_field notice_table[] = {
+	{ NOTICE_FIELD(is_generic),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 1 },
+	{ NOTICE_FIELD(type),
+	  .offset_words = 0,
+	  .offset_bits  = 1,
+	  .size_bits    = 7 },
+	{ NOTICE_FIELD(trap.generic.producer_type),
+	  .offset_words = 0,
+	  .offset_bits  = 8,
+	  .size_bits    = 24 },
+	{ NOTICE_FIELD(trap.generic.trap_num),
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ NOTICE_FIELD(issuer_lid),
+	  .offset_words = 1,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ NOTICE_FIELD(notice_toggle),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 1 },
+	{ NOTICE_FIELD(notice_count),
+	  .offset_words = 2,
+	  .offset_bits  = 1,
+	  .size_bits    = 15 },
+	{ NOTICE_FIELD(data_details),
+	  .offset_words = 2,
+	  .offset_bits  = 16,
+	  .size_bits    = 432 },
+	{ NOTICE_FIELD(issuer_gid),
+	  .offset_words = 16,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+};
+
+int ib_sa_check_selector(ib_sa_comp_mask comp_mask,
+			 ib_sa_comp_mask selector_mask,
+			 ib_sa_comp_mask value_mask,
+			 u8 selector, u8 src_value, u8 dst_value)
+{
+	int err;
+
+	if (!(comp_mask & selector_mask) || !(comp_mask & value_mask))
+		return 0;
+
+	switch (selector) {
+	case IB_SA_GT:
+		err = (src_value <= dst_value);
+		break;
+	case IB_SA_LT:
+		err = (src_value >= dst_value);
+		break;
+	case IB_SA_EQ:
+		err = (src_value != dst_value);
+		break;
+	default:
+		err = 0;
+		break;
+	}
+
+	return err;
+}
+
+int ib_sa_pack_attr(void *dst, void *src, int attr_id)
+{
+	switch (attr_id) {
+	case IB_SA_ATTR_PATH_REC:
+		ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), src, dst);
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+int ib_sa_unpack_attr(void *dst, void *src, int attr_id)
+{
+	switch (attr_id) {
+	case IB_SA_ATTR_PATH_REC:
+		ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), src, dst);
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void free_sm_ah(struct kref *kref)
+{
+	struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
+
+	ib_destroy_ah(sm_ah->ah);
+	kfree(sm_ah);
+}
+
+static void update_sm_ah(struct work_struct *work)
+{
+	struct ib_sa_port *port =
+		container_of(work, struct ib_sa_port, update_task);
+	struct ib_sa_sm_ah *new_ah;
+	struct ib_port_attr port_attr;
+	struct ib_ah_attr   ah_attr;
+
+	if (ib_query_port(port->agent->device, port->port_num, &port_attr)) {
+		printk(KERN_WARNING "Couldn't query port\n");
+		return;
+	}
+
+	new_ah = kmalloc(sizeof *new_ah, GFP_KERNEL);
+	if (!new_ah) {
+		printk(KERN_WARNING "Couldn't allocate new SM AH\n");
+		return;
+	}
+
+	kref_init(&new_ah->ref);
+	new_ah->src_path_mask = (1 << port_attr.lmc) - 1;
+
+	new_ah->pkey_index = 0;
+	if (ib_find_pkey(port->agent->device, port->port_num,
+			 IB_DEFAULT_PKEY_FULL, &new_ah->pkey_index))
+		printk(KERN_ERR "Couldn't find index for default PKey\n");
+
+	memset(&ah_attr, 0, sizeof ah_attr);
+	ah_attr.dlid     = port_attr.sm_lid;
+	ah_attr.sl       = port_attr.sm_sl;
+	ah_attr.port_num = port->port_num;
+
+	new_ah->ah = ib_create_ah(port->agent->qp->pd, &ah_attr);
+	if (IS_ERR(new_ah->ah)) {
+		printk(KERN_WARNING "Couldn't create new SM AH\n");
+		kfree(new_ah);
+		return;
+	}
+
+	spin_lock_irq(&port->ah_lock);
+	if (port->sm_ah)
+		kref_put(&port->sm_ah->ref, free_sm_ah);
+	port->sm_ah = new_ah;
+	spin_unlock_irq(&port->ah_lock);
+
+}
+
+static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event)
+{
+	if (event->event == IB_EVENT_PORT_ERR    ||
+	    event->event == IB_EVENT_PORT_ACTIVE ||
+	    event->event == IB_EVENT_LID_CHANGE  ||
+	    event->event == IB_EVENT_PKEY_CHANGE ||
+	    event->event == IB_EVENT_SM_CHANGE   ||
+	    event->event == IB_EVENT_CLIENT_REREGISTER) {
+		unsigned long flags;
+		struct ib_sa_device *sa_dev =
+			container_of(handler, typeof(*sa_dev), event_handler);
+		struct ib_sa_port *port =
+			&sa_dev->port[event->element.port_num - sa_dev->start_port];
+
+		if (rdma_port_get_link_layer(handler->device, port->port_num) != IB_LINK_LAYER_INFINIBAND)
+			return;
+
+		spin_lock_irqsave(&port->ah_lock, flags);
+		if (port->sm_ah)
+			kref_put(&port->sm_ah->ref, free_sm_ah);
+		port->sm_ah = NULL;
+		spin_unlock_irqrestore(&port->ah_lock, flags);
+
+		schedule_work(&sa_dev->port[event->element.port_num -
+					    sa_dev->start_port].update_task);
+	}
+}
+
+void ib_sa_register_client(struct ib_sa_client *client)
+{
+	atomic_set(&client->users, 1);
+	init_completion(&client->comp);
+}
+EXPORT_SYMBOL(ib_sa_register_client);
+
+void ib_sa_unregister_client(struct ib_sa_client *client)
+{
+	ib_sa_client_put(client);
+	wait_for_completion(&client->comp);
+}
+EXPORT_SYMBOL(ib_sa_unregister_client);
+
+/**
+ * ib_sa_cancel_query - try to cancel an SA query
+ * @id:ID of query to cancel
+ * @query:query pointer to cancel
+ *
+ * Try to cancel an SA query.  If the id and query don't match up or
+ * the query has already completed, nothing is done.  Otherwise the
+ * query is canceled and will complete with a status of -EINTR.
+ */
+void ib_sa_cancel_query(int id, struct ib_sa_query *query)
+{
+	unsigned long flags;
+	struct ib_mad_agent *agent;
+	struct ib_mad_send_buf *mad_buf;
+
+	spin_lock_irqsave(&idr_lock, flags);
+	if (idr_find(&query_idr, id) != query) {
+		spin_unlock_irqrestore(&idr_lock, flags);
+		return;
+	}
+	agent = query->port->agent;
+	mad_buf = query->mad_buf;
+	spin_unlock_irqrestore(&idr_lock, flags);
+
+	ib_cancel_mad(agent, mad_buf);
+}
+EXPORT_SYMBOL(ib_sa_cancel_query);
+
+static u8 get_src_path_mask(struct ib_device *device, u8 port_num)
+{
+	struct ib_sa_device *sa_dev;
+	struct ib_sa_port   *port;
+	unsigned long flags;
+	u8 src_path_mask;
+
+	sa_dev = ib_get_client_data(device, &sa_client);
+	if (!sa_dev)
+		return 0x7f;
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+	spin_lock_irqsave(&port->ah_lock, flags);
+	src_path_mask = port->sm_ah ? port->sm_ah->src_path_mask : 0x7f;
+	spin_unlock_irqrestore(&port->ah_lock, flags);
+
+	return src_path_mask;
+}
+
+int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
+			 struct ib_sa_path_rec *rec, struct ib_ah_attr *ah_attr)
+{
+	int ret;
+	u16 gid_index;
+	int force_grh;
+
+	memset(ah_attr, 0, sizeof *ah_attr);
+	ah_attr->dlid = be16_to_cpu(rec->dlid);
+	ah_attr->sl = rec->sl;
+	ah_attr->src_path_bits = be16_to_cpu(rec->slid) &
+				 get_src_path_mask(device, port_num);
+	ah_attr->port_num = port_num;
+	ah_attr->static_rate = rec->rate;
+
+	force_grh = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_ETHERNET;
+
+	if (rec->hop_limit > 1 || force_grh) {
+		ah_attr->ah_flags = IB_AH_GRH;
+		ah_attr->grh.dgid = rec->dgid;
+
+		ret = ib_find_cached_gid(device, &rec->sgid, &port_num,
+					 &gid_index);
+		if (ret)
+			return ret;
+
+		ah_attr->grh.sgid_index    = gid_index;
+		ah_attr->grh.flow_label    = be32_to_cpu(rec->flow_label);
+		ah_attr->grh.hop_limit     = rec->hop_limit;
+		ah_attr->grh.traffic_class = rec->traffic_class;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ib_init_ah_from_path);
+
+static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&query->port->ah_lock, flags);
+	if (!query->port->sm_ah) {
+		spin_unlock_irqrestore(&query->port->ah_lock, flags);
+		return -EAGAIN;
+	}
+	kref_get(&query->port->sm_ah->ref);
+	query->sm_ah = query->port->sm_ah;
+	spin_unlock_irqrestore(&query->port->ah_lock, flags);
+
+	query->mad_buf = ib_create_send_mad(query->port->agent, 1,
+					    query->sm_ah->pkey_index,
+					    0, IB_MGMT_SA_HDR, IB_MGMT_SA_DATA,
+					    gfp_mask);
+	if (IS_ERR(query->mad_buf)) {
+		kref_put(&query->sm_ah->ref, free_sm_ah);
+		return -ENOMEM;
+	}
+
+	query->mad_buf->ah = query->sm_ah->ah;
+
+	return 0;
+}
+
+static void free_mad(struct ib_sa_query *query)
+{
+	ib_free_send_mad(query->mad_buf);
+	kref_put(&query->sm_ah->ref, free_sm_ah);
+}
+
+static void init_mad(struct ib_sa_mad *mad, struct ib_mad_agent *agent)
+{
+	unsigned long flags;
+
+	memset(mad, 0, sizeof *mad);
+
+	mad->mad_hdr.base_version  = IB_MGMT_BASE_VERSION;
+	mad->mad_hdr.mgmt_class    = IB_MGMT_CLASS_SUBN_ADM;
+	mad->mad_hdr.class_version = IB_SA_CLASS_VERSION;
+
+	spin_lock_irqsave(&tid_lock, flags);
+	mad->mad_hdr.tid           =
+		cpu_to_be64(((u64) agent->hi_tid) << 32 | tid++);
+	spin_unlock_irqrestore(&tid_lock, flags);
+}
+
+static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
+{
+	unsigned long flags;
+	int ret, id;
+
+retry:
+	if (!idr_pre_get(&query_idr, gfp_mask))
+		return -ENOMEM;
+	spin_lock_irqsave(&idr_lock, flags);
+	ret = idr_get_new(&query_idr, query, &id);
+	spin_unlock_irqrestore(&idr_lock, flags);
+	if (ret == -EAGAIN)
+		goto retry;
+	if (ret)
+		return ret;
+
+	query->mad_buf->timeout_ms  = timeout_ms;
+	query->mad_buf->context[0] = query;
+	query->id = id;
+
+	ret = ib_post_send_mad(query->mad_buf, NULL);
+	if (ret) {
+		spin_lock_irqsave(&idr_lock, flags);
+		idr_remove(&query_idr, id);
+		spin_unlock_irqrestore(&idr_lock, flags);
+	}
+
+	/*
+	 * It's not safe to dereference query any more, because the
+	 * send may already have completed and freed the query in
+	 * another context.
+	 */
+	return ret ? ret : id;
+}
+
+void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec)
+{
+	ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), attribute, rec);
+}
+EXPORT_SYMBOL(ib_sa_unpack_path);
+
+static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
+				    int status,
+				    struct ib_sa_mad *mad)
+{
+	struct ib_sa_path_query *query =
+		container_of(sa_query, struct ib_sa_path_query, sa_query);
+
+	if (mad) {
+		struct ib_sa_path_rec rec;
+
+		ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table),
+			  mad->data, &rec);
+		query->callback(status, &rec, query->context);
+	} else
+		query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_path_rec_release(struct ib_sa_query *sa_query)
+{
+	kfree(container_of(sa_query, struct ib_sa_path_query, sa_query));
+}
+
+int ib_sa_path_rec_query(struct ib_sa_client *client,
+			 struct ib_device *device, u8 port_num,
+			 struct ib_sa_path_rec *rec,
+			 ib_sa_comp_mask comp_mask,
+			 int timeout_ms, gfp_t gfp_mask,
+			 void (*callback)(int status,
+					  struct ib_sa_path_rec *resp,
+					  void *context),
+			 void *context,
+			 struct ib_sa_query **sa_query)
+{
+	struct ib_sa_path_query *query;
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_port   *port;
+	struct ib_mad_agent *agent;
+	struct ib_sa_mad *mad;
+	int ret;
+
+	if (!sa_dev)
+		return -ENODEV;
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+	agent = port->agent;
+
+	query = kmalloc(sizeof *query, gfp_mask);
+	if (!query)
+		return -ENOMEM;
+
+	query->sa_query.port     = port;
+	ret = alloc_mad(&query->sa_query, gfp_mask);
+	if (ret)
+		goto err1;
+
+	ib_sa_client_get(client);
+	query->sa_query.client = client;
+	query->callback        = callback;
+	query->context         = context;
+
+	mad = query->sa_query.mad_buf->mad;
+	init_mad(mad, agent);
+
+	query->sa_query.callback = callback ? ib_sa_path_rec_callback : NULL;
+	query->sa_query.release  = ib_sa_path_rec_release;
+	mad->mad_hdr.method	 = IB_MGMT_METHOD_GET;
+	mad->mad_hdr.attr_id	 = cpu_to_be16(IB_SA_ATTR_PATH_REC);
+	mad->sa_hdr.comp_mask	 = comp_mask;
+
+	ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), rec, mad->data);
+
+	*sa_query = &query->sa_query;
+
+	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+	if (ret < 0)
+		goto err2;
+
+	return ret;
+
+err2:
+	*sa_query = NULL;
+	ib_sa_client_put(query->sa_query.client);
+	free_mad(&query->sa_query);
+
+err1:
+	kfree(query);
+	return ret;
+}
+
+static void ib_sa_service_rec_callback(struct ib_sa_query *sa_query,
+				    int status,
+				    struct ib_sa_mad *mad)
+{
+	struct ib_sa_service_query *query =
+		container_of(sa_query, struct ib_sa_service_query, sa_query);
+
+	if (mad) {
+		struct ib_sa_service_rec rec;
+
+		ib_unpack(service_rec_table, ARRAY_SIZE(service_rec_table),
+			  mad->data, &rec);
+		query->callback(status, &rec, query->context);
+	} else
+		query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_service_rec_release(struct ib_sa_query *sa_query)
+{
+	kfree(container_of(sa_query, struct ib_sa_service_query, sa_query));
+}
+
+/**
+ * ib_sa_service_rec_query - Start Service Record operation
+ * @client:SA client
+ * @device:device to send request on
+ * @port_num: port number to send request on
+ * @method:SA method - should be get, set, or delete
+ * @rec:Service Record to send in request
+ * @comp_mask:component mask to send in request
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when request completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:request context, used to cancel request
+ *
+ * Send a Service Record set/get/delete to the SA to register,
+ * unregister or query a service record.
+ * The callback function will be called when the request completes (or
+ * fails); status is 0 for a successful response, -EINTR if the query
+ * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
+ * occurred sending the query.  The resp parameter of the callback is
+ * only valid if status is 0.
+ *
+ * If the return value of ib_sa_service_rec_query() is negative, it is an
+ * error code.  Otherwise it is a request ID that can be used to cancel
+ * the query.
+ */
+int ib_sa_service_rec_query(struct ib_sa_client *client,
+			    struct ib_device *device, u8 port_num, u8 method,
+			    struct ib_sa_service_rec *rec,
+			    ib_sa_comp_mask comp_mask,
+			    int timeout_ms, gfp_t gfp_mask,
+			    void (*callback)(int status,
+					     struct ib_sa_service_rec *resp,
+					     void *context),
+			    void *context,
+			    struct ib_sa_query **sa_query)
+{
+	struct ib_sa_service_query *query;
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_port   *port;
+	struct ib_mad_agent *agent;
+	struct ib_sa_mad *mad;
+	int ret;
+
+	if (!sa_dev)
+		return -ENODEV;
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+	agent = port->agent;
+
+	if (method != IB_MGMT_METHOD_GET &&
+	    method != IB_MGMT_METHOD_SET &&
+	    method != IB_SA_METHOD_DELETE)
+		return -EINVAL;
+
+	query = kmalloc(sizeof *query, gfp_mask);
+	if (!query)
+		return -ENOMEM;
+
+	query->sa_query.port     = port;
+	ret = alloc_mad(&query->sa_query, gfp_mask);
+	if (ret)
+		goto err1;
+
+	ib_sa_client_get(client);
+	query->sa_query.client = client;
+	query->callback        = callback;
+	query->context         = context;
+
+	mad = query->sa_query.mad_buf->mad;
+	init_mad(mad, agent);
+
+	query->sa_query.callback = callback ? ib_sa_service_rec_callback : NULL;
+	query->sa_query.release  = ib_sa_service_rec_release;
+	mad->mad_hdr.method	 = method;
+	mad->mad_hdr.attr_id	 = cpu_to_be16(IB_SA_ATTR_SERVICE_REC);
+	mad->sa_hdr.comp_mask	 = comp_mask;
+
+	ib_pack(service_rec_table, ARRAY_SIZE(service_rec_table),
+		rec, mad->data);
+
+	*sa_query = &query->sa_query;
+
+	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+	if (ret < 0)
+		goto err2;
+
+	return ret;
+
+err2:
+	*sa_query = NULL;
+	ib_sa_client_put(query->sa_query.client);
+	free_mad(&query->sa_query);
+
+err1:
+	kfree(query);
+	return ret;
+}
+EXPORT_SYMBOL(ib_sa_service_rec_query);
+
+static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query,
+					int status,
+					struct ib_sa_mad *mad)
+{
+	struct ib_sa_mcmember_query *query =
+		container_of(sa_query, struct ib_sa_mcmember_query, sa_query);
+
+	if (mad) {
+		struct ib_sa_mcmember_rec rec;
+
+		ib_unpack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table),
+			  mad->data, &rec);
+		query->callback(status, &rec, query->context);
+	} else
+		query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_mcmember_rec_release(struct ib_sa_query *sa_query)
+{
+	kfree(container_of(sa_query, struct ib_sa_mcmember_query, sa_query));
+}
+
+int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
+			     struct ib_device *device, u8 port_num,
+			     u8 method,
+			     struct ib_sa_mcmember_rec *rec,
+			     ib_sa_comp_mask comp_mask,
+			     int timeout_ms, gfp_t gfp_mask,
+			     void (*callback)(int status,
+					      struct ib_sa_mcmember_rec *resp,
+					      void *context),
+			     void *context,
+			     struct ib_sa_query **sa_query)
+{
+	struct ib_sa_mcmember_query *query;
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_port   *port;
+	struct ib_mad_agent *agent;
+	struct ib_sa_mad *mad;
+	int ret;
+
+	if (!sa_dev)
+		return -ENODEV;
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+	agent = port->agent;
+
+	query = kmalloc(sizeof *query, gfp_mask);
+	if (!query)
+		return -ENOMEM;
+
+	query->sa_query.port     = port;
+	ret = alloc_mad(&query->sa_query, gfp_mask);
+	if (ret)
+		goto err1;
+
+	ib_sa_client_get(client);
+	query->sa_query.client = client;
+	query->callback        = callback;
+	query->context         = context;
+
+	mad = query->sa_query.mad_buf->mad;
+	init_mad(mad, agent);
+
+	query->sa_query.callback = callback ? ib_sa_mcmember_rec_callback : NULL;
+	query->sa_query.release  = ib_sa_mcmember_rec_release;
+	mad->mad_hdr.method	 = method;
+	mad->mad_hdr.attr_id	 = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
+	mad->sa_hdr.comp_mask	 = comp_mask;
+
+	ib_pack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table),
+		rec, mad->data);
+
+	*sa_query = &query->sa_query;
+
+	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+	if (ret < 0)
+		goto err2;
+
+	return ret;
+
+err2:
+	*sa_query = NULL;
+	ib_sa_client_put(query->sa_query.client);
+	free_mad(&query->sa_query);
+
+err1:
+	kfree(query);
+	return ret;
+}
+
+static void ib_sa_inform_callback(struct ib_sa_query *sa_query,
+				  int status,
+				  struct ib_sa_mad *mad)
+{
+	struct ib_sa_inform_query *query =
+		container_of(sa_query, struct ib_sa_inform_query, sa_query);
+
+	if (mad) {
+		struct ib_sa_inform rec;
+
+		ib_unpack(inform_table, ARRAY_SIZE(inform_table),
+			  mad->data, &rec);
+		query->callback(status, &rec, query->context);
+	} else
+		query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_inform_release(struct ib_sa_query *sa_query)
+{
+	kfree(container_of(sa_query, struct ib_sa_inform_query, sa_query));
+}
+
+/**
+ * ib_sa_informinfo_query - Start an InformInfo registration.
+ * @client:SA client
+ * @device:device to send query on
+ * @port_num: port number to send query on
+ * @rec:Inform record to send in query
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when notice handler registration completes,
+ * times out or is canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:query context, used to cancel query
+ *
+ * This function sends inform info to register with SA to receive
+ * in-service notice.
+ * The callback function will be called when the query completes (or
+ * fails); status is 0 for a successful response, -EINTR if the query
+ * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
+ * occurred sending the query.  The resp parameter of the callback is
+ * only valid if status is 0.
+ *
+ * If the return value of ib_sa_inform_query() is negative, it is an
+ * error code.  Otherwise it is a query ID that can be used to cancel
+ * the query.
+ */
+int ib_sa_informinfo_query(struct ib_sa_client *client,
+			   struct ib_device *device, u8 port_num,
+			   struct ib_sa_inform *rec,
+			   int timeout_ms, gfp_t gfp_mask,
+			   void (*callback)(int status,
+					   struct ib_sa_inform *resp,
+					   void *context),
+			   void *context,
+			   struct ib_sa_query **sa_query)
+{
+	struct ib_sa_inform_query *query;
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_port   *port;
+	struct ib_mad_agent *agent;
+	struct ib_sa_mad *mad;
+	int ret;
+
+	if (!sa_dev)
+		return -ENODEV;
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+	agent = port->agent;
+
+	query = kmalloc(sizeof *query, gfp_mask);
+	if (!query)
+		return -ENOMEM;
+
+	query->sa_query.port     = port;
+	ret = alloc_mad(&query->sa_query, gfp_mask);
+	if (ret)
+		goto err1;
+
+	ib_sa_client_get(client);
+	query->sa_query.client = client;
+	query->callback = callback;
+	query->context  = context;
+
+	mad = query->sa_query.mad_buf->mad;
+	init_mad(mad, agent);
+
+	query->sa_query.callback = callback ? ib_sa_inform_callback : NULL;
+	query->sa_query.release  = ib_sa_inform_release;
+	query->sa_query.port     = port;
+	mad->mad_hdr.method	 = IB_MGMT_METHOD_SET;
+	mad->mad_hdr.attr_id	 = cpu_to_be16(IB_SA_ATTR_INFORM_INFO);
+
+	ib_pack(inform_table, ARRAY_SIZE(inform_table), rec, mad->data);
+
+	*sa_query = &query->sa_query;
+	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+	if (ret < 0)
+		goto err2;
+
+	return ret;
+
+err2:
+	*sa_query = NULL;
+	ib_sa_client_put(query->sa_query.client);
+	free_mad(&query->sa_query);
+err1:
+	kfree(query);
+	return ret;
+}
+
+static void ib_sa_notice_resp(struct ib_sa_port *port,
+			      struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_mad_send_buf *mad_buf;
+	struct ib_sa_mad *mad;
+	int ret;
+	unsigned long flags;
+
+	mad_buf = ib_create_send_mad(port->notice_agent, 1, 0, 0,
+				     IB_MGMT_SA_HDR, IB_MGMT_SA_DATA,
+				     GFP_KERNEL);
+	if (IS_ERR(mad_buf))
+		return;
+
+	mad = mad_buf->mad;
+	memcpy(mad, mad_recv_wc->recv_buf.mad, sizeof *mad);
+	mad->mad_hdr.method = IB_MGMT_METHOD_REPORT_RESP;
+
+	spin_lock_irqsave(&port->ah_lock, flags);
+	if (!port->sm_ah) {
+		spin_unlock_irqrestore(&port->ah_lock, flags);
+		ib_free_send_mad(mad_buf);
+		return;
+	}
+	kref_get(&port->sm_ah->ref);
+	mad_buf->context[0] = &port->sm_ah->ref;
+	mad_buf->ah = port->sm_ah->ah;
+	spin_unlock_irqrestore(&port->ah_lock, flags);
+
+	ret = ib_post_send_mad(mad_buf, NULL);
+	if (ret)
+		goto err;
+
+	return;
+err:
+	kref_put(mad_buf->context[0], free_sm_ah);
+	ib_free_send_mad(mad_buf);
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_sa_query *query = mad_send_wc->send_buf->context[0];
+	unsigned long flags;
+
+	if (query->callback)
+		switch (mad_send_wc->status) {
+		case IB_WC_SUCCESS:
+			/* No callback -- already got recv */
+			break;
+		case IB_WC_RESP_TIMEOUT_ERR:
+			query->callback(query, -ETIMEDOUT, NULL);
+			break;
+		case IB_WC_WR_FLUSH_ERR:
+			query->callback(query, -EINTR, NULL);
+			break;
+		default:
+			query->callback(query, -EIO, NULL);
+			break;
+		}
+
+	spin_lock_irqsave(&idr_lock, flags);
+	idr_remove(&query_idr, query->id);
+	spin_unlock_irqrestore(&idr_lock, flags);
+
+	free_mad(query);
+	ib_sa_client_put(query->client);
+	query->release(query);
+}
+
+static void recv_handler(struct ib_mad_agent *mad_agent,
+			 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_sa_query *query;
+	struct ib_mad_send_buf *mad_buf;
+
+	mad_buf = (void *) (unsigned long) mad_recv_wc->wc->wr_id;
+	query = mad_buf->context[0];
+
+	if (query->callback) {
+		if (mad_recv_wc->wc->status == IB_WC_SUCCESS)
+			query->callback(query,
+					mad_recv_wc->recv_buf.mad->mad_hdr.status ?
+					-EINVAL : 0,
+					(struct ib_sa_mad *) mad_recv_wc->recv_buf.mad);
+		else
+			query->callback(query, -EIO, NULL);
+	}
+
+	ib_free_recv_mad(mad_recv_wc);
+}
+
+static void notice_resp_handler(struct ib_mad_agent *agent,
+				struct ib_mad_send_wc *mad_send_wc)
+{
+	kref_put(mad_send_wc->send_buf->context[0], free_sm_ah);
+	ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+static void notice_handler(struct ib_mad_agent *mad_agent,
+			   struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_sa_port *port;
+	struct ib_sa_mad *mad;
+	struct ib_sa_notice notice;
+
+	port = mad_agent->context;
+	mad = (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad;
+	ib_unpack(notice_table, ARRAY_SIZE(notice_table), mad->data, &notice);
+
+	if (!notice_dispatch(port->device, port->port_num, &notice))
+		ib_sa_notice_resp(port, mad_recv_wc);
+	ib_free_recv_mad(mad_recv_wc);
+}
+
+static void ib_sa_add_one(struct ib_device *device)
+{
+	struct ib_sa_device *sa_dev;
+	struct ib_mad_reg_req reg_req = {
+		.mgmt_class = IB_MGMT_CLASS_SUBN_ADM,
+		.mgmt_class_version = 2
+	};
+	int s, e, i;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH)
+		s = e = 0;
+	else {
+		s = 1;
+		e = device->phys_port_cnt;
+	}
+
+	sa_dev = kzalloc(sizeof *sa_dev +
+			 (e - s + 1) * sizeof (struct ib_sa_port),
+			 GFP_KERNEL);
+	if (!sa_dev)
+		return;
+
+	sa_dev->start_port = s;
+	sa_dev->end_port   = e;
+
+	for (i = 0; i <= e - s; ++i) {
+		spin_lock_init(&sa_dev->port[i].ah_lock);
+		if (rdma_port_get_link_layer(device, i + 1) != IB_LINK_LAYER_INFINIBAND)
+			continue;
+
+		sa_dev->port[i].sm_ah    = NULL;
+		sa_dev->port[i].port_num = i + s;
+
+		sa_dev->port[i].agent =
+			ib_register_mad_agent(device, i + s, IB_QPT_GSI,
+					      NULL, 0, send_handler,
+					      recv_handler, sa_dev);
+		if (IS_ERR(sa_dev->port[i].agent))
+			goto err;
+
+		sa_dev->port[i].device = device;
+		set_bit(IB_MGMT_METHOD_REPORT, reg_req.method_mask);
+		sa_dev->port[i].notice_agent =
+			ib_register_mad_agent(device, i + s, IB_QPT_GSI,
+					      &reg_req, 0, notice_resp_handler,
+					      notice_handler, &sa_dev->port[i]);
+
+		if (IS_ERR(sa_dev->port[i].notice_agent))
+			goto err;
+
+		INIT_WORK(&sa_dev->port[i].update_task, update_sm_ah);
+	}
+
+	ib_set_client_data(device, &sa_client, sa_dev);
+
+	/*
+	 * We register our event handler after everything is set up,
+	 * and then update our cached info after the event handler is
+	 * registered to avoid any problems if a port changes state
+	 * during our initialization.
+	 */
+
+	INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, ib_sa_event);
+	if (ib_register_event_handler(&sa_dev->event_handler))
+		goto err;
+
+	for (i = 0; i <= e - s; ++i)
+		if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND)
+			update_sm_ah(&sa_dev->port[i].update_task);
+
+	return;
+
+err:
+	while (--i >= 0)
+		if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) {
+			if (!IS_ERR(sa_dev->port[i].notice_agent))
+				ib_unregister_mad_agent(sa_dev->port[i].notice_agent);
+			if (!IS_ERR(sa_dev->port[i].agent))
+				ib_unregister_mad_agent(sa_dev->port[i].agent);
+		}
+
+	kfree(sa_dev);
+
+	return;
+}
+
+static void ib_sa_remove_one(struct ib_device *device)
+{
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	int i;
+
+	if (!sa_dev)
+		return;
+
+	ib_unregister_event_handler(&sa_dev->event_handler);
+
+	flush_scheduled_work();
+
+	for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) {
+		if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) {
+			ib_unregister_mad_agent(sa_dev->port[i].notice_agent);
+			ib_unregister_mad_agent(sa_dev->port[i].agent);
+			if (sa_dev->port[i].sm_ah)
+				kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah);
+		}
+
+	}
+
+	kfree(sa_dev);
+}
+
+static int __init ib_sa_init(void)
+{
+	int ret;
+
+	spin_lock_init(&idr_lock);
+	spin_lock_init(&tid_lock);
+
+	get_random_bytes(&tid, sizeof tid);
+
+	ret = ib_register_client(&sa_client);
+	if (ret) {
+		printk(KERN_ERR "Couldn't register ib_sa client\n");
+		goto err1;
+	}
+
+	ret = mcast_init();
+	if (ret) {
+		printk(KERN_ERR "Couldn't initialize multicast handling\n");
+		goto err2;
+	}
+
+	ret = notice_init();
+	if (ret) {
+		printk(KERN_ERR "Couldn't initialize notice handling\n");
+		goto err3;
+	}
+
+	ret = sa_db_init();
+	if (ret) {
+		printk(KERN_ERR "Couldn't initialize local SA\n");
+		goto err4;
+	}
+
+	return 0;
+err4:
+	notice_cleanup();
+err3:
+	mcast_cleanup();
+err2:
+	ib_unregister_client(&sa_client);
+err1:
+	return ret;
+}
+
+static void __exit ib_sa_cleanup(void)
+{
+	sa_db_cleanup();
+	mcast_cleanup();
+	notice_cleanup();
+	ib_unregister_client(&sa_client);
+	idr_destroy(&query_idr);
+}
+
+module_init_order(ib_sa_init, SI_ORDER_SECOND);
+module_exit(ib_sa_cleanup);
diff --git a/sys/ofed/drivers/infiniband/core/smi.c b/sys/ofed/drivers/infiniband/core/smi.c
new file mode 100644
index 0000000..8723675
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/smi.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2004, 2005 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004, 2005 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <rdma/ib_smi.h>
+#include "smi.h"
+
+/*
+ * Fixup a directed route SMP for sending
+ * Return 0 if the SMP should be discarded
+ */
+enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
+				       u8 node_type, int port_num)
+{
+	u8 hop_ptr, hop_cnt;
+
+	hop_ptr = smp->hop_ptr;
+	hop_cnt = smp->hop_cnt;
+
+	/* See section 14.2.2.2, Vol 1 IB spec */
+	if (!ib_get_smp_direction(smp)) {
+		/* C14-9:1 */
+		if (hop_cnt && hop_ptr == 0) {
+			smp->hop_ptr++;
+			return (smp->initial_path[smp->hop_ptr] ==
+				port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-9:2 */
+		if (hop_ptr && hop_ptr < hop_cnt) {
+			if (node_type != RDMA_NODE_IB_SWITCH)
+				return IB_SMI_DISCARD;
+
+			/* smp->return_path set when received */
+			smp->hop_ptr++;
+			return (smp->initial_path[smp->hop_ptr] ==
+				port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-9:3 -- We're at the end of the DR segment of path */
+		if (hop_ptr == hop_cnt) {
+			/* smp->return_path set when received */
+			smp->hop_ptr++;
+			return (node_type == RDMA_NODE_IB_SWITCH ||
+				smp->dr_dlid == IB_LID_PERMISSIVE ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+		/* C14-9:5 -- Fail unreasonable hop pointer */
+		return (hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+
+	} else {
+		/* C14-13:1 */
+		if (hop_cnt && hop_ptr == hop_cnt + 1) {
+			smp->hop_ptr--;
+			return (smp->return_path[smp->hop_ptr] ==
+				port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:2 */
+		if (2 <= hop_ptr && hop_ptr <= hop_cnt) {
+			if (node_type != RDMA_NODE_IB_SWITCH)
+				return IB_SMI_DISCARD;
+
+			smp->hop_ptr--;
+			return (smp->return_path[smp->hop_ptr] ==
+				port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:3 -- at the end of the DR segment of path */
+		if (hop_ptr == 1) {
+			smp->hop_ptr--;
+			/* C14-13:3 -- SMPs destined for SM shouldn't be here */
+			return (node_type == RDMA_NODE_IB_SWITCH ||
+				smp->dr_slid == IB_LID_PERMISSIVE ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:4 -- hop_ptr = 0 -> should have gone to SM */
+		if (hop_ptr == 0)
+			return IB_SMI_HANDLE;
+
+		/* C14-13:5 -- Check for unreasonable hop pointer */
+		return IB_SMI_DISCARD;
+	}
+}
+
+/*
+ * Adjust information for a received SMP
+ * Return 0 if the SMP should be dropped
+ */
+enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type,
+				       int port_num, int phys_port_cnt)
+{
+	u8 hop_ptr, hop_cnt;
+
+	hop_ptr = smp->hop_ptr;
+	hop_cnt = smp->hop_cnt;
+
+	/* See section 14.2.2.2, Vol 1 IB spec */
+	if (!ib_get_smp_direction(smp)) {
+		/* C14-9:1 -- sender should have incremented hop_ptr */
+		if (hop_cnt && hop_ptr == 0)
+			return IB_SMI_DISCARD;
+
+		/* C14-9:2 -- intermediate hop */
+		if (hop_ptr && hop_ptr < hop_cnt) {
+			if (node_type != RDMA_NODE_IB_SWITCH)
+				return IB_SMI_DISCARD;
+
+			smp->return_path[hop_ptr] = port_num;
+			/* smp->hop_ptr updated when sending */
+			return (smp->initial_path[hop_ptr+1] <= phys_port_cnt ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-9:3 -- We're at the end of the DR segment of path */
+		if (hop_ptr == hop_cnt) {
+			if (hop_cnt)
+				smp->return_path[hop_ptr] = port_num;
+			/* smp->hop_ptr updated when sending */
+
+			return (node_type == RDMA_NODE_IB_SWITCH ||
+				smp->dr_dlid == IB_LID_PERMISSIVE ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+		/* C14-9:5 -- fail unreasonable hop pointer */
+		return (hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+
+	} else {
+
+		/* C14-13:1 */
+		if (hop_cnt && hop_ptr == hop_cnt + 1) {
+			smp->hop_ptr--;
+			return (smp->return_path[smp->hop_ptr] ==
+				port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:2 */
+		if (2 <= hop_ptr && hop_ptr <= hop_cnt) {
+			if (node_type != RDMA_NODE_IB_SWITCH)
+				return IB_SMI_DISCARD;
+
+			/* smp->hop_ptr updated when sending */
+			return (smp->return_path[hop_ptr-1] <= phys_port_cnt ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:3 -- We're at the end of the DR segment of path */
+		if (hop_ptr == 1) {
+			if (smp->dr_slid == IB_LID_PERMISSIVE) {
+				/* giving SMP to SM - update hop_ptr */
+				smp->hop_ptr--;
+				return IB_SMI_HANDLE;
+			}
+			/* smp->hop_ptr updated when sending */
+			return (node_type == RDMA_NODE_IB_SWITCH ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:4 -- hop_ptr = 0 -> give to SM */
+		/* C14-13:5 -- Check for unreasonable hop pointer */
+		return (hop_ptr == 0 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+	}
+}
+
+enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp)
+{
+	u8 hop_ptr, hop_cnt;
+
+	hop_ptr = smp->hop_ptr;
+	hop_cnt = smp->hop_cnt;
+
+	if (!ib_get_smp_direction(smp)) {
+		/* C14-9:2 -- intermediate hop */
+		if (hop_ptr && hop_ptr < hop_cnt)
+			return IB_SMI_FORWARD;
+
+		/* C14-9:3 -- at the end of the DR segment of path */
+		if (hop_ptr == hop_cnt)
+			return (smp->dr_dlid == IB_LID_PERMISSIVE ?
+				IB_SMI_SEND : IB_SMI_LOCAL);
+
+		/* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+		if (hop_ptr == hop_cnt + 1)
+			return IB_SMI_SEND;
+	} else {
+		/* C14-13:2  -- intermediate hop */
+		if (2 <= hop_ptr && hop_ptr <= hop_cnt)
+			return IB_SMI_FORWARD;
+
+		/* C14-13:3 -- at the end of the DR segment of path */
+		if (hop_ptr == 1)
+			return (smp->dr_slid != IB_LID_PERMISSIVE ?
+				IB_SMI_SEND : IB_SMI_LOCAL);
+	}
+	return IB_SMI_LOCAL;
+}
+
+/*
+ * Return the forwarding port number from initial_path for outgoing SMP and
+ * from return_path for returning SMP
+ */
+int smi_get_fwd_port(struct ib_smp *smp)
+{
+	return (!ib_get_smp_direction(smp) ? smp->initial_path[smp->hop_ptr+1] :
+		smp->return_path[smp->hop_ptr-1]);
+}
diff --git a/sys/ofed/drivers/infiniband/core/smi.h b/sys/ofed/drivers/infiniband/core/smi.h
new file mode 100644
index 0000000..aff96ba
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/smi.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __SMI_H_
+#define __SMI_H_
+
+#include <rdma/ib_smi.h>
+
+enum smi_action {
+	IB_SMI_DISCARD,
+	IB_SMI_HANDLE
+};
+
+enum smi_forward_action {
+	IB_SMI_LOCAL,	/* SMP should be completed up the stack */
+	IB_SMI_SEND,	/* received DR SMP should be forwarded to the send queue */
+	IB_SMI_FORWARD	/* SMP should be forwarded (for switches only) */
+};
+
+enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type,
+				       int port_num, int phys_port_cnt);
+int smi_get_fwd_port(struct ib_smp *smp);
+extern enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp);
+extern enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
+					      u8 node_type, int port_num);
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action smi_check_local_smp(struct ib_smp *smp,
+						  struct ib_device *device)
+{
+	/* C14-9:3 -- We're at the end of the DR segment of path */
+	/* C14-9:4 -- Hop Pointer = Hop Count + 1 -> give to SMA/SM */
+	return ((device->process_mad &&
+		!ib_get_smp_direction(smp) &&
+		(smp->hop_ptr == smp->hop_cnt + 1)) ?
+		IB_SMI_HANDLE : IB_SMI_DISCARD);
+}
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action smi_check_local_returning_smp(struct ib_smp *smp,
+						   struct ib_device *device)
+{
+	/* C14-13:3 -- We're at the end of the DR segment of path */
+	/* C14-13:4 -- Hop Pointer == 0 -> give to SM */
+	return ((device->process_mad &&
+		ib_get_smp_direction(smp) &&
+		!smp->hop_ptr) ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+}
+
+#endif	/* __SMI_H_ */
diff --git a/sys/ofed/drivers/infiniband/core/sysfs.c b/sys/ofed/drivers/infiniband/core/sysfs.c
new file mode 100644
index 0000000..a406406
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/sysfs.c
@@ -0,0 +1,911 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "core_priv.h"
+
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include <rdma/ib_mad.h>
+
+struct ib_port {
+	struct kobject         kobj;
+	struct ib_device      *ibdev;
+	struct attribute_group gid_group;
+	struct attribute_group pkey_group;
+	u8                     port_num;
+};
+
+struct port_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct ib_port *, struct port_attribute *, char *buf);
+	ssize_t (*store)(struct ib_port *, struct port_attribute *,
+			 const char *buf, size_t count);
+};
+
+#define PORT_ATTR(_name, _mode, _show, _store) \
+struct port_attribute port_attr_##_name = __ATTR(_name, _mode, _show, _store)
+
+#define PORT_ATTR_RO(_name) \
+struct port_attribute port_attr_##_name = __ATTR_RO(_name)
+
+struct port_table_attribute {
+	struct port_attribute	attr;
+	char			name[8];
+	int			index;
+};
+
+static ssize_t port_attr_show(struct kobject *kobj,
+			      struct attribute *attr, char *buf)
+{
+	struct port_attribute *port_attr =
+		container_of(attr, struct port_attribute, attr);
+	struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+
+	if (!port_attr->show)
+		return -EIO;
+
+	return port_attr->show(p, port_attr, buf);
+}
+
+static const struct sysfs_ops port_sysfs_ops = {
+	.show = port_attr_show
+};
+
+static ssize_t state_show(struct ib_port *p, struct port_attribute *unused,
+			  char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	static const char *state_name[] = {
+		[IB_PORT_NOP]		= "NOP",
+		[IB_PORT_DOWN]		= "DOWN",
+		[IB_PORT_INIT]		= "INIT",
+		[IB_PORT_ARMED]		= "ARMED",
+		[IB_PORT_ACTIVE]	= "ACTIVE",
+		[IB_PORT_ACTIVE_DEFER]	= "ACTIVE_DEFER"
+	};
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%d: %s\n", attr.state,
+		       attr.state >= 0 && attr.state < ARRAY_SIZE(state_name) ?
+		       state_name[attr.state] : "UNKNOWN");
+}
+
+static ssize_t lid_show(struct ib_port *p, struct port_attribute *unused,
+			char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%x\n", attr.lid);
+}
+
+static ssize_t lid_mask_count_show(struct ib_port *p,
+				   struct port_attribute *unused,
+				   char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%d\n", attr.lmc);
+}
+
+static ssize_t sm_lid_show(struct ib_port *p, struct port_attribute *unused,
+			   char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%x\n", attr.sm_lid);
+}
+
+static ssize_t sm_sl_show(struct ib_port *p, struct port_attribute *unused,
+			  char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%d\n", attr.sm_sl);
+}
+
+static ssize_t cap_mask_show(struct ib_port *p, struct port_attribute *unused,
+			     char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%08x\n", attr.port_cap_flags);
+}
+
+static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused,
+			 char *buf)
+{
+	struct ib_port_attr attr;
+	char *speed = "";
+	int rate;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	switch (attr.active_speed) {
+	case 2: speed = " DDR"; break;
+	case 4: speed = " QDR"; break;
+	}
+
+	rate = 25 * ib_width_enum_to_int(attr.active_width) * attr.active_speed;
+	if (rate < 0)
+		return -EINVAL;
+
+	return sprintf(buf, "%d%s Gb/sec (%dX%s)\n",
+		       rate / 10, rate % 10 ? ".5" : "",
+		       ib_width_enum_to_int(attr.active_width), speed);
+}
+
+static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused,
+			       char *buf)
+{
+	struct ib_port_attr attr;
+
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	switch (attr.phys_state) {
+	case 1:  return sprintf(buf, "1: Sleep\n");
+	case 2:  return sprintf(buf, "2: Polling\n");
+	case 3:  return sprintf(buf, "3: Disabled\n");
+	case 4:  return sprintf(buf, "4: PortConfigurationTraining\n");
+	case 5:  return sprintf(buf, "5: LinkUp\n");
+	case 6:  return sprintf(buf, "6: LinkErrorRecovery\n");
+	case 7:  return sprintf(buf, "7: Phy Test\n");
+	default: return sprintf(buf, "%d: <unknown>\n", attr.phys_state);
+	}
+}
+
+static ssize_t link_layer_show(struct ib_port *p, struct port_attribute *unused,
+			       char *buf)
+{
+	switch (rdma_port_get_link_layer(p->ibdev, p->port_num)) {
+	case IB_LINK_LAYER_INFINIBAND:
+		return sprintf(buf, "%s\n", "IB");
+	case IB_LINK_LAYER_ETHERNET:
+		return sprintf(buf, "%s\n", "Ethernet");
+	default:
+		return sprintf(buf, "%s\n", "Unknown");
+	}
+}
+
+static PORT_ATTR_RO(state);
+static PORT_ATTR_RO(lid);
+static PORT_ATTR_RO(lid_mask_count);
+static PORT_ATTR_RO(sm_lid);
+static PORT_ATTR_RO(sm_sl);
+static PORT_ATTR_RO(cap_mask);
+static PORT_ATTR_RO(rate);
+static PORT_ATTR_RO(phys_state);
+static PORT_ATTR_RO(link_layer);
+
+static struct attribute *port_default_attrs[] = {
+	&port_attr_state.attr,
+	&port_attr_lid.attr,
+	&port_attr_lid_mask_count.attr,
+	&port_attr_sm_lid.attr,
+	&port_attr_sm_sl.attr,
+	&port_attr_cap_mask.attr,
+	&port_attr_rate.attr,
+	&port_attr_phys_state.attr,
+	&port_attr_link_layer.attr,
+	NULL
+};
+
+static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr,
+			     char *buf)
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	union ib_gid gid;
+	ssize_t ret;
+	u16 *raw;
+
+	ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid);
+	if (ret)
+		return ret;
+
+	raw = (u16 *)gid.raw;
+	return sprintf(buf, "%.4x:%.4x:%.4x:%.4x:%.4x:%.4x:%.4x:%.4x\n",
+	    htons(raw[0]), htons(raw[1]), htons(raw[2]), htons(raw[3]),
+	    htons(raw[4]), htons(raw[5]), htons(raw[6]), htons(raw[7]));
+}
+
+static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr,
+			      char *buf)
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	u16 pkey;
+	ssize_t ret;
+
+	ret = ib_query_pkey(p->ibdev, p->port_num, tab_attr->index, &pkey);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%04x\n", pkey);
+}
+
+#define PORT_PMA_ATTR(_name, _counter, _width, _offset)			\
+struct port_table_attribute port_pma_attr_##_name = {			\
+	.attr  = __ATTR(_name, S_IRUGO, show_pma_counter, NULL),	\
+	.index = (_offset) | ((_width) << 16) | ((_counter) << 24)	\
+}
+
+static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
+				char *buf)
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	int offset = tab_attr->index & 0xffff;
+	int width  = (tab_attr->index >> 16) & 0xff;
+	struct ib_mad *in_mad  = NULL;
+	struct ib_mad *out_mad = NULL;
+	ssize_t ret;
+
+	if (!p->ibdev->process_mad)
+		return sprintf(buf, "N/A (no PMA)\n");
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	in_mad->mad_hdr.base_version  = 1;
+	in_mad->mad_hdr.mgmt_class    = IB_MGMT_CLASS_PERF_MGMT;
+	in_mad->mad_hdr.class_version = 1;
+	in_mad->mad_hdr.method        = IB_MGMT_METHOD_GET;
+	in_mad->mad_hdr.attr_id       = cpu_to_be16(0x12); /* PortCounters */
+
+	in_mad->data[41] = p->port_num;	/* PortSelect field */
+
+	if ((p->ibdev->process_mad(p->ibdev, IB_MAD_IGNORE_MKEY,
+		 p->port_num, NULL, NULL, in_mad, out_mad) &
+	     (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) !=
+	    (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	switch (width) {
+	case 4:
+		ret = sprintf(buf, "%u\n", (out_mad->data[40 + offset / 8] >>
+					    (4 - (offset % 8))) & 0xf);
+		break;
+	case 8:
+		ret = sprintf(buf, "%u\n", out_mad->data[40 + offset / 8]);
+		break;
+	case 16:
+		ret = sprintf(buf, "%u\n",
+			      be16_to_cpup((__be16 *)(out_mad->data + 40 + offset / 8)));
+		break;
+	case 32:
+		ret = sprintf(buf, "%u\n",
+			      be32_to_cpup((__be32 *)(out_mad->data + 40 + offset / 8)));
+		break;
+	default:
+		ret = 0;
+	}
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+
+	return ret;
+}
+
+static PORT_PMA_ATTR(symbol_error		    ,  0, 16,  32);
+static PORT_PMA_ATTR(link_error_recovery	    ,  1,  8,  48);
+static PORT_PMA_ATTR(link_downed		    ,  2,  8,  56);
+static PORT_PMA_ATTR(port_rcv_errors		    ,  3, 16,  64);
+static PORT_PMA_ATTR(port_rcv_remote_physical_errors,  4, 16,  80);
+static PORT_PMA_ATTR(port_rcv_switch_relay_errors   ,  5, 16,  96);
+static PORT_PMA_ATTR(port_xmit_discards		    ,  6, 16, 112);
+static PORT_PMA_ATTR(port_xmit_constraint_errors    ,  7,  8, 128);
+static PORT_PMA_ATTR(port_rcv_constraint_errors	    ,  8,  8, 136);
+static PORT_PMA_ATTR(local_link_integrity_errors    ,  9,  4, 152);
+static PORT_PMA_ATTR(excessive_buffer_overrun_errors, 10,  4, 156);
+static PORT_PMA_ATTR(VL15_dropped		    , 11, 16, 176);
+static PORT_PMA_ATTR(port_xmit_data		    , 12, 32, 192);
+static PORT_PMA_ATTR(port_rcv_data		    , 13, 32, 224);
+static PORT_PMA_ATTR(port_xmit_packets		    , 14, 32, 256);
+static PORT_PMA_ATTR(port_rcv_packets		    , 15, 32, 288);
+/*
+ * There is no bit allocated for port_xmit_wait in the CounterSelect field
+ * (IB spec). However, since this bit is ignored when reading
+ * (show_pma_counter), the _counter field of port_xmit_wait can be set to zero.
+ */
+static PORT_PMA_ATTR(port_xmit_wait		    ,  0, 32, 320);
+
+static struct attribute *pma_attrs[] = {
+	&port_pma_attr_symbol_error.attr.attr,
+	&port_pma_attr_link_error_recovery.attr.attr,
+	&port_pma_attr_link_downed.attr.attr,
+	&port_pma_attr_port_rcv_errors.attr.attr,
+	&port_pma_attr_port_rcv_remote_physical_errors.attr.attr,
+	&port_pma_attr_port_rcv_switch_relay_errors.attr.attr,
+	&port_pma_attr_port_xmit_discards.attr.attr,
+	&port_pma_attr_port_xmit_constraint_errors.attr.attr,
+	&port_pma_attr_port_rcv_constraint_errors.attr.attr,
+	&port_pma_attr_local_link_integrity_errors.attr.attr,
+	&port_pma_attr_excessive_buffer_overrun_errors.attr.attr,
+	&port_pma_attr_VL15_dropped.attr.attr,
+	&port_pma_attr_port_xmit_data.attr.attr,
+	&port_pma_attr_port_rcv_data.attr.attr,
+	&port_pma_attr_port_xmit_packets.attr.attr,
+	&port_pma_attr_port_rcv_packets.attr.attr,
+	&port_pma_attr_port_xmit_wait.attr.attr,
+	NULL
+};
+
+static struct attribute_group pma_group = {
+	.name  = "counters",
+	.attrs  = pma_attrs
+};
+
+static void ib_port_release(struct kobject *kobj)
+{
+	struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+	struct attribute *a;
+	int i;
+
+	for (i = 0; (a = p->gid_group.attrs[i]); ++i)
+		kfree(a);
+
+	kfree(p->gid_group.attrs);
+
+	for (i = 0; (a = p->pkey_group.attrs[i]); ++i)
+		kfree(a);
+
+	kfree(p->pkey_group.attrs);
+
+	kfree(p);
+}
+
+static struct kobj_type port_type = {
+	.release       = ib_port_release,
+	.sysfs_ops     = &port_sysfs_ops,
+	.default_attrs = port_default_attrs
+};
+
+static void ib_device_release(struct device *device)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	kfree(dev);
+}
+
+#ifdef __linux__
+/* BSD supports this through devfs(5) and devd(8). */
+static int ib_device_uevent(struct device *device,
+			    struct kobj_uevent_env *env)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	if (add_uevent_var(env, "NAME=%s", dev->name))
+		return -ENOMEM;
+
+	/*
+	 * It would be nice to pass the node GUID with the event...
+	 */
+
+	return 0;
+}
+#endif
+
+static struct attribute **
+alloc_group_attrs(ssize_t (*show)(struct ib_port *,
+				  struct port_attribute *, char *buf),
+		  int len)
+{
+	struct attribute **tab_attr;
+	struct port_table_attribute *element;
+	int i;
+
+	tab_attr = kcalloc(1 + len, sizeof(struct attribute *), GFP_KERNEL);
+	if (!tab_attr)
+		return NULL;
+
+	for (i = 0; i < len; i++) {
+		element = kzalloc(sizeof(struct port_table_attribute),
+				  GFP_KERNEL);
+		if (!element)
+			goto err;
+
+		if (snprintf(element->name, sizeof(element->name),
+			     "%d", i) >= sizeof(element->name)) {
+			kfree(element);
+			goto err;
+		}
+
+		element->attr.attr.name  = element->name;
+		element->attr.attr.mode  = S_IRUGO;
+		element->attr.show       = show;
+		element->index		 = i;
+
+		tab_attr[i] = &element->attr.attr;
+	}
+
+	return tab_attr;
+
+err:
+	while (--i >= 0)
+		kfree(tab_attr[i]);
+	kfree(tab_attr);
+	return NULL;
+}
+
+static int add_port(struct ib_device *device, int port_num)
+{
+	struct ib_port *p;
+	struct ib_port_attr attr;
+	int i;
+	int ret;
+
+	ret = ib_query_port(device, port_num, &attr);
+	if (ret)
+		return ret;
+
+	p = kzalloc(sizeof *p, GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	p->ibdev      = device;
+	p->port_num   = port_num;
+
+	ret = kobject_init_and_add(&p->kobj, &port_type,
+				   device->ports_parent,
+				   "%d", port_num);
+	if (ret)
+		goto err_put;
+
+	ret = sysfs_create_group(&p->kobj, &pma_group);
+	if (ret)
+		goto err_put;
+
+	p->gid_group.name  = "gids";
+	p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len);
+	if (!p->gid_group.attrs)
+		goto err_remove_pma;
+
+	ret = sysfs_create_group(&p->kobj, &p->gid_group);
+	if (ret)
+		goto err_free_gid;
+
+	p->pkey_group.name  = "pkeys";
+	p->pkey_group.attrs = alloc_group_attrs(show_port_pkey,
+						attr.pkey_tbl_len);
+	if (!p->pkey_group.attrs)
+		goto err_remove_gid;
+
+	ret = sysfs_create_group(&p->kobj, &p->pkey_group);
+	if (ret)
+		goto err_free_pkey;
+
+	list_add_tail(&p->kobj.entry, &device->port_list);
+
+#ifdef __linux__
+	kobject_uevent(&p->kobj, KOBJ_ADD);
+#endif
+	return 0;
+
+err_free_pkey:
+	for (i = 0; i < attr.pkey_tbl_len; ++i)
+		kfree(p->pkey_group.attrs[i]);
+
+	kfree(p->pkey_group.attrs);
+
+err_remove_gid:
+	sysfs_remove_group(&p->kobj, &p->gid_group);
+
+err_free_gid:
+	for (i = 0; i < attr.gid_tbl_len; ++i)
+		kfree(p->gid_group.attrs[i]);
+
+	kfree(p->gid_group.attrs);
+
+err_remove_pma:
+	sysfs_remove_group(&p->kobj, &pma_group);
+
+err_put:
+	kobject_put(device->ports_parent);
+	kfree(p);
+	return ret;
+}
+
+static ssize_t show_node_type(struct device *device,
+			      struct device_attribute *attr, char *buf)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	switch (dev->node_type) {
+	case RDMA_NODE_IB_CA:	  return sprintf(buf, "%d: CA\n", dev->node_type);
+	case RDMA_NODE_RNIC:	  return sprintf(buf, "%d: RNIC\n", dev->node_type);
+	case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type);
+	case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type);
+	default:		  return sprintf(buf, "%d: <unknown>\n", dev->node_type);
+	}
+}
+
+static ssize_t show_sys_image_guid(struct device *device,
+				   struct device_attribute *dev_attr, char *buf)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+	struct ib_device_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_device(dev, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%04x:%04x:%04x:%04x\n",
+		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[0]),
+		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[1]),
+		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[2]),
+		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[3]));
+}
+
+static ssize_t show_node_guid(struct device *device,
+			      struct device_attribute *attr, char *buf)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	return sprintf(buf, "%04x:%04x:%04x:%04x\n",
+		       be16_to_cpu(((__be16 *) &dev->node_guid)[0]),
+		       be16_to_cpu(((__be16 *) &dev->node_guid)[1]),
+		       be16_to_cpu(((__be16 *) &dev->node_guid)[2]),
+		       be16_to_cpu(((__be16 *) &dev->node_guid)[3]));
+}
+
+static ssize_t show_node_desc(struct device *device,
+			      struct device_attribute *attr, char *buf)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	return sprintf(buf, "%.64s\n", dev->node_desc);
+}
+
+static ssize_t set_node_desc(struct device *device,
+			     struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+	struct ib_device_modify desc = {};
+	int ret;
+
+	if (!dev->modify_device)
+		return -EIO;
+
+	memcpy(desc.node_desc, buf, min_t(int, count, 64));
+	ret = ib_modify_device(dev, IB_DEVICE_MODIFY_NODE_DESC, &desc);
+	if (ret)
+		return ret;
+
+	return count;
+}
+
+static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL);
+static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL);
+static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL);
+static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc);
+
+static struct device_attribute *ib_class_attributes[] = {
+	&dev_attr_node_type,
+	&dev_attr_sys_image_guid,
+	&dev_attr_node_guid,
+	&dev_attr_node_desc
+};
+
+static struct class ib_class = {
+	.name    = "infiniband",
+	.dev_release = ib_device_release,
+#ifdef __linux__
+	.dev_uevent = ib_device_uevent,
+#endif
+};
+
+/* Show a given an attribute in the statistics group */
+static ssize_t show_protocol_stat(const struct device *device,
+			    struct device_attribute *attr, char *buf,
+			    unsigned offset)
+{
+	struct ib_device *dev = container_of(__DECONST(struct device *, device), struct ib_device, dev);
+	union rdma_protocol_stats stats;
+	ssize_t ret;
+
+	ret = dev->get_protocol_stats(dev, &stats);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%llu\n",
+		       (unsigned long long) ((u64 *) &stats)[offset]);
+}
+
+/* generate a read-only iwarp statistics attribute */
+#define IW_STATS_ENTRY(name)						\
+static ssize_t show_##name(struct device *device,			\
+			   struct device_attribute *attr, char *buf)	\
+{									\
+	return show_protocol_stat(device, attr, buf,			\
+				  offsetof(struct iw_protocol_stats, name) / \
+				  sizeof (u64));			\
+}									\
+static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+IW_STATS_ENTRY(ipInReceives);
+IW_STATS_ENTRY(ipInHdrErrors);
+IW_STATS_ENTRY(ipInTooBigErrors);
+IW_STATS_ENTRY(ipInNoRoutes);
+IW_STATS_ENTRY(ipInAddrErrors);
+IW_STATS_ENTRY(ipInUnknownProtos);
+IW_STATS_ENTRY(ipInTruncatedPkts);
+IW_STATS_ENTRY(ipInDiscards);
+IW_STATS_ENTRY(ipInDelivers);
+IW_STATS_ENTRY(ipOutForwDatagrams);
+IW_STATS_ENTRY(ipOutRequests);
+IW_STATS_ENTRY(ipOutDiscards);
+IW_STATS_ENTRY(ipOutNoRoutes);
+IW_STATS_ENTRY(ipReasmTimeout);
+IW_STATS_ENTRY(ipReasmReqds);
+IW_STATS_ENTRY(ipReasmOKs);
+IW_STATS_ENTRY(ipReasmFails);
+IW_STATS_ENTRY(ipFragOKs);
+IW_STATS_ENTRY(ipFragFails);
+IW_STATS_ENTRY(ipFragCreates);
+IW_STATS_ENTRY(ipInMcastPkts);
+IW_STATS_ENTRY(ipOutMcastPkts);
+IW_STATS_ENTRY(ipInBcastPkts);
+IW_STATS_ENTRY(ipOutBcastPkts);
+IW_STATS_ENTRY(tcpRtoAlgorithm);
+IW_STATS_ENTRY(tcpRtoMin);
+IW_STATS_ENTRY(tcpRtoMax);
+IW_STATS_ENTRY(tcpMaxConn);
+IW_STATS_ENTRY(tcpActiveOpens);
+IW_STATS_ENTRY(tcpPassiveOpens);
+IW_STATS_ENTRY(tcpAttemptFails);
+IW_STATS_ENTRY(tcpEstabResets);
+IW_STATS_ENTRY(tcpCurrEstab);
+IW_STATS_ENTRY(tcpInSegs);
+IW_STATS_ENTRY(tcpOutSegs);
+IW_STATS_ENTRY(tcpRetransSegs);
+IW_STATS_ENTRY(tcpInErrs);
+IW_STATS_ENTRY(tcpOutRsts);
+
+static struct attribute *iw_proto_stats_attrs[] = {
+	&dev_attr_ipInReceives.attr,
+	&dev_attr_ipInHdrErrors.attr,
+	&dev_attr_ipInTooBigErrors.attr,
+	&dev_attr_ipInNoRoutes.attr,
+	&dev_attr_ipInAddrErrors.attr,
+	&dev_attr_ipInUnknownProtos.attr,
+	&dev_attr_ipInTruncatedPkts.attr,
+	&dev_attr_ipInDiscards.attr,
+	&dev_attr_ipInDelivers.attr,
+	&dev_attr_ipOutForwDatagrams.attr,
+	&dev_attr_ipOutRequests.attr,
+	&dev_attr_ipOutDiscards.attr,
+	&dev_attr_ipOutNoRoutes.attr,
+	&dev_attr_ipReasmTimeout.attr,
+	&dev_attr_ipReasmReqds.attr,
+	&dev_attr_ipReasmOKs.attr,
+	&dev_attr_ipReasmFails.attr,
+	&dev_attr_ipFragOKs.attr,
+	&dev_attr_ipFragFails.attr,
+	&dev_attr_ipFragCreates.attr,
+	&dev_attr_ipInMcastPkts.attr,
+	&dev_attr_ipOutMcastPkts.attr,
+	&dev_attr_ipInBcastPkts.attr,
+	&dev_attr_ipOutBcastPkts.attr,
+	&dev_attr_tcpRtoAlgorithm.attr,
+	&dev_attr_tcpRtoMin.attr,
+	&dev_attr_tcpRtoMax.attr,
+	&dev_attr_tcpMaxConn.attr,
+	&dev_attr_tcpActiveOpens.attr,
+	&dev_attr_tcpPassiveOpens.attr,
+	&dev_attr_tcpAttemptFails.attr,
+	&dev_attr_tcpEstabResets.attr,
+	&dev_attr_tcpCurrEstab.attr,
+	&dev_attr_tcpInSegs.attr,
+	&dev_attr_tcpOutSegs.attr,
+	&dev_attr_tcpRetransSegs.attr,
+	&dev_attr_tcpInErrs.attr,
+	&dev_attr_tcpOutRsts.attr,
+	NULL
+};
+
+static struct attribute_group iw_stats_group = {
+	.name	= "proto_stats",
+	.attrs	= iw_proto_stats_attrs,
+};
+
+int ib_device_register_sysfs(struct ib_device *device)
+{
+	struct device *class_dev = &device->dev;
+	int ret;
+	int i;
+
+	class_dev->class      = &ib_class;
+	class_dev->driver_data = device;
+	class_dev->parent     = device->dma_device;
+	dev_set_name(class_dev, device->name);
+
+	INIT_LIST_HEAD(&device->port_list);
+
+	ret = device_register(class_dev);
+	if (ret)
+		goto err;
+
+	for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) {
+		ret = device_create_file(class_dev, ib_class_attributes[i]);
+		if (ret)
+			goto err_unregister;
+	}
+
+	device->ports_parent = kobject_create_and_add("ports",
+						      &class_dev->kobj);
+	if (!device->ports_parent) {
+		ret = -ENOMEM;
+		goto err_put;
+	}
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH) {
+		ret = add_port(device, 0);
+		if (ret)
+			goto err_put;
+	} else {
+		for (i = 1; i <= device->phys_port_cnt; ++i) {
+			ret = add_port(device, i);
+			if (ret)
+				goto err_put;
+		}
+	}
+
+	if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) {
+		ret = sysfs_create_group(&class_dev->kobj, &iw_stats_group);
+		if (ret)
+			goto err_put;
+	}
+
+	return 0;
+
+err_put:
+	{
+		struct kobject *p, *t;
+		struct ib_port *port;
+
+		list_for_each_entry_safe(p, t, &device->port_list, entry) {
+			list_del(&p->entry);
+			port = container_of(p, struct ib_port, kobj);
+			sysfs_remove_group(p, &pma_group);
+			sysfs_remove_group(p, &port->pkey_group);
+			sysfs_remove_group(p, &port->gid_group);
+			kobject_put(p);
+		}
+	}
+
+	kobject_put(&class_dev->kobj);
+
+err_unregister:
+	device_unregister(class_dev);
+
+err:
+	return ret;
+}
+
+void ib_device_unregister_sysfs(struct ib_device *device)
+{
+	struct kobject *p, *t;
+	struct ib_port *port;
+
+	/* Hold kobject until ib_dealloc_device() */
+	kobject_get(&device->dev.kobj);
+
+	list_for_each_entry_safe(p, t, &device->port_list, entry) {
+		list_del(&p->entry);
+		port = container_of(p, struct ib_port, kobj);
+		sysfs_remove_group(p, &pma_group);
+		sysfs_remove_group(p, &port->pkey_group);
+		sysfs_remove_group(p, &port->gid_group);
+		kobject_put(p);
+	}
+
+	kobject_put(device->ports_parent);
+	device_unregister(&device->dev);
+}
+
+int ib_sysfs_setup(void)
+{
+	return class_register(&ib_class);
+}
+
+void ib_sysfs_cleanup(void)
+{
+	class_unregister(&ib_class);
+}
+
+int ib_sysfs_create_port_files(struct ib_device *device,
+			       int (*create)(struct ib_device *dev, u8 port_num,
+					     struct kobject *kobj))
+{
+	struct kobject *p;
+	struct ib_port *port;
+	int ret = 0;
+
+	list_for_each_entry(p, &device->port_list, entry) {
+		port = container_of(p, struct ib_port, kobj);
+		ret = create(device, port->port_num, &port->kobj);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_sysfs_create_port_files);
diff --git a/sys/ofed/drivers/infiniband/core/ucm.c b/sys/ofed/drivers/infiniband/core/ucm.c
new file mode 100644
index 0000000..90e0b31
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/ucm.c
@@ -0,0 +1,1348 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	copyright notice, this list of conditions and the following
+ *	disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	copyright notice, this list of conditions and the following
+ *	disclaimer in the documentation and/or other materials
+ *	provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/cdev.h>
+#include <linux/idr.h>
+#include <linux/mutex.h>
+
+#include <asm/uaccess.h>
+
+#include <rdma/ib_cm.h>
+#include <rdma/ib_user_cm.h>
+#include <rdma/ib_marshall.h>
+
+MODULE_AUTHOR("Libor Michalek");
+MODULE_DESCRIPTION("InfiniBand userspace Connection Manager access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct ib_ucm_device {
+	int			devnum;
+	struct cdev		cdev;
+	struct device		dev;
+	struct ib_device	*ib_dev;
+};
+
+struct ib_ucm_file {
+	struct mutex file_mutex;
+	struct file *filp;
+	struct ib_ucm_device *device;
+
+	struct list_head  ctxs;
+	struct list_head  events;
+	wait_queue_head_t poll_wait;
+};
+
+struct ib_ucm_context {
+	int                 id;
+	struct completion   comp;
+	atomic_t            ref;
+	int		    events_reported;
+
+	struct ib_ucm_file *file;
+	struct ib_cm_id    *cm_id;
+	__u64		   uid;
+
+	struct list_head    events;    /* list of pending events. */
+	struct list_head    file_list; /* member in file ctx list */
+};
+
+struct ib_ucm_event {
+	struct ib_ucm_context *ctx;
+	struct list_head file_list; /* member in file event list */
+	struct list_head ctx_list;  /* member in ctx event list */
+
+	struct ib_cm_id *cm_id;
+	struct ib_ucm_event_resp resp;
+	void *data;
+	void *info;
+	int data_len;
+	int info_len;
+};
+
+enum {
+	IB_UCM_MAJOR = 231,
+	IB_UCM_BASE_MINOR = 224,
+	IB_UCM_MAX_DEVICES = 32
+};
+
+/* ib_cm and ib_user_cm modules share /sys/class/infiniband_cm */
+extern struct class cm_class;
+
+#define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR)
+
+static void ib_ucm_add_one(struct ib_device *device);
+static void ib_ucm_remove_one(struct ib_device *device);
+
+static struct ib_client ucm_client = {
+	.name   = "ucm",
+	.add    = ib_ucm_add_one,
+	.remove = ib_ucm_remove_one
+};
+
+static DEFINE_MUTEX(ctx_id_mutex);
+static DEFINE_IDR(ctx_id_table);
+static DECLARE_BITMAP(dev_map, IB_UCM_MAX_DEVICES);
+
+static struct ib_ucm_context *ib_ucm_ctx_get(struct ib_ucm_file *file, int id)
+{
+	struct ib_ucm_context *ctx;
+
+	mutex_lock(&ctx_id_mutex);
+	ctx = idr_find(&ctx_id_table, id);
+	if (!ctx)
+		ctx = ERR_PTR(-ENOENT);
+	else if (ctx->file != file)
+		ctx = ERR_PTR(-EINVAL);
+	else
+		atomic_inc(&ctx->ref);
+	mutex_unlock(&ctx_id_mutex);
+
+	return ctx;
+}
+
+static void ib_ucm_ctx_put(struct ib_ucm_context *ctx)
+{
+	if (atomic_dec_and_test(&ctx->ref))
+		complete(&ctx->comp);
+}
+
+static inline int ib_ucm_new_cm_id(int event)
+{
+	return event == IB_CM_REQ_RECEIVED || event == IB_CM_SIDR_REQ_RECEIVED;
+}
+
+static void ib_ucm_cleanup_events(struct ib_ucm_context *ctx)
+{
+	struct ib_ucm_event *uevent;
+
+	mutex_lock(&ctx->file->file_mutex);
+	list_del(&ctx->file_list);
+	while (!list_empty(&ctx->events)) {
+
+		uevent = list_entry(ctx->events.next,
+				    struct ib_ucm_event, ctx_list);
+		list_del(&uevent->file_list);
+		list_del(&uevent->ctx_list);
+		mutex_unlock(&ctx->file->file_mutex);
+
+		/* clear incoming connections. */
+		if (ib_ucm_new_cm_id(uevent->resp.event))
+			ib_destroy_cm_id(uevent->cm_id);
+
+		kfree(uevent);
+		mutex_lock(&ctx->file->file_mutex);
+	}
+	mutex_unlock(&ctx->file->file_mutex);
+}
+
+static struct ib_ucm_context *ib_ucm_ctx_alloc(struct ib_ucm_file *file)
+{
+	struct ib_ucm_context *ctx;
+	int result;
+
+	ctx = kzalloc(sizeof *ctx, GFP_KERNEL);
+	if (!ctx)
+		return NULL;
+
+	atomic_set(&ctx->ref, 1);
+	init_completion(&ctx->comp);
+	ctx->file = file;
+	INIT_LIST_HEAD(&ctx->events);
+
+	do {
+		result = idr_pre_get(&ctx_id_table, GFP_KERNEL);
+		if (!result)
+			goto error;
+
+		mutex_lock(&ctx_id_mutex);
+		result = idr_get_new(&ctx_id_table, ctx, &ctx->id);
+		mutex_unlock(&ctx_id_mutex);
+	} while (result == -EAGAIN);
+
+	if (result)
+		goto error;
+
+	list_add_tail(&ctx->file_list, &file->ctxs);
+	return ctx;
+
+error:
+	kfree(ctx);
+	return NULL;
+}
+
+static void ib_ucm_event_req_get(struct ib_ucm_req_event_resp *ureq,
+				 struct ib_cm_req_event_param *kreq)
+{
+	ureq->remote_ca_guid             = kreq->remote_ca_guid;
+	ureq->remote_qkey                = kreq->remote_qkey;
+	ureq->remote_qpn                 = kreq->remote_qpn;
+	ureq->qp_type                    = kreq->qp_type;
+	ureq->starting_psn               = kreq->starting_psn;
+	ureq->responder_resources        = kreq->responder_resources;
+	ureq->initiator_depth            = kreq->initiator_depth;
+	ureq->local_cm_response_timeout  = kreq->local_cm_response_timeout;
+	ureq->flow_control               = kreq->flow_control;
+	ureq->remote_cm_response_timeout = kreq->remote_cm_response_timeout;
+	ureq->retry_count                = kreq->retry_count;
+	ureq->rnr_retry_count            = kreq->rnr_retry_count;
+	ureq->srq                        = kreq->srq;
+	ureq->port			 = kreq->port;
+
+	ib_copy_path_rec_to_user(&ureq->primary_path, kreq->primary_path);
+	if (kreq->alternate_path)
+		ib_copy_path_rec_to_user(&ureq->alternate_path,
+					 kreq->alternate_path);
+}
+
+static void ib_ucm_event_rep_get(struct ib_ucm_rep_event_resp *urep,
+				 struct ib_cm_rep_event_param *krep)
+{
+	urep->remote_ca_guid      = krep->remote_ca_guid;
+	urep->remote_qkey         = krep->remote_qkey;
+	urep->remote_qpn          = krep->remote_qpn;
+	urep->starting_psn        = krep->starting_psn;
+	urep->responder_resources = krep->responder_resources;
+	urep->initiator_depth     = krep->initiator_depth;
+	urep->target_ack_delay    = krep->target_ack_delay;
+	urep->failover_accepted   = krep->failover_accepted;
+	urep->flow_control        = krep->flow_control;
+	urep->rnr_retry_count     = krep->rnr_retry_count;
+	urep->srq                 = krep->srq;
+}
+
+static void ib_ucm_event_sidr_rep_get(struct ib_ucm_sidr_rep_event_resp *urep,
+				      struct ib_cm_sidr_rep_event_param *krep)
+{
+	urep->status = krep->status;
+	urep->qkey   = krep->qkey;
+	urep->qpn    = krep->qpn;
+};
+
+static int ib_ucm_event_process(struct ib_cm_event *evt,
+				struct ib_ucm_event *uvt)
+{
+	void *info = NULL;
+
+	switch (evt->event) {
+	case IB_CM_REQ_RECEIVED:
+		ib_ucm_event_req_get(&uvt->resp.u.req_resp,
+				     &evt->param.req_rcvd);
+		uvt->data_len      = IB_CM_REQ_PRIVATE_DATA_SIZE;
+		uvt->resp.present  = IB_UCM_PRES_PRIMARY;
+		uvt->resp.present |= (evt->param.req_rcvd.alternate_path ?
+				      IB_UCM_PRES_ALTERNATE : 0);
+		break;
+	case IB_CM_REP_RECEIVED:
+		ib_ucm_event_rep_get(&uvt->resp.u.rep_resp,
+				     &evt->param.rep_rcvd);
+		uvt->data_len = IB_CM_REP_PRIVATE_DATA_SIZE;
+		break;
+	case IB_CM_RTU_RECEIVED:
+		uvt->data_len = IB_CM_RTU_PRIVATE_DATA_SIZE;
+		uvt->resp.u.send_status = evt->param.send_status;
+		break;
+	case IB_CM_DREQ_RECEIVED:
+		uvt->data_len = IB_CM_DREQ_PRIVATE_DATA_SIZE;
+		uvt->resp.u.send_status = evt->param.send_status;
+		break;
+	case IB_CM_DREP_RECEIVED:
+		uvt->data_len = IB_CM_DREP_PRIVATE_DATA_SIZE;
+		uvt->resp.u.send_status = evt->param.send_status;
+		break;
+	case IB_CM_MRA_RECEIVED:
+		uvt->resp.u.mra_resp.timeout =
+					evt->param.mra_rcvd.service_timeout;
+		uvt->data_len = IB_CM_MRA_PRIVATE_DATA_SIZE;
+		break;
+	case IB_CM_REJ_RECEIVED:
+		uvt->resp.u.rej_resp.reason = evt->param.rej_rcvd.reason;
+		uvt->data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
+		uvt->info_len = evt->param.rej_rcvd.ari_length;
+		info	      = evt->param.rej_rcvd.ari;
+		break;
+	case IB_CM_LAP_RECEIVED:
+		ib_copy_path_rec_to_user(&uvt->resp.u.lap_resp.path,
+					 evt->param.lap_rcvd.alternate_path);
+		uvt->data_len = IB_CM_LAP_PRIVATE_DATA_SIZE;
+		uvt->resp.present = IB_UCM_PRES_ALTERNATE;
+		break;
+	case IB_CM_APR_RECEIVED:
+		uvt->resp.u.apr_resp.status = evt->param.apr_rcvd.ap_status;
+		uvt->data_len = IB_CM_APR_PRIVATE_DATA_SIZE;
+		uvt->info_len = evt->param.apr_rcvd.info_len;
+		info	      = evt->param.apr_rcvd.apr_info;
+		break;
+	case IB_CM_SIDR_REQ_RECEIVED:
+		uvt->resp.u.sidr_req_resp.pkey =
+					evt->param.sidr_req_rcvd.pkey;
+		uvt->resp.u.sidr_req_resp.port =
+					evt->param.sidr_req_rcvd.port;
+		uvt->data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE;
+		break;
+	case IB_CM_SIDR_REP_RECEIVED:
+		ib_ucm_event_sidr_rep_get(&uvt->resp.u.sidr_rep_resp,
+					  &evt->param.sidr_rep_rcvd);
+		uvt->data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE;
+		uvt->info_len = evt->param.sidr_rep_rcvd.info_len;
+		info	      = evt->param.sidr_rep_rcvd.info;
+		break;
+	default:
+		uvt->resp.u.send_status = evt->param.send_status;
+		break;
+	}
+
+	if (uvt->data_len) {
+		uvt->data = kmemdup(evt->private_data, uvt->data_len, GFP_KERNEL);
+		if (!uvt->data)
+			goto err1;
+
+		uvt->resp.present |= IB_UCM_PRES_DATA;
+	}
+
+	if (uvt->info_len) {
+		uvt->info = kmemdup(info, uvt->info_len, GFP_KERNEL);
+		if (!uvt->info)
+			goto err2;
+
+		uvt->resp.present |= IB_UCM_PRES_INFO;
+	}
+	return 0;
+
+err2:
+	kfree(uvt->data);
+err1:
+	return -ENOMEM;
+}
+
+static int ib_ucm_event_handler(struct ib_cm_id *cm_id,
+				struct ib_cm_event *event)
+{
+	struct ib_ucm_event *uevent;
+	struct ib_ucm_context *ctx;
+	int result = 0;
+
+	ctx = cm_id->context;
+
+	uevent = kzalloc(sizeof *uevent, GFP_KERNEL);
+	if (!uevent)
+		goto err1;
+
+	uevent->ctx = ctx;
+	uevent->cm_id = cm_id;
+	uevent->resp.uid = ctx->uid;
+	uevent->resp.id = ctx->id;
+	uevent->resp.event = event->event;
+
+	result = ib_ucm_event_process(event, uevent);
+	if (result)
+		goto err2;
+
+	mutex_lock(&ctx->file->file_mutex);
+	list_add_tail(&uevent->file_list, &ctx->file->events);
+	list_add_tail(&uevent->ctx_list, &ctx->events);
+	wake_up_interruptible(&ctx->file->poll_wait);
+	if (ctx->file->filp)
+		selwakeup(&ctx->file->filp->f_selinfo);
+	mutex_unlock(&ctx->file->file_mutex);
+	return 0;
+
+err2:
+	kfree(uevent);
+err1:
+	/* Destroy new cm_id's */
+	return ib_ucm_new_cm_id(event->event);
+}
+
+static ssize_t ib_ucm_event(struct ib_ucm_file *file,
+			    const char __user *inbuf,
+			    int in_len, int out_len)
+{
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_event_get cmd;
+	struct ib_ucm_event *uevent;
+	int result = 0;
+	DEFINE_WAIT(wait);
+
+	if (out_len < sizeof(struct ib_ucm_event_resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&file->file_mutex);
+	while (list_empty(&file->events)) {
+		mutex_unlock(&file->file_mutex);
+
+		if (file->filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(file->poll_wait,
+					     !list_empty(&file->events)))
+			return -ERESTARTSYS;
+
+		mutex_lock(&file->file_mutex);
+	}
+
+	uevent = list_entry(file->events.next, struct ib_ucm_event, file_list);
+
+	if (ib_ucm_new_cm_id(uevent->resp.event)) {
+		ctx = ib_ucm_ctx_alloc(file);
+		if (!ctx) {
+			result = -ENOMEM;
+			goto done;
+		}
+
+		ctx->cm_id = uevent->cm_id;
+		ctx->cm_id->context = ctx;
+		uevent->resp.id = ctx->id;
+	}
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &uevent->resp, sizeof(uevent->resp))) {
+		result = -EFAULT;
+		goto done;
+	}
+
+	if (uevent->data) {
+		if (cmd.data_len < uevent->data_len) {
+			result = -ENOMEM;
+			goto done;
+		}
+		if (copy_to_user((void __user *)(unsigned long)cmd.data,
+				 uevent->data, uevent->data_len)) {
+			result = -EFAULT;
+			goto done;
+		}
+	}
+
+	if (uevent->info) {
+		if (cmd.info_len < uevent->info_len) {
+			result = -ENOMEM;
+			goto done;
+		}
+		if (copy_to_user((void __user *)(unsigned long)cmd.info,
+				 uevent->info, uevent->info_len)) {
+			result = -EFAULT;
+			goto done;
+		}
+	}
+
+	list_del(&uevent->file_list);
+	list_del(&uevent->ctx_list);
+	uevent->ctx->events_reported++;
+
+	kfree(uevent->data);
+	kfree(uevent->info);
+	kfree(uevent);
+done:
+	mutex_unlock(&file->file_mutex);
+	return result;
+}
+
+static ssize_t ib_ucm_create_id(struct ib_ucm_file *file,
+				const char __user *inbuf,
+				int in_len, int out_len)
+{
+	struct ib_ucm_create_id cmd;
+	struct ib_ucm_create_id_resp resp;
+	struct ib_ucm_context *ctx;
+	int result;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&file->file_mutex);
+	ctx = ib_ucm_ctx_alloc(file);
+	mutex_unlock(&file->file_mutex);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->uid = cmd.uid;
+	ctx->cm_id = ib_create_cm_id(file->device->ib_dev,
+				     ib_ucm_event_handler, ctx);
+	if (IS_ERR(ctx->cm_id)) {
+		result = PTR_ERR(ctx->cm_id);
+		goto err1;
+	}
+
+	resp.id = ctx->id;
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp))) {
+		result = -EFAULT;
+		goto err2;
+	}
+	return 0;
+
+err2:
+	ib_destroy_cm_id(ctx->cm_id);
+err1:
+	mutex_lock(&ctx_id_mutex);
+	idr_remove(&ctx_id_table, ctx->id);
+	mutex_unlock(&ctx_id_mutex);
+	kfree(ctx);
+	return result;
+}
+
+static ssize_t ib_ucm_destroy_id(struct ib_ucm_file *file,
+				 const char __user *inbuf,
+				 int in_len, int out_len)
+{
+	struct ib_ucm_destroy_id cmd;
+	struct ib_ucm_destroy_id_resp resp;
+	struct ib_ucm_context *ctx;
+	int result = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&ctx_id_mutex);
+	ctx = idr_find(&ctx_id_table, cmd.id);
+	if (!ctx)
+		ctx = ERR_PTR(-ENOENT);
+	else if (ctx->file != file)
+		ctx = ERR_PTR(-EINVAL);
+	else
+		idr_remove(&ctx_id_table, ctx->id);
+	mutex_unlock(&ctx_id_mutex);
+
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ib_ucm_ctx_put(ctx);
+	wait_for_completion(&ctx->comp);
+
+	/* No new events will be generated after destroying the cm_id. */
+	ib_destroy_cm_id(ctx->cm_id);
+	/* Cleanup events not yet reported to the user. */
+	ib_ucm_cleanup_events(ctx);
+
+	resp.events_reported = ctx->events_reported;
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		result = -EFAULT;
+
+	kfree(ctx);
+	return result;
+}
+
+static ssize_t ib_ucm_attr_id(struct ib_ucm_file *file,
+			      const char __user *inbuf,
+			      int in_len, int out_len)
+{
+	struct ib_ucm_attr_id_resp resp;
+	struct ib_ucm_attr_id cmd;
+	struct ib_ucm_context *ctx;
+	int result = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	resp.service_id   = ctx->cm_id->service_id;
+	resp.service_mask = ctx->cm_id->service_mask;
+	resp.local_id     = ctx->cm_id->local_id;
+	resp.remote_id    = ctx->cm_id->remote_id;
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		result = -EFAULT;
+
+	ib_ucm_ctx_put(ctx);
+	return result;
+}
+
+static ssize_t ib_ucm_init_qp_attr(struct ib_ucm_file *file,
+				   const char __user *inbuf,
+				   int in_len, int out_len)
+{
+	struct ib_uverbs_qp_attr resp;
+	struct ib_ucm_init_qp_attr cmd;
+	struct ib_ucm_context *ctx;
+	struct ib_qp_attr qp_attr;
+	int result = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	resp.qp_attr_mask = 0;
+	memset(&qp_attr, 0, sizeof qp_attr);
+	qp_attr.qp_state = cmd.qp_state;
+	result = ib_cm_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask);
+	if (result)
+		goto out;
+
+	ib_copy_qp_attr_to_user(&resp, &qp_attr);
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		result = -EFAULT;
+
+out:
+	ib_ucm_ctx_put(ctx);
+	return result;
+}
+
+static int ucm_validate_listen(__be64 service_id, __be64 service_mask)
+{
+	service_id &= service_mask;
+
+	if (((service_id & IB_CMA_SERVICE_ID_MASK) == IB_CMA_SERVICE_ID) ||
+	    ((service_id & IB_SDP_SERVICE_ID_MASK) == IB_SDP_SERVICE_ID))
+		return -EINVAL;
+
+	return 0;
+}
+
+static ssize_t ib_ucm_listen(struct ib_ucm_file *file,
+			     const char __user *inbuf,
+			     int in_len, int out_len)
+{
+	struct ib_ucm_listen cmd;
+	struct ib_ucm_context *ctx;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	result = ucm_validate_listen(cmd.service_id, cmd.service_mask);
+	if (result)
+		goto out;
+
+	result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask,
+			      NULL);
+out:
+	ib_ucm_ctx_put(ctx);
+	return result;
+}
+
+static ssize_t ib_ucm_notify(struct ib_ucm_file *file,
+			     const char __user *inbuf,
+			     int in_len, int out_len)
+{
+	struct ib_ucm_notify cmd;
+	struct ib_ucm_context *ctx;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	result = ib_cm_notify(ctx->cm_id, (enum ib_event_type) cmd.event);
+	ib_ucm_ctx_put(ctx);
+	return result;
+}
+
+static int ib_ucm_alloc_data(const void **dest, u64 src, u32 len)
+{
+	void *data;
+
+	*dest = NULL;
+
+	if (!len)
+		return 0;
+
+	data = kmalloc(len, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	if (copy_from_user(data, (void __user *)(unsigned long)src, len)) {
+		kfree(data);
+		return -EFAULT;
+	}
+
+	*dest = data;
+	return 0;
+}
+
+static int ib_ucm_path_get(struct ib_sa_path_rec **path, u64 src)
+{
+	struct ib_user_path_rec upath;
+	struct ib_sa_path_rec  *sa_path;
+
+	*path = NULL;
+
+	if (!src)
+		return 0;
+
+	sa_path = kmalloc(sizeof(*sa_path), GFP_KERNEL);
+	if (!sa_path)
+		return -ENOMEM;
+
+	if (copy_from_user(&upath, (void __user *)(unsigned long)src,
+			   sizeof(upath))) {
+
+		kfree(sa_path);
+		return -EFAULT;
+	}
+
+	ib_copy_path_rec_from_user(sa_path, &upath);
+	*path = sa_path;
+	return 0;
+}
+
+static ssize_t ib_ucm_send_req(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct ib_cm_req_param param;
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_req cmd;
+	int result;
+
+	param.private_data   = NULL;
+	param.primary_path   = NULL;
+	param.alternate_path = NULL;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
+	if (result)
+		goto done;
+
+	result = ib_ucm_path_get(&param.primary_path, cmd.primary_path);
+	if (result)
+		goto done;
+
+	result = ib_ucm_path_get(&param.alternate_path, cmd.alternate_path);
+	if (result)
+		goto done;
+
+	param.private_data_len           = cmd.len;
+	param.service_id                 = cmd.sid;
+	param.qp_num                     = cmd.qpn;
+	param.qp_type                    = cmd.qp_type;
+	param.starting_psn               = cmd.psn;
+	param.peer_to_peer               = cmd.peer_to_peer;
+	param.responder_resources        = cmd.responder_resources;
+	param.initiator_depth            = cmd.initiator_depth;
+	param.remote_cm_response_timeout = cmd.remote_cm_response_timeout;
+	param.flow_control               = cmd.flow_control;
+	param.local_cm_response_timeout  = cmd.local_cm_response_timeout;
+	param.retry_count                = cmd.retry_count;
+	param.rnr_retry_count            = cmd.rnr_retry_count;
+	param.max_cm_retries             = cmd.max_cm_retries;
+	param.srq                        = cmd.srq;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = ib_send_cm_req(ctx->cm_id, &param);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+done:
+	kfree(param.private_data);
+	kfree(param.primary_path);
+	kfree(param.alternate_path);
+	return result;
+}
+
+static ssize_t ib_ucm_send_rep(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct ib_cm_rep_param param;
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_rep cmd;
+	int result;
+
+	param.private_data = NULL;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
+	if (result)
+		return result;
+
+	param.qp_num              = cmd.qpn;
+	param.starting_psn        = cmd.psn;
+	param.private_data_len    = cmd.len;
+	param.responder_resources = cmd.responder_resources;
+	param.initiator_depth     = cmd.initiator_depth;
+	param.failover_accepted   = cmd.failover_accepted;
+	param.flow_control        = cmd.flow_control;
+	param.rnr_retry_count     = cmd.rnr_retry_count;
+	param.srq                 = cmd.srq;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		ctx->uid = cmd.uid;
+		result = ib_send_cm_rep(ctx->cm_id, &param);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+	kfree(param.private_data);
+	return result;
+}
+
+static ssize_t ib_ucm_send_private_data(struct ib_ucm_file *file,
+					const char __user *inbuf, int in_len,
+					int (*func)(struct ib_cm_id *cm_id,
+						    const void *private_data,
+						    u8 private_data_len))
+{
+	struct ib_ucm_private_data cmd;
+	struct ib_ucm_context *ctx;
+	const void *private_data = NULL;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&private_data, cmd.data, cmd.len);
+	if (result)
+		return result;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = func(ctx->cm_id, private_data, cmd.len);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+	kfree(private_data);
+	return result;
+}
+
+static ssize_t ib_ucm_send_rtu(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_rtu);
+}
+
+static ssize_t ib_ucm_send_dreq(struct ib_ucm_file *file,
+				const char __user *inbuf,
+				int in_len, int out_len)
+{
+	return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_dreq);
+}
+
+static ssize_t ib_ucm_send_drep(struct ib_ucm_file *file,
+				const char __user *inbuf,
+				int in_len, int out_len)
+{
+	return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_drep);
+}
+
+static ssize_t ib_ucm_send_info(struct ib_ucm_file *file,
+				const char __user *inbuf, int in_len,
+				int (*func)(struct ib_cm_id *cm_id,
+					    int status,
+					    const void *info,
+					    u8 info_len,
+					    const void *data,
+					    u8 data_len))
+{
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_info cmd;
+	const void *data = NULL;
+	const void *info = NULL;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&data, cmd.data, cmd.data_len);
+	if (result)
+		goto done;
+
+	result = ib_ucm_alloc_data(&info, cmd.info, cmd.info_len);
+	if (result)
+		goto done;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = func(ctx->cm_id, cmd.status, info, cmd.info_len,
+			      data, cmd.data_len);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+done:
+	kfree(data);
+	kfree(info);
+	return result;
+}
+
+static ssize_t ib_ucm_send_rej(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_rej);
+}
+
+static ssize_t ib_ucm_send_apr(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_apr);
+}
+
+static ssize_t ib_ucm_send_mra(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_mra cmd;
+	const void *data = NULL;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&data, cmd.data, cmd.len);
+	if (result)
+		return result;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = ib_send_cm_mra(ctx->cm_id, cmd.timeout, data, cmd.len);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+	kfree(data);
+	return result;
+}
+
+static ssize_t ib_ucm_send_lap(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct ib_ucm_context *ctx;
+	struct ib_sa_path_rec *path = NULL;
+	struct ib_ucm_lap cmd;
+	const void *data = NULL;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&data, cmd.data, cmd.len);
+	if (result)
+		goto done;
+
+	result = ib_ucm_path_get(&path, cmd.path);
+	if (result)
+		goto done;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = ib_send_cm_lap(ctx->cm_id, path, data, cmd.len);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+done:
+	kfree(data);
+	kfree(path);
+	return result;
+}
+
+static ssize_t ib_ucm_send_sidr_req(struct ib_ucm_file *file,
+				    const char __user *inbuf,
+				    int in_len, int out_len)
+{
+	struct ib_cm_sidr_req_param param;
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_sidr_req cmd;
+	int result;
+
+	param.private_data = NULL;
+	param.path = NULL;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
+	if (result)
+		goto done;
+
+	result = ib_ucm_path_get(&param.path, cmd.path);
+	if (result)
+		goto done;
+
+	param.private_data_len = cmd.len;
+	param.service_id       = cmd.sid;
+	param.timeout_ms       = cmd.timeout;
+	param.max_cm_retries   = cmd.max_cm_retries;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = ib_send_cm_sidr_req(ctx->cm_id, &param);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+done:
+	kfree(param.private_data);
+	kfree(param.path);
+	return result;
+}
+
+static ssize_t ib_ucm_send_sidr_rep(struct ib_ucm_file *file,
+				    const char __user *inbuf,
+				    int in_len, int out_len)
+{
+	struct ib_cm_sidr_rep_param param;
+	struct ib_ucm_sidr_rep cmd;
+	struct ib_ucm_context *ctx;
+	int result;
+
+	param.info = NULL;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&param.private_data,
+				   cmd.data, cmd.data_len);
+	if (result)
+		goto done;
+
+	result = ib_ucm_alloc_data(&param.info, cmd.info, cmd.info_len);
+	if (result)
+		goto done;
+
+	param.qp_num		= cmd.qpn;
+	param.qkey		= cmd.qkey;
+	param.status		= cmd.status;
+	param.info_length	= cmd.info_len;
+	param.private_data_len	= cmd.data_len;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = ib_send_cm_sidr_rep(ctx->cm_id, &param);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+done:
+	kfree(param.private_data);
+	kfree(param.info);
+	return result;
+}
+
+static ssize_t (*ucm_cmd_table[])(struct ib_ucm_file *file,
+				  const char __user *inbuf,
+				  int in_len, int out_len) = {
+	[IB_USER_CM_CMD_CREATE_ID]     = ib_ucm_create_id,
+	[IB_USER_CM_CMD_DESTROY_ID]    = ib_ucm_destroy_id,
+	[IB_USER_CM_CMD_ATTR_ID]       = ib_ucm_attr_id,
+	[IB_USER_CM_CMD_LISTEN]        = ib_ucm_listen,
+	[IB_USER_CM_CMD_NOTIFY]        = ib_ucm_notify,
+	[IB_USER_CM_CMD_SEND_REQ]      = ib_ucm_send_req,
+	[IB_USER_CM_CMD_SEND_REP]      = ib_ucm_send_rep,
+	[IB_USER_CM_CMD_SEND_RTU]      = ib_ucm_send_rtu,
+	[IB_USER_CM_CMD_SEND_DREQ]     = ib_ucm_send_dreq,
+	[IB_USER_CM_CMD_SEND_DREP]     = ib_ucm_send_drep,
+	[IB_USER_CM_CMD_SEND_REJ]      = ib_ucm_send_rej,
+	[IB_USER_CM_CMD_SEND_MRA]      = ib_ucm_send_mra,
+	[IB_USER_CM_CMD_SEND_LAP]      = ib_ucm_send_lap,
+	[IB_USER_CM_CMD_SEND_APR]      = ib_ucm_send_apr,
+	[IB_USER_CM_CMD_SEND_SIDR_REQ] = ib_ucm_send_sidr_req,
+	[IB_USER_CM_CMD_SEND_SIDR_REP] = ib_ucm_send_sidr_rep,
+	[IB_USER_CM_CMD_EVENT]	       = ib_ucm_event,
+	[IB_USER_CM_CMD_INIT_QP_ATTR]  = ib_ucm_init_qp_attr,
+};
+
+static ssize_t ib_ucm_write(struct file *filp, const char __user *buf,
+			    size_t len, loff_t *pos)
+{
+	struct ib_ucm_file *file = filp->private_data;
+	struct ib_ucm_cmd_hdr hdr;
+	ssize_t result;
+
+	if (len < sizeof(hdr))
+		return -EINVAL;
+
+	if (copy_from_user(&hdr, buf, sizeof(hdr)))
+		return -EFAULT;
+
+	if (hdr.cmd < 0 || hdr.cmd >= ARRAY_SIZE(ucm_cmd_table))
+		return -EINVAL;
+
+	if (hdr.in + sizeof(hdr) > len)
+		return -EINVAL;
+
+	result = ucm_cmd_table[hdr.cmd](file, buf + sizeof(hdr),
+					hdr.in, hdr.out);
+	if (!result)
+		result = len;
+
+	return result;
+}
+
+static unsigned int ib_ucm_poll(struct file *filp,
+				struct poll_table_struct *wait)
+{
+	struct ib_ucm_file *file = filp->private_data;
+	unsigned int mask = 0;
+
+	poll_wait(filp, &file->poll_wait, wait);
+
+	if (!list_empty(&file->events))
+		mask = POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
+/*
+ * ib_ucm_open() does not need the BKL:
+ *
+ *  - no global state is referred to;
+ *  - there is no ioctl method to race against;
+ *  - no further module initialization is required for open to work
+ *    after the device is registered.
+ */
+static int ib_ucm_open(struct inode *inode, struct file *filp)
+{
+	struct ib_ucm_file *file;
+
+	file = kzalloc(sizeof(*file), GFP_KERNEL);
+	if (!file)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&file->events);
+	INIT_LIST_HEAD(&file->ctxs);
+	init_waitqueue_head(&file->poll_wait);
+
+	mutex_init(&file->file_mutex);
+
+	filp->private_data = file;
+	file->filp = filp;
+	file->device = container_of(inode->i_cdev->si_drv1, struct ib_ucm_device, cdev);
+
+	return 0;
+}
+
+static int ib_ucm_close(struct inode *inode, struct file *filp)
+{
+	struct ib_ucm_file *file = filp->private_data;
+	struct ib_ucm_context *ctx;
+
+	mutex_lock(&file->file_mutex);
+	while (!list_empty(&file->ctxs)) {
+		ctx = list_entry(file->ctxs.next,
+				 struct ib_ucm_context, file_list);
+		mutex_unlock(&file->file_mutex);
+
+		mutex_lock(&ctx_id_mutex);
+		idr_remove(&ctx_id_table, ctx->id);
+		mutex_unlock(&ctx_id_mutex);
+
+		ib_destroy_cm_id(ctx->cm_id);
+		ib_ucm_cleanup_events(ctx);
+		kfree(ctx);
+
+		mutex_lock(&file->file_mutex);
+	}
+	mutex_unlock(&file->file_mutex);
+	kfree(file);
+	return 0;
+}
+
+static void ib_ucm_release_dev(struct device *dev)
+{
+	struct ib_ucm_device *ucm_dev;
+
+	ucm_dev = container_of(dev, struct ib_ucm_device, dev);
+	cdev_del(&ucm_dev->cdev);
+	clear_bit(ucm_dev->devnum, dev_map);
+	kfree(ucm_dev);
+}
+
+static const struct file_operations ucm_fops = {
+	.owner 	 = THIS_MODULE,
+	.open 	 = ib_ucm_open,
+	.release = ib_ucm_close,
+	.write 	 = ib_ucm_write,
+	.poll    = ib_ucm_poll,
+};
+
+static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct ib_ucm_device *ucm_dev;
+
+	ucm_dev = container_of(dev, struct ib_ucm_device, dev);
+	return sprintf(buf, "%s\n", ucm_dev->ib_dev->name);
+}
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static void ib_ucm_add_one(struct ib_device *device)
+{
+	struct ib_ucm_device *ucm_dev;
+
+	if (!device->alloc_ucontext ||
+	    rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	ucm_dev = kzalloc(sizeof *ucm_dev, GFP_KERNEL);
+	if (!ucm_dev)
+		return;
+
+	ucm_dev->ib_dev = device;
+
+	ucm_dev->devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES);
+	if (ucm_dev->devnum >= IB_UCM_MAX_DEVICES)
+		goto err;
+
+	set_bit(ucm_dev->devnum, dev_map);
+
+	cdev_init(&ucm_dev->cdev, &ucm_fops);
+	ucm_dev->cdev.owner = THIS_MODULE;
+	kobject_set_name(&ucm_dev->cdev.kobj, "ucm%d", ucm_dev->devnum);
+	if (cdev_add(&ucm_dev->cdev, IB_UCM_BASE_DEV + ucm_dev->devnum, 1))
+		goto err;
+
+	ucm_dev->dev.class = &cm_class;
+	ucm_dev->dev.parent = device->dma_device;
+	ucm_dev->dev.devt = ucm_dev->cdev.dev;
+	ucm_dev->dev.release = ib_ucm_release_dev;
+	dev_set_name(&ucm_dev->dev, "ucm%d", ucm_dev->devnum);
+	if (device_register(&ucm_dev->dev))
+		goto err_cdev;
+
+	if (device_create_file(&ucm_dev->dev, &dev_attr_ibdev))
+		goto err_dev;
+
+	ib_set_client_data(device, &ucm_client, ucm_dev);
+	return;
+
+err_dev:
+	device_unregister(&ucm_dev->dev);
+err_cdev:
+	cdev_del(&ucm_dev->cdev);
+	clear_bit(ucm_dev->devnum, dev_map);
+err:
+	kfree(ucm_dev);
+	return;
+}
+
+static void ib_ucm_remove_one(struct ib_device *device)
+{
+	struct ib_ucm_device *ucm_dev = ib_get_client_data(device, &ucm_client);
+
+	if (!ucm_dev)
+		return;
+
+	device_unregister(&ucm_dev->dev);
+}
+
+static ssize_t show_abi_version(struct class *class, char *buf)
+{
+	return sprintf(buf, "%d\n", IB_USER_CM_ABI_VERSION);
+}
+static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
+
+static int __init ib_ucm_init(void)
+{
+	int ret;
+
+	ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES,
+				     "infiniband_cm");
+	if (ret) {
+		printk(KERN_ERR "ucm: couldn't register device number\n");
+		goto error1;
+	}
+
+	ret = class_create_file(&cm_class, &class_attr_abi_version);
+	if (ret) {
+		printk(KERN_ERR "ucm: couldn't create abi_version attribute\n");
+		goto error2;
+	}
+
+	ret = ib_register_client(&ucm_client);
+	if (ret) {
+		printk(KERN_ERR "ucm: couldn't register client\n");
+		goto error3;
+	}
+	return 0;
+
+error3:
+	class_remove_file(&cm_class, &class_attr_abi_version);
+error2:
+	unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES);
+error1:
+	return ret;
+}
+
+static void __exit ib_ucm_cleanup(void)
+{
+	ib_unregister_client(&ucm_client);
+	class_remove_file(&cm_class, &class_attr_abi_version);
+	unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES);
+	idr_destroy(&ctx_id_table);
+}
+
+module_init_order(ib_ucm_init, SI_ORDER_THIRD);
+module_exit(ib_ucm_cleanup);
diff --git a/sys/ofed/drivers/infiniband/core/ucma.c b/sys/ofed/drivers/infiniband/core/ucma.c
new file mode 100644
index 0000000..23cbf7b
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/ucma.c
@@ -0,0 +1,1337 @@
+/*
+ * Copyright (c) 2005-2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	copyright notice, this list of conditions and the following
+ *	disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	copyright notice, this list of conditions and the following
+ *	disclaimer in the documentation and/or other materials
+ *	provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/file.h>
+#include <linux/mutex.h>
+#include <linux/poll.h>
+#include <linux/idr.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/miscdevice.h>
+
+#include <rdma/rdma_user_cm.h>
+#include <rdma/ib_marshall.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/rdma_cm_ib.h>
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+enum {
+	UCMA_MAX_BACKLOG	= 1024
+};
+
+struct ucma_file {
+	struct mutex		mut;
+	struct file		*filp;
+	struct list_head	ctx_list;
+	struct list_head	event_list;
+	wait_queue_head_t	poll_wait;
+};
+
+struct ucma_context {
+	int			id;
+	struct completion	comp;
+	atomic_t		ref;
+	int			events_reported;
+	int			backlog;
+
+	struct ucma_file	*file;
+	struct rdma_cm_id	*cm_id;
+	u64			uid;
+
+	struct list_head	list;
+	struct list_head	mc_list;
+};
+
+struct ucma_multicast {
+	struct ucma_context	*ctx;
+	int			id;
+	int			events_reported;
+
+	u64			uid;
+	struct list_head	list;
+	struct sockaddr_storage	addr;
+};
+
+struct ucma_event {
+	struct ucma_context	*ctx;
+	struct ucma_multicast	*mc;
+	struct list_head	list;
+	struct rdma_cm_id	*cm_id;
+	struct rdma_ucm_event_resp resp;
+};
+
+static DEFINE_MUTEX(mut);
+static DEFINE_IDR(ctx_idr);
+static DEFINE_IDR(multicast_idr);
+
+static inline struct ucma_context *_ucma_find_context(int id,
+						      struct ucma_file *file)
+{
+	struct ucma_context *ctx;
+
+	ctx = idr_find(&ctx_idr, id);
+	if (!ctx)
+		ctx = ERR_PTR(-ENOENT);
+	else if (ctx->file != file)
+		ctx = ERR_PTR(-EINVAL);
+	return ctx;
+}
+
+static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id)
+{
+	struct ucma_context *ctx;
+
+	mutex_lock(&mut);
+	ctx = _ucma_find_context(id, file);
+	if (!IS_ERR(ctx))
+		atomic_inc(&ctx->ref);
+	mutex_unlock(&mut);
+	return ctx;
+}
+
+static void ucma_put_ctx(struct ucma_context *ctx)
+{
+	if (atomic_dec_and_test(&ctx->ref))
+		complete(&ctx->comp);
+}
+
+static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
+{
+	struct ucma_context *ctx;
+	int ret;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return NULL;
+
+	atomic_set(&ctx->ref, 1);
+	init_completion(&ctx->comp);
+	INIT_LIST_HEAD(&ctx->mc_list);
+	ctx->file = file;
+
+	do {
+		ret = idr_pre_get(&ctx_idr, GFP_KERNEL);
+		if (!ret)
+			goto error;
+
+		mutex_lock(&mut);
+		ret = idr_get_new(&ctx_idr, ctx, &ctx->id);
+		mutex_unlock(&mut);
+	} while (ret == -EAGAIN);
+
+	if (ret)
+		goto error;
+
+	list_add_tail(&ctx->list, &file->ctx_list);
+	return ctx;
+
+error:
+	kfree(ctx);
+	return NULL;
+}
+
+static struct ucma_multicast* ucma_alloc_multicast(struct ucma_context *ctx)
+{
+	struct ucma_multicast *mc;
+	int ret;
+
+	mc = kzalloc(sizeof(*mc), GFP_KERNEL);
+	if (!mc)
+		return NULL;
+
+	do {
+		ret = idr_pre_get(&multicast_idr, GFP_KERNEL);
+		if (!ret)
+			goto error;
+
+		mutex_lock(&mut);
+		ret = idr_get_new(&multicast_idr, mc, &mc->id);
+		mutex_unlock(&mut);
+	} while (ret == -EAGAIN);
+
+	if (ret)
+		goto error;
+
+	mc->ctx = ctx;
+	list_add_tail(&mc->list, &ctx->mc_list);
+	return mc;
+
+error:
+	kfree(mc);
+	return NULL;
+}
+
+static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst,
+				 struct rdma_conn_param *src)
+{
+	if (src->private_data_len)
+		memcpy(dst->private_data, src->private_data,
+		       src->private_data_len);
+	dst->private_data_len = src->private_data_len;
+	dst->responder_resources =src->responder_resources;
+	dst->initiator_depth = src->initiator_depth;
+	dst->flow_control = src->flow_control;
+	dst->retry_count = src->retry_count;
+	dst->rnr_retry_count = src->rnr_retry_count;
+	dst->srq = src->srq;
+	dst->qp_num = src->qp_num;
+}
+
+static void ucma_copy_ud_event(struct rdma_ucm_ud_param *dst,
+			       struct rdma_ud_param *src)
+{
+	if (src->private_data_len)
+		memcpy(dst->private_data, src->private_data,
+		       src->private_data_len);
+	dst->private_data_len = src->private_data_len;
+	ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr);
+	dst->qp_num = src->qp_num;
+	dst->qkey = src->qkey;
+}
+
+static void ucma_set_event_context(struct ucma_context *ctx,
+				   struct rdma_cm_event *event,
+				   struct ucma_event *uevent)
+{
+	uevent->ctx = ctx;
+	switch (event->event) {
+	case RDMA_CM_EVENT_MULTICAST_JOIN:
+	case RDMA_CM_EVENT_MULTICAST_ERROR:
+		uevent->mc = (struct ucma_multicast *)
+			     event->param.ud.private_data;
+		uevent->resp.uid = uevent->mc->uid;
+		uevent->resp.id = uevent->mc->id;
+		break;
+	default:
+		uevent->resp.uid = ctx->uid;
+		uevent->resp.id = ctx->id;
+		break;
+	}
+}
+
+static int ucma_event_handler(struct rdma_cm_id *cm_id,
+			      struct rdma_cm_event *event)
+{
+	struct ucma_event *uevent;
+	struct ucma_context *ctx = cm_id->context;
+	int ret = 0;
+
+	uevent = kzalloc(sizeof(*uevent), GFP_KERNEL);
+	if (!uevent)
+		return event->event == RDMA_CM_EVENT_CONNECT_REQUEST;
+
+	uevent->cm_id = cm_id;
+	ucma_set_event_context(ctx, event, uevent);
+	uevent->resp.event = event->event;
+	uevent->resp.status = event->status;
+	if (cm_id->ps == RDMA_PS_UDP || cm_id->ps == RDMA_PS_IPOIB)
+		ucma_copy_ud_event(&uevent->resp.param.ud, &event->param.ud);
+	else
+		ucma_copy_conn_event(&uevent->resp.param.conn,
+				     &event->param.conn);
+
+	mutex_lock(&ctx->file->mut);
+	if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) {
+		if (!ctx->backlog) {
+			ret = -ENOMEM;
+			kfree(uevent);
+			goto out;
+		}
+		ctx->backlog--;
+	} else if (!ctx->uid) {
+		/*
+		 * We ignore events for new connections until userspace has set
+		 * their context.  This can only happen if an error occurs on a
+		 * new connection before the user accepts it.  This is okay,
+		 * since the accept will just fail later.
+		 */
+		kfree(uevent);
+		goto out;
+	}
+
+	list_add_tail(&uevent->list, &ctx->file->event_list);
+	wake_up_interruptible(&ctx->file->poll_wait);
+	if (ctx->file->filp)
+		selwakeup(&ctx->file->filp->f_selinfo);
+out:
+	mutex_unlock(&ctx->file->mut);
+	return ret;
+}
+
+static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf,
+			      int in_len, int out_len)
+{
+	struct ucma_context *ctx;
+	struct rdma_ucm_get_event cmd;
+	struct ucma_event *uevent;
+	int ret = 0;
+	DEFINE_WAIT(wait);
+
+	if (out_len < sizeof uevent->resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&file->mut);
+	while (list_empty(&file->event_list)) {
+		mutex_unlock(&file->mut);
+
+		if (file->filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(file->poll_wait,
+					     !list_empty(&file->event_list)))
+			return -ERESTARTSYS;
+
+		mutex_lock(&file->mut);
+	}
+
+	uevent = list_entry(file->event_list.next, struct ucma_event, list);
+
+	if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) {
+		ctx = ucma_alloc_ctx(file);
+		if (!ctx) {
+			ret = -ENOMEM;
+			goto done;
+		}
+		uevent->ctx->backlog++;
+		ctx->cm_id = uevent->cm_id;
+		ctx->cm_id->context = ctx;
+		uevent->resp.id = ctx->id;
+	}
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &uevent->resp, sizeof uevent->resp)) {
+		ret = -EFAULT;
+		goto done;
+	}
+
+	list_del(&uevent->list);
+	uevent->ctx->events_reported++;
+	if (uevent->mc)
+		uevent->mc->events_reported++;
+	kfree(uevent);
+done:
+	mutex_unlock(&file->mut);
+	return ret;
+}
+
+static ssize_t ucma_create_id(struct ucma_file *file,
+				const char __user *inbuf,
+				int in_len, int out_len)
+{
+	struct rdma_ucm_create_id cmd;
+	struct rdma_ucm_create_id_resp resp;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&file->mut);
+	ctx = ucma_alloc_ctx(file);
+	mutex_unlock(&file->mut);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->uid = cmd.uid;
+	ctx->cm_id = rdma_create_id(ucma_event_handler, ctx, cmd.ps);
+	if (IS_ERR(ctx->cm_id)) {
+		ret = PTR_ERR(ctx->cm_id);
+		goto err1;
+	}
+
+	resp.id = ctx->id;
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp))) {
+		ret = -EFAULT;
+		goto err2;
+	}
+	return 0;
+
+err2:
+	rdma_destroy_id(ctx->cm_id);
+err1:
+	mutex_lock(&mut);
+	idr_remove(&ctx_idr, ctx->id);
+	mutex_unlock(&mut);
+	kfree(ctx);
+	return ret;
+}
+
+static void ucma_cleanup_multicast(struct ucma_context *ctx)
+{
+	struct ucma_multicast *mc, *tmp;
+
+	mutex_lock(&mut);
+	list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) {
+		list_del(&mc->list);
+		idr_remove(&multicast_idr, mc->id);
+		kfree(mc);
+	}
+	mutex_unlock(&mut);
+}
+
+static void ucma_cleanup_events(struct ucma_context *ctx)
+{
+	struct ucma_event *uevent, *tmp;
+
+	list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) {
+		if (uevent->ctx != ctx)
+			continue;
+
+		list_del(&uevent->list);
+
+		/* clear incoming connections. */
+		if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST)
+			rdma_destroy_id(uevent->cm_id);
+
+		kfree(uevent);
+	}
+}
+
+static void ucma_cleanup_mc_events(struct ucma_multicast *mc)
+{
+	struct ucma_event *uevent, *tmp;
+
+	list_for_each_entry_safe(uevent, tmp, &mc->ctx->file->event_list, list) {
+		if (uevent->mc != mc)
+			continue;
+
+		list_del(&uevent->list);
+		kfree(uevent);
+	}
+}
+
+static int ucma_free_ctx(struct ucma_context *ctx)
+{
+	int events_reported;
+
+	/* No new events will be generated after destroying the id. */
+	rdma_destroy_id(ctx->cm_id);
+
+	ucma_cleanup_multicast(ctx);
+
+	/* Cleanup events not yet reported to the user. */
+	mutex_lock(&ctx->file->mut);
+	ucma_cleanup_events(ctx);
+	list_del(&ctx->list);
+	mutex_unlock(&ctx->file->mut);
+
+	events_reported = ctx->events_reported;
+	kfree(ctx);
+	return events_reported;
+}
+
+static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct rdma_ucm_destroy_id cmd;
+	struct rdma_ucm_destroy_id_resp resp;
+	struct ucma_context *ctx;
+	int ret = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&mut);
+	ctx = _ucma_find_context(cmd.id, file);
+	if (!IS_ERR(ctx))
+		idr_remove(&ctx_idr, ctx->id);
+	mutex_unlock(&mut);
+
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ucma_put_ctx(ctx);
+	wait_for_completion(&ctx->comp);
+	resp.events_reported = ucma_free_ctx(ctx);
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static ssize_t ucma_bind_addr(struct ucma_file *file, const char __user *inbuf,
+			      int in_len, int out_len)
+{
+	struct rdma_ucm_bind_addr cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_resolve_addr(struct ucma_file *file,
+				 const char __user *inbuf,
+				 int in_len, int out_len)
+{
+	struct rdma_ucm_resolve_addr cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr,
+				(struct sockaddr *) &cmd.dst_addr,
+				cmd.timeout_ms);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_resolve_route(struct ucma_file *file,
+				  const char __user *inbuf,
+				  int in_len, int out_len)
+{
+	struct rdma_ucm_resolve_route cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_resolve_route(ctx->cm_id, cmd.timeout_ms);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp,
+			       struct rdma_route *route)
+{
+	struct rdma_dev_addr *dev_addr;
+
+	resp->num_paths = route->num_paths;
+	switch (route->num_paths) {
+	case 0:
+		dev_addr = &route->addr.dev_addr;
+		rdma_addr_get_dgid(dev_addr,
+				   (union ib_gid *) &resp->ib_route[0].dgid);
+		rdma_addr_get_sgid(dev_addr,
+				   (union ib_gid *) &resp->ib_route[0].sgid);
+		resp->ib_route[0].pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
+		break;
+	case 2:
+		ib_copy_path_rec_to_user(&resp->ib_route[1],
+					 &route->path_rec[1]);
+		/* fall through */
+	case 1:
+		ib_copy_path_rec_to_user(&resp->ib_route[0],
+					 &route->path_rec[0]);
+		break;
+	default:
+		break;
+	}
+}
+
+static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp,
+				 struct rdma_route *route)
+{
+	struct rdma_dev_addr *dev_addr;
+	struct net_device *dev;
+	u16 vid = 0;
+
+	resp->num_paths = route->num_paths;
+	switch (route->num_paths) {
+	case 0:
+		dev_addr = &route->addr.dev_addr;
+		dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+			if (dev) {
+				vid = rdma_vlan_dev_vlan_id(dev);
+				dev_put(dev);
+			}
+
+		iboe_mac_vlan_to_ll((union ib_gid *) &resp->ib_route[0].dgid,
+				    dev_addr->dst_dev_addr, vid);
+		iboe_addr_get_sgid(dev_addr,
+				   (union ib_gid *) &resp->ib_route[0].sgid);
+		resp->ib_route[0].pkey = cpu_to_be16(0xffff);
+		break;
+	case 2:
+		ib_copy_path_rec_to_user(&resp->ib_route[1],
+					 &route->path_rec[1]);
+		/* fall through */
+	case 1:
+		ib_copy_path_rec_to_user(&resp->ib_route[0],
+					 &route->path_rec[0]);
+		break;
+	default:
+		break;
+	}
+}
+
+static ssize_t ucma_query_route(struct ucma_file *file,
+				const char __user *inbuf,
+				int in_len, int out_len)
+{
+	struct rdma_ucm_query_route cmd;
+	struct rdma_ucm_query_route_resp resp;
+	struct ucma_context *ctx;
+	struct sockaddr *addr;
+	int ret = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	memset(&resp, 0, sizeof resp);
+	addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr;
+	memcpy(&resp.src_addr, addr, addr->sa_family == AF_INET ?
+				     sizeof(struct sockaddr_in) :
+				     sizeof(struct sockaddr_in6));
+	addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr;
+	memcpy(&resp.dst_addr, addr, addr->sa_family == AF_INET ?
+				     sizeof(struct sockaddr_in) :
+				     sizeof(struct sockaddr_in6));
+	if (!ctx->cm_id->device)
+		goto out;
+
+	resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid;
+	resp.port_num = ctx->cm_id->port_num;
+	if (rdma_node_get_transport(ctx->cm_id->device->node_type) == RDMA_TRANSPORT_IB) {
+		switch (rdma_port_get_link_layer(ctx->cm_id->device, ctx->cm_id->port_num)) {
+		case IB_LINK_LAYER_INFINIBAND:
+			ucma_copy_ib_route(&resp, &ctx->cm_id->route);
+			break;
+		case IB_LINK_LAYER_ETHERNET:
+			ucma_copy_iboe_route(&resp, &ctx->cm_id->route);
+			break;
+		default:
+			break;
+		}
+	}
+
+out:
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static void ucma_copy_conn_param(struct rdma_conn_param *dst,
+				 struct rdma_ucm_conn_param *src)
+{
+	dst->private_data = src->private_data;
+	dst->private_data_len = src->private_data_len;
+	dst->responder_resources =src->responder_resources;
+	dst->initiator_depth = src->initiator_depth;
+	dst->flow_control = src->flow_control;
+	dst->retry_count = src->retry_count;
+	dst->rnr_retry_count = src->rnr_retry_count;
+	dst->srq = src->srq;
+	dst->qp_num = src->qp_num;
+}
+
+static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf,
+			    int in_len, int out_len)
+{
+	struct rdma_ucm_connect cmd;
+	struct rdma_conn_param conn_param;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	if (!cmd.conn_param.valid)
+		return -EINVAL;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ucma_copy_conn_param(&conn_param, &cmd.conn_param);
+	ret = rdma_connect(ctx->cm_id, &conn_param);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_listen(struct ucma_file *file, const char __user *inbuf,
+			   int in_len, int out_len)
+{
+	struct rdma_ucm_listen cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ctx->backlog = cmd.backlog > 0 && cmd.backlog < UCMA_MAX_BACKLOG ?
+		       cmd.backlog : UCMA_MAX_BACKLOG;
+	ret = rdma_listen(ctx->cm_id, ctx->backlog);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf,
+			   int in_len, int out_len)
+{
+	struct rdma_ucm_accept cmd;
+	struct rdma_conn_param conn_param;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	if (cmd.conn_param.valid) {
+		ctx->uid = cmd.uid;
+		ucma_copy_conn_param(&conn_param, &cmd.conn_param);
+		ret = rdma_accept(ctx->cm_id, &conn_param);
+	} else
+		ret = rdma_accept(ctx->cm_id, NULL);
+
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_reject(struct ucma_file *file, const char __user *inbuf,
+			   int in_len, int out_len)
+{
+	struct rdma_ucm_reject cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_reject(ctx->cm_id, cmd.private_data, cmd.private_data_len);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_disconnect(struct ucma_file *file, const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct rdma_ucm_disconnect cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_disconnect(ctx->cm_id);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_init_qp_attr(struct ucma_file *file,
+				 const char __user *inbuf,
+				 int in_len, int out_len)
+{
+	struct rdma_ucm_init_qp_attr cmd;
+	struct ib_uverbs_qp_attr resp;
+	struct ucma_context *ctx;
+	struct ib_qp_attr qp_attr;
+	int ret;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	resp.qp_attr_mask = 0;
+	memset(&qp_attr, 0, sizeof qp_attr);
+	qp_attr.qp_state = cmd.qp_state;
+	ret = rdma_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask);
+	if (ret)
+		goto out;
+
+	ib_copy_qp_attr_to_user(&resp, &qp_attr);
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+out:
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static int ucma_set_option_id(struct ucma_context *ctx, int optname,
+			      void *optval, size_t optlen)
+{
+	int ret = 0;
+
+	switch (optname) {
+	case RDMA_OPTION_ID_TOS:
+		if (optlen != sizeof(u8)) {
+			ret = -EINVAL;
+			break;
+		}
+		rdma_set_service_type(ctx->cm_id, *((u8 *) optval));
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	return ret;
+}
+
+static int ucma_set_ib_path(struct ucma_context *ctx,
+			    struct ib_path_rec_data *path_data, size_t optlen)
+{
+	struct ib_sa_path_rec sa_path;
+	struct rdma_cm_event event;
+	int ret;
+
+	if (optlen % sizeof(*path_data))
+		return -EINVAL;
+
+	for (; optlen; optlen -= sizeof(*path_data), path_data++) {
+		if (path_data->flags == (IB_PATH_GMP | IB_PATH_PRIMARY |
+					 IB_PATH_BIDIRECTIONAL))
+			break;
+	}
+
+	if (!optlen)
+		return -EINVAL;
+
+	ib_sa_unpack_path(path_data->path_rec, &sa_path);
+	ret = rdma_set_ib_paths(ctx->cm_id, &sa_path, 1);
+	if (ret)
+		return ret;
+
+	memset(&event, 0, sizeof event);
+	event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+	return ucma_event_handler(ctx->cm_id, &event);
+}
+
+static int ucma_set_option_ib(struct ucma_context *ctx, int optname,
+			      void *optval, size_t optlen)
+{
+	int ret;
+
+	switch (optname) {
+	case RDMA_OPTION_IB_PATH:
+		ret = ucma_set_ib_path(ctx, optval, optlen);
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	return ret;
+}
+
+static int ucma_set_option_level(struct ucma_context *ctx, int level,
+				 int optname, void *optval, size_t optlen)
+{
+	int ret;
+
+	switch (level) {
+	case RDMA_OPTION_ID:
+		ret = ucma_set_option_id(ctx, optname, optval, optlen);
+		break;
+	case RDMA_OPTION_IB:
+		ret = ucma_set_option_ib(ctx, optname, optval, optlen);
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	return ret;
+}
+
+static ssize_t ucma_set_option(struct ucma_file *file, const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct rdma_ucm_set_option cmd;
+	struct ucma_context *ctx;
+	void *optval;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	optval = kmalloc(cmd.optlen, GFP_KERNEL);
+	if (!optval) {
+		ret = -ENOMEM;
+		goto out1;
+	}
+
+	if (copy_from_user(optval, (void __user *) (unsigned long) cmd.optval,
+			   cmd.optlen)) {
+		ret = -EFAULT;
+		goto out2;
+	}
+
+	ret = ucma_set_option_level(ctx, cmd.level, cmd.optname, optval,
+				    cmd.optlen);
+out2:
+	kfree(optval);
+out1:
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_notify(struct ucma_file *file, const char __user *inbuf,
+			   int in_len, int out_len)
+{
+	struct rdma_ucm_notify cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_notify(ctx->cm_id, (enum ib_event_type) cmd.event);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_join_multicast(struct ucma_file *file,
+				   const char __user *inbuf,
+				   int in_len, int out_len)
+{
+	struct rdma_ucm_join_mcast cmd;
+	struct rdma_ucm_create_id_resp resp;
+	struct ucma_context *ctx;
+	struct ucma_multicast *mc;
+	int ret;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	mutex_lock(&file->mut);
+	mc = ucma_alloc_multicast(ctx);
+	if (!mc) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	mc->uid = cmd.uid;
+	memcpy(&mc->addr, &cmd.addr, sizeof cmd.addr);
+	ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr, mc);
+	if (ret)
+		goto err2;
+
+	resp.id = mc->id;
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp))) {
+		ret = -EFAULT;
+		goto err3;
+	}
+
+	mutex_unlock(&file->mut);
+	ucma_put_ctx(ctx);
+	return 0;
+
+err3:
+	rdma_leave_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr);
+	ucma_cleanup_mc_events(mc);
+err2:
+	mutex_lock(&mut);
+	idr_remove(&multicast_idr, mc->id);
+	mutex_unlock(&mut);
+	list_del(&mc->list);
+	kfree(mc);
+err1:
+	mutex_unlock(&file->mut);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_leave_multicast(struct ucma_file *file,
+				    const char __user *inbuf,
+				    int in_len, int out_len)
+{
+	struct rdma_ucm_destroy_id cmd;
+	struct rdma_ucm_destroy_id_resp resp;
+	struct ucma_multicast *mc;
+	int ret = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&mut);
+	mc = idr_find(&multicast_idr, cmd.id);
+	if (!mc)
+		mc = ERR_PTR(-ENOENT);
+	else if (mc->ctx->file != file)
+		mc = ERR_PTR(-EINVAL);
+	else {
+		idr_remove(&multicast_idr, mc->id);
+		atomic_inc(&mc->ctx->ref);
+	}
+	mutex_unlock(&mut);
+
+	if (IS_ERR(mc)) {
+		ret = PTR_ERR(mc);
+		goto out;
+	}
+
+	rdma_leave_multicast(mc->ctx->cm_id, (struct sockaddr *) &mc->addr);
+	mutex_lock(&mc->ctx->file->mut);
+	ucma_cleanup_mc_events(mc);
+	list_del(&mc->list);
+	mutex_unlock(&mc->ctx->file->mut);
+
+	ucma_put_ctx(mc->ctx);
+	resp.events_reported = mc->events_reported;
+	kfree(mc);
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+out:
+	return ret;
+}
+
+static void ucma_lock_files(struct ucma_file *file1, struct ucma_file *file2)
+{
+	/* Acquire mutex's based on pointer comparison to prevent deadlock. */
+	if (file1 < file2) {
+		mutex_lock(&file1->mut);
+		mutex_lock(&file2->mut);
+	} else {
+		mutex_lock(&file2->mut);
+		mutex_lock(&file1->mut);
+	}
+}
+
+static void ucma_unlock_files(struct ucma_file *file1, struct ucma_file *file2)
+{
+	if (file1 < file2) {
+		mutex_unlock(&file2->mut);
+		mutex_unlock(&file1->mut);
+	} else {
+		mutex_unlock(&file1->mut);
+		mutex_unlock(&file2->mut);
+	}
+}
+
+static void ucma_move_events(struct ucma_context *ctx, struct ucma_file *file)
+{
+	struct ucma_event *uevent, *tmp;
+
+	list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list)
+		if (uevent->ctx == ctx)
+			list_move_tail(&uevent->list, &file->event_list);
+}
+
+static ssize_t ucma_migrate_id(struct ucma_file *new_file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct rdma_ucm_migrate_id cmd;
+	struct rdma_ucm_migrate_resp resp;
+	struct ucma_context *ctx;
+	struct file *filp;
+	struct ucma_file *cur_file;
+	int ret = 0;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	/* Get current fd to protect against it being closed */
+	filp = fget(cmd.fd);
+	if (!filp)
+		return -ENOENT;
+
+	/* Validate current fd and prevent destruction of id. */
+	ctx = ucma_get_ctx(filp->private_data, cmd.id);
+	if (IS_ERR(ctx)) {
+		ret = PTR_ERR(ctx);
+		goto file_put;
+	}
+
+	cur_file = ctx->file;
+	if (cur_file == new_file) {
+		resp.events_reported = ctx->events_reported;
+		goto response;
+	}
+
+	/*
+	 * Migrate events between fd's, maintaining order, and avoiding new
+	 * events being added before existing events.
+	 */
+	ucma_lock_files(cur_file, new_file);
+	mutex_lock(&mut);
+
+	list_move_tail(&ctx->list, &new_file->ctx_list);
+	ucma_move_events(ctx, new_file);
+	ctx->file = new_file;
+	resp.events_reported = ctx->events_reported;
+
+	mutex_unlock(&mut);
+	ucma_unlock_files(cur_file, new_file);
+
+response:
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+	ucma_put_ctx(ctx);
+file_put:
+	fput(filp);
+	return ret;
+}
+
+static ssize_t (*ucma_cmd_table[])(struct ucma_file *file,
+				   const char __user *inbuf,
+				   int in_len, int out_len) = {
+	[RDMA_USER_CM_CMD_CREATE_ID]	= ucma_create_id,
+	[RDMA_USER_CM_CMD_DESTROY_ID]	= ucma_destroy_id,
+	[RDMA_USER_CM_CMD_BIND_ADDR]	= ucma_bind_addr,
+	[RDMA_USER_CM_CMD_RESOLVE_ADDR]	= ucma_resolve_addr,
+	[RDMA_USER_CM_CMD_RESOLVE_ROUTE]= ucma_resolve_route,
+	[RDMA_USER_CM_CMD_QUERY_ROUTE]	= ucma_query_route,
+	[RDMA_USER_CM_CMD_CONNECT]	= ucma_connect,
+	[RDMA_USER_CM_CMD_LISTEN]	= ucma_listen,
+	[RDMA_USER_CM_CMD_ACCEPT]	= ucma_accept,
+	[RDMA_USER_CM_CMD_REJECT]	= ucma_reject,
+	[RDMA_USER_CM_CMD_DISCONNECT]	= ucma_disconnect,
+	[RDMA_USER_CM_CMD_INIT_QP_ATTR]	= ucma_init_qp_attr,
+	[RDMA_USER_CM_CMD_GET_EVENT]	= ucma_get_event,
+	[RDMA_USER_CM_CMD_GET_OPTION]	= NULL,
+	[RDMA_USER_CM_CMD_SET_OPTION]	= ucma_set_option,
+	[RDMA_USER_CM_CMD_NOTIFY]	= ucma_notify,
+	[RDMA_USER_CM_CMD_JOIN_MCAST]	= ucma_join_multicast,
+	[RDMA_USER_CM_CMD_LEAVE_MCAST]	= ucma_leave_multicast,
+	[RDMA_USER_CM_CMD_MIGRATE_ID]	= ucma_migrate_id
+};
+
+static ssize_t ucma_write(struct file *filp, const char __user *buf,
+			  size_t len, loff_t *pos)
+{
+	struct ucma_file *file = filp->private_data;
+	struct rdma_ucm_cmd_hdr hdr;
+	ssize_t ret;
+
+	if (len < sizeof(hdr))
+		return -EINVAL;
+
+	if (copy_from_user(&hdr, buf, sizeof(hdr)))
+		return -EFAULT;
+
+	if (hdr.cmd < 0 || hdr.cmd >= ARRAY_SIZE(ucma_cmd_table))
+		return -EINVAL;
+
+	if (hdr.in + sizeof(hdr) > len)
+		return -EINVAL;
+
+	if (!ucma_cmd_table[hdr.cmd])
+		return -ENOSYS;
+
+	ret = ucma_cmd_table[hdr.cmd](file, buf + sizeof(hdr), hdr.in, hdr.out);
+	if (!ret)
+		ret = len;
+
+	return ret;
+}
+
+static unsigned int ucma_poll(struct file *filp, struct poll_table_struct *wait)
+{
+	struct ucma_file *file = filp->private_data;
+	unsigned int mask = 0;
+
+	poll_wait(filp, &file->poll_wait, wait);
+
+	if (!list_empty(&file->event_list))
+		mask = POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
+/*
+ * ucma_open() does not need the BKL:
+ *
+ *  - no global state is referred to;
+ *  - there is no ioctl method to race against;
+ *  - no further module initialization is required for open to work
+ *    after the device is registered.
+ */
+static int ucma_open(struct inode *inode, struct file *filp)
+{
+	struct ucma_file *file;
+
+	file = kmalloc(sizeof *file, GFP_KERNEL);
+	if (!file)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&file->event_list);
+	INIT_LIST_HEAD(&file->ctx_list);
+	init_waitqueue_head(&file->poll_wait);
+	mutex_init(&file->mut);
+
+	filp->private_data = file;
+	file->filp = filp;
+	return 0;
+}
+
+static int ucma_close(struct inode *inode, struct file *filp)
+{
+	struct ucma_file *file = filp->private_data;
+	struct ucma_context *ctx, *tmp;
+
+	mutex_lock(&file->mut);
+	list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) {
+		mutex_unlock(&file->mut);
+
+		mutex_lock(&mut);
+		idr_remove(&ctx_idr, ctx->id);
+		mutex_unlock(&mut);
+
+		ucma_free_ctx(ctx);
+		mutex_lock(&file->mut);
+	}
+	mutex_unlock(&file->mut);
+	kfree(file);
+	return 0;
+}
+
+static const struct file_operations ucma_fops = {
+	.owner 	 = THIS_MODULE,
+	.open 	 = ucma_open,
+	.release = ucma_close,
+	.write	 = ucma_write,
+	.poll    = ucma_poll,
+};
+
+static struct miscdevice ucma_misc = {
+	.minor	= MISC_DYNAMIC_MINOR,
+	.name	= "rdma_cm",
+	.fops	= &ucma_fops,
+};
+
+static ssize_t show_abi_version(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	return sprintf(buf, "%d\n", RDMA_USER_CM_ABI_VERSION);
+}
+static DEVICE_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
+
+static int __init ucma_init(void)
+{
+	int ret;
+
+	ret = misc_register(&ucma_misc);
+	if (ret)
+		return ret;
+
+	ret = device_create_file(ucma_misc.this_device, &dev_attr_abi_version);
+	if (ret) {
+		printk(KERN_ERR "rdma_ucm: couldn't create abi_version attr\n");
+		goto err;
+	}
+	return 0;
+err:
+	misc_deregister(&ucma_misc);
+	return ret;
+}
+
+static void __exit ucma_cleanup(void)
+{
+	device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);
+	misc_deregister(&ucma_misc);
+	idr_destroy(&ctx_idr);
+}
+
+module_init(ucma_init);
+module_exit(ucma_cleanup);
diff --git a/sys/ofed/drivers/infiniband/core/ud_header.c b/sys/ofed/drivers/infiniband/core/ud_header.c
new file mode 100644
index 0000000..e095a12
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/ud_header.c
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/if_ether.h>
+
+#include <rdma/ib_pack.h>
+
+#define STRUCT_FIELD(header, field) \
+	.struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field),      \
+	.struct_size_bytes   = sizeof ((struct ib_unpacked_ ## header *) 0)->field, \
+	.field_name          = #header ":" #field
+
+static const struct ib_field lrh_table[]  = {
+	{ STRUCT_FIELD(lrh, virtual_lane),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(lrh, link_version),
+	  .offset_words = 0,
+	  .offset_bits  = 4,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(lrh, service_level),
+	  .offset_words = 0,
+	  .offset_bits  = 8,
+	  .size_bits    = 4 },
+	{ RESERVED,
+	  .offset_words = 0,
+	  .offset_bits  = 12,
+	  .size_bits    = 2 },
+	{ STRUCT_FIELD(lrh, link_next_header),
+	  .offset_words = 0,
+	  .offset_bits  = 14,
+	  .size_bits    = 2 },
+	{ STRUCT_FIELD(lrh, destination_lid),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ RESERVED,
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 5 },
+	{ STRUCT_FIELD(lrh, packet_length),
+	  .offset_words = 1,
+	  .offset_bits  = 5,
+	  .size_bits    = 11 },
+	{ STRUCT_FIELD(lrh, source_lid),
+	  .offset_words = 1,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 }
+};
+
+static const struct ib_field eth_table[]  = {
+	{ STRUCT_FIELD(eth, dmac_h),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ STRUCT_FIELD(eth, dmac_l),
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(eth, smac_h),
+	  .offset_words = 1,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(eth, smac_l),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ STRUCT_FIELD(eth, type),
+	  .offset_words = 3,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 }
+};
+
+static const struct ib_field vlan_table[]  = {
+	{ STRUCT_FIELD(vlan, tag),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(vlan, type),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 }
+};
+
+static const struct ib_field grh_table[]  = {
+	{ STRUCT_FIELD(grh, ip_version),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(grh, traffic_class),
+	  .offset_words = 0,
+	  .offset_bits  = 4,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(grh, flow_label),
+	  .offset_words = 0,
+	  .offset_bits  = 12,
+	  .size_bits    = 20 },
+	{ STRUCT_FIELD(grh, payload_length),
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(grh, next_header),
+	  .offset_words = 1,
+	  .offset_bits  = 16,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(grh, hop_limit),
+	  .offset_words = 1,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(grh, source_gid),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ STRUCT_FIELD(grh, destination_gid),
+	  .offset_words = 6,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 }
+};
+
+static const struct ib_field bth_table[]  = {
+	{ STRUCT_FIELD(bth, opcode),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(bth, solicited_event),
+	  .offset_words = 0,
+	  .offset_bits  = 8,
+	  .size_bits    = 1 },
+	{ STRUCT_FIELD(bth, mig_req),
+	  .offset_words = 0,
+	  .offset_bits  = 9,
+	  .size_bits    = 1 },
+	{ STRUCT_FIELD(bth, pad_count),
+	  .offset_words = 0,
+	  .offset_bits  = 10,
+	  .size_bits    = 2 },
+	{ STRUCT_FIELD(bth, transport_header_version),
+	  .offset_words = 0,
+	  .offset_bits  = 12,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(bth, pkey),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ RESERVED,
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(bth, destination_qpn),
+	  .offset_words = 1,
+	  .offset_bits  = 8,
+	  .size_bits    = 24 },
+	{ STRUCT_FIELD(bth, ack_req),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 1 },
+	{ RESERVED,
+	  .offset_words = 2,
+	  .offset_bits  = 1,
+	  .size_bits    = 7 },
+	{ STRUCT_FIELD(bth, psn),
+	  .offset_words = 2,
+	  .offset_bits  = 8,
+	  .size_bits    = 24 }
+};
+
+static const struct ib_field deth_table[] = {
+	{ STRUCT_FIELD(deth, qkey),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ RESERVED,
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(deth, source_qpn),
+	  .offset_words = 1,
+	  .offset_bits  = 8,
+	  .size_bits    = 24 }
+};
+
+/**
+ * ib_ud_header_init - Initialize UD header structure
+ * @payload_bytes:Length of packet payload
+ * @lrh_present: specify if LRH is present
+ * @eth_present: specify if Eth header is present
+ * @vlan_present: packet is tagged vlan
+ * @grh_present:GRH flag (if non-zero, GRH will be included)
+ * @immediate_present: specify if immediate data is present
+ * @header:Structure to initialize
+ */
+void ib_ud_header_init(int     		    payload_bytes,
+		       int		    lrh_present,
+		       int		    eth_present,
+		       int		    vlan_present,
+		       int    		    grh_present,
+		       int		    immediate_present,
+		       struct ib_ud_header *header)
+{
+	u16 packet_length;
+
+	memset(header, 0, sizeof *header);
+
+	if (lrh_present) {
+		header->lrh.link_version     = 0;
+		header->lrh.link_next_header =
+			grh_present ? IB_LNH_IBA_GLOBAL : IB_LNH_IBA_LOCAL;
+		packet_length = IB_LRH_BYTES;
+	}
+
+	if (eth_present) {
+		if (vlan_present) {
+			header->eth.type = cpu_to_be16(ETH_P_8021Q);
+			packet_length += IB_VLAN_BYTES;
+
+		}
+		packet_length += IB_ETH_BYTES;
+	}
+
+	packet_length += IB_BTH_BYTES + IB_DETH_BYTES + payload_bytes +
+		4       + /* ICRC     */
+		3;        /* round up */
+	packet_length /= 4;
+	if (grh_present) {
+		packet_length += IB_GRH_BYTES / 4;
+		header->grh.ip_version = 6;
+		header->grh.payload_length =
+			cpu_to_be16((IB_BTH_BYTES  +
+				     IB_DETH_BYTES +
+				     payload_bytes +
+				     4             + /* ICRC     */
+				     3) & ~3);       /* round up */
+		header->grh.next_header     = 0x1b;
+	}
+
+	if (lrh_present)
+		header->lrh.packet_length = cpu_to_be16(packet_length);
+
+	if (immediate_present)
+		header->bth.opcode           = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+	else
+		header->bth.opcode           = IB_OPCODE_UD_SEND_ONLY;
+	header->bth.pad_count                = (4 - payload_bytes) & 3;
+	header->bth.transport_header_version = 0;
+
+	header->lrh_present = lrh_present;
+	header->eth_present = eth_present;
+	header->vlan_present = vlan_present;
+	header->grh_present = grh_present;
+	header->immediate_present = immediate_present;
+}
+EXPORT_SYMBOL(ib_ud_header_init);
+
+/**
+ * ib_lrh_header_pack - Pack LRH header struct into wire format
+ * @lrh:unpacked LRH header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_lrh_header_pack() packs the LRH header structure @lrh into
+ * wire format in the buffer @buf.
+ */
+int ib_lrh_header_pack(struct ib_unpacked_lrh *lrh, void *buf)
+{
+	ib_pack(lrh_table, ARRAY_SIZE(lrh_table), lrh, buf);
+	return 0;
+}
+EXPORT_SYMBOL(ib_lrh_header_pack);
+
+/**
+ * ib_lrh_header_unpack - Unpack LRH structure from wire format
+ * @lrh:unpacked LRH header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_lrh_header_unpack() unpacks the LRH header structure from
+ * wire format (in buf) into @lrh.
+ */
+int ib_lrh_header_unpack(void *buf, struct ib_unpacked_lrh *lrh)
+{
+	ib_unpack(lrh_table, ARRAY_SIZE(lrh_table), buf, lrh);
+	return 0;
+}
+EXPORT_SYMBOL(ib_lrh_header_unpack);
+
+/**
+ * ib_ud_header_pack - Pack UD header struct into wire format
+ * @header:UD header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_ud_header_pack() packs the UD header structure @header into wire
+ * format in the buffer @buf.
+ */
+int ib_ud_header_pack(struct ib_ud_header *header,
+		      void                *buf)
+{
+	int len = 0;
+
+	if (header->lrh_present) {
+		ib_pack(lrh_table, ARRAY_SIZE(lrh_table),
+			&header->lrh, buf + len);
+		len += IB_LRH_BYTES;
+	}
+	if (header->eth_present) {
+		ib_pack(eth_table, ARRAY_SIZE(eth_table),
+			&header->eth, buf + len);
+		len += IB_ETH_BYTES;
+	}
+
+
+	if (header->vlan_present) {
+		ib_pack(vlan_table, ARRAY_SIZE(vlan_table),
+			&header->vlan, buf + len);
+		len += IB_VLAN_BYTES;
+	}
+
+	if (header->grh_present) {
+		ib_pack(grh_table, ARRAY_SIZE(grh_table),
+			&header->grh, buf + len);
+		len += IB_GRH_BYTES;
+	}
+
+	ib_pack(bth_table, ARRAY_SIZE(bth_table),
+		&header->bth, buf + len);
+	len += IB_BTH_BYTES;
+
+	ib_pack(deth_table, ARRAY_SIZE(deth_table),
+		&header->deth, buf + len);
+	len += IB_DETH_BYTES;
+
+	if (header->immediate_present) {
+		memcpy(buf + len, &header->immediate_data, sizeof header->immediate_data);
+		len += sizeof header->immediate_data;
+	}
+
+	return len;
+}
+EXPORT_SYMBOL(ib_ud_header_pack);
+
+/**
+ * ib_ud_header_unpack - Unpack UD header struct from wire format
+ * @header:UD header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_ud_header_pack() unpacks the UD header structure @header from wire
+ * format in the buffer @buf.
+ */
+int ib_ud_header_unpack(void                *buf,
+			struct ib_ud_header *header)
+{
+	ib_unpack(lrh_table, ARRAY_SIZE(lrh_table),
+		  buf, &header->lrh);
+	buf += IB_LRH_BYTES;
+
+	if (header->lrh.link_version != 0) {
+		printk(KERN_WARNING "Invalid LRH.link_version %d\n",
+		       header->lrh.link_version);
+		return -EINVAL;
+	}
+
+	switch (header->lrh.link_next_header) {
+	case IB_LNH_IBA_LOCAL:
+		header->grh_present = 0;
+		break;
+
+	case IB_LNH_IBA_GLOBAL:
+		header->grh_present = 1;
+		ib_unpack(grh_table, ARRAY_SIZE(grh_table),
+			  buf, &header->grh);
+		buf += IB_GRH_BYTES;
+
+		if (header->grh.ip_version != 6) {
+			printk(KERN_WARNING "Invalid GRH.ip_version %d\n",
+			       header->grh.ip_version);
+			return -EINVAL;
+		}
+		if (header->grh.next_header != 0x1b) {
+			printk(KERN_WARNING "Invalid GRH.next_header 0x%02x\n",
+			       header->grh.next_header);
+			return -EINVAL;
+		}
+		break;
+
+	default:
+		printk(KERN_WARNING "Invalid LRH.link_next_header %d\n",
+		       header->lrh.link_next_header);
+		return -EINVAL;
+	}
+
+	ib_unpack(bth_table, ARRAY_SIZE(bth_table),
+		  buf, &header->bth);
+	buf += IB_BTH_BYTES;
+
+	switch (header->bth.opcode) {
+	case IB_OPCODE_UD_SEND_ONLY:
+		header->immediate_present = 0;
+		break;
+	case IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE:
+		header->immediate_present = 1;
+		break;
+	default:
+		printk(KERN_WARNING "Invalid BTH.opcode 0x%02x\n",
+		       header->bth.opcode);
+		return -EINVAL;
+	}
+
+	if (header->bth.transport_header_version != 0) {
+		printk(KERN_WARNING "Invalid BTH.transport_header_version %d\n",
+		       header->bth.transport_header_version);
+		return -EINVAL;
+	}
+
+	ib_unpack(deth_table, ARRAY_SIZE(deth_table),
+		  buf, &header->deth);
+	buf += IB_DETH_BYTES;
+
+	if (header->immediate_present)
+		memcpy(&header->immediate_data, buf, sizeof header->immediate_data);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_ud_header_unpack);
diff --git a/sys/ofed/drivers/infiniband/core/umem.c b/sys/ofed/drivers/infiniband/core/umem.c
new file mode 100644
index 0000000..0c6fed2
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/umem.c
@@ -0,0 +1,532 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/sched.h>
+#ifdef __linux__
+#include <linux/hugetlb.h>
+#endif
+#include <linux/dma-attrs.h>
+
+#include <sys/priv.h>
+#include <sys/resource.h>
+#include <sys/resourcevar.h>
+
+#include <vm/vm.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_pageout.h>
+
+#include "uverbs.h"
+
+static int allow_weak_ordering;
+module_param(allow_weak_ordering, bool, 0444);
+MODULE_PARM_DESC(allow_weak_ordering,  "Allow weak ordering for data registered memory");
+
+#define IB_UMEM_MAX_PAGE_CHUNK						\
+	((PAGE_SIZE - offsetof(struct ib_umem_chunk, page_list)) /	\
+	 ((void *) &((struct ib_umem_chunk *) 0)->page_list[1] -	\
+	  (void *) &((struct ib_umem_chunk *) 0)->page_list[0]))
+
+#ifdef __ia64__
+extern int dma_map_sg_hp_wa;
+
+static int dma_map_sg_ia64(struct ib_device *ibdev,
+			   struct scatterlist *sg,
+			   int nents,
+			   enum dma_data_direction dir)
+{
+	int i, rc, j, lents = 0;
+	struct device *dev;
+
+	if (!dma_map_sg_hp_wa)
+		return ib_dma_map_sg(ibdev, sg, nents, dir);
+
+	dev = ibdev->dma_device;
+	for (i = 0; i < nents; ++i) {
+		rc = dma_map_sg(dev, sg + i, 1, dir);
+		if (rc <= 0) {
+			for (j = 0; j < i; ++j)
+				dma_unmap_sg(dev, sg + j, 1, dir);
+
+			return 0;
+		}
+		lents += rc;
+	}
+
+	return lents;
+}
+
+static void dma_unmap_sg_ia64(struct ib_device *ibdev,
+			      struct scatterlist *sg,
+			      int nents,
+			      enum dma_data_direction dir)
+{
+	int i;
+	struct device *dev;
+
+	if (!dma_map_sg_hp_wa)
+		return ib_dma_unmap_sg(ibdev, sg, nents, dir);
+
+	dev = ibdev->dma_device;
+	for (i = 0; i < nents; ++i)
+		dma_unmap_sg(dev, sg + i, 1, dir);
+}
+
+#define ib_dma_map_sg(dev, sg, nents, dir) dma_map_sg_ia64(dev, sg, nents, dir)
+#define ib_dma_unmap_sg(dev, sg, nents, dir) dma_unmap_sg_ia64(dev, sg, nents, dir)
+
+#endif
+
+static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
+{
+#ifdef __linux__
+	struct ib_umem_chunk *chunk, *tmp;
+	int i;
+
+	list_for_each_entry_safe(chunk, tmp, &umem->chunk_list, list) {
+		ib_dma_unmap_sg_attrs(dev, chunk->page_list,
+				      chunk->nents, DMA_BIDIRECTIONAL, &chunk->attrs);
+		for (i = 0; i < chunk->nents; ++i) {
+			struct page *page = sg_page(&chunk->page_list[i]);
+			if (umem->writable && dirty)
+				set_page_dirty_lock(page);
+			put_page(page);
+		}
+		kfree(chunk);
+	}
+#else
+	struct ib_umem_chunk *chunk, *tmp;
+	vm_object_t object;
+	int i;
+
+	object = NULL;
+	list_for_each_entry_safe(chunk, tmp, &umem->chunk_list, list) {
+		ib_dma_unmap_sg_attrs(dev, chunk->page_list,
+				      chunk->nents, DMA_BIDIRECTIONAL, &chunk->attrs);
+		for (i = 0; i < chunk->nents; ++i) {
+			struct page *page = sg_page(&chunk->page_list[i]);
+			if (umem->writable && dirty) {
+				if (object && object != page->object)
+					VM_OBJECT_UNLOCK(object);
+				if (object != page->object) {
+					object = page->object;
+					VM_OBJECT_LOCK(object);
+				}
+				vm_page_dirty(page);
+			}
+		}
+		kfree(chunk);
+	}
+	if (object)
+		VM_OBJECT_UNLOCK(object);
+
+#endif
+}
+
+/**
+ * ib_umem_get - Pin and DMA map userspace memory.
+ * @context: userspace context to pin memory for
+ * @addr: userspace virtual address to start at
+ * @size: length of region to pin
+ * @access: IB_ACCESS_xxx flags for memory being pinned
+ * @dmasync: flush in-flight DMA when the memory region is written
+ */
+struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
+			    size_t size, int access, int dmasync)
+{
+#ifdef __linux__
+	struct ib_umem *umem;
+	struct page **page_list;
+	struct vm_area_struct **vma_list;
+	struct ib_umem_chunk *chunk;
+	unsigned long locked;
+	unsigned long lock_limit;
+	unsigned long cur_base;
+	unsigned long npages;
+	int ret;
+	int off;
+	int i;
+	DEFINE_DMA_ATTRS(attrs);
+
+	if (dmasync)
+		dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs);
+	else if (allow_weak_ordering)
+		dma_set_attr(DMA_ATTR_WEAK_ORDERING, &attrs);
+
+	if (!can_do_mlock())
+		return ERR_PTR(-EPERM);
+
+	umem = kmalloc(sizeof *umem, GFP_KERNEL);
+	if (!umem)
+		return ERR_PTR(-ENOMEM);
+
+	umem->context   = context;
+	umem->length    = size;
+	umem->offset    = addr & ~PAGE_MASK;
+	umem->page_size = PAGE_SIZE;
+	/*
+	 * We ask for writable memory if any access flags other than
+	 * "remote read" are set.  "Local write" and "remote write"
+	 * obviously require write access.  "Remote atomic" can do
+	 * things like fetch and add, which will modify memory, and
+	 * "MW bind" can change permissions by binding a window.
+	 */
+	umem->writable  = !!(access & ~IB_ACCESS_REMOTE_READ);
+
+	/* We assume the memory is from hugetlb until proved otherwise */
+	umem->hugetlb   = 1;
+
+	INIT_LIST_HEAD(&umem->chunk_list);
+
+	page_list = (struct page **) __get_free_page(GFP_KERNEL);
+	if (!page_list) {
+		kfree(umem);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/*
+	 * if we can't alloc the vma_list, it's not so bad;
+	 * just assume the memory is not hugetlb memory
+	 */
+	vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL);
+	if (!vma_list)
+		umem->hugetlb = 0;
+
+	npages = PAGE_ALIGN(size + umem->offset) >> PAGE_SHIFT;
+
+	down_write(&current->mm->mmap_sem);
+
+	locked     = npages + current->mm->locked_vm;
+	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+
+	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	cur_base = addr & PAGE_MASK;
+
+	ret = 0;
+
+	while (npages) {
+		ret = get_user_pages(current, current->mm, cur_base,
+				     min_t(unsigned long, npages,
+					   PAGE_SIZE / sizeof (struct page *)),
+				     1, !umem->writable, page_list, vma_list);
+
+		if (ret < 0)
+			goto out;
+
+		cur_base += ret * PAGE_SIZE;
+		npages   -= ret;
+
+		off = 0;
+
+		while (ret) {
+			chunk = kmalloc(sizeof *chunk + sizeof (struct scatterlist) *
+					min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK),
+					GFP_KERNEL);
+			if (!chunk) {
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			chunk->attrs = attrs;
+			chunk->nents = min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK);
+			sg_init_table(chunk->page_list, chunk->nents);
+			for (i = 0; i < chunk->nents; ++i) {
+				if (vma_list &&
+				    !is_vm_hugetlb_page(vma_list[i + off]))
+					umem->hugetlb = 0;
+				sg_set_page(&chunk->page_list[i], page_list[i + off], PAGE_SIZE, 0);
+			}
+
+			chunk->nmap = ib_dma_map_sg_attrs(context->device,
+							  &chunk->page_list[0],
+							  chunk->nents,
+							  DMA_BIDIRECTIONAL,
+							  &attrs);
+			if (chunk->nmap <= 0) {
+				for (i = 0; i < chunk->nents; ++i)
+					put_page(sg_page(&chunk->page_list[i]));
+				kfree(chunk);
+
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			ret -= chunk->nents;
+			off += chunk->nents;
+			list_add_tail(&chunk->list, &umem->chunk_list);
+		}
+
+		ret = 0;
+	}
+
+out:
+	if (ret < 0) {
+		__ib_umem_release(context->device, umem, 0);
+		kfree(umem);
+	} else
+		current->mm->locked_vm = locked;
+
+	up_write(&current->mm->mmap_sem);
+	if (vma_list)
+		free_page((unsigned long) vma_list);
+	free_page((unsigned long) page_list);
+
+	return ret < 0 ? ERR_PTR(ret) : umem;
+#else
+	struct ib_umem *umem;
+	struct ib_umem_chunk *chunk;
+        struct proc *proc;
+	pmap_t pmap;
+        vm_offset_t end, last, start;
+        vm_size_t npages;
+        int error;
+	int ents;
+	int ret;
+	int i;
+	DEFINE_DMA_ATTRS(attrs);
+
+	error = priv_check(curthread, PRIV_VM_MLOCK);
+	if (error)
+		return ERR_PTR(-error);
+
+	last = addr + size;
+	start = addr & PAGE_MASK; /* Use the linux PAGE_MASK definition. */
+	end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */
+	if (last < addr || end < addr)
+		return ERR_PTR(-EINVAL);
+	npages = atop(end - start);
+	if (npages > vm_page_max_wired)
+		return ERR_PTR(-ENOMEM);
+	umem = kzalloc(sizeof *umem, GFP_KERNEL);
+	if (!umem)
+		return ERR_PTR(-ENOMEM);
+	proc = curthread->td_proc;
+	PROC_LOCK(proc);
+	if (ptoa(npages +
+	    pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) >
+	    lim_cur(proc, RLIMIT_MEMLOCK)) {
+		PROC_UNLOCK(proc);
+		kfree(umem);
+		return ERR_PTR(-ENOMEM);
+	}
+        PROC_UNLOCK(proc);
+	if (npages + cnt.v_wire_count > vm_page_max_wired) {
+		kfree(umem);
+		return ERR_PTR(-EAGAIN);
+	}
+	error = vm_map_wire(&proc->p_vmspace->vm_map, start, end,
+	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES |
+	    (umem->writable ? VM_MAP_WIRE_WRITE : 0));
+	if (error != KERN_SUCCESS) {
+		kfree(umem);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	umem->context   = context;
+	umem->length    = size;
+	umem->offset    = addr & ~PAGE_MASK;
+	umem->page_size = PAGE_SIZE;
+	umem->start	= addr;
+	/*
+	 * We ask for writable memory if any access flags other than
+	 * "remote read" are set.  "Local write" and "remote write"
+	 * obviously require write access.  "Remote atomic" can do
+	 * things like fetch and add, which will modify memory, and
+	 * "MW bind" can change permissions by binding a window.
+	 */
+	umem->writable  = !!(access & ~IB_ACCESS_REMOTE_READ);
+	umem->hugetlb = 0;
+	INIT_LIST_HEAD(&umem->chunk_list);
+
+	pmap = vm_map_pmap(&proc->p_vmspace->vm_map);
+	ret = 0;
+	while (npages) {
+		ents = min_t(int, npages, IB_UMEM_MAX_PAGE_CHUNK);
+		chunk = kmalloc(sizeof(*chunk) +
+				(sizeof(struct scatterlist) * ents),
+				GFP_KERNEL);
+		if (!chunk) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		chunk->attrs = attrs;
+		chunk->nents = ents;
+		sg_init_table(&chunk->page_list[0], ents);
+		for (i = 0; i < chunk->nents; ++i) {
+			vm_paddr_t pa;
+
+			pa = pmap_extract(pmap, start);
+			if (pa == 0) {
+				ret = -ENOMEM;
+				kfree(chunk);
+				goto out;
+			}
+			sg_set_page(&chunk->page_list[i], PHYS_TO_VM_PAGE(pa),
+			    PAGE_SIZE, 0);
+			npages--;
+			start += PAGE_SIZE;
+		}
+
+		chunk->nmap = ib_dma_map_sg_attrs(context->device,
+						  &chunk->page_list[0],
+						  chunk->nents,
+						  DMA_BIDIRECTIONAL,
+						  &attrs);
+		if (chunk->nmap != chunk->nents) {
+			kfree(chunk);
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		list_add_tail(&chunk->list, &umem->chunk_list);
+	}
+
+out:
+	if (ret < 0) {
+		__ib_umem_release(context->device, umem, 0);
+		kfree(umem);
+	}
+
+	return ret < 0 ? ERR_PTR(ret) : umem;
+#endif
+}
+EXPORT_SYMBOL(ib_umem_get);
+
+#ifdef __linux__
+static void ib_umem_account(struct work_struct *work)
+{
+	struct ib_umem *umem = container_of(work, struct ib_umem, work);
+
+	down_write(&umem->mm->mmap_sem);
+	umem->mm->locked_vm -= umem->diff;
+	up_write(&umem->mm->mmap_sem);
+	mmput(umem->mm);
+	kfree(umem);
+}
+#endif
+
+/**
+ * ib_umem_release - release memory pinned with ib_umem_get
+ * @umem: umem struct to release
+ */
+void ib_umem_release(struct ib_umem *umem)
+{
+#ifdef __linux__
+	struct ib_ucontext *context = umem->context;
+	struct mm_struct *mm;
+	unsigned long diff;
+
+	__ib_umem_release(umem->context->device, umem, 1);
+
+	mm = get_task_mm(current);
+	if (!mm) {
+		kfree(umem);
+		return;
+	}
+
+	diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT;
+
+	/*
+	 * We may be called with the mm's mmap_sem already held.  This
+	 * can happen when a userspace munmap() is the call that drops
+	 * the last reference to our file and calls our release
+	 * method.  If there are memory regions to destroy, we'll end
+	 * up here and not be able to take the mmap_sem.  In that case
+	 * we defer the vm_locked accounting to the system workqueue.
+	 */
+	if (context->closing) {
+		if (!down_write_trylock(&mm->mmap_sem)) {
+			INIT_WORK(&umem->work, ib_umem_account);
+			umem->mm   = mm;
+			umem->diff = diff;
+
+			schedule_work(&umem->work);
+			return;
+		}
+	} else
+		down_write(&mm->mmap_sem);
+
+	current->mm->locked_vm -= diff;
+	up_write(&mm->mmap_sem);
+	mmput(mm);
+#else
+	vm_offset_t addr, end, last, start;
+	vm_size_t size;
+	int error;
+
+	__ib_umem_release(umem->context->device, umem, 1);
+	if (umem->context->closing) {
+		kfree(umem);
+		return;
+	}
+	error = priv_check(curthread, PRIV_VM_MUNLOCK);
+	if (error)
+		return;
+	addr = umem->start;
+	size = umem->length;
+	last = addr + size;
+        start = addr & PAGE_MASK; /* Use the linux PAGE_MASK definition. */
+	end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */
+	vm_map_unwire(&curthread->td_proc->p_vmspace->vm_map, start, end,
+	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
+	
+#endif
+	kfree(umem);
+}
+EXPORT_SYMBOL(ib_umem_release);
+
+int ib_umem_page_count(struct ib_umem *umem)
+{
+	struct ib_umem_chunk *chunk;
+	int shift;
+	int i;
+	int n;
+
+	shift = ilog2(umem->page_size);
+
+	n = 0;
+	list_for_each_entry(chunk, &umem->chunk_list, list)
+		for (i = 0; i < chunk->nmap; ++i)
+			n += sg_dma_len(&chunk->page_list[i]) >> shift;
+
+	return n;
+}
+EXPORT_SYMBOL(ib_umem_page_count);
diff --git a/sys/ofed/drivers/infiniband/core/user_mad.c b/sys/ofed/drivers/infiniband/core/user_mad.c
new file mode 100644
index 0000000..3dae9ce
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/user_mad.c
@@ -0,0 +1,1225 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2008 Cisco. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <linux/dma-mapping.h>
+#include <linux/poll.h>
+#include <linux/mutex.h>
+#include <linux/kref.h>
+#include <linux/compat.h>
+#include <linux/semaphore.h>
+
+#include <asm/uaccess.h>
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_user_mad.h>
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand userspace MAD packet access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+enum {
+	IB_UMAD_MAX_PORTS  = 64,
+	IB_UMAD_MAX_AGENTS = 32,
+
+	IB_UMAD_MAJOR      = 231,
+	IB_UMAD_MINOR_BASE = 0
+};
+
+/*
+ * Our lifetime rules for these structs are the following: each time a
+ * device special file is opened, we look up the corresponding struct
+ * ib_umad_port by minor in the umad_port[] table while holding the
+ * port_lock.  If this lookup succeeds, we take a reference on the
+ * ib_umad_port's struct ib_umad_device while still holding the
+ * port_lock; if the lookup fails, we fail the open().  We drop these
+ * references in the corresponding close().
+ *
+ * In addition to references coming from open character devices, there
+ * is one more reference to each ib_umad_device representing the
+ * module's reference taken when allocating the ib_umad_device in
+ * ib_umad_add_one().
+ *
+ * When destroying an ib_umad_device, we clear all of its
+ * ib_umad_ports from umad_port[] while holding port_lock before
+ * dropping the module's reference to the ib_umad_device.  This is
+ * always safe because any open() calls will either succeed and obtain
+ * a reference before we clear the umad_port[] entries, or fail after
+ * we clear the umad_port[] entries.
+ */
+
+struct ib_umad_port {
+	struct cdev           *cdev;
+	struct device	      *dev;
+
+	struct cdev           *sm_cdev;
+	struct device	      *sm_dev;
+	struct semaphore       sm_sem;
+
+	struct mutex	       file_mutex;
+	struct list_head       file_list;
+
+	struct ib_device      *ib_dev;
+	struct ib_umad_device *umad_dev;
+	int                    dev_num;
+	u8                     port_num;
+};
+
+struct ib_umad_device {
+	int                  start_port, end_port;
+	struct kref          ref;
+	struct ib_umad_port  port[0];
+};
+
+struct ib_umad_file {
+	struct mutex		mutex;
+	struct ib_umad_port    *port;
+	struct file	       *filp;
+	struct list_head	recv_list;
+	struct list_head	send_list;
+	struct list_head	port_list;
+	spinlock_t		send_lock;
+	wait_queue_head_t	recv_wait;
+	struct ib_mad_agent    *agent[IB_UMAD_MAX_AGENTS];
+	int			agents_dead;
+	u8			use_pkey_index;
+	u8			already_used;
+};
+
+struct ib_umad_packet {
+	struct ib_mad_send_buf *msg;
+	struct ib_mad_recv_wc  *recv_wc;
+	struct list_head   list;
+	int		   length;
+	struct ib_user_mad mad;
+};
+
+static struct class *umad_class;
+
+static const dev_t base_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE);
+
+static DEFINE_SPINLOCK(port_lock);
+static struct ib_umad_port *umad_port[IB_UMAD_MAX_PORTS];
+static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS);
+
+static void ib_umad_add_one(struct ib_device *device);
+static void ib_umad_remove_one(struct ib_device *device);
+
+static void ib_umad_release_dev(struct kref *ref)
+{
+	struct ib_umad_device *dev =
+		container_of(ref, struct ib_umad_device, ref);
+
+	kfree(dev);
+}
+
+static int hdr_size(struct ib_umad_file *file)
+{
+	return file->use_pkey_index ? sizeof (struct ib_user_mad_hdr) :
+		sizeof (struct ib_user_mad_hdr_old);
+}
+
+/* caller must hold file->mutex */
+static struct ib_mad_agent *__get_agent(struct ib_umad_file *file, int id)
+{
+	return file->agents_dead ? NULL : file->agent[id];
+}
+
+static int queue_packet(struct ib_umad_file *file,
+			struct ib_mad_agent *agent,
+			struct ib_umad_packet *packet)
+{
+	int ret = 1;
+
+	mutex_lock(&file->mutex);
+
+	for (packet->mad.hdr.id = 0;
+	     packet->mad.hdr.id < IB_UMAD_MAX_AGENTS;
+	     packet->mad.hdr.id++)
+		if (agent == __get_agent(file, packet->mad.hdr.id)) {
+			list_add_tail(&packet->list, &file->recv_list);
+			selwakeup(&file->filp->f_selinfo);
+			wake_up_interruptible(&file->recv_wait);
+			ret = 0;
+			break;
+		}
+
+	mutex_unlock(&file->mutex);
+
+	return ret;
+}
+
+static void dequeue_send(struct ib_umad_file *file,
+			 struct ib_umad_packet *packet)
+{
+	spin_lock_irq(&file->send_lock);
+	list_del(&packet->list);
+	spin_unlock_irq(&file->send_lock);
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *send_wc)
+{
+	struct ib_umad_file *file = agent->context;
+	struct ib_umad_packet *packet = send_wc->send_buf->context[0];
+
+	dequeue_send(file, packet);
+	ib_destroy_ah(packet->msg->ah);
+	ib_free_send_mad(packet->msg);
+
+	if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) {
+		packet->length = IB_MGMT_MAD_HDR;
+		packet->mad.hdr.status = ETIMEDOUT;
+		if (!queue_packet(file, agent, packet))
+			return;
+	}
+	kfree(packet);
+}
+
+static void recv_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_umad_file *file = agent->context;
+	struct ib_umad_packet *packet;
+
+	if (mad_recv_wc->wc->status != IB_WC_SUCCESS)
+		goto err1;
+
+	packet = kzalloc(sizeof *packet, GFP_KERNEL);
+	if (!packet)
+		goto err1;
+
+	packet->length = mad_recv_wc->mad_len;
+	packet->recv_wc = mad_recv_wc;
+
+	packet->mad.hdr.status	   = 0;
+	packet->mad.hdr.length	   = hdr_size(file) + mad_recv_wc->mad_len;
+	packet->mad.hdr.qpn	   = cpu_to_be32(mad_recv_wc->wc->src_qp);
+	packet->mad.hdr.lid	   = cpu_to_be16(mad_recv_wc->wc->slid);
+	packet->mad.hdr.sl	   = mad_recv_wc->wc->sl;
+	packet->mad.hdr.path_bits  = mad_recv_wc->wc->dlid_path_bits;
+	packet->mad.hdr.pkey_index = mad_recv_wc->wc->pkey_index;
+	packet->mad.hdr.grh_present = !!(mad_recv_wc->wc->wc_flags & IB_WC_GRH);
+	if (packet->mad.hdr.grh_present) {
+		struct ib_ah_attr ah_attr;
+
+		ib_init_ah_from_wc(agent->device, agent->port_num,
+				   mad_recv_wc->wc, mad_recv_wc->recv_buf.grh,
+				   &ah_attr);
+
+		packet->mad.hdr.gid_index = ah_attr.grh.sgid_index;
+		packet->mad.hdr.hop_limit = ah_attr.grh.hop_limit;
+		packet->mad.hdr.traffic_class = ah_attr.grh.traffic_class;
+		memcpy(packet->mad.hdr.gid, &ah_attr.grh.dgid, 16);
+		packet->mad.hdr.flow_label = cpu_to_be32(ah_attr.grh.flow_label);
+	}
+
+	if (queue_packet(file, agent, packet))
+		goto err2;
+	return;
+
+err2:
+	kfree(packet);
+err1:
+	ib_free_recv_mad(mad_recv_wc);
+}
+
+static ssize_t copy_recv_mad(struct ib_umad_file *file, char __user *buf,
+			     struct ib_umad_packet *packet, size_t count)
+{
+	struct ib_mad_recv_buf *recv_buf;
+	int left, seg_payload, offset, max_seg_payload;
+
+	/* We need enough room to copy the first (or only) MAD segment. */
+	recv_buf = &packet->recv_wc->recv_buf;
+	if ((packet->length <= sizeof (*recv_buf->mad) &&
+	     count < hdr_size(file) + packet->length) ||
+	    (packet->length > sizeof (*recv_buf->mad) &&
+	     count < hdr_size(file) + sizeof (*recv_buf->mad)))
+		return -EINVAL;
+
+	if (copy_to_user(buf, &packet->mad, hdr_size(file)))
+		return -EFAULT;
+
+	buf += hdr_size(file);
+	seg_payload = min_t(int, packet->length, sizeof (*recv_buf->mad));
+	if (copy_to_user(buf, recv_buf->mad, seg_payload))
+		return -EFAULT;
+
+	if (seg_payload < packet->length) {
+		/*
+		 * Multipacket RMPP MAD message. Copy remainder of message.
+		 * Note that last segment may have a shorter payload.
+		 */
+		if (count < hdr_size(file) + packet->length) {
+			/*
+			 * The buffer is too small, return the first RMPP segment,
+			 * which includes the RMPP message length.
+			 */
+			return -ENOSPC;
+		}
+		offset = ib_get_mad_data_offset(recv_buf->mad->mad_hdr.mgmt_class);
+		max_seg_payload = sizeof (struct ib_mad) - offset;
+
+		for (left = packet->length - seg_payload, buf += seg_payload;
+		     left; left -= seg_payload, buf += seg_payload) {
+			recv_buf = container_of(recv_buf->list.next,
+						struct ib_mad_recv_buf, list);
+			seg_payload = min(left, max_seg_payload);
+			if (copy_to_user(buf, ((void *) recv_buf->mad) + offset,
+					 seg_payload))
+				return -EFAULT;
+		}
+	}
+	return hdr_size(file) + packet->length;
+}
+
+static ssize_t copy_send_mad(struct ib_umad_file *file, char __user *buf,
+			     struct ib_umad_packet *packet, size_t count)
+{
+	ssize_t size = hdr_size(file) + packet->length;
+
+	if (count < size)
+		return -EINVAL;
+
+	if (copy_to_user(buf, &packet->mad, hdr_size(file)))
+		return -EFAULT;
+
+	buf += hdr_size(file);
+
+	if (copy_to_user(buf, packet->mad.data, packet->length))
+		return -EFAULT;
+
+	return size;
+}
+
+static ssize_t ib_umad_read(struct file *filp, char __user *buf,
+			    size_t count, loff_t *pos)
+{
+	struct ib_umad_file *file = filp->private_data;
+	struct ib_umad_packet *packet;
+	ssize_t ret;
+
+	if (count < hdr_size(file))
+		return -EINVAL;
+
+	mutex_lock(&file->mutex);
+
+	while (list_empty(&file->recv_list)) {
+		mutex_unlock(&file->mutex);
+
+		if (filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(file->recv_wait,
+					     !list_empty(&file->recv_list)))
+			return -ERESTARTSYS;
+
+		mutex_lock(&file->mutex);
+	}
+
+	packet = list_entry(file->recv_list.next, struct ib_umad_packet, list);
+	list_del(&packet->list);
+
+	mutex_unlock(&file->mutex);
+
+	if (packet->recv_wc)
+		ret = copy_recv_mad(file, buf, packet, count);
+	else
+		ret = copy_send_mad(file, buf, packet, count);
+
+	if (ret < 0) {
+		/* Requeue packet */
+		mutex_lock(&file->mutex);
+		list_add(&packet->list, &file->recv_list);
+		mutex_unlock(&file->mutex);
+	} else {
+		if (packet->recv_wc)
+			ib_free_recv_mad(packet->recv_wc);
+		kfree(packet);
+	}
+	return ret;
+}
+
+static int copy_rmpp_mad(struct ib_mad_send_buf *msg, const char __user *buf)
+{
+	int left, seg;
+
+	/* Copy class specific header */
+	if ((msg->hdr_len > IB_MGMT_RMPP_HDR) &&
+	    copy_from_user(msg->mad + IB_MGMT_RMPP_HDR, buf + IB_MGMT_RMPP_HDR,
+			   msg->hdr_len - IB_MGMT_RMPP_HDR))
+		return -EFAULT;
+
+	/* All headers are in place.  Copy data segments. */
+	for (seg = 1, left = msg->data_len, buf += msg->hdr_len; left > 0;
+	     seg++, left -= msg->seg_size, buf += msg->seg_size) {
+		if (copy_from_user(ib_get_rmpp_segment(msg, seg), buf,
+				   min(left, msg->seg_size)))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+static int same_destination(struct ib_user_mad_hdr *hdr1,
+			    struct ib_user_mad_hdr *hdr2)
+{
+	if (!hdr1->grh_present && !hdr2->grh_present)
+	   return (hdr1->lid == hdr2->lid);
+
+	if (hdr1->grh_present && hdr2->grh_present)
+	   return !memcmp(hdr1->gid, hdr2->gid, 16);
+
+	return 0;
+}
+
+static int is_duplicate(struct ib_umad_file *file,
+			struct ib_umad_packet *packet)
+{
+	struct ib_umad_packet *sent_packet;
+	struct ib_mad_hdr *sent_hdr, *hdr;
+
+	hdr = (struct ib_mad_hdr *) packet->mad.data;
+	list_for_each_entry(sent_packet, &file->send_list, list) {
+		sent_hdr = (struct ib_mad_hdr *) sent_packet->mad.data;
+
+		if ((hdr->tid != sent_hdr->tid) ||
+		    (hdr->mgmt_class != sent_hdr->mgmt_class))
+			continue;
+
+		/*
+		 * No need to be overly clever here.  If two new operations have
+		 * the same TID, reject the second as a duplicate.  This is more
+		 * restrictive than required by the spec.
+		 */
+		if (!ib_response_mad((struct ib_mad *) hdr)) {
+			if (!ib_response_mad((struct ib_mad *) sent_hdr))
+				return 1;
+			continue;
+		} else if (!ib_response_mad((struct ib_mad *) sent_hdr))
+			continue;
+
+		if (same_destination(&packet->mad.hdr, &sent_packet->mad.hdr))
+			return 1;
+	}
+
+	return 0;
+}
+
+static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
+			     size_t count, loff_t *pos)
+{
+	struct ib_umad_file *file = filp->private_data;
+	struct ib_umad_packet *packet;
+	struct ib_mad_agent *agent;
+	struct ib_ah_attr ah_attr;
+	struct ib_ah *ah;
+	struct ib_rmpp_mad *rmpp_mad;
+	__be64 *tid;
+	int ret, data_len, hdr_len, copy_offset, rmpp_active;
+
+	if (count < hdr_size(file) + IB_MGMT_RMPP_HDR)
+		return -EINVAL;
+
+	packet = kzalloc(sizeof *packet + IB_MGMT_RMPP_HDR, GFP_KERNEL);
+	if (!packet)
+		return -ENOMEM;
+
+	if (copy_from_user(&packet->mad, buf, hdr_size(file))) {
+		ret = -EFAULT;
+		goto err;
+	}
+
+	if (packet->mad.hdr.id < 0 ||
+	    packet->mad.hdr.id >= IB_UMAD_MAX_AGENTS) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	buf += hdr_size(file);
+
+	if (copy_from_user(packet->mad.data, buf, IB_MGMT_RMPP_HDR)) {
+		ret = -EFAULT;
+		goto err;
+	}
+
+	mutex_lock(&file->mutex);
+
+	agent = __get_agent(file, packet->mad.hdr.id);
+	if (!agent) {
+		ret = -EINVAL;
+		goto err_up;
+	}
+
+	memset(&ah_attr, 0, sizeof ah_attr);
+	ah_attr.dlid          = be16_to_cpu(packet->mad.hdr.lid);
+	ah_attr.sl            = packet->mad.hdr.sl;
+	ah_attr.src_path_bits = packet->mad.hdr.path_bits;
+	ah_attr.port_num      = file->port->port_num;
+	if (packet->mad.hdr.grh_present) {
+		ah_attr.ah_flags = IB_AH_GRH;
+		memcpy(ah_attr.grh.dgid.raw, packet->mad.hdr.gid, 16);
+		ah_attr.grh.sgid_index	   = packet->mad.hdr.gid_index;
+		ah_attr.grh.flow_label 	   = be32_to_cpu(packet->mad.hdr.flow_label);
+		ah_attr.grh.hop_limit  	   = packet->mad.hdr.hop_limit;
+		ah_attr.grh.traffic_class  = packet->mad.hdr.traffic_class;
+	}
+
+	ah = ib_create_ah(agent->qp->pd, &ah_attr);
+	if (IS_ERR(ah)) {
+		ret = PTR_ERR(ah);
+		goto err_up;
+	}
+
+	rmpp_mad = (struct ib_rmpp_mad *) packet->mad.data;
+	hdr_len = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class);
+	if (!ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)) {
+		copy_offset = IB_MGMT_MAD_HDR;
+		rmpp_active = 0;
+	} else {
+		copy_offset = IB_MGMT_RMPP_HDR;
+		rmpp_active = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+			      IB_MGMT_RMPP_FLAG_ACTIVE;
+	}
+
+	data_len = count - hdr_size(file) - hdr_len;
+	packet->msg = ib_create_send_mad(agent,
+					 be32_to_cpu(packet->mad.hdr.qpn),
+					 packet->mad.hdr.pkey_index, rmpp_active,
+					 hdr_len, data_len, GFP_KERNEL);
+	if (IS_ERR(packet->msg)) {
+		ret = PTR_ERR(packet->msg);
+		goto err_ah;
+	}
+
+	packet->msg->ah 	= ah;
+	packet->msg->timeout_ms = packet->mad.hdr.timeout_ms;
+	packet->msg->retries 	= packet->mad.hdr.retries;
+	packet->msg->context[0] = packet;
+
+	/* Copy MAD header.  Any RMPP header is already in place. */
+	memcpy(packet->msg->mad, packet->mad.data, IB_MGMT_MAD_HDR);
+
+	if (!rmpp_active) {
+		if (copy_from_user(packet->msg->mad + copy_offset,
+				   buf + copy_offset,
+				   hdr_len + data_len - copy_offset)) {
+			ret = -EFAULT;
+			goto err_msg;
+		}
+	} else {
+		ret = copy_rmpp_mad(packet->msg, buf);
+		if (ret)
+			goto err_msg;
+	}
+
+	/*
+	 * Set the high-order part of the transaction ID to make MADs from
+	 * different agents unique, and allow routing responses back to the
+	 * original requestor.
+	 */
+	if (!ib_response_mad(packet->msg->mad)) {
+		tid = &((struct ib_mad_hdr *) packet->msg->mad)->tid;
+		*tid = cpu_to_be64(((u64) agent->hi_tid) << 32 |
+				   (be64_to_cpup(tid) & 0xffffffff));
+		rmpp_mad->mad_hdr.tid = *tid;
+	}
+
+	spin_lock_irq(&file->send_lock);
+	ret = is_duplicate(file, packet);
+	if (!ret)
+		list_add_tail(&packet->list, &file->send_list);
+	spin_unlock_irq(&file->send_lock);
+	if (ret) {
+		ret = -EINVAL;
+		goto err_msg;
+	}
+
+	ret = ib_post_send_mad(packet->msg, NULL);
+	if (ret)
+		goto err_send;
+
+	mutex_unlock(&file->mutex);
+	return count;
+
+err_send:
+	dequeue_send(file, packet);
+err_msg:
+	ib_free_send_mad(packet->msg);
+err_ah:
+	ib_destroy_ah(ah);
+err_up:
+	mutex_unlock(&file->mutex);
+err:
+	kfree(packet);
+	return ret;
+}
+
+static unsigned int ib_umad_poll(struct file *filp, struct poll_table_struct *wait)
+{
+	struct ib_umad_file *file = filp->private_data;
+
+	/* we will always be able to post a MAD send */
+	unsigned int mask = POLLOUT | POLLWRNORM;
+
+	poll_wait(filp, &file->recv_wait, wait);
+
+	if (!list_empty(&file->recv_list))
+		mask |= POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
+static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg,
+			     int compat_method_mask)
+{
+	struct ib_user_mad_reg_req ureq;
+	struct ib_mad_reg_req req;
+	struct ib_mad_agent *agent = NULL;
+	int agent_id;
+	int ret;
+
+	mutex_lock(&file->port->file_mutex);
+	mutex_lock(&file->mutex);
+
+	if (!file->port->ib_dev) {
+		ret = -EPIPE;
+		goto out;
+	}
+
+	if (copy_from_user(&ureq, arg, sizeof ureq)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (ureq.qpn != 0 && ureq.qpn != 1) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id)
+		if (!__get_agent(file, agent_id))
+			goto found;
+
+	ret = -ENOMEM;
+	goto out;
+
+found:
+	if (ureq.mgmt_class) {
+		req.mgmt_class         = ureq.mgmt_class;
+		req.mgmt_class_version = ureq.mgmt_class_version;
+		memcpy(req.oui, ureq.oui, sizeof req.oui);
+
+		if (compat_method_mask) {
+			u32 *umm = (u32 *) ureq.method_mask;
+			int i;
+
+			for (i = 0; i < BITS_TO_LONGS(IB_MGMT_MAX_METHODS); ++i)
+				req.method_mask[i] =
+					umm[i * 2] | ((u64) umm[i * 2 + 1] << 32);
+		} else
+			memcpy(req.method_mask, ureq.method_mask,
+			       sizeof req.method_mask);
+	}
+
+	agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num,
+				      ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI,
+				      ureq.mgmt_class ? &req : NULL,
+				      ureq.rmpp_version,
+				      send_handler, recv_handler, file);
+	if (IS_ERR(agent)) {
+		ret = PTR_ERR(agent);
+		agent = NULL;
+		goto out;
+	}
+
+	if (put_user(agent_id,
+		     (u32 __user *) (arg + offsetof(struct ib_user_mad_reg_req, id)))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (!file->already_used) {
+		file->already_used = 1;
+		if (!file->use_pkey_index) {
+			printk(KERN_WARNING "user_mad: process %s did not enable "
+			       "P_Key index support.\n", curproc->p_comm);
+			printk(KERN_WARNING "user_mad:   Documentation/infiniband/user_mad.txt "
+			       "has info on the new ABI.\n");
+		}
+	}
+
+	file->agent[agent_id] = agent;
+	ret = 0;
+
+out:
+	mutex_unlock(&file->mutex);
+
+	if (ret && agent)
+		ib_unregister_mad_agent(agent);
+
+	mutex_unlock(&file->port->file_mutex);
+
+	return ret;
+}
+
+static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg)
+{
+	struct ib_mad_agent *agent = NULL;
+	u32 id;
+	int ret = 0;
+
+	if (get_user(id, arg))
+		return -EFAULT;
+
+	mutex_lock(&file->port->file_mutex);
+	mutex_lock(&file->mutex);
+
+	if (id < 0 || id >= IB_UMAD_MAX_AGENTS || !__get_agent(file, id)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	agent = file->agent[id];
+	file->agent[id] = NULL;
+
+out:
+	mutex_unlock(&file->mutex);
+
+	if (agent)
+		ib_unregister_mad_agent(agent);
+
+	mutex_unlock(&file->port->file_mutex);
+
+	return ret;
+}
+
+static long ib_umad_enable_pkey(struct ib_umad_file *file)
+{
+	int ret = 0;
+
+	mutex_lock(&file->mutex);
+	if (file->already_used)
+		ret = -EINVAL;
+	else
+		file->use_pkey_index = 1;
+	mutex_unlock(&file->mutex);
+
+	return ret;
+}
+
+static long ib_umad_ioctl(struct file *filp, unsigned int cmd,
+			  unsigned long arg)
+{
+	switch (cmd) {
+	case IB_USER_MAD_REGISTER_AGENT:
+		return ib_umad_reg_agent(filp->private_data, (void __user *) arg, 0);
+	case IB_USER_MAD_UNREGISTER_AGENT:
+		return ib_umad_unreg_agent(filp->private_data, (__u32 __user *) arg);
+	case IB_USER_MAD_ENABLE_PKEY:
+		return ib_umad_enable_pkey(filp->private_data);
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+static long ib_umad_compat_ioctl(struct file *filp, unsigned int cmd,
+				 unsigned long arg)
+{
+	switch (cmd) {
+	case IB_USER_MAD_REGISTER_AGENT:
+		return ib_umad_reg_agent(filp->private_data, compat_ptr(arg), 1);
+	case IB_USER_MAD_UNREGISTER_AGENT:
+		return ib_umad_unreg_agent(filp->private_data, compat_ptr(arg));
+	case IB_USER_MAD_ENABLE_PKEY:
+		return ib_umad_enable_pkey(filp->private_data);
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+#endif
+
+/*
+ * ib_umad_open() does not need the BKL:
+ *
+ *  - umad_port[] accesses are protected by port_lock, the
+ *    ib_umad_port structures are properly reference counted, and
+ *    everything else is purely local to the file being created, so
+ *    races against other open calls are not a problem;
+ *  - the ioctl method does not affect any global state outside of the
+ *    file structure being operated on;
+ *  - the port is added to umad_port[] as the last part of module
+ *    initialization so the open method will either immediately run
+ *    -ENXIO, or all required initialization will be done.
+ */
+static int ib_umad_open(struct inode *inode, struct file *filp)
+{
+	struct ib_umad_port *port;
+	struct ib_umad_file *file;
+	int ret = 0;
+
+	spin_lock(&port_lock);
+	port = umad_port[iminor(inode) - IB_UMAD_MINOR_BASE];
+	if (port)
+		kref_get(&port->umad_dev->ref);
+	spin_unlock(&port_lock);
+
+	if (!port)
+		return -ENXIO;
+
+	mutex_lock(&port->file_mutex);
+
+	if (!port->ib_dev) {
+		ret = -ENXIO;
+		goto out;
+	}
+
+	file = kzalloc(sizeof *file, GFP_KERNEL);
+	if (!file) {
+		kref_put(&port->umad_dev->ref, ib_umad_release_dev);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	mutex_init(&file->mutex);
+	spin_lock_init(&file->send_lock);
+	INIT_LIST_HEAD(&file->recv_list);
+	INIT_LIST_HEAD(&file->send_list);
+	init_waitqueue_head(&file->recv_wait);
+
+	file->port = port;
+	file->filp = filp;
+	filp->private_data = file;
+
+	list_add_tail(&file->port_list, &port->file_list);
+
+out:
+	mutex_unlock(&port->file_mutex);
+	return ret;
+}
+
+static int ib_umad_close(struct inode *inode, struct file *filp)
+{
+	struct ib_umad_file *file = filp->private_data;
+	struct ib_umad_device *dev = file->port->umad_dev;
+	struct ib_umad_packet *packet, *tmp;
+	int already_dead;
+	int i;
+
+	mutex_lock(&file->port->file_mutex);
+	mutex_lock(&file->mutex);
+
+	already_dead = file->agents_dead;
+	file->agents_dead = 1;
+
+	list_for_each_entry_safe(packet, tmp, &file->recv_list, list) {
+		if (packet->recv_wc)
+			ib_free_recv_mad(packet->recv_wc);
+		kfree(packet);
+	}
+
+	list_del(&file->port_list);
+
+	mutex_unlock(&file->mutex);
+
+	if (!already_dead)
+		for (i = 0; i < IB_UMAD_MAX_AGENTS; ++i)
+			if (file->agent[i])
+				ib_unregister_mad_agent(file->agent[i]);
+
+	mutex_unlock(&file->port->file_mutex);
+
+	kfree(file);
+	kref_put(&dev->ref, ib_umad_release_dev);
+
+	return 0;
+}
+
+static const struct file_operations umad_fops = {
+	.owner 	 	= THIS_MODULE,
+	.read 	 	= ib_umad_read,
+	.write 	 	= ib_umad_write,
+	.poll 	 	= ib_umad_poll,
+	.unlocked_ioctl = ib_umad_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl 	= ib_umad_compat_ioctl,
+#endif
+	.open 	 	= ib_umad_open,
+	.release 	= ib_umad_close
+};
+
+static int ib_umad_sm_open(struct inode *inode, struct file *filp)
+{
+	struct ib_umad_port *port;
+	struct ib_port_modify props = {
+		.set_port_cap_mask = IB_PORT_SM
+	};
+	int ret;
+
+	spin_lock(&port_lock);
+	port = umad_port[iminor(inode) - IB_UMAD_MINOR_BASE - IB_UMAD_MAX_PORTS];
+	if (port)
+		kref_get(&port->umad_dev->ref);
+	spin_unlock(&port_lock);
+
+	if (!port)
+		return -ENXIO;
+
+	if (filp->f_flags & O_NONBLOCK) {
+		if (down_trylock(&port->sm_sem)) {
+			ret = -EAGAIN;
+			goto fail;
+		}
+	} else {
+		if (down_interruptible(&port->sm_sem)) {
+			ret = -ERESTARTSYS;
+			goto fail;
+		}
+	}
+
+	ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+	if (ret) {
+		up(&port->sm_sem);
+		goto fail;
+	}
+
+	filp->private_data = port;
+
+	return 0;
+
+fail:
+	kref_put(&port->umad_dev->ref, ib_umad_release_dev);
+	return ret;
+}
+
+static int ib_umad_sm_close(struct inode *inode, struct file *filp)
+{
+	struct ib_umad_port *port = filp->private_data;
+	struct ib_port_modify props = {
+		.clr_port_cap_mask = IB_PORT_SM
+	};
+	int ret = 0;
+
+	mutex_lock(&port->file_mutex);
+	if (port->ib_dev)
+		ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+	mutex_unlock(&port->file_mutex);
+
+	up(&port->sm_sem);
+
+	kref_put(&port->umad_dev->ref, ib_umad_release_dev);
+
+	return ret;
+}
+
+static const struct file_operations umad_sm_fops = {
+	.owner 	 = THIS_MODULE,
+	.open 	 = ib_umad_sm_open,
+	.release = ib_umad_sm_close
+};
+
+static struct ib_client umad_client = {
+	.name   = "umad",
+	.add    = ib_umad_add_one,
+	.remove = ib_umad_remove_one
+};
+
+static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct ib_umad_port *port = dev_get_drvdata(dev);
+
+	if (!port)
+		return -ENODEV;
+
+	return sprintf(buf, "%s\n", port->ib_dev->name);
+}
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static ssize_t show_port(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct ib_umad_port *port = dev_get_drvdata(dev);
+
+	if (!port)
+		return -ENODEV;
+
+	return sprintf(buf, "%d\n", port->port_num);
+}
+static DEVICE_ATTR(port, S_IRUGO, show_port, NULL);
+
+static ssize_t show_abi_version(struct class *class, char *buf)
+{
+	return sprintf(buf, "%d\n", IB_USER_MAD_ABI_VERSION);
+}
+static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
+
+static int ib_umad_init_port(struct ib_device *device, int port_num,
+			     struct ib_umad_port *port)
+{
+	spin_lock(&port_lock);
+	port->dev_num = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS);
+	if (port->dev_num >= IB_UMAD_MAX_PORTS) {
+		spin_unlock(&port_lock);
+		return -1;
+	}
+	set_bit(port->dev_num, dev_map);
+	spin_unlock(&port_lock);
+
+	port->ib_dev   = device;
+	port->port_num = port_num;
+	init_MUTEX(&port->sm_sem);
+	mutex_init(&port->file_mutex);
+	INIT_LIST_HEAD(&port->file_list);
+
+	port->cdev = cdev_alloc();
+	if (!port->cdev)
+		return -1;
+	port->cdev->owner = THIS_MODULE;
+	port->cdev->ops   = &umad_fops;
+	kobject_set_name(&port->cdev->kobj, "umad%d", port->dev_num);
+	if (cdev_add(port->cdev, base_dev + port->dev_num, 1))
+		goto err_cdev;
+
+	port->dev = device_create(umad_class, device->dma_device,
+				  port->cdev->dev, port,
+				  "umad%d", port->dev_num);
+	if (IS_ERR(port->dev))
+		goto err_cdev;
+
+	if (device_create_file(port->dev, &dev_attr_ibdev))
+		goto err_dev;
+	if (device_create_file(port->dev, &dev_attr_port))
+		goto err_dev;
+
+	port->sm_cdev = cdev_alloc();
+	if (!port->sm_cdev)
+		goto err_dev;
+	port->sm_cdev->owner = THIS_MODULE;
+	port->sm_cdev->ops   = &umad_sm_fops;
+	kobject_set_name(&port->sm_cdev->kobj, "issm%d", port->dev_num);
+	if (cdev_add(port->sm_cdev, base_dev + port->dev_num + IB_UMAD_MAX_PORTS, 1))
+		goto err_sm_cdev;
+
+	port->sm_dev = device_create(umad_class, device->dma_device,
+				     port->sm_cdev->dev, port,
+				     "issm%d", port->dev_num);
+	if (IS_ERR(port->sm_dev))
+		goto err_sm_cdev;
+
+	if (device_create_file(port->sm_dev, &dev_attr_ibdev))
+		goto err_sm_dev;
+	if (device_create_file(port->sm_dev, &dev_attr_port))
+		goto err_sm_dev;
+
+	spin_lock(&port_lock);
+	umad_port[port->dev_num] = port;
+	spin_unlock(&port_lock);
+
+	return 0;
+
+err_sm_dev:
+	device_destroy(umad_class, port->sm_cdev->dev);
+
+err_sm_cdev:
+	cdev_del(port->sm_cdev);
+
+err_dev:
+	device_destroy(umad_class, port->cdev->dev);
+
+err_cdev:
+	cdev_del(port->cdev);
+	clear_bit(port->dev_num, dev_map);
+
+	return -1;
+}
+
+static void ib_umad_kill_port(struct ib_umad_port *port)
+{
+	struct ib_umad_file *file;
+	int already_dead;
+	int id;
+
+	dev_set_drvdata(port->dev,    NULL);
+	dev_set_drvdata(port->sm_dev, NULL);
+
+	device_destroy(umad_class, port->cdev->dev);
+	device_destroy(umad_class, port->sm_cdev->dev);
+
+	cdev_del(port->cdev);
+	cdev_del(port->sm_cdev);
+
+	spin_lock(&port_lock);
+	umad_port[port->dev_num] = NULL;
+	spin_unlock(&port_lock);
+
+	mutex_lock(&port->file_mutex);
+
+	port->ib_dev = NULL;
+
+	list_for_each_entry(file, &port->file_list, port_list) {
+		mutex_lock(&file->mutex);
+		already_dead = file->agents_dead;
+		file->agents_dead = 1;
+		mutex_unlock(&file->mutex);
+
+		for (id = 0; id < IB_UMAD_MAX_AGENTS; ++id)
+			if (file->agent[id])
+				ib_unregister_mad_agent(file->agent[id]);
+	}
+
+	mutex_unlock(&port->file_mutex);
+
+	clear_bit(port->dev_num, dev_map);
+}
+
+static void ib_umad_add_one(struct ib_device *device)
+{
+	struct ib_umad_device *umad_dev;
+	int s, e, i;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH)
+		s = e = 0;
+	else {
+		s = 1;
+		e = device->phys_port_cnt;
+	}
+
+	umad_dev = kzalloc(sizeof *umad_dev +
+			   (e - s + 1) * sizeof (struct ib_umad_port),
+			   GFP_KERNEL);
+	if (!umad_dev)
+		return;
+
+	kref_init(&umad_dev->ref);
+
+	umad_dev->start_port = s;
+	umad_dev->end_port   = e;
+
+	for (i = s; i <= e; ++i) {
+		umad_dev->port[i - s].umad_dev = umad_dev;
+
+		if (rdma_port_get_link_layer(device, i) == IB_LINK_LAYER_INFINIBAND)
+			if (ib_umad_init_port(device, i, &umad_dev->port[i - s]))
+				goto err;
+	}
+
+	ib_set_client_data(device, &umad_client, umad_dev);
+
+	return;
+
+err:
+	while (--i >= s)
+		if (rdma_port_get_link_layer(device, i) == IB_LINK_LAYER_INFINIBAND)
+			ib_umad_kill_port(&umad_dev->port[i - s]);
+
+	kref_put(&umad_dev->ref, ib_umad_release_dev);
+}
+
+static void ib_umad_remove_one(struct ib_device *device)
+{
+	struct ib_umad_device *umad_dev = ib_get_client_data(device, &umad_client);
+	int i;
+
+	if (!umad_dev)
+		return;
+
+	for (i = 0; i <= umad_dev->end_port - umad_dev->start_port; ++i)
+		if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND)
+			ib_umad_kill_port(&umad_dev->port[i]);
+
+	kref_put(&umad_dev->ref, ib_umad_release_dev);
+}
+
+static int __init ib_umad_init(void)
+{
+	int ret;
+
+	ret = register_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2,
+				     "infiniband_mad");
+	if (ret) {
+		printk(KERN_ERR "user_mad: couldn't register device number\n");
+		goto out;
+	}
+
+	umad_class = class_create(THIS_MODULE, "infiniband_mad");
+	if (IS_ERR(umad_class)) {
+		ret = PTR_ERR(umad_class);
+		printk(KERN_ERR "user_mad: couldn't create class infiniband_mad\n");
+		goto out_chrdev;
+	}
+
+	ret = class_create_file(umad_class, &class_attr_abi_version);
+	if (ret) {
+		printk(KERN_ERR "user_mad: couldn't create abi_version attribute\n");
+		goto out_class;
+	}
+
+	ret = ib_register_client(&umad_client);
+	if (ret) {
+		printk(KERN_ERR "user_mad: couldn't register ib_umad client\n");
+		goto out_class;
+	}
+
+	return 0;
+
+out_class:
+	class_destroy(umad_class);
+
+out_chrdev:
+	unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2);
+
+out:
+	return ret;
+}
+
+static void __exit ib_umad_cleanup(void)
+{
+	ib_unregister_client(&umad_client);
+	class_destroy(umad_class);
+	unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2);
+}
+
+module_init(ib_umad_init);
+module_exit(ib_umad_cleanup);
diff --git a/sys/ofed/drivers/infiniband/core/uverbs.h b/sys/ofed/drivers/infiniband/core/uverbs.h
new file mode 100644
index 0000000..fa64da5
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/uverbs.h
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef UVERBS_H
+#define UVERBS_H
+
+#include <linux/kref.h>
+#include <linux/idr.h>
+#include <linux/mutex.h>
+#include <linux/completion.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+
+/*
+ * Our lifetime rules for these structs are the following:
+ *
+ * struct ib_uverbs_device: One reference is held by the module and
+ * released in ib_uverbs_remove_one().  Another reference is taken by
+ * ib_uverbs_open() each time the character special file is opened,
+ * and released in ib_uverbs_release_file() when the file is released.
+ *
+ * struct ib_uverbs_file: One reference is held by the VFS and
+ * released when the file is closed.  Another reference is taken when
+ * an asynchronous event queue file is created and released when the
+ * event file is closed.
+ *
+ * struct ib_uverbs_event_file: One reference is held by the VFS and
+ * released when the file is closed.  For asynchronous event files,
+ * another reference is held by the corresponding main context file
+ * and released when that file is closed.  For completion event files,
+ * a reference is taken when a CQ is created that uses the file, and
+ * released when the CQ is destroyed.
+ */
+
+struct ib_uverbs_device {
+	struct kref				ref;
+	struct completion			comp;
+	int					devnum;
+	struct cdev			       *cdev;
+	struct device			       *dev;
+	struct ib_device		       *ib_dev;
+	int					num_comp_vectors;
+};
+
+struct ib_uverbs_event_file {
+	struct kref				ref;
+	struct file			       *filp;
+	struct ib_uverbs_file		       *uverbs_file;
+	spinlock_t				lock;
+	wait_queue_head_t			poll_wait;
+	struct fasync_struct		       *async_queue;
+	struct list_head			event_list;
+	int					is_async;
+	int					is_closed;
+};
+
+struct ib_uverbs_file {
+	struct kref				ref;
+	struct mutex				mutex;
+	struct ib_uverbs_device		       *device;
+	struct ib_ucontext		       *ucontext;
+	struct ib_event_handler			event_handler;
+	struct ib_uverbs_event_file	       *async_file;
+};
+
+struct ib_uverbs_event {
+	union {
+		struct ib_uverbs_async_event_desc	async;
+		struct ib_uverbs_comp_event_desc	comp;
+	}					desc;
+	struct list_head			list;
+	struct list_head			obj_list;
+	u32				       *counter;
+};
+
+struct ib_uverbs_mcast_entry {
+	struct list_head	list;
+	union ib_gid 		gid;
+	u16 			lid;
+};
+
+struct ib_uevent_object {
+	struct ib_uobject	uobject;
+	struct list_head	event_list;
+	u32			events_reported;
+};
+
+struct ib_uqp_object {
+	struct ib_uevent_object	uevent;
+	struct list_head 	mcast_list;
+};
+
+struct ib_ucq_object {
+	struct ib_uobject	uobject;
+	struct ib_uverbs_file  *uverbs_file;
+	struct list_head	comp_list;
+	struct list_head	async_list;
+	u32			comp_events_reported;
+	u32			async_events_reported;
+};
+
+struct ib_uxrcd_object {
+	struct ib_uobject	uobject;
+	struct list_head	xrc_reg_qp_list;
+};
+
+extern spinlock_t ib_uverbs_idr_lock;
+extern struct idr ib_uverbs_pd_idr;
+extern struct idr ib_uverbs_mr_idr;
+extern struct idr ib_uverbs_mw_idr;
+extern struct idr ib_uverbs_ah_idr;
+extern struct idr ib_uverbs_cq_idr;
+extern struct idr ib_uverbs_qp_idr;
+extern struct idr ib_uverbs_srq_idr;
+extern struct idr ib_uverbs_xrc_domain_idr;
+
+void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj);
+
+struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+					int is_async, int *fd);
+struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd);
+
+void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
+			   struct ib_uverbs_event_file *ev_file,
+			   struct ib_ucq_object *uobj);
+void ib_uverbs_release_uevent(struct ib_uverbs_file *file,
+			      struct ib_uevent_object *uobj);
+
+void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context);
+void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_event_handler(struct ib_event_handler *handler,
+			     struct ib_event *event);
+void ib_uverbs_xrc_rcv_qp_event_handler(struct ib_event *event,
+					void *context_ptr);
+void ib_uverbs_dealloc_xrcd(struct ib_device *ib_dev,
+			    struct ib_xrcd *xrcd);
+int ib_uverbs_cleanup_xrc_rcv_qp(struct ib_uverbs_file *file,
+				 struct ib_xrcd *xrcd, u32 qp_num);
+
+#define IB_UVERBS_DECLARE_CMD(name)					\
+	ssize_t ib_uverbs_##name(struct ib_uverbs_file *file,		\
+				 const char __user *buf, int in_len,	\
+				 int out_len)
+
+IB_UVERBS_DECLARE_CMD(get_context);
+IB_UVERBS_DECLARE_CMD(query_device);
+IB_UVERBS_DECLARE_CMD(query_port);
+IB_UVERBS_DECLARE_CMD(alloc_pd);
+IB_UVERBS_DECLARE_CMD(dealloc_pd);
+IB_UVERBS_DECLARE_CMD(reg_mr);
+IB_UVERBS_DECLARE_CMD(dereg_mr);
+IB_UVERBS_DECLARE_CMD(create_comp_channel);
+IB_UVERBS_DECLARE_CMD(create_cq);
+IB_UVERBS_DECLARE_CMD(resize_cq);
+IB_UVERBS_DECLARE_CMD(poll_cq);
+IB_UVERBS_DECLARE_CMD(req_notify_cq);
+IB_UVERBS_DECLARE_CMD(destroy_cq);
+IB_UVERBS_DECLARE_CMD(create_qp);
+IB_UVERBS_DECLARE_CMD(query_qp);
+IB_UVERBS_DECLARE_CMD(modify_qp);
+IB_UVERBS_DECLARE_CMD(destroy_qp);
+IB_UVERBS_DECLARE_CMD(post_send);
+IB_UVERBS_DECLARE_CMD(post_recv);
+IB_UVERBS_DECLARE_CMD(post_srq_recv);
+IB_UVERBS_DECLARE_CMD(create_ah);
+IB_UVERBS_DECLARE_CMD(destroy_ah);
+IB_UVERBS_DECLARE_CMD(attach_mcast);
+IB_UVERBS_DECLARE_CMD(detach_mcast);
+IB_UVERBS_DECLARE_CMD(create_srq);
+IB_UVERBS_DECLARE_CMD(modify_srq);
+IB_UVERBS_DECLARE_CMD(query_srq);
+IB_UVERBS_DECLARE_CMD(destroy_srq);
+IB_UVERBS_DECLARE_CMD(create_xrc_srq);
+IB_UVERBS_DECLARE_CMD(open_xrc_domain);
+IB_UVERBS_DECLARE_CMD(close_xrc_domain);
+IB_UVERBS_DECLARE_CMD(create_xrc_rcv_qp);
+IB_UVERBS_DECLARE_CMD(modify_xrc_rcv_qp);
+IB_UVERBS_DECLARE_CMD(query_xrc_rcv_qp);
+IB_UVERBS_DECLARE_CMD(reg_xrc_rcv_qp);
+IB_UVERBS_DECLARE_CMD(unreg_xrc_rcv_qp);
+
+
+#endif /* UVERBS_H */
diff --git a/sys/ofed/drivers/infiniband/core/uverbs_cmd.c b/sys/ofed/drivers/infiniband/core/uverbs_cmd.c
new file mode 100644
index 0000000..3520182
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/uverbs_cmd.c
@@ -0,0 +1,3022 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
+ * Copyright (c) 2006 Mellanox Technologies.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/file.h>
+#include <linux/fs.h>
+
+#include <asm/uaccess.h>
+#include <asm/fcntl.h>
+
+#include "uverbs.h"
+
+static struct lock_class_key pd_lock_key;
+static struct lock_class_key mr_lock_key;
+static struct lock_class_key cq_lock_key;
+static struct lock_class_key qp_lock_key;
+static struct lock_class_key ah_lock_key;
+static struct lock_class_key srq_lock_key;
+
+#define INIT_UDATA(udata, ibuf, obuf, ilen, olen)			\
+	do {								\
+		(udata)->inbuf  = (void __user *) (ibuf);		\
+		(udata)->outbuf = (void __user *) (obuf);		\
+		(udata)->inlen  = (ilen);				\
+		(udata)->outlen = (olen);				\
+	} while (0)
+
+/*
+ * The ib_uobject locking scheme is as follows:
+ *
+ * - ib_uverbs_idr_lock protects the uverbs idrs themselves, so it
+ *   needs to be held during all idr operations.  When an object is
+ *   looked up, a reference must be taken on the object's kref before
+ *   dropping this lock.
+ *
+ * - Each object also has an rwsem.  This rwsem must be held for
+ *   reading while an operation that uses the object is performed.
+ *   For example, while registering an MR, the associated PD's
+ *   uobject.mutex must be held for reading.  The rwsem must be held
+ *   for writing while initializing or destroying an object.
+ *
+ * - In addition, each object has a "live" flag.  If this flag is not
+ *   set, then lookups of the object will fail even if it is found in
+ *   the idr.  This handles a reader that blocks and does not acquire
+ *   the rwsem until after the object is destroyed.  The destroy
+ *   operation will set the live flag to 0 and then drop the rwsem;
+ *   this will allow the reader to acquire the rwsem, see that the
+ *   live flag is 0, and then drop the rwsem and its reference to
+ *   object.  The underlying storage will not be freed until the last
+ *   reference to the object is dropped.
+ */
+
+static void init_uobj(struct ib_uobject *uobj, u64 user_handle,
+		      struct ib_ucontext *context, struct lock_class_key *key)
+{
+	uobj->user_handle = user_handle;
+	uobj->context     = context;
+	kref_init(&uobj->ref);
+	init_rwsem(&uobj->mutex);
+	lockdep_set_class(&uobj->mutex, key);
+	uobj->live        = 0;
+}
+
+static void release_uobj(struct kref *kref)
+{
+	kfree(container_of(kref, struct ib_uobject, ref));
+}
+
+static void put_uobj(struct ib_uobject *uobj)
+{
+	kref_put(&uobj->ref, release_uobj);
+}
+
+static void put_uobj_read(struct ib_uobject *uobj)
+{
+	up_read(&uobj->mutex);
+	put_uobj(uobj);
+}
+
+static void put_uobj_write(struct ib_uobject *uobj)
+{
+	up_write(&uobj->mutex);
+	put_uobj(uobj);
+}
+
+static int idr_add_uobj(struct idr *idr, struct ib_uobject *uobj)
+{
+	int ret;
+
+retry:
+	if (!idr_pre_get(idr, GFP_KERNEL))
+		return -ENOMEM;
+
+	spin_lock(&ib_uverbs_idr_lock);
+	ret = idr_get_new(idr, uobj, &uobj->id);
+	spin_unlock(&ib_uverbs_idr_lock);
+
+	if (ret == -EAGAIN)
+		goto retry;
+
+	return ret;
+}
+
+void idr_remove_uobj(struct idr *idr, struct ib_uobject *uobj)
+{
+	spin_lock(&ib_uverbs_idr_lock);
+	idr_remove(idr, uobj->id);
+	spin_unlock(&ib_uverbs_idr_lock);
+}
+
+static struct ib_uobject *__idr_get_uobj(struct idr *idr, int id,
+					 struct ib_ucontext *context)
+{
+	struct ib_uobject *uobj;
+
+	spin_lock(&ib_uverbs_idr_lock);
+	uobj = idr_find(idr, id);
+	if (uobj) {
+		if (uobj->context == context)
+			kref_get(&uobj->ref);
+		else
+			uobj = NULL;
+	}
+	spin_unlock(&ib_uverbs_idr_lock);
+
+	return uobj;
+}
+
+static struct ib_uobject *idr_read_uobj(struct idr *idr, int id,
+					struct ib_ucontext *context, int nested)
+{
+	struct ib_uobject *uobj;
+
+	uobj = __idr_get_uobj(idr, id, context);
+	if (!uobj)
+		return NULL;
+
+	if (nested)
+		down_read_nested(&uobj->mutex, SINGLE_DEPTH_NESTING);
+	else
+		down_read(&uobj->mutex);
+	if (!uobj->live) {
+		put_uobj_read(uobj);
+		return NULL;
+	}
+
+	return uobj;
+}
+
+static struct ib_uobject *idr_write_uobj(struct idr *idr, int id,
+					 struct ib_ucontext *context)
+{
+	struct ib_uobject *uobj;
+
+	uobj = __idr_get_uobj(idr, id, context);
+	if (!uobj)
+		return NULL;
+
+	down_write(&uobj->mutex);
+	if (!uobj->live) {
+		put_uobj_write(uobj);
+		return NULL;
+	}
+
+	return uobj;
+}
+
+static void *idr_read_obj(struct idr *idr, int id, struct ib_ucontext *context,
+			  int nested)
+{
+	struct ib_uobject *uobj;
+
+	uobj = idr_read_uobj(idr, id, context, nested);
+	return uobj ? uobj->object : NULL;
+}
+
+static struct ib_pd *idr_read_pd(int pd_handle, struct ib_ucontext *context)
+{
+	return idr_read_obj(&ib_uverbs_pd_idr, pd_handle, context, 0);
+}
+
+static void put_pd_read(struct ib_pd *pd)
+{
+	put_uobj_read(pd->uobject);
+}
+
+static struct ib_cq *idr_read_cq(int cq_handle, struct ib_ucontext *context, int nested)
+{
+	return idr_read_obj(&ib_uverbs_cq_idr, cq_handle, context, nested);
+}
+
+static void put_cq_read(struct ib_cq *cq)
+{
+	put_uobj_read(cq->uobject);
+}
+
+static struct ib_ah *idr_read_ah(int ah_handle, struct ib_ucontext *context)
+{
+	return idr_read_obj(&ib_uverbs_ah_idr, ah_handle, context, 0);
+}
+
+static void put_ah_read(struct ib_ah *ah)
+{
+	put_uobj_read(ah->uobject);
+}
+
+static struct ib_qp *idr_read_qp(int qp_handle, struct ib_ucontext *context)
+{
+	return idr_read_obj(&ib_uverbs_qp_idr, qp_handle, context, 0);
+}
+
+static void put_qp_read(struct ib_qp *qp)
+{
+	put_uobj_read(qp->uobject);
+}
+
+static struct ib_srq *idr_read_srq(int srq_handle, struct ib_ucontext *context)
+{
+	return idr_read_obj(&ib_uverbs_srq_idr, srq_handle, context, 0);
+}
+
+static void put_srq_read(struct ib_srq *srq)
+{
+	put_uobj_read(srq->uobject);
+}
+
+static struct ib_xrcd *idr_read_xrcd(int xrcd_handle,
+				     struct ib_ucontext *context,
+				     struct ib_uobject **uobj)
+{
+	*uobj = idr_read_uobj(&ib_uverbs_xrc_domain_idr, xrcd_handle,
+			      context, 0);
+	return *uobj ? (*uobj)->object : NULL;
+}
+
+static void put_xrcd_read(struct ib_uobject *uobj)
+{
+	put_uobj_read(uobj);
+}
+
+ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
+			      const char __user *buf,
+			      int in_len, int out_len)
+{
+	struct ib_uverbs_get_context      cmd;
+	struct ib_uverbs_get_context_resp resp;
+	struct ib_udata                   udata;
+	struct ib_device                 *ibdev = file->device->ib_dev;
+	struct ib_ucontext		 *ucontext;
+	struct file			 *filp;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	mutex_lock(&file->mutex);
+
+	if (file->ucontext) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	ucontext = ibdev->alloc_ucontext(ibdev, &udata);
+	if (IS_ERR(ucontext)) {
+		ret = PTR_ERR(file->ucontext);
+		goto err;
+	}
+
+	ucontext->device = ibdev;
+	INIT_LIST_HEAD(&ucontext->pd_list);
+	INIT_LIST_HEAD(&ucontext->mr_list);
+	INIT_LIST_HEAD(&ucontext->mw_list);
+	INIT_LIST_HEAD(&ucontext->cq_list);
+	INIT_LIST_HEAD(&ucontext->qp_list);
+	INIT_LIST_HEAD(&ucontext->srq_list);
+	INIT_LIST_HEAD(&ucontext->ah_list);
+	INIT_LIST_HEAD(&ucontext->xrc_domain_list);
+	ucontext->closing = 0;
+
+	resp.num_comp_vectors = file->device->num_comp_vectors;
+
+	filp = ib_uverbs_alloc_event_file(file, 1, &resp.async_fd);
+	if (IS_ERR(filp)) {
+		ret = PTR_ERR(filp);
+		goto err_free;
+	}
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_file;
+	}
+
+	file->async_file = filp->private_data;
+
+	INIT_IB_EVENT_HANDLER(&file->event_handler, file->device->ib_dev,
+			      ib_uverbs_event_handler);
+	ret = ib_register_event_handler(&file->event_handler);
+	if (ret)
+		goto err_file;
+
+	kref_get(&file->async_file->ref);
+	kref_get(&file->ref);
+	file->ucontext = ucontext;
+
+	fd_install(resp.async_fd, filp);
+
+	mutex_unlock(&file->mutex);
+
+	return in_len;
+
+err_file:
+	put_unused_fd(resp.async_fd);
+	fput(filp);
+
+err_free:
+	ibdev->dealloc_ucontext(ucontext);
+
+err:
+	mutex_unlock(&file->mutex);
+	return ret;
+}
+
+ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
+			       const char __user *buf,
+			       int in_len, int out_len)
+{
+	struct ib_uverbs_query_device      cmd;
+	struct ib_uverbs_query_device_resp resp;
+	struct ib_device_attr              attr;
+	int                                ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	ret = ib_query_device(file->device->ib_dev, &attr);
+	if (ret)
+		return ret;
+
+	memset(&resp, 0, sizeof resp);
+
+	resp.fw_ver 		       = attr.fw_ver;
+	resp.node_guid 		       = file->device->ib_dev->node_guid;
+	resp.sys_image_guid 	       = attr.sys_image_guid;
+	resp.max_mr_size 	       = attr.max_mr_size;
+	resp.page_size_cap 	       = attr.page_size_cap;
+	resp.vendor_id 		       = attr.vendor_id;
+	resp.vendor_part_id 	       = attr.vendor_part_id;
+	resp.hw_ver 		       = attr.hw_ver;
+	resp.max_qp 		       = attr.max_qp;
+	resp.max_qp_wr 		       = attr.max_qp_wr;
+	resp.device_cap_flags 	       = attr.device_cap_flags;
+	resp.max_sge 		       = attr.max_sge;
+	resp.max_sge_rd 	       = attr.max_sge_rd;
+	resp.max_cq 		       = attr.max_cq;
+	resp.max_cqe 		       = attr.max_cqe;
+	resp.max_mr 		       = attr.max_mr;
+	resp.max_pd 		       = attr.max_pd;
+	resp.max_qp_rd_atom 	       = attr.max_qp_rd_atom;
+	resp.max_ee_rd_atom 	       = attr.max_ee_rd_atom;
+	resp.max_res_rd_atom 	       = attr.max_res_rd_atom;
+	resp.max_qp_init_rd_atom       = attr.max_qp_init_rd_atom;
+	resp.max_ee_init_rd_atom       = attr.max_ee_init_rd_atom;
+	resp.atomic_cap 	       = attr.atomic_cap;
+	resp.max_ee 		       = attr.max_ee;
+	resp.max_rdd 		       = attr.max_rdd;
+	resp.max_mw 		       = attr.max_mw;
+	resp.max_raw_ipv6_qp 	       = attr.max_raw_ipv6_qp;
+	resp.max_raw_ethy_qp 	       = attr.max_raw_ethy_qp;
+	resp.max_mcast_grp 	       = attr.max_mcast_grp;
+	resp.max_mcast_qp_attach       = attr.max_mcast_qp_attach;
+	resp.max_total_mcast_qp_attach = attr.max_total_mcast_qp_attach;
+	resp.max_ah 		       = attr.max_ah;
+	resp.max_fmr 		       = attr.max_fmr;
+	resp.max_map_per_fmr 	       = attr.max_map_per_fmr;
+	resp.max_srq 		       = attr.max_srq;
+	resp.max_srq_wr 	       = attr.max_srq_wr;
+	resp.max_srq_sge 	       = attr.max_srq_sge;
+	resp.max_pkeys 		       = attr.max_pkeys;
+	resp.local_ca_ack_delay        = attr.local_ca_ack_delay;
+	resp.phys_port_cnt	       = file->device->ib_dev->phys_port_cnt;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		return -EFAULT;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
+			     const char __user *buf,
+			     int in_len, int out_len)
+{
+	struct ib_uverbs_query_port      cmd;
+	struct ib_uverbs_query_port_resp resp;
+	struct ib_port_attr              attr;
+	int                              ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	ret = ib_query_port(file->device->ib_dev, cmd.port_num, &attr);
+	if (ret)
+		return ret;
+
+	memset(&resp, 0, sizeof resp);
+
+	resp.state 	     = attr.state;
+	resp.max_mtu 	     = attr.max_mtu;
+	resp.active_mtu      = attr.active_mtu;
+	resp.gid_tbl_len     = attr.gid_tbl_len;
+	resp.port_cap_flags  = attr.port_cap_flags;
+	resp.max_msg_sz      = attr.max_msg_sz;
+	resp.bad_pkey_cntr   = attr.bad_pkey_cntr;
+	resp.qkey_viol_cntr  = attr.qkey_viol_cntr;
+	resp.pkey_tbl_len    = attr.pkey_tbl_len;
+	resp.lid 	     = attr.lid;
+	resp.sm_lid 	     = attr.sm_lid;
+	resp.lmc 	     = attr.lmc;
+	resp.max_vl_num      = attr.max_vl_num;
+	resp.sm_sl 	     = attr.sm_sl;
+	resp.subnet_timeout  = attr.subnet_timeout;
+	resp.init_type_reply = attr.init_type_reply;
+	resp.active_width    = attr.active_width;
+	resp.active_speed    = attr.active_speed;
+	resp.phys_state      = attr.phys_state;
+	resp.link_layer	     = attr.link_layer;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		return -EFAULT;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
+			   const char __user *buf,
+			   int in_len, int out_len)
+{
+	struct ib_uverbs_alloc_pd      cmd;
+	struct ib_uverbs_alloc_pd_resp resp;
+	struct ib_udata                udata;
+	struct ib_uobject             *uobj;
+	struct ib_pd                  *pd;
+	int                            ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
+	if (!uobj)
+		return -ENOMEM;
+
+	init_uobj(uobj, 0, file->ucontext, &pd_lock_key);
+	down_write(&uobj->mutex);
+
+	pd = file->device->ib_dev->alloc_pd(file->device->ib_dev,
+					    file->ucontext, &udata);
+	if (IS_ERR(pd)) {
+		ret = PTR_ERR(pd);
+		goto err;
+	}
+
+	pd->device  = file->device->ib_dev;
+	pd->uobject = uobj;
+	atomic_set(&pd->usecnt, 0);
+
+	uobj->object = pd;
+	ret = idr_add_uobj(&ib_uverbs_pd_idr, uobj);
+	if (ret)
+		goto err_idr;
+
+	memset(&resp, 0, sizeof resp);
+	resp.pd_handle = uobj->id;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&uobj->list, &file->ucontext->pd_list);
+	mutex_unlock(&file->mutex);
+
+	uobj->live = 1;
+
+	up_write(&uobj->mutex);
+
+	return in_len;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
+
+err_idr:
+	ib_dealloc_pd(pd);
+
+err:
+	put_uobj_write(uobj);
+	return ret;
+}
+
+ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
+			     const char __user *buf,
+			     int in_len, int out_len)
+{
+	struct ib_uverbs_dealloc_pd cmd;
+	struct ib_uobject          *uobj;
+	int                         ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+
+	ret = ib_dealloc_pd(uobj->object);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		return ret;
+
+	idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	put_uobj(uobj);
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
+			 const char __user *buf, int in_len,
+			 int out_len)
+{
+	struct ib_uverbs_reg_mr      cmd;
+	struct ib_uverbs_reg_mr_resp resp;
+	struct ib_udata              udata;
+	struct ib_uobject           *uobj;
+	struct ib_pd                *pd;
+	struct ib_mr                *mr;
+	int                          ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))
+		return -EINVAL;
+
+	/*
+	 * Local write permission is required if remote write or
+	 * remote atomic permission is also requested.
+	 */
+	if (cmd.access_flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) &&
+	    !(cmd.access_flags & IB_ACCESS_LOCAL_WRITE))
+		return -EINVAL;
+
+	uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
+	if (!uobj)
+		return -ENOMEM;
+
+	init_uobj(uobj, 0, file->ucontext, &mr_lock_key);
+	down_write(&uobj->mutex);
+
+	pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+	if (!pd) {
+		ret = -EINVAL;
+		goto err_free;
+	}
+
+	mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
+				     cmd.access_flags, &udata);
+	if (IS_ERR(mr)) {
+		ret = PTR_ERR(mr);
+		goto err_put;
+	}
+
+	mr->device  = pd->device;
+	mr->pd      = pd;
+	mr->uobject = uobj;
+	atomic_inc(&pd->usecnt);
+	atomic_set(&mr->usecnt, 0);
+
+	uobj->object = mr;
+	ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj);
+	if (ret)
+		goto err_unreg;
+
+	memset(&resp, 0, sizeof resp);
+	resp.lkey      = mr->lkey;
+	resp.rkey      = mr->rkey;
+	resp.mr_handle = uobj->id;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	put_pd_read(pd);
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&uobj->list, &file->ucontext->mr_list);
+	mutex_unlock(&file->mutex);
+
+	uobj->live = 1;
+
+	up_write(&uobj->mutex);
+
+	return in_len;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
+
+err_unreg:
+	ib_dereg_mr(mr);
+
+err_put:
+	put_pd_read(pd);
+
+err_free:
+	put_uobj_write(uobj);
+	return ret;
+}
+
+ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
+			   const char __user *buf, int in_len,
+			   int out_len)
+{
+	struct ib_uverbs_dereg_mr cmd;
+	struct ib_mr             *mr;
+	struct ib_uobject	 *uobj;
+	int                       ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle, file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+
+	mr = uobj->object;
+
+	ret = ib_dereg_mr(mr);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		return ret;
+
+	idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	put_uobj(uobj);
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
+				      const char __user *buf, int in_len,
+				      int out_len)
+{
+	struct ib_uverbs_create_comp_channel	   cmd;
+	struct ib_uverbs_create_comp_channel_resp  resp;
+	struct file				  *filp;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	filp = ib_uverbs_alloc_event_file(file, 0, &resp.fd);
+	if (IS_ERR(filp))
+		return PTR_ERR(filp);
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		put_unused_fd(resp.fd);
+		fput(filp);
+		return -EFAULT;
+	}
+
+	fd_install(resp.fd, filp);
+	return in_len;
+}
+
+ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_create_cq      cmd;
+	struct ib_uverbs_create_cq_resp resp;
+	struct ib_udata                 udata;
+	struct ib_ucq_object           *obj;
+	struct ib_uverbs_event_file    *ev_file = NULL;
+	struct ib_cq                   *cq;
+	int                             ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	if (cmd.comp_vector >= file->device->num_comp_vectors)
+		return -EINVAL;
+
+	obj = kmalloc(sizeof *obj, GFP_KERNEL);
+	if (!obj)
+		return -ENOMEM;
+
+	init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, &cq_lock_key);
+	down_write(&obj->uobject.mutex);
+
+	if (cmd.comp_channel >= 0) {
+		ev_file = ib_uverbs_lookup_comp_file(cmd.comp_channel);
+		if (!ev_file) {
+			ret = -EINVAL;
+			goto err;
+		}
+	}
+
+	obj->uverbs_file	   = file;
+	obj->comp_events_reported  = 0;
+	obj->async_events_reported = 0;
+	INIT_LIST_HEAD(&obj->comp_list);
+	INIT_LIST_HEAD(&obj->async_list);
+
+	cq = file->device->ib_dev->create_cq(file->device->ib_dev, cmd.cqe,
+					     cmd.comp_vector,
+					     file->ucontext, &udata);
+	if (IS_ERR(cq)) {
+		ret = PTR_ERR(cq);
+		goto err_file;
+	}
+
+	cq->device        = file->device->ib_dev;
+	cq->uobject       = &obj->uobject;
+	cq->comp_handler  = ib_uverbs_comp_handler;
+	cq->event_handler = ib_uverbs_cq_event_handler;
+	cq->cq_context    = ev_file;
+	atomic_set(&cq->usecnt, 0);
+
+	obj->uobject.object = cq;
+	ret = idr_add_uobj(&ib_uverbs_cq_idr, &obj->uobject);
+	if (ret)
+		goto err_free;
+
+	memset(&resp, 0, sizeof resp);
+	resp.cq_handle = obj->uobject.id;
+	resp.cqe       = cq->cqe;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&obj->uobject.list, &file->ucontext->cq_list);
+	mutex_unlock(&file->mutex);
+
+	obj->uobject.live = 1;
+
+	up_write(&obj->uobject.mutex);
+
+	return in_len;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_cq_idr, &obj->uobject);
+
+err_free:
+	ib_destroy_cq(cq);
+
+err_file:
+	if (ev_file)
+		ib_uverbs_release_ucq(file, ev_file, obj);
+
+err:
+	put_uobj_write(&obj->uobject);
+	return ret;
+}
+
+ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_resize_cq	cmd;
+	struct ib_uverbs_resize_cq_resp	resp;
+	struct ib_udata                 udata;
+	struct ib_cq			*cq;
+	int				ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+	if (!cq)
+		return -EINVAL;
+
+	ret = cq->device->resize_cq(cq, cmd.cqe, &udata);
+	if (ret)
+		goto out;
+
+	resp.cqe = cq->cqe;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp.cqe))
+		ret = -EFAULT;
+
+out:
+	put_cq_read(cq);
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
+			  const char __user *buf, int in_len,
+			  int out_len)
+{
+	struct ib_uverbs_poll_cq       cmd;
+	struct ib_uverbs_poll_cq_resp *resp;
+	struct ib_cq                  *cq;
+	struct ib_wc                  *wc;
+	int                            ret = 0;
+	int                            i;
+	int                            rsize;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	wc = kmalloc(cmd.ne * sizeof *wc, GFP_KERNEL);
+	if (!wc)
+		return -ENOMEM;
+
+	rsize = sizeof *resp + cmd.ne * sizeof(struct ib_uverbs_wc);
+	resp = kmalloc(rsize, GFP_KERNEL);
+	if (!resp) {
+		ret = -ENOMEM;
+		goto out_wc;
+	}
+
+	cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+	if (!cq) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	resp->count = ib_poll_cq(cq, cmd.ne, wc);
+
+	put_cq_read(cq);
+
+	for (i = 0; i < resp->count; i++) {
+		resp->wc[i].wr_id 	   = wc[i].wr_id;
+		resp->wc[i].status 	   = wc[i].status;
+		resp->wc[i].opcode 	   = wc[i].opcode;
+		resp->wc[i].vendor_err 	   = wc[i].vendor_err;
+		resp->wc[i].byte_len 	   = wc[i].byte_len;
+		resp->wc[i].ex.imm_data    = (__u32 __force) wc[i].ex.imm_data;
+		resp->wc[i].qp_num 	   = wc[i].qp->qp_num;
+		resp->wc[i].src_qp 	   = wc[i].src_qp;
+		resp->wc[i].wc_flags 	   = wc[i].wc_flags;
+		resp->wc[i].pkey_index 	   = wc[i].pkey_index;
+		resp->wc[i].slid 	   = wc[i].slid;
+		resp->wc[i].sl 		   = wc[i].sl;
+		resp->wc[i].dlid_path_bits = wc[i].dlid_path_bits;
+		resp->wc[i].port_num 	   = wc[i].port_num;
+	}
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response, resp, rsize))
+		ret = -EFAULT;
+
+out:
+	kfree(resp);
+
+out_wc:
+	kfree(wc);
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
+				const char __user *buf, int in_len,
+				int out_len)
+{
+	struct ib_uverbs_req_notify_cq cmd;
+	struct ib_cq                  *cq;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+	if (!cq)
+		return -EINVAL;
+
+	ib_req_notify_cq(cq, cmd.solicited_only ?
+			 IB_CQ_SOLICITED : IB_CQ_NEXT_COMP);
+
+	put_cq_read(cq);
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_destroy_cq      cmd;
+	struct ib_uverbs_destroy_cq_resp resp;
+	struct ib_uobject		*uobj;
+	struct ib_cq               	*cq;
+	struct ib_ucq_object        	*obj;
+	struct ib_uverbs_event_file	*ev_file;
+	int                        	 ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = idr_write_uobj(&ib_uverbs_cq_idr, cmd.cq_handle, file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+	cq      = uobj->object;
+	ev_file = cq->cq_context;
+	obj     = container_of(cq->uobject, struct ib_ucq_object, uobject);
+
+	ret = ib_destroy_cq(cq);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		return ret;
+
+	idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	ib_uverbs_release_ucq(file, ev_file, obj);
+
+	memset(&resp, 0, sizeof resp);
+	resp.comp_events_reported  = obj->comp_events_reported;
+	resp.async_events_reported = obj->async_events_reported;
+
+	put_uobj(uobj);
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		return -EFAULT;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_create_qp      cmd;
+	struct ib_uverbs_create_qp_resp resp;
+	struct ib_udata                 udata;
+	struct ib_uqp_object           *obj;
+	struct ib_pd                   *pd;
+	struct ib_cq                   *scq, *rcq;
+	struct ib_srq                  *srq;
+	struct ib_qp                   *qp;
+	struct ib_qp_init_attr          attr;
+	struct ib_xrcd		       *xrcd;
+	struct ib_uobject	       *xrcd_uobj;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	obj = kmalloc(sizeof *obj, GFP_KERNEL);
+	if (!obj)
+		return -ENOMEM;
+
+	init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_key);
+	down_write(&obj->uevent.uobject.mutex);
+
+	srq = (cmd.is_srq && cmd.qp_type != IB_QPT_XRC) ?
+		idr_read_srq(cmd.srq_handle, file->ucontext) : NULL;
+	xrcd = cmd.qp_type == IB_QPT_XRC ?
+		idr_read_xrcd(cmd.srq_handle, file->ucontext, &xrcd_uobj) : NULL;
+	pd  = idr_read_pd(cmd.pd_handle, file->ucontext);
+	scq = idr_read_cq(cmd.send_cq_handle, file->ucontext, 0);
+	rcq = cmd.recv_cq_handle == cmd.send_cq_handle ?
+		scq : idr_read_cq(cmd.recv_cq_handle, file->ucontext, 1);
+
+	if (!pd || !scq || !rcq || (cmd.is_srq && !srq) ||
+	    (cmd.qp_type == IB_QPT_XRC && !xrcd)) {
+		ret = -EINVAL;
+		goto err_put;
+	}
+
+	attr.create_flags  = 0;
+	attr.event_handler = ib_uverbs_qp_event_handler;
+	attr.qp_context    = file;
+	attr.send_cq       = scq;
+	attr.recv_cq       = rcq;
+	attr.srq           = srq;
+	attr.sq_sig_type   = cmd.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
+	attr.qp_type       = cmd.qp_type;
+	attr.xrc_domain    = xrcd;
+	attr.create_flags  = 0;
+
+	attr.cap.max_send_wr     = cmd.max_send_wr;
+	attr.cap.max_recv_wr     = cmd.max_recv_wr;
+	attr.cap.max_send_sge    = cmd.max_send_sge;
+	attr.cap.max_recv_sge    = cmd.max_recv_sge;
+	attr.cap.max_inline_data = cmd.max_inline_data;
+
+	obj->uevent.events_reported     = 0;
+	INIT_LIST_HEAD(&obj->uevent.event_list);
+	INIT_LIST_HEAD(&obj->mcast_list);
+
+	qp = pd->device->create_qp(pd, &attr, &udata);
+	if (IS_ERR(qp)) {
+		ret = PTR_ERR(qp);
+		goto err_put;
+	}
+
+	qp->device     	  = pd->device;
+	qp->pd         	  = pd;
+	qp->send_cq    	  = attr.send_cq;
+	qp->recv_cq    	  = attr.recv_cq;
+	qp->srq	       	  = attr.srq;
+	qp->uobject       = &obj->uevent.uobject;
+	qp->event_handler = attr.event_handler;
+	qp->qp_context    = attr.qp_context;
+	qp->qp_type	  = attr.qp_type;
+	qp->xrcd	  = attr.xrc_domain;
+	atomic_inc(&pd->usecnt);
+	atomic_inc(&attr.send_cq->usecnt);
+	atomic_inc(&attr.recv_cq->usecnt);
+	if (attr.srq)
+		atomic_inc(&attr.srq->usecnt);
+	else if (attr.xrc_domain)
+		atomic_inc(&attr.xrc_domain->usecnt);
+
+	obj->uevent.uobject.object = qp;
+	ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
+	if (ret)
+		goto err_destroy;
+
+	memset(&resp, 0, sizeof resp);
+	resp.qpn             = qp->qp_num;
+	resp.qp_handle       = obj->uevent.uobject.id;
+	resp.max_recv_sge    = attr.cap.max_recv_sge;
+	resp.max_send_sge    = attr.cap.max_send_sge;
+	resp.max_recv_wr     = attr.cap.max_recv_wr;
+	resp.max_send_wr     = attr.cap.max_send_wr;
+	resp.max_inline_data = attr.cap.max_inline_data;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	put_pd_read(pd);
+	put_cq_read(scq);
+	if (rcq != scq)
+		put_cq_read(rcq);
+	if (srq)
+		put_srq_read(srq);
+	if (xrcd)
+		put_xrcd_read(xrcd_uobj);
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list);
+	mutex_unlock(&file->mutex);
+
+	obj->uevent.uobject.live = 1;
+
+	up_write(&obj->uevent.uobject.mutex);
+
+	return in_len;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
+
+err_destroy:
+	ib_destroy_qp(qp);
+
+err_put:
+	if (pd)
+		put_pd_read(pd);
+	if (scq)
+		put_cq_read(scq);
+	if (rcq && rcq != scq)
+		put_cq_read(rcq);
+	if (srq)
+		put_srq_read(srq);
+	if (xrcd)
+		put_xrcd_read(xrcd_uobj);
+
+	put_uobj_write(&obj->uevent.uobject);
+	return ret;
+}
+
+ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file,
+			   const char __user *buf, int in_len,
+			   int out_len)
+{
+	struct ib_uverbs_query_qp      cmd;
+	struct ib_uverbs_query_qp_resp resp;
+	struct ib_qp                   *qp;
+	struct ib_qp_attr              *attr;
+	struct ib_qp_init_attr         *init_attr;
+	int                            ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	attr      = kmalloc(sizeof *attr, GFP_KERNEL);
+	init_attr = kmalloc(sizeof *init_attr, GFP_KERNEL);
+	if (!attr || !init_attr) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+	if (!qp) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = ib_query_qp(qp, attr, cmd.attr_mask, init_attr);
+
+	put_qp_read(qp);
+
+	if (ret)
+		goto out;
+
+	memset(&resp, 0, sizeof resp);
+
+	resp.qp_state               = attr->qp_state;
+	resp.cur_qp_state           = attr->cur_qp_state;
+	resp.path_mtu               = attr->path_mtu;
+	resp.path_mig_state         = attr->path_mig_state;
+	resp.qkey                   = attr->qkey;
+	resp.rq_psn                 = attr->rq_psn;
+	resp.sq_psn                 = attr->sq_psn;
+	resp.dest_qp_num            = attr->dest_qp_num;
+	resp.qp_access_flags        = attr->qp_access_flags;
+	resp.pkey_index             = attr->pkey_index;
+	resp.alt_pkey_index         = attr->alt_pkey_index;
+	resp.sq_draining            = attr->sq_draining;
+	resp.max_rd_atomic          = attr->max_rd_atomic;
+	resp.max_dest_rd_atomic     = attr->max_dest_rd_atomic;
+	resp.min_rnr_timer          = attr->min_rnr_timer;
+	resp.port_num               = attr->port_num;
+	resp.timeout                = attr->timeout;
+	resp.retry_cnt              = attr->retry_cnt;
+	resp.rnr_retry              = attr->rnr_retry;
+	resp.alt_port_num           = attr->alt_port_num;
+	resp.alt_timeout            = attr->alt_timeout;
+
+	memcpy(resp.dest.dgid, attr->ah_attr.grh.dgid.raw, 16);
+	resp.dest.flow_label        = attr->ah_attr.grh.flow_label;
+	resp.dest.sgid_index        = attr->ah_attr.grh.sgid_index;
+	resp.dest.hop_limit         = attr->ah_attr.grh.hop_limit;
+	resp.dest.traffic_class     = attr->ah_attr.grh.traffic_class;
+	resp.dest.dlid              = attr->ah_attr.dlid;
+	resp.dest.sl                = attr->ah_attr.sl;
+	resp.dest.src_path_bits     = attr->ah_attr.src_path_bits;
+	resp.dest.static_rate       = attr->ah_attr.static_rate;
+	resp.dest.is_global         = !!(attr->ah_attr.ah_flags & IB_AH_GRH);
+	resp.dest.port_num          = attr->ah_attr.port_num;
+
+	memcpy(resp.alt_dest.dgid, attr->alt_ah_attr.grh.dgid.raw, 16);
+	resp.alt_dest.flow_label    = attr->alt_ah_attr.grh.flow_label;
+	resp.alt_dest.sgid_index    = attr->alt_ah_attr.grh.sgid_index;
+	resp.alt_dest.hop_limit     = attr->alt_ah_attr.grh.hop_limit;
+	resp.alt_dest.traffic_class = attr->alt_ah_attr.grh.traffic_class;
+	resp.alt_dest.dlid          = attr->alt_ah_attr.dlid;
+	resp.alt_dest.sl            = attr->alt_ah_attr.sl;
+	resp.alt_dest.src_path_bits = attr->alt_ah_attr.src_path_bits;
+	resp.alt_dest.static_rate   = attr->alt_ah_attr.static_rate;
+	resp.alt_dest.is_global     = !!(attr->alt_ah_attr.ah_flags & IB_AH_GRH);
+	resp.alt_dest.port_num      = attr->alt_ah_attr.port_num;
+
+	resp.max_send_wr            = init_attr->cap.max_send_wr;
+	resp.max_recv_wr            = init_attr->cap.max_recv_wr;
+	resp.max_send_sge           = init_attr->cap.max_send_sge;
+	resp.max_recv_sge           = init_attr->cap.max_recv_sge;
+	resp.max_inline_data        = init_attr->cap.max_inline_data;
+	resp.sq_sig_all             = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		ret = -EFAULT;
+
+out:
+	kfree(attr);
+	kfree(init_attr);
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_modify_qp cmd;
+	struct ib_udata            udata;
+	struct ib_qp              *qp;
+	struct ib_qp_attr         *attr;
+	int                        ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd,
+		   out_len);
+
+	attr = kmalloc(sizeof *attr, GFP_KERNEL);
+	if (!attr)
+		return -ENOMEM;
+
+	qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+	if (!qp) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	attr->qp_state 		  = cmd.qp_state;
+	attr->cur_qp_state 	  = cmd.cur_qp_state;
+	attr->path_mtu 		  = cmd.path_mtu;
+	attr->path_mig_state 	  = cmd.path_mig_state;
+	attr->qkey 		  = cmd.qkey;
+	attr->rq_psn 		  = cmd.rq_psn;
+	attr->sq_psn 		  = cmd.sq_psn;
+	attr->dest_qp_num 	  = cmd.dest_qp_num;
+	attr->qp_access_flags 	  = cmd.qp_access_flags;
+	attr->pkey_index 	  = cmd.pkey_index;
+	attr->alt_pkey_index 	  = cmd.alt_pkey_index;
+	attr->en_sqd_async_notify = cmd.en_sqd_async_notify;
+	attr->max_rd_atomic 	  = cmd.max_rd_atomic;
+	attr->max_dest_rd_atomic  = cmd.max_dest_rd_atomic;
+	attr->min_rnr_timer 	  = cmd.min_rnr_timer;
+	attr->port_num 		  = cmd.port_num;
+	attr->timeout 		  = cmd.timeout;
+	attr->retry_cnt 	  = cmd.retry_cnt;
+	attr->rnr_retry 	  = cmd.rnr_retry;
+	attr->alt_port_num 	  = cmd.alt_port_num;
+	attr->alt_timeout 	  = cmd.alt_timeout;
+
+	memcpy(attr->ah_attr.grh.dgid.raw, cmd.dest.dgid, 16);
+	attr->ah_attr.grh.flow_label        = cmd.dest.flow_label;
+	attr->ah_attr.grh.sgid_index        = cmd.dest.sgid_index;
+	attr->ah_attr.grh.hop_limit         = cmd.dest.hop_limit;
+	attr->ah_attr.grh.traffic_class     = cmd.dest.traffic_class;
+	attr->ah_attr.dlid 	    	    = cmd.dest.dlid;
+	attr->ah_attr.sl   	    	    = cmd.dest.sl;
+	attr->ah_attr.src_path_bits 	    = cmd.dest.src_path_bits;
+	attr->ah_attr.static_rate   	    = cmd.dest.static_rate;
+	attr->ah_attr.ah_flags 	    	    = cmd.dest.is_global ? IB_AH_GRH : 0;
+	attr->ah_attr.port_num 	    	    = cmd.dest.port_num;
+
+	memcpy(attr->alt_ah_attr.grh.dgid.raw, cmd.alt_dest.dgid, 16);
+	attr->alt_ah_attr.grh.flow_label    = cmd.alt_dest.flow_label;
+	attr->alt_ah_attr.grh.sgid_index    = cmd.alt_dest.sgid_index;
+	attr->alt_ah_attr.grh.hop_limit     = cmd.alt_dest.hop_limit;
+	attr->alt_ah_attr.grh.traffic_class = cmd.alt_dest.traffic_class;
+	attr->alt_ah_attr.dlid 	    	    = cmd.alt_dest.dlid;
+	attr->alt_ah_attr.sl   	    	    = cmd.alt_dest.sl;
+	attr->alt_ah_attr.src_path_bits     = cmd.alt_dest.src_path_bits;
+	attr->alt_ah_attr.static_rate       = cmd.alt_dest.static_rate;
+	attr->alt_ah_attr.ah_flags 	    = cmd.alt_dest.is_global ? IB_AH_GRH : 0;
+	attr->alt_ah_attr.port_num 	    = cmd.alt_dest.port_num;
+
+	ret = qp->device->modify_qp(qp, attr, cmd.attr_mask, &udata);
+
+	put_qp_read(qp);
+
+	if (ret)
+		goto out;
+
+	ret = in_len;
+
+out:
+	kfree(attr);
+
+	return ret;
+}
+
+ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_destroy_qp      cmd;
+	struct ib_uverbs_destroy_qp_resp resp;
+	struct ib_uobject		*uobj;
+	struct ib_qp               	*qp;
+	struct ib_uqp_object        	*obj;
+	int                        	 ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	memset(&resp, 0, sizeof resp);
+
+	uobj = idr_write_uobj(&ib_uverbs_qp_idr, cmd.qp_handle, file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+	qp  = uobj->object;
+	obj = container_of(uobj, struct ib_uqp_object, uevent.uobject);
+
+	if (!list_empty(&obj->mcast_list)) {
+		put_uobj_write(uobj);
+		return -EBUSY;
+	}
+
+	ret = ib_destroy_qp(qp);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		return ret;
+
+	idr_remove_uobj(&ib_uverbs_qp_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	ib_uverbs_release_uevent(file, &obj->uevent);
+
+	resp.events_reported = obj->uevent.events_reported;
+
+	put_uobj(uobj);
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		return -EFAULT;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_post_send      cmd;
+	struct ib_uverbs_post_send_resp resp;
+	struct ib_uverbs_send_wr       *user_wr;
+	struct ib_send_wr              *wr = NULL, *last, *next, *bad_wr;
+	struct ib_qp                   *qp;
+	int                             i, sg_ind;
+	int				is_ud;
+	ssize_t                         ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	if (in_len < sizeof cmd + cmd.wqe_size * cmd.wr_count +
+	    cmd.sge_count * sizeof (struct ib_uverbs_sge))
+		return -EINVAL;
+
+	if (cmd.wqe_size < sizeof (struct ib_uverbs_send_wr))
+		return -EINVAL;
+
+	user_wr = kmalloc(cmd.wqe_size, GFP_KERNEL);
+	if (!user_wr)
+		return -ENOMEM;
+
+	qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+	if (!qp)
+		goto out;
+
+	is_ud = qp->qp_type == IB_QPT_UD;
+	sg_ind = 0;
+	last = NULL;
+	for (i = 0; i < cmd.wr_count; ++i) {
+		if (copy_from_user(user_wr,
+				   buf + sizeof cmd + i * cmd.wqe_size,
+				   cmd.wqe_size)) {
+			ret = -EFAULT;
+			goto out_put;
+		}
+
+		if (user_wr->num_sge + sg_ind > cmd.sge_count) {
+			ret = -EINVAL;
+			goto out_put;
+		}
+
+		next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) +
+			       user_wr->num_sge * sizeof (struct ib_sge),
+			       GFP_KERNEL);
+		if (!next) {
+			ret = -ENOMEM;
+			goto out_put;
+		}
+
+		if (!last)
+			wr = next;
+		else
+			last->next = next;
+		last = next;
+
+		next->next       = NULL;
+		next->wr_id      = user_wr->wr_id;
+		next->num_sge    = user_wr->num_sge;
+		next->opcode     = user_wr->opcode;
+		next->send_flags = user_wr->send_flags;
+
+		if (is_ud) {
+			next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah,
+						     file->ucontext);
+			if (!next->wr.ud.ah) {
+				ret = -EINVAL;
+				goto out_put;
+			}
+			next->wr.ud.remote_qpn  = user_wr->wr.ud.remote_qpn;
+			next->wr.ud.remote_qkey = user_wr->wr.ud.remote_qkey;
+		} else {
+			switch (next->opcode) {
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				next->ex.imm_data =
+					(__be32 __force) user_wr->ex.imm_data;
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_READ:
+				next->wr.rdma.remote_addr =
+					user_wr->wr.rdma.remote_addr;
+				next->wr.rdma.rkey        =
+					user_wr->wr.rdma.rkey;
+				break;
+			case IB_WR_SEND_WITH_IMM:
+				next->ex.imm_data =
+					(__be32 __force) user_wr->ex.imm_data;
+				break;
+			case IB_WR_SEND_WITH_INV:
+				next->ex.invalidate_rkey =
+					user_wr->ex.invalidate_rkey;
+				break;
+			case IB_WR_ATOMIC_CMP_AND_SWP:
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+				next->wr.atomic.remote_addr =
+					user_wr->wr.atomic.remote_addr;
+				next->wr.atomic.compare_add =
+					user_wr->wr.atomic.compare_add;
+				next->wr.atomic.swap = user_wr->wr.atomic.swap;
+				next->wr.atomic.rkey = user_wr->wr.atomic.rkey;
+				break;
+			default:
+				break;
+			}
+		}
+
+		if (next->num_sge) {
+			next->sg_list = (void *) next +
+				ALIGN(sizeof *next, sizeof (struct ib_sge));
+			if (copy_from_user(next->sg_list,
+					   buf + sizeof cmd +
+					   cmd.wr_count * cmd.wqe_size +
+					   sg_ind * sizeof (struct ib_sge),
+					   next->num_sge * sizeof (struct ib_sge))) {
+				ret = -EFAULT;
+				goto out_put;
+			}
+			sg_ind += next->num_sge;
+		} else
+			next->sg_list = NULL;
+	}
+
+	resp.bad_wr = 0;
+	ret = qp->device->post_send(qp, wr, &bad_wr);
+	if (ret)
+		for (next = wr; next; next = next->next) {
+			++resp.bad_wr;
+			if (next == bad_wr)
+				break;
+		}
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		ret = -EFAULT;
+
+out_put:
+	put_qp_read(qp);
+
+	while (wr) {
+		if (is_ud && wr->wr.ud.ah)
+			put_ah_read(wr->wr.ud.ah);
+		next = wr->next;
+		kfree(wr);
+		wr = next;
+	}
+
+out:
+	kfree(user_wr);
+
+	return ret ? ret : in_len;
+}
+
+static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf,
+						    int in_len,
+						    u32 wr_count,
+						    u32 sge_count,
+						    u32 wqe_size)
+{
+	struct ib_uverbs_recv_wr *user_wr;
+	struct ib_recv_wr        *wr = NULL, *last, *next;
+	int                       sg_ind;
+	int                       i;
+	int                       ret;
+
+	if (in_len < wqe_size * wr_count +
+	    sge_count * sizeof (struct ib_uverbs_sge))
+		return ERR_PTR(-EINVAL);
+
+	if (wqe_size < sizeof (struct ib_uverbs_recv_wr))
+		return ERR_PTR(-EINVAL);
+
+	user_wr = kmalloc(wqe_size, GFP_KERNEL);
+	if (!user_wr)
+		return ERR_PTR(-ENOMEM);
+
+	sg_ind = 0;
+	last = NULL;
+	for (i = 0; i < wr_count; ++i) {
+		if (copy_from_user(user_wr, buf + i * wqe_size,
+				   wqe_size)) {
+			ret = -EFAULT;
+			goto err;
+		}
+
+		if (user_wr->num_sge + sg_ind > sge_count) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) +
+			       user_wr->num_sge * sizeof (struct ib_sge),
+			       GFP_KERNEL);
+		if (!next) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		if (!last)
+			wr = next;
+		else
+			last->next = next;
+		last = next;
+
+		next->next       = NULL;
+		next->wr_id      = user_wr->wr_id;
+		next->num_sge    = user_wr->num_sge;
+
+		if (next->num_sge) {
+			next->sg_list = (void *) next +
+				ALIGN(sizeof *next, sizeof (struct ib_sge));
+			if (copy_from_user(next->sg_list,
+					   buf + wr_count * wqe_size +
+					   sg_ind * sizeof (struct ib_sge),
+					   next->num_sge * sizeof (struct ib_sge))) {
+				ret = -EFAULT;
+				goto err;
+			}
+			sg_ind += next->num_sge;
+		} else
+			next->sg_list = NULL;
+	}
+
+	kfree(user_wr);
+	return wr;
+
+err:
+	kfree(user_wr);
+
+	while (wr) {
+		next = wr->next;
+		kfree(wr);
+		wr = next;
+	}
+
+	return ERR_PTR(ret);
+}
+
+ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_post_recv      cmd;
+	struct ib_uverbs_post_recv_resp resp;
+	struct ib_recv_wr              *wr, *next, *bad_wr;
+	struct ib_qp                   *qp;
+	ssize_t                         ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd,
+				       in_len - sizeof cmd, cmd.wr_count,
+				       cmd.sge_count, cmd.wqe_size);
+	if (IS_ERR(wr))
+		return PTR_ERR(wr);
+
+	qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+	if (!qp)
+		goto out;
+
+	resp.bad_wr = 0;
+	ret = qp->device->post_recv(qp, wr, &bad_wr);
+
+	put_qp_read(qp);
+
+	if (ret)
+		for (next = wr; next; next = next->next) {
+			++resp.bad_wr;
+			if (next == bad_wr)
+				break;
+		}
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		ret = -EFAULT;
+
+out:
+	while (wr) {
+		next = wr->next;
+		kfree(wr);
+		wr = next;
+	}
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file,
+				const char __user *buf, int in_len,
+				int out_len)
+{
+	struct ib_uverbs_post_srq_recv      cmd;
+	struct ib_uverbs_post_srq_recv_resp resp;
+	struct ib_recv_wr                  *wr, *next, *bad_wr;
+	struct ib_srq                      *srq;
+	ssize_t                             ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd,
+				       in_len - sizeof cmd, cmd.wr_count,
+				       cmd.sge_count, cmd.wqe_size);
+	if (IS_ERR(wr))
+		return PTR_ERR(wr);
+
+	srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+	if (!srq)
+		goto out;
+
+	resp.bad_wr = 0;
+	ret = srq->device->post_srq_recv(srq, wr, &bad_wr);
+
+	put_srq_read(srq);
+
+	if (ret)
+		for (next = wr; next; next = next->next) {
+			++resp.bad_wr;
+			if (next == bad_wr)
+				break;
+		}
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		ret = -EFAULT;
+
+out:
+	while (wr) {
+		next = wr->next;
+		kfree(wr);
+		wr = next;
+	}
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_create_ah	 cmd;
+	struct ib_uverbs_create_ah_resp	 resp;
+	struct ib_uobject		*uobj;
+	struct ib_pd			*pd;
+	struct ib_ah			*ah;
+	struct ib_ah_attr		attr;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
+	if (!uobj)
+		return -ENOMEM;
+
+	init_uobj(uobj, cmd.user_handle, file->ucontext, &ah_lock_key);
+	down_write(&uobj->mutex);
+
+	pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+	if (!pd) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	attr.dlid 	       = cmd.attr.dlid;
+	attr.sl 	       = cmd.attr.sl;
+	attr.src_path_bits     = cmd.attr.src_path_bits;
+	attr.static_rate       = cmd.attr.static_rate;
+	attr.ah_flags          = cmd.attr.is_global ? IB_AH_GRH : 0;
+	attr.port_num 	       = cmd.attr.port_num;
+	attr.grh.flow_label    = cmd.attr.grh.flow_label;
+	attr.grh.sgid_index    = cmd.attr.grh.sgid_index;
+	attr.grh.hop_limit     = cmd.attr.grh.hop_limit;
+	attr.grh.traffic_class = cmd.attr.grh.traffic_class;
+	memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16);
+
+	ah = ib_create_ah(pd, &attr);
+	if (IS_ERR(ah)) {
+		ret = PTR_ERR(ah);
+		goto err_put;
+	}
+
+	ah->uobject  = uobj;
+	uobj->object = ah;
+
+	ret = idr_add_uobj(&ib_uverbs_ah_idr, uobj);
+	if (ret)
+		goto err_destroy;
+
+	resp.ah_handle = uobj->id;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	put_pd_read(pd);
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&uobj->list, &file->ucontext->ah_list);
+	mutex_unlock(&file->mutex);
+
+	uobj->live = 1;
+
+	up_write(&uobj->mutex);
+
+	return in_len;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
+
+err_destroy:
+	ib_destroy_ah(ah);
+
+err_put:
+	put_pd_read(pd);
+
+err:
+	put_uobj_write(uobj);
+	return ret;
+}
+
+ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
+			     const char __user *buf, int in_len, int out_len)
+{
+	struct ib_uverbs_destroy_ah cmd;
+	struct ib_ah		   *ah;
+	struct ib_uobject	   *uobj;
+	int			    ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = idr_write_uobj(&ib_uverbs_ah_idr, cmd.ah_handle, file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+	ah = uobj->object;
+
+	ret = ib_destroy_ah(ah);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		return ret;
+
+	idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	put_uobj(uobj);
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file,
+			       const char __user *buf, int in_len,
+			       int out_len)
+{
+	struct ib_uverbs_attach_mcast cmd;
+	struct ib_qp                 *qp;
+	struct ib_uqp_object         *obj;
+	struct ib_uverbs_mcast_entry *mcast;
+	int                           ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+	if (!qp)
+		return -EINVAL;
+
+	obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject);
+
+	list_for_each_entry(mcast, &obj->mcast_list, list)
+		if (cmd.mlid == mcast->lid &&
+		    !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) {
+			ret = 0;
+			goto out_put;
+		}
+
+	mcast = kmalloc(sizeof *mcast, GFP_KERNEL);
+	if (!mcast) {
+		ret = -ENOMEM;
+		goto out_put;
+	}
+
+	mcast->lid = cmd.mlid;
+	memcpy(mcast->gid.raw, cmd.gid, sizeof mcast->gid.raw);
+
+	ret = ib_attach_mcast(qp, &mcast->gid, cmd.mlid);
+	if (!ret)
+		list_add_tail(&mcast->list, &obj->mcast_list);
+	else
+		kfree(mcast);
+
+out_put:
+	put_qp_read(qp);
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file,
+			       const char __user *buf, int in_len,
+			       int out_len)
+{
+	struct ib_uverbs_detach_mcast cmd;
+	struct ib_uqp_object         *obj;
+	struct ib_qp                 *qp;
+	struct ib_uverbs_mcast_entry *mcast;
+	int                           ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+	if (!qp)
+		return -EINVAL;
+
+	ret = ib_detach_mcast(qp, (union ib_gid *) cmd.gid, cmd.mlid);
+	if (ret)
+		goto out_put;
+
+	obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject);
+
+	list_for_each_entry(mcast, &obj->mcast_list, list)
+		if (cmd.mlid == mcast->lid &&
+		    !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) {
+			list_del(&mcast->list);
+			kfree(mcast);
+			break;
+		}
+
+out_put:
+	put_qp_read(qp);
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_create_srq      cmd;
+	struct ib_uverbs_create_srq_resp resp;
+	struct ib_udata                  udata;
+	struct ib_uevent_object         *obj;
+	struct ib_pd                    *pd;
+	struct ib_srq                   *srq;
+	struct ib_srq_init_attr          attr;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	obj = kmalloc(sizeof *obj, GFP_KERNEL);
+	if (!obj)
+		return -ENOMEM;
+
+	init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, &srq_lock_key);
+	down_write(&obj->uobject.mutex);
+
+	pd  = idr_read_pd(cmd.pd_handle, file->ucontext);
+	if (!pd) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	attr.event_handler  = ib_uverbs_srq_event_handler;
+	attr.srq_context    = file;
+	attr.attr.max_wr    = cmd.max_wr;
+	attr.attr.max_sge   = cmd.max_sge;
+	attr.attr.srq_limit = cmd.srq_limit;
+
+	obj->events_reported     = 0;
+	INIT_LIST_HEAD(&obj->event_list);
+
+	srq = pd->device->create_srq(pd, &attr, &udata);
+	if (IS_ERR(srq)) {
+		ret = PTR_ERR(srq);
+		goto err_put;
+	}
+
+	srq->device    	   = pd->device;
+	srq->pd        	   = pd;
+	srq->uobject       = &obj->uobject;
+	srq->event_handler = attr.event_handler;
+	srq->srq_context   = attr.srq_context;
+	srq->xrc_cq = NULL;
+	srq->xrcd = NULL;
+	atomic_inc(&pd->usecnt);
+	atomic_set(&srq->usecnt, 0);
+
+	obj->uobject.object = srq;
+	ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uobject);
+	if (ret)
+		goto err_destroy;
+
+	memset(&resp, 0, sizeof resp);
+	resp.srq_handle = obj->uobject.id;
+	resp.max_wr     = attr.attr.max_wr;
+	resp.max_sge    = attr.attr.max_sge;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	put_pd_read(pd);
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&obj->uobject.list, &file->ucontext->srq_list);
+	mutex_unlock(&file->mutex);
+
+	obj->uobject.live = 1;
+
+	up_write(&obj->uobject.mutex);
+
+	return in_len;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uobject);
+
+err_destroy:
+	ib_destroy_srq(srq);
+
+err_put:
+	put_pd_read(pd);
+
+err:
+	put_uobj_write(&obj->uobject);
+	return ret;
+}
+
+ssize_t ib_uverbs_create_xrc_srq(struct ib_uverbs_file *file,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_create_xrc_srq  cmd;
+	struct ib_uverbs_create_srq_resp resp;
+	struct ib_udata			 udata;
+	struct ib_uevent_object		*obj;
+	struct ib_pd			*pd;
+	struct ib_srq			*srq;
+	struct ib_cq			*xrc_cq;
+	struct ib_xrcd			*xrcd;
+	struct ib_srq_init_attr		 attr;
+	struct ib_uobject		*xrcd_uobj;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	obj = kmalloc(sizeof *obj, GFP_KERNEL);
+	if (!obj)
+		return -ENOMEM;
+
+	init_uobj(&obj->uobject, cmd.user_handle, file->ucontext,
+		  &srq_lock_key);
+	down_write(&obj->uobject.mutex);
+
+	pd  = idr_read_pd(cmd.pd_handle, file->ucontext);
+	if (!pd) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	xrc_cq  = idr_read_cq(cmd.xrc_cq, file->ucontext, 0);
+	if (!xrc_cq) {
+		ret = -EINVAL;
+		goto err_put_pd;
+	}
+
+	xrcd  = idr_read_xrcd(cmd.xrcd_handle, file->ucontext, &xrcd_uobj);
+	if (!xrcd) {
+		ret = -EINVAL;
+		goto err_put_cq;
+	}
+
+
+	attr.event_handler  = ib_uverbs_srq_event_handler;
+	attr.srq_context    = file;
+	attr.attr.max_wr    = cmd.max_wr;
+	attr.attr.max_sge   = cmd.max_sge;
+	attr.attr.srq_limit = cmd.srq_limit;
+
+	obj->events_reported     = 0;
+	INIT_LIST_HEAD(&obj->event_list);
+
+	srq = pd->device->create_xrc_srq(pd, xrc_cq, xrcd, &attr, &udata);
+	if (IS_ERR(srq)) {
+		ret = PTR_ERR(srq);
+		goto err_put;
+	}
+
+	srq->device	   = pd->device;
+	srq->pd		   = pd;
+	srq->uobject	   = &obj->uobject;
+	srq->event_handler = attr.event_handler;
+	srq->srq_context   = attr.srq_context;
+	srq->xrc_cq	   = xrc_cq;
+	srq->xrcd	   = xrcd;
+	atomic_inc(&pd->usecnt);
+	atomic_inc(&xrc_cq->usecnt);
+	atomic_inc(&xrcd->usecnt);
+
+	atomic_set(&srq->usecnt, 0);
+
+	obj->uobject.object = srq;
+	ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uobject);
+	if (ret)
+		goto err_destroy;
+
+	memset(&resp, 0, sizeof resp);
+	resp.srq_handle	= obj->uobject.id;
+	resp.max_wr	= attr.attr.max_wr;
+	resp.max_sge	= attr.attr.max_sge;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	put_xrcd_read(xrcd_uobj);
+	put_cq_read(xrc_cq);
+	put_pd_read(pd);
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&obj->uobject.list, &file->ucontext->srq_list);
+	mutex_unlock(&file->mutex);
+
+	obj->uobject.live = 1;
+
+	up_write(&obj->uobject.mutex);
+
+	return in_len;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uobject);
+
+err_destroy:
+	ib_destroy_srq(srq);
+
+err_put:
+	put_xrcd_read(xrcd_uobj);
+
+err_put_cq:
+	put_cq_read(xrc_cq);
+
+err_put_pd:
+	put_pd_read(pd);
+
+err:
+	put_uobj_write(&obj->uobject);
+	return ret;
+}
+
+ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_modify_srq cmd;
+	struct ib_udata             udata;
+	struct ib_srq              *srq;
+	struct ib_srq_attr          attr;
+	int                         ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd,
+		   out_len);
+
+	srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+	if (!srq)
+		return -EINVAL;
+
+	attr.max_wr    = cmd.max_wr;
+	attr.srq_limit = cmd.srq_limit;
+
+	ret = srq->device->modify_srq(srq, &attr, cmd.attr_mask, &udata);
+
+	put_srq_read(srq);
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
+			    const char __user *buf,
+			    int in_len, int out_len)
+{
+	struct ib_uverbs_query_srq      cmd;
+	struct ib_uverbs_query_srq_resp resp;
+	struct ib_srq_attr              attr;
+	struct ib_srq                   *srq;
+	int                             ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+	if (!srq)
+		return -EINVAL;
+
+	ret = ib_query_srq(srq, &attr);
+
+	put_srq_read(srq);
+
+	if (ret)
+		return ret;
+
+	memset(&resp, 0, sizeof resp);
+
+	resp.max_wr    = attr.max_wr;
+	resp.max_sge   = attr.max_sge;
+	resp.srq_limit = attr.srq_limit;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		return -EFAULT;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
+			      const char __user *buf, int in_len,
+			      int out_len)
+{
+	struct ib_uverbs_destroy_srq      cmd;
+	struct ib_uverbs_destroy_srq_resp resp;
+	struct ib_uobject		 *uobj;
+	struct ib_srq               	 *srq;
+	struct ib_uevent_object        	 *obj;
+	int                         	  ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = idr_write_uobj(&ib_uverbs_srq_idr, cmd.srq_handle, file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+	srq = uobj->object;
+	obj = container_of(uobj, struct ib_uevent_object, uobject);
+
+	ret = ib_destroy_srq(srq);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		return ret;
+
+	idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	ib_uverbs_release_uevent(file, obj);
+
+	memset(&resp, 0, sizeof resp);
+	resp.events_reported = obj->events_reported;
+
+	put_uobj(uobj);
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		ret = -EFAULT;
+
+	return ret ? ret : in_len;
+}
+
+static struct inode *xrc_file2inode(struct file *f)
+{
+	return f->f_dentry->d_inode;
+}
+
+struct xrcd_table_entry {
+	struct rb_node node;
+	struct inode *inode;
+	struct ib_xrcd *xrcd;
+};
+
+static int xrcd_table_insert(struct ib_device *dev,
+			     struct inode *i_n,
+			     struct ib_xrcd *xrcd)
+{
+	struct xrcd_table_entry *entry, *scan;
+	struct rb_node **p = &dev->ib_uverbs_xrcd_table.rb_node;
+	struct rb_node *parent = NULL;
+
+	entry = kmalloc(sizeof(struct xrcd_table_entry), GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+
+	entry->inode = i_n;
+	entry->xrcd = xrcd;
+
+	while (*p) {
+		parent = *p;
+		scan = rb_entry(parent, struct xrcd_table_entry, node);
+
+		if (i_n < scan->inode)
+			p = &(*p)->rb_left;
+		else if (i_n > scan->inode)
+			p = &(*p)->rb_right;
+		else {
+			kfree(entry);
+			return -EEXIST;
+		}
+	}
+
+	rb_link_node(&entry->node, parent, p);
+	rb_insert_color(&entry->node, &dev->ib_uverbs_xrcd_table);
+	igrab(i_n);
+	return 0;
+}
+
+static struct xrcd_table_entry *xrcd_table_search(struct ib_device *dev,
+						   struct inode *i_n)
+{
+	struct xrcd_table_entry *scan;
+	struct rb_node **p = &dev->ib_uverbs_xrcd_table.rb_node;
+	struct rb_node *parent = NULL;
+
+	while (*p) {
+		parent = *p;
+		scan = rb_entry(parent, struct xrcd_table_entry, node);
+
+		if (i_n < scan->inode)
+			p = &(*p)->rb_left;
+		else if (i_n > scan->inode)
+			p = &(*p)->rb_right;
+		else
+			return scan;
+	}
+	return NULL;
+}
+
+static int find_xrcd(struct ib_device *dev, struct inode *i_n,
+		     struct ib_xrcd **xrcd)
+{
+	struct xrcd_table_entry *entry;
+
+	entry = xrcd_table_search(dev, i_n);
+	if (!entry)
+		return -EINVAL;
+
+	*xrcd = entry->xrcd;
+	return 0;
+}
+
+
+static void xrcd_table_delete(struct ib_device *dev,
+			      struct inode *i_n)
+{
+	struct xrcd_table_entry *entry = xrcd_table_search(dev, i_n);
+
+	if (entry) {
+		iput(i_n);
+		rb_erase(&entry->node, &dev->ib_uverbs_xrcd_table);
+		kfree(entry);
+	}
+}
+
+ssize_t ib_uverbs_open_xrc_domain(struct ib_uverbs_file *file,
+				  const char __user *buf, int in_len,
+				  int out_len)
+{
+	struct ib_uverbs_open_xrc_domain cmd;
+	struct ib_uverbs_open_xrc_domain_resp resp;
+	struct ib_udata	udata;
+	struct ib_uobject *uobj;
+	struct ib_uxrcd_object         	*xrcd_uobj;
+	struct ib_xrcd			*xrcd = NULL;
+	struct file			*f = NULL;
+	struct inode			*inode = NULL;
+	int				 ret = 0;
+	int				 new_xrcd = 0;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	mutex_lock(&file->device->ib_dev->xrcd_table_mutex);
+	if (cmd.fd != (u32) (-1)) {
+		/* search for file descriptor */
+		f = fget(cmd.fd);
+		if (!f) {
+			ret = -EBADF;
+			goto err_table_mutex_unlock;
+		}
+
+		inode = xrc_file2inode(f);
+		if (!inode) {
+			ret = -EBADF;
+			goto err_table_mutex_unlock;
+		}
+
+		ret = find_xrcd(file->device->ib_dev, inode, &xrcd);
+		if (ret && !(cmd.oflags & O_CREAT)) {
+			/* no file descriptor. Need CREATE flag */
+			ret = -EAGAIN;
+			goto err_table_mutex_unlock;
+		}
+
+		if (xrcd && cmd.oflags & O_EXCL) {
+			ret = -EINVAL;
+			goto err_table_mutex_unlock;
+		}
+	}
+
+	xrcd_uobj = kmalloc(sizeof *xrcd_uobj, GFP_KERNEL);
+	if (!xrcd_uobj) {
+		ret = -ENOMEM;
+		goto err_table_mutex_unlock;
+	}
+
+	uobj = &xrcd_uobj->uobject;
+	init_uobj(uobj, 0, file->ucontext, &pd_lock_key);
+	down_write(&uobj->mutex);
+
+	if (!xrcd) {
+		xrcd = file->device->ib_dev->alloc_xrcd(file->device->ib_dev,
+							file->ucontext, &udata);
+		if (IS_ERR(xrcd)) {
+			ret = PTR_ERR(xrcd);
+			goto err;
+		}
+		xrcd->uobject = (cmd.fd == -1) ? uobj : NULL;
+		xrcd->inode = inode;
+		xrcd->device  = file->device->ib_dev;
+		atomic_set(&xrcd->usecnt, 0);
+		new_xrcd = 1;
+	}
+
+	uobj->object = xrcd;
+	ret = idr_add_uobj(&ib_uverbs_xrc_domain_idr, uobj);
+	if (ret)
+		goto err_idr;
+
+	memset(&resp, 0, sizeof resp);
+	resp.xrcd_handle = uobj->id;
+
+	if (inode) {
+		if (new_xrcd) {
+		/* create new inode/xrcd table entry */
+			ret = xrcd_table_insert(file->device->ib_dev, inode, xrcd);
+			if (ret)
+				goto err_insert_xrcd;
+		}
+		atomic_inc(&xrcd->usecnt);
+	}
+	if (f)
+		fput(f);
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	INIT_LIST_HEAD(&xrcd_uobj->xrc_reg_qp_list);
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&uobj->list, &file->ucontext->xrc_domain_list);
+	mutex_unlock(&file->mutex);
+
+	uobj->live = 1;
+
+	up_write(&uobj->mutex);
+
+	mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+	return in_len;
+
+err_copy:
+
+	if (inode) {
+		if (new_xrcd)
+			xrcd_table_delete(file->device->ib_dev, inode);
+		atomic_dec(&xrcd->usecnt);
+	}
+
+err_insert_xrcd:
+	idr_remove_uobj(&ib_uverbs_xrc_domain_idr, uobj);
+
+err_idr:
+	ib_dealloc_xrcd(xrcd);
+
+err:
+	put_uobj_write(uobj);
+
+err_table_mutex_unlock:
+
+	if (f)
+		fput(f);
+	mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+	return ret;
+}
+
+ssize_t ib_uverbs_close_xrc_domain(struct ib_uverbs_file *file,
+				   const char __user *buf, int in_len,
+				   int out_len)
+{
+	struct ib_uverbs_close_xrc_domain cmd;
+	struct ib_uobject *uobj, *t_uobj;
+	struct ib_uxrcd_object *xrcd_uobj;
+	struct ib_xrcd *xrcd = NULL;
+	struct inode *inode = NULL;
+	int ret = 0;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	mutex_lock(&file->device->ib_dev->xrcd_table_mutex);
+	uobj = idr_write_uobj(&ib_uverbs_xrc_domain_idr, cmd.xrcd_handle,
+			      file->ucontext);
+	if (!uobj) {
+		ret = -EINVAL;
+		goto err_unlock_mutex;
+	}
+
+	mutex_lock(&file->mutex);
+	if (!ret) {
+		list_for_each_entry(t_uobj, &file->ucontext->qp_list, list) {
+			struct ib_qp *qp = t_uobj->object;
+			if (qp->xrcd && qp->xrcd == uobj->object) {
+				ret = -EBUSY;
+				break;
+			}
+		}
+	}
+	if (!ret) {
+		list_for_each_entry(t_uobj, &file->ucontext->srq_list, list) {
+			struct ib_srq *srq = t_uobj->object;
+			if (srq->xrcd && srq->xrcd == uobj->object) {
+				ret = -EBUSY;
+				break;
+			}
+		}
+	}
+	mutex_unlock(&file->mutex);
+	if (ret) {
+		put_uobj_write(uobj);
+		goto err_unlock_mutex;
+	}
+
+	xrcd_uobj = container_of(uobj, struct ib_uxrcd_object, uobject);
+	if (!list_empty(&xrcd_uobj->xrc_reg_qp_list)) {
+		ret = -EBUSY;
+		put_uobj_write(uobj);
+		goto err_unlock_mutex;
+	}
+
+	xrcd = (struct ib_xrcd *) (uobj->object);
+	inode = xrcd->inode;
+
+	if (inode)
+		atomic_dec(&xrcd->usecnt);
+
+	ret = ib_dealloc_xrcd(uobj->object);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret && !inode)
+		goto err_unlock_mutex;
+
+	if (!ret && inode)
+		xrcd_table_delete(file->device->ib_dev, inode);
+
+	idr_remove_uobj(&ib_uverbs_xrc_domain_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	put_uobj(uobj);
+
+	mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+	return in_len;
+
+err_unlock_mutex:
+	mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+	return ret;
+}
+
+void ib_uverbs_dealloc_xrcd(struct ib_device *ib_dev,
+			    struct ib_xrcd *xrcd)
+{
+	struct inode *inode = NULL;
+	int ret = 0;
+
+	inode = xrcd->inode;
+	if (inode)
+		atomic_dec(&xrcd->usecnt);
+
+	ret = ib_dealloc_xrcd(xrcd);
+	if (!ret && inode)
+		xrcd_table_delete(ib_dev, inode);
+}
+
+ssize_t ib_uverbs_create_xrc_rcv_qp(struct ib_uverbs_file *file,
+				    const char __user *buf, int in_len,
+				    int out_len)
+{
+	struct ib_uverbs_create_xrc_rcv_qp	cmd;
+	struct ib_uverbs_create_xrc_rcv_qp_resp resp;
+	struct ib_uxrc_rcv_object      *obj;
+	struct ib_qp_init_attr		init_attr;
+	struct ib_xrcd		       *xrcd;
+	struct ib_uobject	       *uobj;
+	struct ib_uxrcd_object	       *xrcd_uobj;
+	u32				qp_num;
+	int				err;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	obj = kzalloc(sizeof *obj, GFP_KERNEL);
+	if (!obj)
+		return -ENOMEM;
+
+	xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj);
+	if (!xrcd) {
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	init_attr.event_handler = ib_uverbs_xrc_rcv_qp_event_handler;
+	init_attr.qp_context	= file;
+	init_attr.srq		= NULL;
+	init_attr.sq_sig_type	=
+		cmd.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
+	init_attr.qp_type	= IB_QPT_XRC;
+	init_attr.xrc_domain	= xrcd;
+
+	init_attr.cap.max_send_wr	= 1;
+	init_attr.cap.max_recv_wr	= 0;
+	init_attr.cap.max_send_sge	= 1;
+	init_attr.cap.max_recv_sge	= 0;
+	init_attr.cap.max_inline_data	= 0;
+
+	err = xrcd->device->create_xrc_rcv_qp(&init_attr, &qp_num);
+	if (err)
+		goto err_put;
+
+	memset(&resp, 0, sizeof resp);
+	resp.qpn = qp_num;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		err = -EFAULT;
+		goto err_destroy;
+	}
+
+	atomic_inc(&xrcd->usecnt);
+	put_xrcd_read(uobj);
+	obj->qp_num = qp_num;
+	obj->domain_handle = cmd.xrc_domain_handle;
+	xrcd_uobj = container_of(uobj, struct ib_uxrcd_object, uobject);
+	mutex_lock(&file->device->ib_dev->xrcd_table_mutex);
+	list_add_tail(&obj->list, &xrcd_uobj->xrc_reg_qp_list);
+	mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+
+	return in_len;
+
+err_destroy:
+	xrcd->device->unreg_xrc_rcv_qp(xrcd, file, qp_num);
+err_put:
+	put_xrcd_read(uobj);
+err_out:
+	kfree(obj);
+	return err;
+}
+
+ssize_t ib_uverbs_modify_xrc_rcv_qp(struct ib_uverbs_file *file,
+				    const char __user *buf, int in_len,
+				    int out_len)
+{
+	struct ib_uverbs_modify_xrc_rcv_qp      cmd;
+	struct ib_qp_attr	       *attr;
+	struct ib_xrcd		       *xrcd;
+	struct ib_uobject	       *uobj;
+	int				err;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	attr = kzalloc(sizeof *attr, GFP_KERNEL);
+	if (!attr)
+		return -ENOMEM;
+
+	xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj);
+	if (!xrcd) {
+		kfree(attr);
+		return -EINVAL;
+	}
+
+	attr->qp_state		  = cmd.qp_state;
+	attr->cur_qp_state	  = cmd.cur_qp_state;
+	attr->qp_access_flags	  = cmd.qp_access_flags;
+	attr->pkey_index	  = cmd.pkey_index;
+	attr->port_num		  = cmd.port_num;
+	attr->path_mtu		  = cmd.path_mtu;
+	attr->path_mig_state	  = cmd.path_mig_state;
+	attr->qkey		  = cmd.qkey;
+	attr->rq_psn		  = cmd.rq_psn;
+	attr->sq_psn		  = cmd.sq_psn;
+	attr->dest_qp_num	  = cmd.dest_qp_num;
+	attr->alt_pkey_index	  = cmd.alt_pkey_index;
+	attr->en_sqd_async_notify = cmd.en_sqd_async_notify;
+	attr->max_rd_atomic	  = cmd.max_rd_atomic;
+	attr->max_dest_rd_atomic  = cmd.max_dest_rd_atomic;
+	attr->min_rnr_timer	  = cmd.min_rnr_timer;
+	attr->port_num		  = cmd.port_num;
+	attr->timeout		  = cmd.timeout;
+	attr->retry_cnt		  = cmd.retry_cnt;
+	attr->rnr_retry		  = cmd.rnr_retry;
+	attr->alt_port_num	  = cmd.alt_port_num;
+	attr->alt_timeout	  = cmd.alt_timeout;
+
+	memcpy(attr->ah_attr.grh.dgid.raw, cmd.dest.dgid, 16);
+	attr->ah_attr.grh.flow_label	    = cmd.dest.flow_label;
+	attr->ah_attr.grh.sgid_index	    = cmd.dest.sgid_index;
+	attr->ah_attr.grh.hop_limit	    = cmd.dest.hop_limit;
+	attr->ah_attr.grh.traffic_class	    = cmd.dest.traffic_class;
+	attr->ah_attr.dlid		    = cmd.dest.dlid;
+	attr->ah_attr.sl		    = cmd.dest.sl;
+	attr->ah_attr.src_path_bits	    = cmd.dest.src_path_bits;
+	attr->ah_attr.static_rate	    = cmd.dest.static_rate;
+	attr->ah_attr.ah_flags		    = cmd.dest.is_global ? IB_AH_GRH : 0;
+	attr->ah_attr.port_num		    = cmd.dest.port_num;
+
+	memcpy(attr->alt_ah_attr.grh.dgid.raw, cmd.alt_dest.dgid, 16);
+	attr->alt_ah_attr.grh.flow_label    = cmd.alt_dest.flow_label;
+	attr->alt_ah_attr.grh.sgid_index    = cmd.alt_dest.sgid_index;
+	attr->alt_ah_attr.grh.hop_limit     = cmd.alt_dest.hop_limit;
+	attr->alt_ah_attr.grh.traffic_class = cmd.alt_dest.traffic_class;
+	attr->alt_ah_attr.dlid		    = cmd.alt_dest.dlid;
+	attr->alt_ah_attr.sl		    = cmd.alt_dest.sl;
+	attr->alt_ah_attr.src_path_bits	    = cmd.alt_dest.src_path_bits;
+	attr->alt_ah_attr.static_rate	    = cmd.alt_dest.static_rate;
+	attr->alt_ah_attr.ah_flags	    = cmd.alt_dest.is_global ? IB_AH_GRH : 0;
+	attr->alt_ah_attr.port_num	    = cmd.alt_dest.port_num;
+
+	err = xrcd->device->modify_xrc_rcv_qp(xrcd, cmd.qp_num, attr, cmd.attr_mask);
+	put_xrcd_read(uobj);
+	kfree(attr);
+	return err ? err : in_len;
+}
+
+ssize_t ib_uverbs_query_xrc_rcv_qp(struct ib_uverbs_file *file,
+				   const char __user *buf, int in_len,
+				   int out_len)
+{
+	struct ib_uverbs_query_xrc_rcv_qp cmd;
+	struct ib_uverbs_query_qp_resp	 resp;
+	struct ib_qp_attr		*attr;
+	struct ib_qp_init_attr		*init_attr;
+	struct ib_xrcd			*xrcd;
+	struct ib_uobject		*uobj;
+	int				 ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	attr      = kmalloc(sizeof *attr, GFP_KERNEL);
+	init_attr = kmalloc(sizeof *init_attr, GFP_KERNEL);
+	if (!attr || !init_attr) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj);
+	if (!xrcd) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = xrcd->device->query_xrc_rcv_qp(xrcd, cmd.qp_num, attr,
+					     cmd.attr_mask, init_attr);
+
+	put_xrcd_read(uobj);
+
+	if (ret)
+		goto out;
+
+	memset(&resp, 0, sizeof resp);
+	resp.qp_state		    = attr->qp_state;
+	resp.cur_qp_state	    = attr->cur_qp_state;
+	resp.path_mtu		    = attr->path_mtu;
+	resp.path_mig_state	    = attr->path_mig_state;
+	resp.qkey		    = attr->qkey;
+	resp.rq_psn		    = attr->rq_psn;
+	resp.sq_psn		    = attr->sq_psn;
+	resp.dest_qp_num	    = attr->dest_qp_num;
+	resp.qp_access_flags	    = attr->qp_access_flags;
+	resp.pkey_index		    = attr->pkey_index;
+	resp.alt_pkey_index	    = attr->alt_pkey_index;
+	resp.sq_draining	    = attr->sq_draining;
+	resp.max_rd_atomic	    = attr->max_rd_atomic;
+	resp.max_dest_rd_atomic	    = attr->max_dest_rd_atomic;
+	resp.min_rnr_timer	    = attr->min_rnr_timer;
+	resp.port_num		    = attr->port_num;
+	resp.timeout		    = attr->timeout;
+	resp.retry_cnt		    = attr->retry_cnt;
+	resp.rnr_retry		    = attr->rnr_retry;
+	resp.alt_port_num	    = attr->alt_port_num;
+	resp.alt_timeout	    = attr->alt_timeout;
+
+	memcpy(resp.dest.dgid, attr->ah_attr.grh.dgid.raw, 16);
+	resp.dest.flow_label	    = attr->ah_attr.grh.flow_label;
+	resp.dest.sgid_index	    = attr->ah_attr.grh.sgid_index;
+	resp.dest.hop_limit	    = attr->ah_attr.grh.hop_limit;
+	resp.dest.traffic_class	    = attr->ah_attr.grh.traffic_class;
+	resp.dest.dlid		    = attr->ah_attr.dlid;
+	resp.dest.sl		    = attr->ah_attr.sl;
+	resp.dest.src_path_bits	    = attr->ah_attr.src_path_bits;
+	resp.dest.static_rate	    = attr->ah_attr.static_rate;
+	resp.dest.is_global	    = !!(attr->ah_attr.ah_flags & IB_AH_GRH);
+	resp.dest.port_num	    = attr->ah_attr.port_num;
+
+	memcpy(resp.alt_dest.dgid, attr->alt_ah_attr.grh.dgid.raw, 16);
+	resp.alt_dest.flow_label    = attr->alt_ah_attr.grh.flow_label;
+	resp.alt_dest.sgid_index    = attr->alt_ah_attr.grh.sgid_index;
+	resp.alt_dest.hop_limit     = attr->alt_ah_attr.grh.hop_limit;
+	resp.alt_dest.traffic_class = attr->alt_ah_attr.grh.traffic_class;
+	resp.alt_dest.dlid	    = attr->alt_ah_attr.dlid;
+	resp.alt_dest.sl	    = attr->alt_ah_attr.sl;
+	resp.alt_dest.src_path_bits = attr->alt_ah_attr.src_path_bits;
+	resp.alt_dest.static_rate   = attr->alt_ah_attr.static_rate;
+	resp.alt_dest.is_global	    = !!(attr->alt_ah_attr.ah_flags & IB_AH_GRH);
+	resp.alt_dest.port_num	    = attr->alt_ah_attr.port_num;
+
+	resp.max_send_wr	    = init_attr->cap.max_send_wr;
+	resp.max_recv_wr	    = init_attr->cap.max_recv_wr;
+	resp.max_send_sge	    = init_attr->cap.max_send_sge;
+	resp.max_recv_sge	    = init_attr->cap.max_recv_sge;
+	resp.max_inline_data	    = init_attr->cap.max_inline_data;
+	resp.sq_sig_all		    = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		ret = -EFAULT;
+
+out:
+	kfree(attr);
+	kfree(init_attr);
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_reg_xrc_rcv_qp(struct ib_uverbs_file *file,
+				 const char __user *buf, int in_len,
+				 int out_len)
+{
+	struct ib_uverbs_reg_xrc_rcv_qp  cmd;
+	struct ib_uxrc_rcv_object	*qp_obj, *tmp;
+	struct ib_xrcd			*xrcd;
+	struct ib_uobject		*uobj;
+	struct ib_uxrcd_object		*xrcd_uobj;
+	int				 ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	qp_obj = kmalloc(sizeof *qp_obj, GFP_KERNEL);
+	if (!qp_obj)
+		return -ENOMEM;
+
+	xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj);
+	if (!xrcd) {
+		ret = -EINVAL;
+		goto err_out;
+	}
+
+	ret = xrcd->device->reg_xrc_rcv_qp(xrcd, file, cmd.qp_num);
+	if (ret)
+		goto err_put;
+
+	xrcd_uobj = container_of(uobj, struct ib_uxrcd_object, uobject);
+	mutex_lock(&file->device->ib_dev->xrcd_table_mutex);
+	list_for_each_entry(tmp, &xrcd_uobj->xrc_reg_qp_list, list)
+		if (cmd.qp_num == tmp->qp_num) {
+			kfree(qp_obj);
+			mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+			put_xrcd_read(uobj);
+			return in_len;
+		}
+	qp_obj->qp_num = cmd.qp_num;
+	qp_obj->domain_handle = cmd.xrc_domain_handle;
+	list_add_tail(&qp_obj->list, &xrcd_uobj->xrc_reg_qp_list);
+	mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+	atomic_inc(&xrcd->usecnt);
+	put_xrcd_read(uobj);
+	return in_len;
+
+err_put:
+	put_xrcd_read(uobj);
+err_out:
+
+	kfree(qp_obj);
+	return ret;
+}
+
+int ib_uverbs_cleanup_xrc_rcv_qp(struct ib_uverbs_file *file,
+				 struct ib_xrcd *xrcd, u32 qp_num)
+{
+	int err;
+	err = xrcd->device->unreg_xrc_rcv_qp(xrcd, file, qp_num);
+	if (!err)
+		atomic_dec(&xrcd->usecnt);
+	return err;
+}
+
+ssize_t ib_uverbs_unreg_xrc_rcv_qp(struct ib_uverbs_file *file,
+				   const char __user *buf, int in_len,
+				   int out_len)
+{
+	struct ib_uverbs_unreg_xrc_rcv_qp cmd;
+	struct ib_uxrc_rcv_object *qp_obj, *tmp;
+	struct ib_xrcd *xrcd;
+	struct ib_uobject *uobj;
+	struct ib_uxrcd_object *xrcd_uobj;
+	int ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj);
+	if (!xrcd)
+		return -EINVAL;
+
+	ret = xrcd->device->unreg_xrc_rcv_qp(xrcd, file, cmd.qp_num);
+	if (ret) {
+		put_xrcd_read(uobj);
+		return -EINVAL;
+	}
+	atomic_dec(&xrcd->usecnt);
+
+	xrcd_uobj = container_of(uobj, struct ib_uxrcd_object, uobject);
+	mutex_lock(&file->device->ib_dev->xrcd_table_mutex);
+	list_for_each_entry_safe(qp_obj, tmp, &xrcd_uobj->xrc_reg_qp_list, list)
+		if (cmd.qp_num == qp_obj->qp_num) {
+			list_del(&qp_obj->list);
+			kfree(qp_obj);
+			break;
+		}
+	mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+	put_xrcd_read(uobj);
+	return in_len;
+}
diff --git a/sys/ofed/drivers/infiniband/core/uverbs_main.c b/sys/ofed/drivers/infiniband/core/uverbs_main.c
new file mode 100644
index 0000000..380abd3
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/uverbs_main.c
@@ -0,0 +1,1012 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/cdev.h>
+
+#include <asm/uaccess.h>
+
+#include "uverbs.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand userspace verbs access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+#define INFINIBANDEVENTFS_MAGIC	0x49426576	/* "IBev" */
+
+enum {
+	IB_UVERBS_MAJOR       = 231,
+	IB_UVERBS_BASE_MINOR  = 192,
+	IB_UVERBS_MAX_DEVICES = 32
+};
+
+#define IB_UVERBS_BASE_DEV	MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR)
+
+static struct class *uverbs_class;
+
+DEFINE_SPINLOCK(ib_uverbs_idr_lock);
+DEFINE_IDR(ib_uverbs_pd_idr);
+DEFINE_IDR(ib_uverbs_mr_idr);
+DEFINE_IDR(ib_uverbs_mw_idr);
+DEFINE_IDR(ib_uverbs_ah_idr);
+DEFINE_IDR(ib_uverbs_cq_idr);
+DEFINE_IDR(ib_uverbs_qp_idr);
+DEFINE_IDR(ib_uverbs_srq_idr);
+DEFINE_IDR(ib_uverbs_xrc_domain_idr);
+
+static spinlock_t map_lock;
+static struct ib_uverbs_device *dev_table[IB_UVERBS_MAX_DEVICES];
+static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
+
+static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
+				     const char __user *buf, int in_len,
+				     int out_len) = {
+	[IB_USER_VERBS_CMD_GET_CONTEXT]   	= ib_uverbs_get_context,
+	[IB_USER_VERBS_CMD_QUERY_DEVICE]  	= ib_uverbs_query_device,
+	[IB_USER_VERBS_CMD_QUERY_PORT]    	= ib_uverbs_query_port,
+	[IB_USER_VERBS_CMD_ALLOC_PD]      	= ib_uverbs_alloc_pd,
+	[IB_USER_VERBS_CMD_DEALLOC_PD]    	= ib_uverbs_dealloc_pd,
+	[IB_USER_VERBS_CMD_REG_MR]        	= ib_uverbs_reg_mr,
+	[IB_USER_VERBS_CMD_DEREG_MR]      	= ib_uverbs_dereg_mr,
+	[IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL] = ib_uverbs_create_comp_channel,
+	[IB_USER_VERBS_CMD_CREATE_CQ]     	= ib_uverbs_create_cq,
+	[IB_USER_VERBS_CMD_RESIZE_CQ]     	= ib_uverbs_resize_cq,
+	[IB_USER_VERBS_CMD_POLL_CQ]     	= ib_uverbs_poll_cq,
+	[IB_USER_VERBS_CMD_REQ_NOTIFY_CQ]     	= ib_uverbs_req_notify_cq,
+	[IB_USER_VERBS_CMD_DESTROY_CQ]    	= ib_uverbs_destroy_cq,
+	[IB_USER_VERBS_CMD_CREATE_QP]     	= ib_uverbs_create_qp,
+	[IB_USER_VERBS_CMD_QUERY_QP]     	= ib_uverbs_query_qp,
+	[IB_USER_VERBS_CMD_MODIFY_QP]     	= ib_uverbs_modify_qp,
+	[IB_USER_VERBS_CMD_DESTROY_QP]    	= ib_uverbs_destroy_qp,
+	[IB_USER_VERBS_CMD_POST_SEND]    	= ib_uverbs_post_send,
+	[IB_USER_VERBS_CMD_POST_RECV]    	= ib_uverbs_post_recv,
+	[IB_USER_VERBS_CMD_POST_SRQ_RECV]    	= ib_uverbs_post_srq_recv,
+	[IB_USER_VERBS_CMD_CREATE_AH]    	= ib_uverbs_create_ah,
+	[IB_USER_VERBS_CMD_DESTROY_AH]    	= ib_uverbs_destroy_ah,
+	[IB_USER_VERBS_CMD_ATTACH_MCAST]  	= ib_uverbs_attach_mcast,
+	[IB_USER_VERBS_CMD_DETACH_MCAST]  	= ib_uverbs_detach_mcast,
+	[IB_USER_VERBS_CMD_CREATE_SRQ]    	= ib_uverbs_create_srq,
+	[IB_USER_VERBS_CMD_MODIFY_SRQ]    	= ib_uverbs_modify_srq,
+	[IB_USER_VERBS_CMD_QUERY_SRQ]     	= ib_uverbs_query_srq,
+	[IB_USER_VERBS_CMD_DESTROY_SRQ]   	= ib_uverbs_destroy_srq,
+	[IB_USER_VERBS_CMD_CREATE_XRC_SRQ]	= ib_uverbs_create_xrc_srq,
+	[IB_USER_VERBS_CMD_OPEN_XRC_DOMAIN]	= ib_uverbs_open_xrc_domain,
+	[IB_USER_VERBS_CMD_CLOSE_XRC_DOMAIN]	= ib_uverbs_close_xrc_domain,
+	[IB_USER_VERBS_CMD_CREATE_XRC_RCV_QP]	= ib_uverbs_create_xrc_rcv_qp,
+	[IB_USER_VERBS_CMD_MODIFY_XRC_RCV_QP]	= ib_uverbs_modify_xrc_rcv_qp,
+	[IB_USER_VERBS_CMD_QUERY_XRC_RCV_QP]	= ib_uverbs_query_xrc_rcv_qp,
+	[IB_USER_VERBS_CMD_REG_XRC_RCV_QP]	= ib_uverbs_reg_xrc_rcv_qp,
+	[IB_USER_VERBS_CMD_UNREG_XRC_RCV_QP]	= ib_uverbs_unreg_xrc_rcv_qp,
+};
+
+#ifdef __linux__
+/* BSD Does not require a fake mountpoint for all files. */
+static struct vfsmount *uverbs_event_mnt;
+#endif
+
+static void ib_uverbs_add_one(struct ib_device *device);
+static void ib_uverbs_remove_one(struct ib_device *device);
+
+static void ib_uverbs_release_dev(struct kref *ref)
+{
+	struct ib_uverbs_device *dev =
+		container_of(ref, struct ib_uverbs_device, ref);
+
+	complete(&dev->comp);
+}
+
+static void ib_uverbs_release_event_file(struct kref *ref)
+{
+	struct ib_uverbs_event_file *file =
+		container_of(ref, struct ib_uverbs_event_file, ref);
+
+	kfree(file);
+}
+
+void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
+			  struct ib_uverbs_event_file *ev_file,
+			  struct ib_ucq_object *uobj)
+{
+	struct ib_uverbs_event *evt, *tmp;
+
+	if (ev_file) {
+		spin_lock_irq(&ev_file->lock);
+		list_for_each_entry_safe(evt, tmp, &uobj->comp_list, obj_list) {
+			list_del(&evt->list);
+			kfree(evt);
+		}
+		spin_unlock_irq(&ev_file->lock);
+
+		kref_put(&ev_file->ref, ib_uverbs_release_event_file);
+	}
+
+	spin_lock_irq(&file->async_file->lock);
+	list_for_each_entry_safe(evt, tmp, &uobj->async_list, obj_list) {
+		list_del(&evt->list);
+		kfree(evt);
+	}
+	spin_unlock_irq(&file->async_file->lock);
+}
+
+void ib_uverbs_release_uevent(struct ib_uverbs_file *file,
+			      struct ib_uevent_object *uobj)
+{
+	struct ib_uverbs_event *evt, *tmp;
+
+	spin_lock_irq(&file->async_file->lock);
+	list_for_each_entry_safe(evt, tmp, &uobj->event_list, obj_list) {
+		list_del(&evt->list);
+		kfree(evt);
+	}
+	spin_unlock_irq(&file->async_file->lock);
+}
+
+static void ib_uverbs_detach_umcast(struct ib_qp *qp,
+				    struct ib_uqp_object *uobj)
+{
+	struct ib_uverbs_mcast_entry *mcast, *tmp;
+
+	list_for_each_entry_safe(mcast, tmp, &uobj->mcast_list, list) {
+		ib_detach_mcast(qp, &mcast->gid, mcast->lid);
+		list_del(&mcast->list);
+		kfree(mcast);
+	}
+}
+
+static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
+				      struct ib_ucontext *context)
+{
+	struct ib_uobject *uobj, *tmp;
+
+	if (!context)
+		return 0;
+
+	context->closing = 1;
+
+	list_for_each_entry_safe(uobj, tmp, &context->ah_list, list) {
+		struct ib_ah *ah = uobj->object;
+
+		idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
+		ib_destroy_ah(ah);
+		kfree(uobj);
+	}
+
+	list_for_each_entry_safe(uobj, tmp, &context->qp_list, list) {
+		struct ib_qp *qp = uobj->object;
+		struct ib_uqp_object *uqp =
+			container_of(uobj, struct ib_uqp_object, uevent.uobject);
+
+		idr_remove_uobj(&ib_uverbs_qp_idr, uobj);
+		ib_uverbs_detach_umcast(qp, uqp);
+		ib_destroy_qp(qp);
+		ib_uverbs_release_uevent(file, &uqp->uevent);
+		kfree(uqp);
+	}
+
+
+	list_for_each_entry_safe(uobj, tmp, &context->srq_list, list) {
+		struct ib_srq *srq = uobj->object;
+		struct ib_uevent_object *uevent =
+			container_of(uobj, struct ib_uevent_object, uobject);
+
+		idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
+		ib_destroy_srq(srq);
+		ib_uverbs_release_uevent(file, uevent);
+		kfree(uevent);
+	}
+
+	list_for_each_entry_safe(uobj, tmp, &context->cq_list, list) {
+		struct ib_cq *cq = uobj->object;
+		struct ib_uverbs_event_file *ev_file = cq->cq_context;
+		struct ib_ucq_object *ucq =
+			container_of(uobj, struct ib_ucq_object, uobject);
+
+		idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
+		ib_destroy_cq(cq);
+		ib_uverbs_release_ucq(file, ev_file, ucq);
+		kfree(ucq);
+	}
+
+	/* XXX Free MWs */
+
+	list_for_each_entry_safe(uobj, tmp, &context->mr_list, list) {
+		struct ib_mr *mr = uobj->object;
+
+		idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
+		ib_dereg_mr(mr);
+		kfree(uobj);
+	}
+
+	mutex_lock(&file->device->ib_dev->xrcd_table_mutex);
+	list_for_each_entry_safe(uobj, tmp, &context->xrc_domain_list, list) {
+		struct ib_xrcd *xrcd = uobj->object;
+		struct ib_uxrc_rcv_object *xrc_qp_obj, *tmp1;
+		struct ib_uxrcd_object *xrcd_uobj =
+			container_of(uobj, struct ib_uxrcd_object, uobject);
+
+		list_for_each_entry_safe(xrc_qp_obj, tmp1,
+					 &xrcd_uobj->xrc_reg_qp_list, list) {
+			list_del(&xrc_qp_obj->list);
+			ib_uverbs_cleanup_xrc_rcv_qp(file, xrcd,
+						     xrc_qp_obj->qp_num);
+			kfree(xrc_qp_obj);
+		}
+
+		idr_remove_uobj(&ib_uverbs_xrc_domain_idr, uobj);
+		ib_uverbs_dealloc_xrcd(file->device->ib_dev, xrcd);
+		kfree(uobj);
+	}
+	mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+
+	list_for_each_entry_safe(uobj, tmp, &context->pd_list, list) {
+		struct ib_pd *pd = uobj->object;
+
+		idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
+		ib_dealloc_pd(pd);
+		kfree(uobj);
+	}
+
+	return context->device->dealloc_ucontext(context);
+}
+
+static void ib_uverbs_release_file(struct kref *ref)
+{
+	struct ib_uverbs_file *file =
+		container_of(ref, struct ib_uverbs_file, ref);
+
+	module_put(file->device->ib_dev->owner);
+	kref_put(&file->device->ref, ib_uverbs_release_dev);
+
+	kfree(file);
+}
+
+static ssize_t ib_uverbs_event_read(struct file *filp, char __user *buf,
+				    size_t count, loff_t *pos)
+{
+	struct ib_uverbs_event_file *file = filp->private_data;
+	struct ib_uverbs_event *event;
+	int eventsz;
+	int ret = 0;
+
+	spin_lock_irq(&file->lock);
+
+	while (list_empty(&file->event_list)) {
+		spin_unlock_irq(&file->lock);
+
+		if (filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(file->poll_wait,
+					     !list_empty(&file->event_list)))
+			return -ERESTARTSYS;
+
+		spin_lock_irq(&file->lock);
+	}
+
+	event = list_entry(file->event_list.next, struct ib_uverbs_event, list);
+
+	if (file->is_async)
+		eventsz = sizeof (struct ib_uverbs_async_event_desc);
+	else
+		eventsz = sizeof (struct ib_uverbs_comp_event_desc);
+
+	if (eventsz > count) {
+		ret   = -EINVAL;
+		event = NULL;
+	} else {
+		list_del(file->event_list.next);
+		if (event->counter) {
+			++(*event->counter);
+			list_del(&event->obj_list);
+		}
+	}
+
+	spin_unlock_irq(&file->lock);
+
+	if (event) {
+		if (copy_to_user(buf, event, eventsz))
+			ret = -EFAULT;
+		else
+			ret = eventsz;
+	}
+
+	kfree(event);
+
+	return ret;
+}
+
+static unsigned int ib_uverbs_event_poll(struct file *filp,
+					 struct poll_table_struct *wait)
+{
+	unsigned int pollflags = 0;
+	struct ib_uverbs_event_file *file = filp->private_data;
+
+	file->filp = filp;
+	poll_wait(filp, &file->poll_wait, wait);
+
+	spin_lock_irq(&file->lock);
+	if (!list_empty(&file->event_list))
+		pollflags = POLLIN | POLLRDNORM;
+	spin_unlock_irq(&file->lock);
+
+	return pollflags;
+}
+
+static int ib_uverbs_event_fasync(int fd, struct file *filp, int on)
+{
+	struct ib_uverbs_event_file *file = filp->private_data;
+
+	return fasync_helper(fd, filp, on, &file->async_queue);
+}
+
+static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
+{
+	struct ib_uverbs_event_file *file = filp->private_data;
+	struct ib_uverbs_event *entry, *tmp;
+
+	spin_lock_irq(&file->lock);
+	file->is_closed = 1;
+	list_for_each_entry_safe(entry, tmp, &file->event_list, list) {
+		if (entry->counter)
+			list_del(&entry->obj_list);
+		kfree(entry);
+	}
+	spin_unlock_irq(&file->lock);
+
+	if (file->is_async) {
+		ib_unregister_event_handler(&file->uverbs_file->event_handler);
+		kref_put(&file->uverbs_file->ref, ib_uverbs_release_file);
+	}
+	kref_put(&file->ref, ib_uverbs_release_event_file);
+
+	return 0;
+}
+
+static const struct file_operations uverbs_event_fops = {
+	.owner	 = THIS_MODULE,
+	.read 	 = ib_uverbs_event_read,
+	.poll    = ib_uverbs_event_poll,
+	.release = ib_uverbs_event_close,
+	.fasync  = ib_uverbs_event_fasync
+};
+
+void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context)
+{
+	struct ib_uverbs_event_file    *file = cq_context;
+	struct ib_ucq_object	       *uobj;
+	struct ib_uverbs_event	       *entry;
+	unsigned long			flags;
+
+	if (!file)
+		return;
+
+	spin_lock_irqsave(&file->lock, flags);
+	if (file->is_closed) {
+		spin_unlock_irqrestore(&file->lock, flags);
+		return;
+	}
+
+	entry = kmalloc(sizeof *entry, GFP_ATOMIC);
+	if (!entry) {
+		spin_unlock_irqrestore(&file->lock, flags);
+		return;
+	}
+
+	uobj = container_of(cq->uobject, struct ib_ucq_object, uobject);
+
+	entry->desc.comp.cq_handle = cq->uobject->user_handle;
+	entry->counter		   = &uobj->comp_events_reported;
+
+	list_add_tail(&entry->list, &file->event_list);
+	list_add_tail(&entry->obj_list, &uobj->comp_list);
+	spin_unlock_irqrestore(&file->lock, flags);
+
+	wake_up_interruptible(&file->poll_wait);
+	if (file->filp)
+		selwakeup(&file->filp->f_selinfo);
+	kill_fasync(&file->async_queue, SIGIO, POLL_IN);
+}
+
+static void ib_uverbs_async_handler(struct ib_uverbs_file *file,
+				    __u64 element, __u64 event,
+				    struct list_head *obj_list,
+				    u32 *counter)
+{
+	struct ib_uverbs_event *entry;
+	unsigned long flags;
+
+	spin_lock_irqsave(&file->async_file->lock, flags);
+	if (file->async_file->is_closed) {
+		spin_unlock_irqrestore(&file->async_file->lock, flags);
+		return;
+	}
+
+	entry = kmalloc(sizeof *entry, GFP_ATOMIC);
+	if (!entry) {
+		spin_unlock_irqrestore(&file->async_file->lock, flags);
+		return;
+	}
+
+	entry->desc.async.element    = element;
+	entry->desc.async.event_type = event;
+	entry->counter               = counter;
+
+	list_add_tail(&entry->list, &file->async_file->event_list);
+	if (obj_list)
+		list_add_tail(&entry->obj_list, obj_list);
+	spin_unlock_irqrestore(&file->async_file->lock, flags);
+
+	wake_up_interruptible(&file->async_file->poll_wait);
+	if (file->async_file->filp)
+		selwakeup(&file->async_file->filp->f_selinfo);
+	kill_fasync(&file->async_file->async_queue, SIGIO, POLL_IN);
+}
+
+void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr)
+{
+	struct ib_ucq_object *uobj = container_of(event->element.cq->uobject,
+						  struct ib_ucq_object, uobject);
+
+	ib_uverbs_async_handler(uobj->uverbs_file, uobj->uobject.user_handle,
+				event->event, &uobj->async_list,
+				&uobj->async_events_reported);
+}
+
+void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr)
+{
+	struct ib_uevent_object *uobj;
+
+	uobj = container_of(event->element.qp->uobject,
+			    struct ib_uevent_object, uobject);
+
+	ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle,
+				event->event, &uobj->event_list,
+				&uobj->events_reported);
+}
+
+void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr)
+{
+	struct ib_uevent_object *uobj;
+
+	uobj = container_of(event->element.srq->uobject,
+			    struct ib_uevent_object, uobject);
+
+	ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle,
+				event->event, &uobj->event_list,
+				&uobj->events_reported);
+}
+
+void ib_uverbs_event_handler(struct ib_event_handler *handler,
+			     struct ib_event *event)
+{
+	struct ib_uverbs_file *file =
+		container_of(handler, struct ib_uverbs_file, event_handler);
+
+	ib_uverbs_async_handler(file, event->element.port_num, event->event,
+				NULL, NULL);
+}
+
+void ib_uverbs_xrc_rcv_qp_event_handler(struct ib_event *event,
+					void *context_ptr)
+{
+	ib_uverbs_async_handler(context_ptr, event->element.xrc_qp_num,
+				event->event, NULL, NULL);
+}
+
+struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+					int is_async, int *fd)
+{
+	struct ib_uverbs_event_file *ev_file;
+	struct file *filp;
+	int ret;
+
+	ev_file = kmalloc(sizeof *ev_file, GFP_KERNEL);
+	if (!ev_file)
+		return ERR_PTR(-ENOMEM);
+
+	kref_init(&ev_file->ref);
+	spin_lock_init(&ev_file->lock);
+	INIT_LIST_HEAD(&ev_file->event_list);
+	init_waitqueue_head(&ev_file->poll_wait);
+	ev_file->uverbs_file = uverbs_file;
+	ev_file->async_queue = NULL;
+	ev_file->is_async    = is_async;
+	ev_file->is_closed   = 0;
+	ev_file->filp	     = NULL;
+
+	*fd = get_unused_fd();
+	if (*fd < 0) {
+		ret = *fd;
+		goto err;
+	}
+
+	/*
+	 * fops_get() can't fail here, because we're coming from a
+	 * system call on a uverbs file, which will already have a
+	 * module reference.
+	 */
+	filp = alloc_file(uverbs_event_mnt, dget(uverbs_event_mnt->mnt_root),
+			  FMODE_READ, fops_get(&uverbs_event_fops));
+	if (!filp) {
+		ret = -ENFILE;
+		goto err_fd;
+	}
+
+	filp->private_data = ev_file;
+
+	return filp;
+
+err_fd:
+	put_unused_fd(*fd);
+
+err:
+	kfree(ev_file);
+	return ERR_PTR(ret);
+}
+
+/*
+ * Look up a completion event file by FD.  If lookup is successful,
+ * takes a ref to the event file struct that it returns; if
+ * unsuccessful, returns NULL.
+ */
+struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd)
+{
+	struct ib_uverbs_event_file *ev_file = NULL;
+	struct file *filp;
+
+	filp = fget(fd);
+	if (!filp)
+		return NULL;
+
+	if (filp->f_op != &uverbs_event_fops)
+		goto out;
+
+	ev_file = filp->private_data;
+	if (ev_file->is_async) {
+		ev_file = NULL;
+		goto out;
+	}
+
+	kref_get(&ev_file->ref);
+
+out:
+	fput(filp);
+	return ev_file;
+}
+
+static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
+			     size_t count, loff_t *pos)
+{
+	struct ib_uverbs_file *file = filp->private_data;
+	struct ib_uverbs_cmd_hdr hdr;
+
+	if (count < sizeof hdr)
+		return -EINVAL;
+
+	if (copy_from_user(&hdr, buf, sizeof hdr))
+		return -EFAULT;
+
+	if (hdr.in_words * 4 != count)
+		return -EINVAL;
+
+	if (hdr.command < 0				||
+	    hdr.command >= ARRAY_SIZE(uverbs_cmd_table) ||
+	    !uverbs_cmd_table[hdr.command]		||
+	    !(file->device->ib_dev->uverbs_cmd_mask & (1ull << hdr.command)))
+		return -EINVAL;
+
+	if (!file->ucontext &&
+	    hdr.command != IB_USER_VERBS_CMD_GET_CONTEXT)
+		return -EINVAL;
+
+	return uverbs_cmd_table[hdr.command](file, buf + sizeof hdr,
+					     hdr.in_words * 4, hdr.out_words * 4);
+}
+
+static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct ib_uverbs_file *file = filp->private_data;
+
+	if (!file->ucontext)
+		return -ENODEV;
+	else
+		return file->device->ib_dev->mmap(file->ucontext, vma);
+}
+
+/*
+ * ib_uverbs_open() does not need the BKL:
+ *
+ *  - dev_table[] accesses are protected by map_lock, the
+ *    ib_uverbs_device structures are properly reference counted, and
+ *    everything else is purely local to the file being created, so
+ *    races against other open calls are not a problem;
+ *  - there is no ioctl method to race against;
+ *  - the device is added to dev_table[] as the last part of module
+ *    initialization, the open method will either immediately run
+ *    -ENXIO, or all required initialization will be done.
+ */
+static int ib_uverbs_open(struct inode *inode, struct file *filp)
+{
+	struct ib_uverbs_device *dev;
+	struct ib_uverbs_file *file;
+	int ret;
+
+	spin_lock(&map_lock);
+	dev = dev_table[iminor(inode) - IB_UVERBS_BASE_MINOR];
+	if (dev)
+		kref_get(&dev->ref);
+	spin_unlock(&map_lock);
+
+	if (!dev)
+		return -ENXIO;
+
+	if (!try_module_get(dev->ib_dev->owner)) {
+		ret = -ENODEV;
+		goto err;
+	}
+
+	file = kmalloc(sizeof *file, GFP_KERNEL);
+	if (!file) {
+		ret = -ENOMEM;
+		goto err_module;
+	}
+
+	file->device	 = dev;
+	file->ucontext	 = NULL;
+	file->async_file = NULL;
+	kref_init(&file->ref);
+	mutex_init(&file->mutex);
+
+	filp->private_data = file;
+
+	return 0;
+
+err_module:
+	module_put(dev->ib_dev->owner);
+
+err:
+	kref_put(&dev->ref, ib_uverbs_release_dev);
+	return ret;
+}
+
+static int ib_uverbs_close(struct inode *inode, struct file *filp)
+{
+	struct ib_uverbs_file *file = filp->private_data;
+
+	ib_uverbs_cleanup_ucontext(file, file->ucontext);
+
+	if (file->async_file)
+		kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
+
+	kref_put(&file->ref, ib_uverbs_release_file);
+
+	return 0;
+}
+
+static const struct file_operations uverbs_fops = {
+	.owner 	 = THIS_MODULE,
+	.write 	 = ib_uverbs_write,
+	.open 	 = ib_uverbs_open,
+	.release = ib_uverbs_close
+};
+
+static const struct file_operations uverbs_mmap_fops = {
+	.owner 	 = THIS_MODULE,
+	.write 	 = ib_uverbs_write,
+	.mmap    = ib_uverbs_mmap,
+	.open 	 = ib_uverbs_open,
+	.release = ib_uverbs_close
+};
+
+static struct ib_client uverbs_client = {
+	.name   = "uverbs",
+	.add    = ib_uverbs_add_one,
+	.remove = ib_uverbs_remove_one
+};
+
+static ssize_t show_ibdev(struct device *device, struct device_attribute *attr,
+			  char *buf)
+{
+	struct ib_uverbs_device *dev = dev_get_drvdata(device);
+
+	if (!dev)
+		return -ENODEV;
+
+	return sprintf(buf, "%s\n", dev->ib_dev->name);
+}
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static ssize_t show_dev_abi_version(struct device *device,
+				    struct device_attribute *attr, char *buf)
+{
+	struct ib_uverbs_device *dev = dev_get_drvdata(device);
+
+	if (!dev)
+		return -ENODEV;
+
+	return sprintf(buf, "%d\n", dev->ib_dev->uverbs_abi_ver);
+}
+static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL);
+
+static ssize_t show_abi_version(struct class *class, char *buf)
+{
+	return sprintf(buf, "%d\n", IB_USER_VERBS_ABI_VERSION);
+}
+static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
+
+#include <linux/pci.h>
+
+static ssize_t
+show_dev_device(struct device *device, struct device_attribute *attr, char *buf)
+{
+	struct ib_uverbs_device *dev = dev_get_drvdata(device);
+
+	if (!dev)
+		return -ENODEV;
+
+	return sprintf(buf, "0x%04x\n",
+	    ((struct pci_dev *)dev->ib_dev->dma_device)->device);
+}
+static DEVICE_ATTR(device, S_IRUGO, show_dev_device, NULL);
+
+static ssize_t
+show_dev_vendor(struct device *device, struct device_attribute *attr, char *buf)
+{
+	struct ib_uverbs_device *dev = dev_get_drvdata(device);
+
+	if (!dev)
+		return -ENODEV;
+
+	return sprintf(buf, "0x%04x\n",
+	    ((struct pci_dev *)dev->ib_dev->dma_device)->vendor);
+}
+static DEVICE_ATTR(vendor, S_IRUGO, show_dev_vendor, NULL);
+
+struct attribute *device_attrs[] =
+{
+	&dev_attr_device.attr,
+	&dev_attr_vendor.attr,
+	NULL
+};
+
+static struct attribute_group device_group = {
+        .name  = "device",
+        .attrs  = device_attrs   
+};
+
+static void ib_uverbs_add_one(struct ib_device *device)
+{
+	struct ib_uverbs_device *uverbs_dev;
+
+	if (!device->alloc_ucontext)
+		return;
+
+	uverbs_dev = kzalloc(sizeof *uverbs_dev, GFP_KERNEL);
+	if (!uverbs_dev)
+		return;
+
+	kref_init(&uverbs_dev->ref);
+	init_completion(&uverbs_dev->comp);
+
+	spin_lock(&map_lock);
+	uverbs_dev->devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
+	if (uverbs_dev->devnum >= IB_UVERBS_MAX_DEVICES) {
+		spin_unlock(&map_lock);
+		goto err;
+	}
+	set_bit(uverbs_dev->devnum, dev_map);
+	spin_unlock(&map_lock);
+
+	uverbs_dev->ib_dev           = device;
+	uverbs_dev->num_comp_vectors = device->num_comp_vectors;
+
+	uverbs_dev->cdev = cdev_alloc();
+	if (!uverbs_dev->cdev)
+		goto err;
+	uverbs_dev->cdev->owner = THIS_MODULE;
+	uverbs_dev->cdev->ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops;
+	kobject_set_name(&uverbs_dev->cdev->kobj, "uverbs%d", uverbs_dev->devnum);
+	if (cdev_add(uverbs_dev->cdev, IB_UVERBS_BASE_DEV + uverbs_dev->devnum, 1))
+		goto err_cdev;
+
+	uverbs_dev->dev = device_create(uverbs_class, device->dma_device,
+					uverbs_dev->cdev->dev, uverbs_dev,
+					"uverbs%d", uverbs_dev->devnum);
+	if (IS_ERR(uverbs_dev->dev))
+		goto err_cdev;
+
+	if (device_create_file(uverbs_dev->dev, &dev_attr_ibdev))
+		goto err_class;
+	if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version))
+		goto err_class;
+	if (sysfs_create_group(&uverbs_dev->dev->kobj, &device_group))
+		goto err_class;
+
+	spin_lock(&map_lock);
+	dev_table[uverbs_dev->devnum] = uverbs_dev;
+	spin_unlock(&map_lock);
+
+	ib_set_client_data(device, &uverbs_client, uverbs_dev);
+
+	return;
+
+err_class:
+	device_destroy(uverbs_class, uverbs_dev->cdev->dev);
+
+err_cdev:
+	cdev_del(uverbs_dev->cdev);
+	clear_bit(uverbs_dev->devnum, dev_map);
+
+err:
+	kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
+	wait_for_completion(&uverbs_dev->comp);
+	kfree(uverbs_dev);
+	return;
+}
+
+static void ib_uverbs_remove_one(struct ib_device *device)
+{
+	struct ib_uverbs_device *uverbs_dev = ib_get_client_data(device, &uverbs_client);
+
+	if (!uverbs_dev)
+		return;
+
+	sysfs_remove_group(&uverbs_dev->dev->kobj, &device_group);
+	dev_set_drvdata(uverbs_dev->dev, NULL);
+	device_destroy(uverbs_class, uverbs_dev->cdev->dev);
+	cdev_del(uverbs_dev->cdev);
+
+	spin_lock(&map_lock);
+	dev_table[uverbs_dev->devnum] = NULL;
+	spin_unlock(&map_lock);
+
+	clear_bit(uverbs_dev->devnum, dev_map);
+
+	kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
+	wait_for_completion(&uverbs_dev->comp);
+	kfree(uverbs_dev);
+}
+#ifdef __linux__
+static int uverbs_event_get_sb(struct file_system_type *fs_type, int flags,
+			       const char *dev_name, void *data,
+			       struct vfsmount *mnt)
+{
+	return get_sb_pseudo(fs_type, "infinibandevent:", NULL,
+			     INFINIBANDEVENTFS_MAGIC, mnt);
+}
+
+static struct file_system_type uverbs_event_fs = {
+	/* No owner field so module can be unloaded */
+	.name    = "infinibandeventfs",
+	.get_sb  = uverbs_event_get_sb,
+	.kill_sb = kill_litter_super
+};
+#endif
+
+static int __init ib_uverbs_init(void)
+{
+	int ret;
+
+	spin_lock_init(&map_lock);
+
+	ret = register_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES,
+				     "infiniband_verbs");
+	if (ret) {
+		printk(KERN_ERR "user_verbs: couldn't register device number\n");
+		goto out;
+	}
+
+	uverbs_class = class_create(THIS_MODULE, "infiniband_verbs");
+	if (IS_ERR(uverbs_class)) {
+		ret = PTR_ERR(uverbs_class);
+		printk(KERN_ERR "user_verbs: couldn't create class infiniband_verbs\n");
+		goto out_chrdev;
+	}
+
+	ret = class_create_file(uverbs_class, &class_attr_abi_version);
+	if (ret) {
+		printk(KERN_ERR "user_verbs: couldn't create abi_version attribute\n");
+		goto out_class;
+	}
+
+#ifdef __linux__
+	ret = register_filesystem(&uverbs_event_fs);
+	if (ret) {
+		printk(KERN_ERR "user_verbs: couldn't register infinibandeventfs\n");
+		goto out_class;
+	}
+
+	uverbs_event_mnt = kern_mount(&uverbs_event_fs);
+	if (IS_ERR(uverbs_event_mnt)) {
+		ret = PTR_ERR(uverbs_event_mnt);
+		printk(KERN_ERR "user_verbs: couldn't mount infinibandeventfs\n");
+		goto out_fs;
+	}
+#endif
+
+	ret = ib_register_client(&uverbs_client);
+	if (ret) {
+		printk(KERN_ERR "user_verbs: couldn't register client\n");
+		goto out_mnt;
+	}
+
+	return 0;
+
+out_mnt:
+#ifdef __linux__
+	mntput(uverbs_event_mnt);
+
+out_fs:
+	unregister_filesystem(&uverbs_event_fs);
+#endif
+
+out_class:
+	class_destroy(uverbs_class);
+
+out_chrdev:
+	unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES);
+
+out:
+	return ret;
+}
+
+static void __exit ib_uverbs_cleanup(void)
+{
+	ib_unregister_client(&uverbs_client);
+#ifdef __linux__
+	mntput(uverbs_event_mnt);
+	unregister_filesystem(&uverbs_event_fs);
+#endif
+	class_destroy(uverbs_class);
+	unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES);
+	idr_destroy(&ib_uverbs_pd_idr);
+	idr_destroy(&ib_uverbs_mr_idr);
+	idr_destroy(&ib_uverbs_mw_idr);
+	idr_destroy(&ib_uverbs_ah_idr);
+	idr_destroy(&ib_uverbs_cq_idr);
+	idr_destroy(&ib_uverbs_qp_idr);
+	idr_destroy(&ib_uverbs_srq_idr);
+}
+
+module_init(ib_uverbs_init);
+module_exit(ib_uverbs_cleanup);
diff --git a/sys/ofed/drivers/infiniband/core/uverbs_marshall.c b/sys/ofed/drivers/infiniband/core/uverbs_marshall.c
new file mode 100644
index 0000000..5440da0
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/uverbs_marshall.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_marshall.h>
+
+void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst,
+			     struct ib_ah_attr *src)
+{
+	memcpy(dst->grh.dgid, src->grh.dgid.raw, sizeof src->grh.dgid);
+	dst->grh.flow_label        = src->grh.flow_label;
+	dst->grh.sgid_index        = src->grh.sgid_index;
+	dst->grh.hop_limit         = src->grh.hop_limit;
+	dst->grh.traffic_class     = src->grh.traffic_class;
+	dst->dlid 	    	   = src->dlid;
+	dst->sl   	    	   = src->sl;
+	dst->src_path_bits 	   = src->src_path_bits;
+	dst->static_rate   	   = src->static_rate;
+	dst->is_global             = src->ah_flags & IB_AH_GRH ? 1 : 0;
+	dst->port_num 	    	   = src->port_num;
+}
+EXPORT_SYMBOL(ib_copy_ah_attr_to_user);
+
+void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst,
+			     struct ib_qp_attr *src)
+{
+	dst->cur_qp_state	= src->cur_qp_state;
+	dst->path_mtu		= src->path_mtu;
+	dst->path_mig_state	= src->path_mig_state;
+	dst->qkey		= src->qkey;
+	dst->rq_psn		= src->rq_psn;
+	dst->sq_psn		= src->sq_psn;
+	dst->dest_qp_num	= src->dest_qp_num;
+	dst->qp_access_flags	= src->qp_access_flags;
+
+	dst->max_send_wr	= src->cap.max_send_wr;
+	dst->max_recv_wr	= src->cap.max_recv_wr;
+	dst->max_send_sge	= src->cap.max_send_sge;
+	dst->max_recv_sge	= src->cap.max_recv_sge;
+	dst->max_inline_data	= src->cap.max_inline_data;
+
+	ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr);
+	ib_copy_ah_attr_to_user(&dst->alt_ah_attr, &src->alt_ah_attr);
+
+	dst->pkey_index		= src->pkey_index;
+	dst->alt_pkey_index	= src->alt_pkey_index;
+	dst->en_sqd_async_notify = src->en_sqd_async_notify;
+	dst->sq_draining	= src->sq_draining;
+	dst->max_rd_atomic	= src->max_rd_atomic;
+	dst->max_dest_rd_atomic	= src->max_dest_rd_atomic;
+	dst->min_rnr_timer	= src->min_rnr_timer;
+	dst->port_num		= src->port_num;
+	dst->timeout		= src->timeout;
+	dst->retry_cnt		= src->retry_cnt;
+	dst->rnr_retry		= src->rnr_retry;
+	dst->alt_port_num	= src->alt_port_num;
+	dst->alt_timeout	= src->alt_timeout;
+}
+EXPORT_SYMBOL(ib_copy_qp_attr_to_user);
+
+void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst,
+			      struct ib_sa_path_rec *src)
+{
+	memcpy(dst->dgid, src->dgid.raw, sizeof src->dgid);
+	memcpy(dst->sgid, src->sgid.raw, sizeof src->sgid);
+
+	dst->dlid		= src->dlid;
+	dst->slid		= src->slid;
+	dst->raw_traffic	= src->raw_traffic;
+	dst->flow_label		= src->flow_label;
+	dst->hop_limit		= src->hop_limit;
+	dst->traffic_class	= src->traffic_class;
+	dst->reversible		= src->reversible;
+	dst->numb_path		= src->numb_path;
+	dst->pkey		= src->pkey;
+	dst->sl			= src->sl;
+	dst->mtu_selector	= src->mtu_selector;
+	dst->mtu		= src->mtu;
+	dst->rate_selector	= src->rate_selector;
+	dst->rate		= src->rate;
+	dst->packet_life_time	= src->packet_life_time;
+	dst->preference		= src->preference;
+	dst->packet_life_time_selector = src->packet_life_time_selector;
+}
+EXPORT_SYMBOL(ib_copy_path_rec_to_user);
+
+void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst,
+				struct ib_user_path_rec *src)
+{
+	memcpy(dst->dgid.raw, src->dgid, sizeof dst->dgid);
+	memcpy(dst->sgid.raw, src->sgid, sizeof dst->sgid);
+
+	dst->dlid		= src->dlid;
+	dst->slid		= src->slid;
+	dst->raw_traffic	= src->raw_traffic;
+	dst->flow_label		= src->flow_label;
+	dst->hop_limit		= src->hop_limit;
+	dst->traffic_class	= src->traffic_class;
+	dst->reversible		= src->reversible;
+	dst->numb_path		= src->numb_path;
+	dst->pkey		= src->pkey;
+	dst->sl			= src->sl;
+	dst->mtu_selector	= src->mtu_selector;
+	dst->mtu		= src->mtu;
+	dst->rate_selector	= src->rate_selector;
+	dst->rate		= src->rate;
+	dst->packet_life_time	= src->packet_life_time;
+	dst->preference		= src->preference;
+	dst->packet_life_time_selector = src->packet_life_time_selector;
+}
+EXPORT_SYMBOL(ib_copy_path_rec_from_user);
diff --git a/sys/ofed/drivers/infiniband/core/verbs.c b/sys/ofed/drivers/infiniband/core/verbs.c
new file mode 100644
index 0000000..90bdeaa
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/verbs.c
@@ -0,0 +1,1073 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/string.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+
+int ib_rate_to_mult(enum ib_rate rate)
+{
+	switch (rate) {
+	case IB_RATE_2_5_GBPS: return  1;
+	case IB_RATE_5_GBPS:   return  2;
+	case IB_RATE_10_GBPS:  return  4;
+	case IB_RATE_20_GBPS:  return  8;
+	case IB_RATE_30_GBPS:  return 12;
+	case IB_RATE_40_GBPS:  return 16;
+	case IB_RATE_60_GBPS:  return 24;
+	case IB_RATE_80_GBPS:  return 32;
+	case IB_RATE_120_GBPS: return 48;
+	default:	       return -1;
+	}
+}
+EXPORT_SYMBOL(ib_rate_to_mult);
+
+enum ib_rate mult_to_ib_rate(int mult)
+{
+	switch (mult) {
+	case 1:  return IB_RATE_2_5_GBPS;
+	case 2:  return IB_RATE_5_GBPS;
+	case 4:  return IB_RATE_10_GBPS;
+	case 8:  return IB_RATE_20_GBPS;
+	case 12: return IB_RATE_30_GBPS;
+	case 16: return IB_RATE_40_GBPS;
+	case 24: return IB_RATE_60_GBPS;
+	case 32: return IB_RATE_80_GBPS;
+	case 48: return IB_RATE_120_GBPS;
+	default: return IB_RATE_PORT_CURRENT;
+	}
+}
+EXPORT_SYMBOL(mult_to_ib_rate);
+
+enum rdma_transport_type
+rdma_node_get_transport(enum rdma_node_type node_type)
+{
+	switch (node_type) {
+	case RDMA_NODE_IB_CA:
+	case RDMA_NODE_IB_SWITCH:
+	case RDMA_NODE_IB_ROUTER:
+		return RDMA_TRANSPORT_IB;
+	case RDMA_NODE_RNIC:
+		return RDMA_TRANSPORT_IWARP;
+	default:
+		BUG();
+		return 0;
+	}
+}
+EXPORT_SYMBOL(rdma_node_get_transport);
+
+enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num)
+{
+	if (device->get_link_layer)
+		return device->get_link_layer(device, port_num);
+
+	switch (rdma_node_get_transport(device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		return IB_LINK_LAYER_INFINIBAND;
+	case RDMA_TRANSPORT_IWARP:
+		return IB_LINK_LAYER_ETHERNET;
+	default:
+		return IB_LINK_LAYER_UNSPECIFIED;
+	}
+}
+EXPORT_SYMBOL(rdma_port_get_link_layer);
+
+/* Protection domains */
+
+struct ib_pd *ib_alloc_pd(struct ib_device *device)
+{
+	struct ib_pd *pd;
+
+	pd = device->alloc_pd(device, NULL, NULL);
+
+	if (!IS_ERR(pd)) {
+		pd->device  = device;
+		pd->uobject = NULL;
+		atomic_set(&pd->usecnt, 0);
+	}
+
+	return pd;
+}
+EXPORT_SYMBOL(ib_alloc_pd);
+
+int ib_dealloc_pd(struct ib_pd *pd)
+{
+	if (atomic_read(&pd->usecnt))
+		return -EBUSY;
+
+	return pd->device->dealloc_pd(pd);
+}
+EXPORT_SYMBOL(ib_dealloc_pd);
+
+/* Address handles */
+
+struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+	struct ib_ah *ah;
+
+	ah = pd->device->create_ah(pd, ah_attr);
+
+	if (!IS_ERR(ah)) {
+		ah->device  = pd->device;
+		ah->pd      = pd;
+		ah->uobject = NULL;
+		atomic_inc(&pd->usecnt);
+	}
+
+	return ah;
+}
+EXPORT_SYMBOL(ib_create_ah);
+
+int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
+		       struct ib_grh *grh, struct ib_ah_attr *ah_attr)
+{
+	u32 flow_class;
+	u16 gid_index;
+	int ret;
+
+	memset(ah_attr, 0, sizeof *ah_attr);
+	ah_attr->dlid = wc->slid;
+	ah_attr->sl = wc->sl;
+	ah_attr->src_path_bits = wc->dlid_path_bits;
+	ah_attr->port_num = port_num;
+
+	if (wc->wc_flags & IB_WC_GRH) {
+		ah_attr->ah_flags = IB_AH_GRH;
+		ah_attr->grh.dgid = grh->sgid;
+
+		ret = ib_find_cached_gid(device, &grh->dgid, &port_num,
+					 &gid_index);
+		if (ret)
+			return ret;
+
+		ah_attr->grh.sgid_index = (u8) gid_index;
+		flow_class = be32_to_cpu(grh->version_tclass_flow);
+		ah_attr->grh.flow_label = flow_class & 0xFFFFF;
+		ah_attr->grh.hop_limit = 0xFF;
+		ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ib_init_ah_from_wc);
+
+struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc,
+				   struct ib_grh *grh, u8 port_num)
+{
+	struct ib_ah_attr ah_attr;
+	int ret;
+
+	ret = ib_init_ah_from_wc(pd->device, port_num, wc, grh, &ah_attr);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return ib_create_ah(pd, &ah_attr);
+}
+EXPORT_SYMBOL(ib_create_ah_from_wc);
+
+int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+	return ah->device->modify_ah ?
+		ah->device->modify_ah(ah, ah_attr) :
+		-ENOSYS;
+}
+EXPORT_SYMBOL(ib_modify_ah);
+
+int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+	return ah->device->query_ah ?
+		ah->device->query_ah(ah, ah_attr) :
+		-ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_ah);
+
+int ib_destroy_ah(struct ib_ah *ah)
+{
+	struct ib_pd *pd;
+	int ret;
+
+	pd = ah->pd;
+	ret = ah->device->destroy_ah(ah);
+	if (!ret)
+		atomic_dec(&pd->usecnt);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_destroy_ah);
+
+/* Shared receive queues */
+
+struct ib_srq *ib_create_srq(struct ib_pd *pd,
+			     struct ib_srq_init_attr *srq_init_attr)
+{
+	struct ib_srq *srq;
+
+	if (!pd->device->create_srq)
+		return ERR_PTR(-ENOSYS);
+
+	srq = pd->device->create_srq(pd, srq_init_attr, NULL);
+
+	if (!IS_ERR(srq)) {
+		srq->device    	   = pd->device;
+		srq->pd        	   = pd;
+		srq->uobject       = NULL;
+		srq->event_handler = srq_init_attr->event_handler;
+		srq->srq_context   = srq_init_attr->srq_context;
+		srq->xrc_cq = NULL;
+		srq->xrcd = NULL;
+		atomic_inc(&pd->usecnt);
+		atomic_set(&srq->usecnt, 0);
+	}
+
+	return srq;
+}
+EXPORT_SYMBOL(ib_create_srq);
+
+struct ib_srq *ib_create_xrc_srq(struct ib_pd *pd,
+				 struct ib_cq *xrc_cq,
+				 struct ib_xrcd *xrcd,
+				 struct ib_srq_init_attr *srq_init_attr)
+{
+	struct ib_srq *srq;
+
+	if (!pd->device->create_xrc_srq)
+		return ERR_PTR(-ENOSYS);
+
+	srq = pd->device->create_xrc_srq(pd, xrc_cq, xrcd, srq_init_attr, NULL);
+
+	if (!IS_ERR(srq)) {
+		srq->device	   = pd->device;
+		srq->pd		   = pd;
+		srq->uobject	   = NULL;
+		srq->event_handler = srq_init_attr->event_handler;
+		srq->srq_context   = srq_init_attr->srq_context;
+		srq->xrc_cq	   = xrc_cq;
+		srq->xrcd	   = xrcd;
+		atomic_inc(&pd->usecnt);
+		atomic_inc(&xrcd->usecnt);
+		atomic_inc(&xrc_cq->usecnt);
+		atomic_set(&srq->usecnt, 0);
+	}
+
+	return srq;
+}
+EXPORT_SYMBOL(ib_create_xrc_srq);
+
+int ib_modify_srq(struct ib_srq *srq,
+		  struct ib_srq_attr *srq_attr,
+		  enum ib_srq_attr_mask srq_attr_mask)
+{
+	return srq->device->modify_srq ?
+		srq->device->modify_srq(srq, srq_attr, srq_attr_mask, NULL) :
+		-ENOSYS;
+}
+EXPORT_SYMBOL(ib_modify_srq);
+
+int ib_query_srq(struct ib_srq *srq,
+		 struct ib_srq_attr *srq_attr)
+{
+	return srq->device->query_srq ?
+		srq->device->query_srq(srq, srq_attr) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_srq);
+
+int ib_destroy_srq(struct ib_srq *srq)
+{
+	struct ib_pd *pd;
+	struct ib_cq *xrc_cq;
+	struct ib_xrcd *xrcd;
+	int ret;
+
+	if (atomic_read(&srq->usecnt))
+		return -EBUSY;
+
+	pd = srq->pd;
+	xrc_cq = srq->xrc_cq;
+	xrcd = srq->xrcd;
+
+	ret = srq->device->destroy_srq(srq);
+	if (!ret) {
+		atomic_dec(&pd->usecnt);
+		if (xrc_cq)
+			atomic_dec(&xrc_cq->usecnt);
+		if (xrcd)
+			atomic_dec(&xrcd->usecnt);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_destroy_srq);
+
+/* Queue pairs */
+
+struct ib_qp *ib_create_qp(struct ib_pd *pd,
+			   struct ib_qp_init_attr *qp_init_attr)
+{
+	struct ib_qp *qp;
+
+	qp = pd->device->create_qp(pd, qp_init_attr, NULL);
+
+	if (!IS_ERR(qp)) {
+		qp->device     	  = pd->device;
+		qp->pd         	  = pd;
+		qp->send_cq    	  = qp_init_attr->send_cq;
+		qp->recv_cq    	  = qp_init_attr->recv_cq;
+		qp->srq	       	  = qp_init_attr->srq;
+		qp->uobject       = NULL;
+		qp->event_handler = qp_init_attr->event_handler;
+		qp->qp_context    = qp_init_attr->qp_context;
+		qp->qp_type	  = qp_init_attr->qp_type;
+		qp->xrcd	  = qp->qp_type == IB_QPT_XRC ?
+			qp_init_attr->xrc_domain : NULL;
+		atomic_inc(&pd->usecnt);
+		atomic_inc(&qp_init_attr->send_cq->usecnt);
+		atomic_inc(&qp_init_attr->recv_cq->usecnt);
+		if (qp_init_attr->srq)
+			atomic_inc(&qp_init_attr->srq->usecnt);
+		if (qp->qp_type == IB_QPT_XRC)
+			atomic_inc(&qp->xrcd->usecnt);
+	}
+
+	return qp;
+}
+EXPORT_SYMBOL(ib_create_qp);
+
+static const struct {
+	int			valid;
+	enum ib_qp_attr_mask	req_param[IB_QPT_RAW_ETH + 1];
+	enum ib_qp_attr_mask	opt_param[IB_QPT_RAW_ETH + 1];
+} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
+	[IB_QPS_RESET] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_INIT]  = {
+			.valid = 1,
+			.req_param = {
+				[IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_QKEY),
+				[IB_QPT_RAW_ETH] = IB_QP_PORT,
+				[IB_QPT_UC]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_RC]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_XRC] = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+			}
+		},
+	},
+	[IB_QPS_INIT]  = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 },
+		[IB_QPS_INIT]  = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_QKEY),
+				[IB_QPT_UC]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_RC]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_XRC] = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+			}
+		},
+		[IB_QPS_RTR]   = {
+			.valid = 1,
+			.req_param = {
+				[IB_QPT_UC]  = (IB_QP_AV			|
+						IB_QP_PATH_MTU			|
+						IB_QP_DEST_QPN			|
+						IB_QP_RQ_PSN),
+				[IB_QPT_RC]  = (IB_QP_AV			|
+						IB_QP_PATH_MTU			|
+						IB_QP_DEST_QPN			|
+						IB_QP_RQ_PSN			|
+						IB_QP_MAX_DEST_RD_ATOMIC	|
+						IB_QP_MIN_RNR_TIMER),
+				[IB_QPT_XRC] = (IB_QP_AV			|
+						IB_QP_PATH_MTU			|
+						IB_QP_DEST_QPN			|
+						IB_QP_RQ_PSN			|
+						IB_QP_MAX_DEST_RD_ATOMIC	|
+						IB_QP_MIN_RNR_TIMER),
+			},
+			.opt_param = {
+				 [IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
+						 IB_QP_QKEY),
+				 [IB_QPT_UC]  = (IB_QP_ALT_PATH			|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_PKEY_INDEX),
+				 [IB_QPT_RC]  = (IB_QP_ALT_PATH			|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_PKEY_INDEX),
+				 [IB_QPT_XRC] = (IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PKEY_INDEX),
+				 [IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
+						 IB_QP_QKEY),
+				 [IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
+						 IB_QP_QKEY),
+			 }
+		}
+	},
+	[IB_QPS_RTR]   = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.req_param = {
+				[IB_QPT_UD]  = IB_QP_SQ_PSN,
+				[IB_QPT_UC]  = IB_QP_SQ_PSN,
+				[IB_QPT_RC]  = (IB_QP_TIMEOUT			|
+						IB_QP_RETRY_CNT			|
+						IB_QP_RNR_RETRY			|
+						IB_QP_SQ_PSN			|
+						IB_QP_MAX_QP_RD_ATOMIC),
+				[IB_QPT_XRC] = (IB_QP_TIMEOUT			|
+						IB_QP_RETRY_CNT			|
+						IB_QP_RNR_RETRY			|
+						IB_QP_SQ_PSN			|
+						IB_QP_MAX_QP_RD_ATOMIC),
+				[IB_QPT_SMI] = IB_QP_SQ_PSN,
+				[IB_QPT_GSI] = IB_QP_SQ_PSN,
+			},
+			.opt_param = {
+				 [IB_QPT_UD]  = (IB_QP_CUR_STATE		|
+						 IB_QP_QKEY),
+				 [IB_QPT_UC]  = (IB_QP_CUR_STATE		|
+						 IB_QP_ALT_PATH			|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_PATH_MIG_STATE),
+				 [IB_QPT_RC]  = (IB_QP_CUR_STATE		|
+						 IB_QP_ALT_PATH			|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_MIN_RNR_TIMER		|
+						 IB_QP_PATH_MIG_STATE),
+				 [IB_QPT_XRC] = (IB_QP_CUR_STATE		|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_MIN_RNR_TIMER		|
+						IB_QP_PATH_MIG_STATE),
+				 [IB_QPT_SMI] = (IB_QP_CUR_STATE		|
+						 IB_QP_QKEY),
+				 [IB_QPT_GSI] = (IB_QP_CUR_STATE		|
+						 IB_QP_QKEY),
+			 }
+		}
+	},
+	[IB_QPS_RTS]   = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_UC]  = (IB_QP_CUR_STATE			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_ALT_PATH			|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_RC]  = (IB_QP_CUR_STATE			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_ALT_PATH			|
+						IB_QP_PATH_MIG_STATE		|
+						IB_QP_MIN_RNR_TIMER),
+				[IB_QPT_XRC] = (IB_QP_CUR_STATE			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_ALT_PATH			|
+						IB_QP_PATH_MIG_STATE		|
+						IB_QP_MIN_RNR_TIMER),
+				[IB_QPT_SMI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+			}
+		},
+		[IB_QPS_SQD]   = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = IB_QP_EN_SQD_ASYNC_NOTIFY,
+				[IB_QPT_UC]  = IB_QP_EN_SQD_ASYNC_NOTIFY,
+				[IB_QPT_RC]  = IB_QP_EN_SQD_ASYNC_NOTIFY,
+				[IB_QPT_XRC] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+				[IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+				[IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY
+			}
+		},
+	},
+	[IB_QPS_SQD]   = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_UC]  = (IB_QP_CUR_STATE			|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_RC]  = (IB_QP_CUR_STATE			|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_MIN_RNR_TIMER		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_XRC] = (IB_QP_CUR_STATE			|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_MIN_RNR_TIMER		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_SMI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+			}
+		},
+		[IB_QPS_SQD]   = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+				[IB_QPT_UC]  = (IB_QP_AV			|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PKEY_INDEX		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_RC]  = (IB_QP_PORT			|
+						IB_QP_AV			|
+						IB_QP_TIMEOUT			|
+						IB_QP_RETRY_CNT			|
+						IB_QP_RNR_RETRY			|
+						IB_QP_MAX_QP_RD_ATOMIC		|
+						IB_QP_MAX_DEST_RD_ATOMIC	|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PKEY_INDEX		|
+						IB_QP_MIN_RNR_TIMER		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_XRC] = (IB_QP_PORT			|
+						IB_QP_AV			|
+						IB_QP_TIMEOUT			|
+						IB_QP_RETRY_CNT			|
+						IB_QP_RNR_RETRY			|
+						IB_QP_MAX_QP_RD_ATOMIC		|
+						IB_QP_MAX_DEST_RD_ATOMIC	|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PKEY_INDEX		|
+						IB_QP_MIN_RNR_TIMER		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+			}
+		}
+	},
+	[IB_QPS_SQE]   = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_UC]  = (IB_QP_CUR_STATE			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_SMI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+			}
+		}
+	},
+	[IB_QPS_ERR] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 }
+	}
+};
+
+int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
+		       enum ib_qp_type type, enum ib_qp_attr_mask mask)
+{
+	enum ib_qp_attr_mask req_param, opt_param;
+
+	if (cur_state  < 0 || cur_state  > IB_QPS_ERR ||
+	    next_state < 0 || next_state > IB_QPS_ERR)
+		return 0;
+
+	if (mask & IB_QP_CUR_STATE  &&
+	    cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS &&
+	    cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE)
+		return 0;
+
+	if (!qp_state_table[cur_state][next_state].valid)
+		return 0;
+
+	req_param = qp_state_table[cur_state][next_state].req_param[type];
+	opt_param = qp_state_table[cur_state][next_state].opt_param[type];
+
+	if ((mask & req_param) != req_param)
+		return 0;
+
+	if (mask & ~(req_param | opt_param | IB_QP_STATE))
+		return 0;
+
+	return 1;
+}
+EXPORT_SYMBOL(ib_modify_qp_is_ok);
+
+int ib_modify_qp(struct ib_qp *qp,
+		 struct ib_qp_attr *qp_attr,
+		 int qp_attr_mask)
+{
+	return qp->device->modify_qp(qp, qp_attr, qp_attr_mask, NULL);
+}
+EXPORT_SYMBOL(ib_modify_qp);
+
+int ib_query_qp(struct ib_qp *qp,
+		struct ib_qp_attr *qp_attr,
+		int qp_attr_mask,
+		struct ib_qp_init_attr *qp_init_attr)
+{
+	return qp->device->query_qp ?
+		qp->device->query_qp(qp, qp_attr, qp_attr_mask, qp_init_attr) :
+		-ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_qp);
+
+int ib_destroy_qp(struct ib_qp *qp)
+{
+	struct ib_pd *pd;
+	struct ib_cq *scq, *rcq;
+	struct ib_srq *srq;
+	struct ib_xrcd *xrcd;
+	enum ib_qp_type	qp_type = qp->qp_type;
+	int ret;
+
+	pd  = qp->pd;
+	scq = qp->send_cq;
+	rcq = qp->recv_cq;
+	srq = qp->srq;
+	xrcd = qp->xrcd;
+
+	ret = qp->device->destroy_qp(qp);
+	if (!ret) {
+		atomic_dec(&pd->usecnt);
+		atomic_dec(&scq->usecnt);
+		atomic_dec(&rcq->usecnt);
+		if (srq)
+			atomic_dec(&srq->usecnt);
+		if (qp_type == IB_QPT_XRC)
+			atomic_dec(&xrcd->usecnt);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_destroy_qp);
+
+/* Completion queues */
+
+struct ib_cq *ib_create_cq(struct ib_device *device,
+			   ib_comp_handler comp_handler,
+			   void (*event_handler)(struct ib_event *, void *),
+			   void *cq_context, int cqe, int comp_vector)
+{
+	struct ib_cq *cq;
+
+	cq = device->create_cq(device, cqe, comp_vector, NULL, NULL);
+
+	if (!IS_ERR(cq)) {
+		cq->device        = device;
+		cq->uobject       = NULL;
+		cq->comp_handler  = comp_handler;
+		cq->event_handler = event_handler;
+		cq->cq_context    = cq_context;
+		atomic_set(&cq->usecnt, 0);
+	}
+
+	return cq;
+}
+EXPORT_SYMBOL(ib_create_cq);
+
+int ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
+{
+	return cq->device->modify_cq ?
+		cq->device->modify_cq(cq, cq_count, cq_period) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_modify_cq);
+
+int ib_destroy_cq(struct ib_cq *cq)
+{
+	if (atomic_read(&cq->usecnt))
+		return -EBUSY;
+
+	return cq->device->destroy_cq(cq);
+}
+EXPORT_SYMBOL(ib_destroy_cq);
+
+int ib_resize_cq(struct ib_cq *cq, int cqe)
+{
+	return cq->device->resize_cq ?
+		cq->device->resize_cq(cq, cqe, NULL) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_resize_cq);
+
+/* Memory regions */
+
+struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
+{
+	struct ib_mr *mr;
+
+	mr = pd->device->get_dma_mr(pd, mr_access_flags);
+
+	if (!IS_ERR(mr)) {
+		mr->device  = pd->device;
+		mr->pd      = pd;
+		mr->uobject = NULL;
+		atomic_inc(&pd->usecnt);
+		atomic_set(&mr->usecnt, 0);
+	}
+
+	return mr;
+}
+EXPORT_SYMBOL(ib_get_dma_mr);
+
+struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
+			     struct ib_phys_buf *phys_buf_array,
+			     int num_phys_buf,
+			     int mr_access_flags,
+			     u64 *iova_start)
+{
+	struct ib_mr *mr;
+
+	if (!pd->device->reg_phys_mr)
+		return ERR_PTR(-ENOSYS);
+
+	mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf,
+				     mr_access_flags, iova_start);
+
+	if (!IS_ERR(mr)) {
+		mr->device  = pd->device;
+		mr->pd      = pd;
+		mr->uobject = NULL;
+		atomic_inc(&pd->usecnt);
+		atomic_set(&mr->usecnt, 0);
+	}
+
+	return mr;
+}
+EXPORT_SYMBOL(ib_reg_phys_mr);
+
+int ib_rereg_phys_mr(struct ib_mr *mr,
+		     int mr_rereg_mask,
+		     struct ib_pd *pd,
+		     struct ib_phys_buf *phys_buf_array,
+		     int num_phys_buf,
+		     int mr_access_flags,
+		     u64 *iova_start)
+{
+	struct ib_pd *old_pd;
+	int ret;
+
+	if (!mr->device->rereg_phys_mr)
+		return -ENOSYS;
+
+	if (atomic_read(&mr->usecnt))
+		return -EBUSY;
+
+	old_pd = mr->pd;
+
+	ret = mr->device->rereg_phys_mr(mr, mr_rereg_mask, pd,
+					phys_buf_array, num_phys_buf,
+					mr_access_flags, iova_start);
+
+	if (!ret && (mr_rereg_mask & IB_MR_REREG_PD)) {
+		atomic_dec(&old_pd->usecnt);
+		atomic_inc(&pd->usecnt);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_rereg_phys_mr);
+
+int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
+{
+	return mr->device->query_mr ?
+		mr->device->query_mr(mr, mr_attr) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_mr);
+
+int ib_dereg_mr(struct ib_mr *mr)
+{
+	struct ib_pd *pd;
+	int ret;
+
+	if (atomic_read(&mr->usecnt))
+		return -EBUSY;
+
+	pd = mr->pd;
+	ret = mr->device->dereg_mr(mr);
+	if (!ret)
+		atomic_dec(&pd->usecnt);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_dereg_mr);
+
+struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
+{
+	struct ib_mr *mr;
+
+	if (!pd->device->alloc_fast_reg_mr)
+		return ERR_PTR(-ENOSYS);
+
+	mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len);
+
+	if (!IS_ERR(mr)) {
+		mr->device  = pd->device;
+		mr->pd      = pd;
+		mr->uobject = NULL;
+		atomic_inc(&pd->usecnt);
+		atomic_set(&mr->usecnt, 0);
+	}
+
+	return mr;
+}
+EXPORT_SYMBOL(ib_alloc_fast_reg_mr);
+
+struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device,
+							  int max_page_list_len)
+{
+	struct ib_fast_reg_page_list *page_list;
+
+	if (!device->alloc_fast_reg_page_list)
+		return ERR_PTR(-ENOSYS);
+
+	page_list = device->alloc_fast_reg_page_list(device, max_page_list_len);
+
+	if (!IS_ERR(page_list)) {
+		page_list->device = device;
+		page_list->max_page_list_len = max_page_list_len;
+	}
+
+	return page_list;
+}
+EXPORT_SYMBOL(ib_alloc_fast_reg_page_list);
+
+void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
+{
+	page_list->device->free_fast_reg_page_list(page_list);
+}
+EXPORT_SYMBOL(ib_free_fast_reg_page_list);
+
+/* Memory windows */
+
+struct ib_mw *ib_alloc_mw(struct ib_pd *pd)
+{
+	struct ib_mw *mw;
+
+	if (!pd->device->alloc_mw)
+		return ERR_PTR(-ENOSYS);
+
+	mw = pd->device->alloc_mw(pd);
+	if (!IS_ERR(mw)) {
+		mw->device  = pd->device;
+		mw->pd      = pd;
+		mw->uobject = NULL;
+		atomic_inc(&pd->usecnt);
+	}
+
+	return mw;
+}
+EXPORT_SYMBOL(ib_alloc_mw);
+
+int ib_dealloc_mw(struct ib_mw *mw)
+{
+	struct ib_pd *pd;
+	int ret;
+
+	pd = mw->pd;
+	ret = mw->device->dealloc_mw(mw);
+	if (!ret)
+		atomic_dec(&pd->usecnt);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_dealloc_mw);
+
+/* "Fast" memory regions */
+
+struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
+			    int mr_access_flags,
+			    struct ib_fmr_attr *fmr_attr)
+{
+	struct ib_fmr *fmr;
+
+	if (!pd->device->alloc_fmr)
+		return ERR_PTR(-ENOSYS);
+
+	fmr = pd->device->alloc_fmr(pd, mr_access_flags, fmr_attr);
+	if (!IS_ERR(fmr)) {
+		fmr->device = pd->device;
+		fmr->pd     = pd;
+		atomic_inc(&pd->usecnt);
+	}
+
+	return fmr;
+}
+EXPORT_SYMBOL(ib_alloc_fmr);
+
+int ib_unmap_fmr(struct list_head *fmr_list)
+{
+	struct ib_fmr *fmr;
+
+	if (list_empty(fmr_list))
+		return 0;
+
+	fmr = list_entry(fmr_list->next, struct ib_fmr, list);
+	return fmr->device->unmap_fmr(fmr_list);
+}
+EXPORT_SYMBOL(ib_unmap_fmr);
+
+int ib_dealloc_fmr(struct ib_fmr *fmr)
+{
+	struct ib_pd *pd;
+	int ret;
+
+	pd = fmr->pd;
+	ret = fmr->device->dealloc_fmr(fmr);
+	if (!ret)
+		atomic_dec(&pd->usecnt);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_dealloc_fmr);
+
+/* Multicast groups */
+
+int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+	if (!qp->device->attach_mcast)
+		return -ENOSYS;
+
+	switch (rdma_node_get_transport(qp->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		if (qp->qp_type == IB_QPT_RAW_ETH) {
+			/* In raw Etherent mgids the 63 msb's should be 0 */
+			if (gid->global.subnet_prefix & cpu_to_be64(~1ULL))
+				return -EINVAL;
+		} else if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD)
+			return -EINVAL;
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		if (qp->qp_type != IB_QPT_RAW_ETH)
+			return -EINVAL;
+		break;
+	}
+	return qp->device->attach_mcast(qp, gid, lid);
+}
+EXPORT_SYMBOL(ib_attach_mcast);
+
+int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+	if (!qp->device->detach_mcast)
+		return -ENOSYS;
+
+	switch (rdma_node_get_transport(qp->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		if (qp->qp_type == IB_QPT_RAW_ETH) {
+			/* In raw Etherent mgids the 63 msb's should be 0 */
+			if (gid->global.subnet_prefix & cpu_to_be64(~1ULL))
+				return -EINVAL;
+		} else if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD)
+			return -EINVAL;
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		if (qp->qp_type != IB_QPT_RAW_ETH)
+			return -EINVAL;
+		break;
+	}
+	return qp->device->detach_mcast(qp, gid, lid);
+}
+EXPORT_SYMBOL(ib_detach_mcast);
+
+int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+{
+	if (atomic_read(&xrcd->usecnt))
+		return -EBUSY;
+
+	return xrcd->device->dealloc_xrcd(xrcd);
+}
+EXPORT_SYMBOL(ib_dealloc_xrcd);
+
+struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device)
+{
+	struct ib_xrcd *xrcd;
+
+	if (!device->alloc_xrcd)
+		return ERR_PTR(-ENOSYS);
+
+	xrcd = device->alloc_xrcd(device, NULL, NULL);
+	if (!IS_ERR(xrcd)) {
+		xrcd->device = device;
+		xrcd->inode = NULL;
+		xrcd->uobject = NULL;
+		atomic_set(&xrcd->usecnt, 0);
+	}
+	return xrcd;
+}
+EXPORT_SYMBOL(ib_alloc_xrcd);
+
diff --git a/sys/ofed/drivers/infiniband/debug/Makefile b/sys/ofed/drivers/infiniband/debug/Makefile
new file mode 100644
index 0000000..e9d9f4b
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/debug/Makefile
@@ -0,0 +1,3 @@
+EXTRA_CFLAGS :=  $(subst $(KERNEL_MEMTRACK_CFLAGS),,$(EXTRA_CFLAGS))
+
+obj-m += memtrack.o
diff --git a/sys/ofed/drivers/infiniband/debug/memtrack.c b/sys/ofed/drivers/infiniband/debug/memtrack.c
new file mode 100644
index 0000000..199b33b
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/debug/memtrack.c
@@ -0,0 +1,600 @@
+/*
+  This software is available to you under a choice of one of two
+  licenses.  You may choose to be licensed under the terms of the GNU
+  General Public License (GPL) Version 2, available at
+  <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+  license, available in the LICENSE.TXT file accompanying this
+  software.  These details are also available at
+  <http://openib.org/license.html>.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+  BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+  ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+  SOFTWARE.
+
+  Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+*/
+
+#define C_MEMTRACK_C
+
+#ifdef kmalloc
+        #undef kmalloc
+#endif
+#ifdef kfree
+        #undef kfree
+#endif
+#ifdef vmalloc
+        #undef vmalloc
+#endif
+#ifdef vfree
+        #undef vfree
+#endif
+#ifdef kmem_cache_alloc
+        #undef kmem_cache_alloc
+#endif
+#ifdef kmem_cache_free
+        #undef kmem_cache_free
+#endif
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/vmalloc.h>
+#include <linux/version.h>
+#include <asm/uaccess.h>
+#include <linux/proc_fs.h>
+#include <memtrack.h>
+
+#include <linux/moduleparam.h>
+
+
+MODULE_AUTHOR("Mellanox Technologies LTD.");
+MODULE_DESCRIPTION("Memory allocations tracking");
+MODULE_LICENSE("GPL");
+
+#define MEMTRACK_HASH_SZ ((1<<15)-19)   /* prime: http://www.utm.edu/research/primes/lists/2small/0bit.html */
+#define MAX_FILENAME_LEN 31
+
+#define memtrack_spin_lock(spl, flags)     spin_lock_irqsave(spl, flags)
+#define memtrack_spin_unlock(spl, flags)   spin_unlock_irqrestore(spl, flags)
+
+/* if a bit is set then the corresponding allocation is tracked.
+   bit0 corresponds to MEMTRACK_KMALLOC, bit1 corresponds to MEMTRACK_VMALLOC etc. */
+static unsigned long track_mask = -1;   /* effectively everything */
+module_param(track_mask, ulong, 0444);
+MODULE_PARM_DESC(track_mask, "bitmask definenig what is tracked");
+
+/* if a bit is set then the corresponding allocation is strictly tracked.
+   That is, before inserting the whole range is checked to not overlap any
+   of the allocations already in the database */
+static unsigned long strict_track_mask = 0;     /* no strict tracking */
+module_param(strict_track_mask, ulong, 0444);
+MODULE_PARM_DESC(strict_track_mask, "bitmask which allocation requires strict tracking");
+
+typedef struct memtrack_meminfo_st {
+        unsigned long addr;
+        unsigned long size;
+        unsigned long line_num;
+        struct memtrack_meminfo_st *next;
+        struct list_head list;  /* used to link all items from a certain type together */
+        char filename[MAX_FILENAME_LEN + 1];    /* putting the char array last is better for struct. packing */
+} memtrack_meminfo_t;
+
+static struct kmem_cache *meminfo_cache;
+
+typedef struct {
+        memtrack_meminfo_t *mem_hash[MEMTRACK_HASH_SZ];
+        spinlock_t hash_lock;
+        unsigned long count; /* size of memory tracked (*malloc) or number of objects tracked */
+        struct list_head tracked_objs_head;     /* head of list of all objects */
+        int strict_track;       /* if 1 then for each object inserted check if it overlaps any of the objects already in the list */
+} tracked_obj_desc_t;
+
+static tracked_obj_desc_t *tracked_objs_arr[MEMTRACK_NUM_OF_MEMTYPES];
+
+static const char *rsc_names[MEMTRACK_NUM_OF_MEMTYPES] = {
+        "kmalloc",
+        "vmalloc",
+        "kmem_cache_alloc"
+};
+
+
+static const char *rsc_free_names[MEMTRACK_NUM_OF_MEMTYPES] = {
+        "kfree",
+        "vfree",
+        "kmem_cache_free"
+};
+
+
+static inline const char *memtype_alloc_str(memtrack_memtype_t memtype)
+{
+        switch (memtype) {
+                case MEMTRACK_KMALLOC:
+                case MEMTRACK_VMALLOC:
+                case MEMTRACK_KMEM_OBJ:
+                        return rsc_names[memtype];
+                default:
+                        return "(Unknown allocation type)";
+        }
+}
+
+static inline const char *memtype_free_str(memtrack_memtype_t memtype)
+{
+        switch (memtype) {
+                case MEMTRACK_KMALLOC:
+                case MEMTRACK_VMALLOC:
+                case MEMTRACK_KMEM_OBJ:
+                        return rsc_free_names[memtype];
+                default:
+                        return "(Unknown allocation type)";
+        }
+}
+
+/*
+ *  overlap_a_b
+ */
+static int overlap_a_b(unsigned long a_start, unsigned long a_end,
+                       unsigned long b_start, unsigned long b_end)
+{
+        if ((b_start > a_end) || (a_start > b_end)) {
+                return 0;
+        }
+        return 1;
+}
+
+/*
+ *  check_overlap
+ */
+static void check_overlap(memtrack_memtype_t memtype,
+                          memtrack_meminfo_t * mem_info_p,
+                          tracked_obj_desc_t * obj_desc_p)
+{
+        struct list_head *pos, *next;
+        memtrack_meminfo_t *cur;
+        unsigned long start_a, end_a, start_b, end_b;
+
+        list_for_each_safe(pos, next, &obj_desc_p->tracked_objs_head) {
+                cur = list_entry(pos, memtrack_meminfo_t, list);
+
+                start_a = mem_info_p->addr;
+                end_a = mem_info_p->addr + mem_info_p->size - 1;
+                start_b = cur->addr;
+                end_b = cur->addr + cur->size - 1;
+
+                if (overlap_a_b(start_a, end_a, start_b, end_b)) {
+                        printk
+                            ("%s overlaps! new_start=0x%lx, new_end=0x%lx, item_start=0x%lx, item_end=0x%lx\n",
+                             memtype_alloc_str(memtype), mem_info_p->addr,
+                             mem_info_p->addr + mem_info_p->size - 1, cur->addr,
+                             cur->addr + cur->size - 1);
+                }
+        }
+}
+
+/* Invoke on memory allocation */
+void memtrack_alloc(memtrack_memtype_t memtype, unsigned long addr,
+                    unsigned long size, const char *filename,
+                    const unsigned long line_num, int alloc_flags)
+{
+        unsigned long hash_val;
+        memtrack_meminfo_t *cur_mem_info_p, *new_mem_info_p;
+        tracked_obj_desc_t *obj_desc_p;
+        unsigned long flags;
+
+        if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) {
+                printk("%s: Invalid memory type (%d)\n", __func__, memtype);
+                return;
+        }
+
+        if (!tracked_objs_arr[memtype]) {
+                /* object is not tracked */
+                return;
+        }
+        obj_desc_p = tracked_objs_arr[memtype];
+
+        hash_val = addr % MEMTRACK_HASH_SZ;
+
+        new_mem_info_p = (memtrack_meminfo_t *)
+            kmem_cache_alloc(meminfo_cache, alloc_flags);
+        if (new_mem_info_p == NULL) {
+                printk
+                    ("%s: Failed allocating kmem_cache item for new mem_info. "
+                     "Lost tracking on allocation at %s:%lu...\n", __func__,
+                     filename, line_num);
+                return;
+        }
+        /* save allocation properties */
+        new_mem_info_p->addr = addr;
+        new_mem_info_p->size = size;
+        new_mem_info_p->line_num = line_num;
+        /* Make sure that we will print out the path tail if the given filename is longer
+         * than MAX_FILENAME_LEN. (otherwise, we will not see the name of the actual file
+         * in the printout -- only the path head!
+         */
+        if (strlen(filename) > MAX_FILENAME_LEN) {
+          strncpy(new_mem_info_p->filename, filename + strlen(filename) - MAX_FILENAME_LEN, MAX_FILENAME_LEN);
+        } else {
+          strncpy(new_mem_info_p->filename, filename, MAX_FILENAME_LEN);
+        }
+        new_mem_info_p->filename[MAX_FILENAME_LEN] = 0; /* NULL terminate anyway */
+
+        memtrack_spin_lock(&obj_desc_p->hash_lock, flags);
+        /* make sure given memory location is not already allocated */
+        cur_mem_info_p = obj_desc_p->mem_hash[hash_val];
+        while (cur_mem_info_p != NULL) {
+                if (cur_mem_info_p->addr == addr) {
+                        /* Found given address in the database */
+                        printk
+                            ("mtl rsc inconsistency: %s: %s::%lu: %s @ addr=0x%lX which is already known from %s:%lu\n",
+                             __func__, filename, line_num,
+                             memtype_alloc_str(memtype), addr,
+                             cur_mem_info_p->filename,
+                             cur_mem_info_p->line_num);
+                        memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
+                        kmem_cache_free(meminfo_cache, new_mem_info_p);
+                        return;
+                }
+                cur_mem_info_p = cur_mem_info_p->next;
+        }
+        /* not found - we can put in the hash bucket */
+        /* link as first */
+        new_mem_info_p->next = obj_desc_p->mem_hash[hash_val];
+        obj_desc_p->mem_hash[hash_val] = new_mem_info_p;
+        if (obj_desc_p->strict_track) {
+                check_overlap(memtype, new_mem_info_p, obj_desc_p);
+        }
+        obj_desc_p->count += size;
+        list_add(&new_mem_info_p->list, &obj_desc_p->tracked_objs_head);
+
+        memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
+        return;
+}
+
+/* Invoke on memory free */
+void memtrack_free(memtrack_memtype_t memtype, unsigned long addr,
+                   const char *filename, const unsigned long line_num)
+{
+        unsigned long hash_val;
+        memtrack_meminfo_t *cur_mem_info_p, *prev_mem_info_p;
+        tracked_obj_desc_t *obj_desc_p;
+        unsigned long flags;
+
+        if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) {
+                printk("%s: Invalid memory type (%d)\n", __func__, memtype);
+                return;
+        }
+
+        if (!tracked_objs_arr[memtype]) {
+                /* object is not tracked */
+                return;
+        }
+        obj_desc_p = tracked_objs_arr[memtype];
+
+        hash_val = addr % MEMTRACK_HASH_SZ;
+
+        memtrack_spin_lock(&obj_desc_p->hash_lock, flags);
+        /* find  mem_info of given memory location */
+        prev_mem_info_p = NULL;
+        cur_mem_info_p = obj_desc_p->mem_hash[hash_val];
+        while (cur_mem_info_p != NULL) {
+                if (cur_mem_info_p->addr == addr) {
+                        /* Found given address in the database - remove from the bucket/list */
+                        if (prev_mem_info_p == NULL) {
+                                obj_desc_p->mem_hash[hash_val] = cur_mem_info_p->next;  /* removing first */
+                        } else {
+                                prev_mem_info_p->next = cur_mem_info_p->next;   /* "crossover" */
+                        }
+                        list_del(&cur_mem_info_p->list);
+
+                        obj_desc_p->count -= cur_mem_info_p->size;
+                        memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
+                        kmem_cache_free(meminfo_cache, cur_mem_info_p);
+                        return;
+                }
+                prev_mem_info_p = cur_mem_info_p;
+                cur_mem_info_p = cur_mem_info_p->next;
+        }
+
+        /* not found */
+        printk
+            ("mtl rsc inconsistency: %s: %s::%lu: %s for unknown address=0x%lX\n",
+             __func__, filename, line_num, memtype_free_str(memtype), addr);
+        memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
+        return;
+}
+
+/* Report current allocations status (for all memory types) */
+static void memtrack_report(void)
+{
+        memtrack_memtype_t memtype;
+        unsigned long cur_bucket;
+        memtrack_meminfo_t *cur_mem_info_p;
+        int serial = 1;
+        tracked_obj_desc_t *obj_desc_p;
+        unsigned long flags;
+
+        printk("%s: Currently known allocations:\n", __func__);
+        for (memtype = 0; memtype < MEMTRACK_NUM_OF_MEMTYPES; memtype++) {
+                if (tracked_objs_arr[memtype]) {
+                        printk("%d) %s:\n", serial, memtype_alloc_str(memtype));
+                        obj_desc_p = tracked_objs_arr[memtype];
+                        /* Scan all buckets to find existing allocations */
+                        /* TBD: this may be optimized by holding a linked list of all hash items */
+                        for (cur_bucket = 0; cur_bucket < MEMTRACK_HASH_SZ;
+                             cur_bucket++) {
+                                memtrack_spin_lock(&obj_desc_p->hash_lock, flags);      /* protect per bucket/list */
+                                cur_mem_info_p =
+                                    obj_desc_p->mem_hash[cur_bucket];
+                                while (cur_mem_info_p != NULL) {        /* scan bucket */
+                                        printk("%s::%lu: %s(%lu)==%lX\n",
+                                               cur_mem_info_p->filename,
+                                               cur_mem_info_p->line_num,
+                                               memtype_alloc_str(memtype),
+                                               cur_mem_info_p->size,
+                                               cur_mem_info_p->addr);
+                                        cur_mem_info_p = cur_mem_info_p->next;
+                                }       /* while cur_mem_info_p */
+                                memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
+                        }       /* for cur_bucket */
+                        serial++;
+                }
+        }                       /* for memtype */
+}
+
+
+
+static struct proc_dir_entry *memtrack_tree;
+
+static memtrack_memtype_t get_rsc_by_name(const char *name)
+{
+        memtrack_memtype_t i;
+
+        for (i=0; i<MEMTRACK_NUM_OF_MEMTYPES; ++i) {
+                if (strcmp(name, rsc_names[i]) == 0) {
+                        return i;
+                }
+        }
+
+        return i;
+}
+
+
+static ssize_t memtrack_read(struct file *filp,
+                                                 char __user *buf,
+                                                         size_t size,
+                                                         loff_t *offset)
+{
+        unsigned long cur, flags;
+        loff_t pos = *offset;
+        static char kbuf[20];
+        static int file_len;
+        int _read, to_ret, left;
+        const char *fname;
+        memtrack_memtype_t memtype;
+
+        if (pos < 0)
+                return -EINVAL;
+
+        fname= filp->f_dentry->d_name.name;
+
+        memtype= get_rsc_by_name(fname);
+        if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) {
+                printk("invalid file name\n");
+                return -EINVAL;
+        }
+
+        if ( pos == 0 ) {
+                memtrack_spin_lock(&tracked_objs_arr[memtype]->hash_lock, flags);
+                cur= tracked_objs_arr[memtype]->count;
+                memtrack_spin_unlock(&tracked_objs_arr[memtype]->hash_lock, flags);
+                _read = sprintf(kbuf, "%lu\n", cur);
+                if ( _read < 0 ) {
+                        return _read;
+                }
+                else {
+                        file_len = _read;
+                }
+        }
+
+        left = file_len - pos;
+        to_ret = (left < size) ? left : size;
+        if ( copy_to_user(buf, kbuf+pos, to_ret) ) {
+                return -EFAULT;
+        }
+        else {
+                *offset = pos + to_ret;
+                return to_ret;
+        }
+}
+
+static struct file_operations memtrack_proc_fops = {
+        .read = memtrack_read,
+};
+
+static const char *memtrack_proc_entry_name = "mt_memtrack";
+
+static int create_procfs_tree(void)
+{
+        struct proc_dir_entry *dir_ent;
+        struct proc_dir_entry *proc_ent;
+        int i, j;
+        unsigned long bit_mask;
+
+        dir_ent = proc_mkdir(memtrack_proc_entry_name, NULL);
+        if ( !dir_ent ) {
+                return -1;
+        }
+
+        memtrack_tree = dir_ent;
+
+        for (i=0, bit_mask=1; i<MEMTRACK_NUM_OF_MEMTYPES; ++i, bit_mask<<=1) {
+                if (bit_mask & track_mask) {
+                        proc_ent = create_proc_entry(rsc_names[i], S_IRUGO, memtrack_tree);
+                        if ( !proc_ent )
+                                goto undo_create_root;
+
+			proc_ent->proc_fops = &memtrack_proc_fops;
+                }
+        }
+
+        goto exit_ok;
+
+undo_create_root:
+        for (j=0, bit_mask=1; j<i; ++j, bit_mask<<=1) {
+                if (bit_mask & track_mask) {
+                        remove_proc_entry(rsc_names[j], memtrack_tree);
+                }
+        }
+        remove_proc_entry(memtrack_proc_entry_name, NULL);
+        return -1;
+
+exit_ok:
+        return 0;
+}
+
+
+static void destroy_procfs_tree(void)
+{
+        int i;
+        unsigned long bit_mask;
+
+        for (i=0, bit_mask=1; i<MEMTRACK_NUM_OF_MEMTYPES; ++i, bit_mask<<=1) {
+                if (bit_mask & track_mask) {
+                        remove_proc_entry(rsc_names[i], memtrack_tree);
+                }
+        }
+        remove_proc_entry(memtrack_proc_entry_name, NULL);
+}
+
+
+/* module entry points */
+
+int init_module(void)
+{
+        memtrack_memtype_t i;
+        int j;
+        unsigned long bit_mask;
+
+
+        /* create a cache for the memtrack_meminfo_t strcutures */
+        meminfo_cache = kmem_cache_create("memtrack_meminfo_t",
+                                          sizeof(memtrack_meminfo_t), 0,
+                                          SLAB_HWCACHE_ALIGN, NULL);
+        if (!meminfo_cache) {
+                printk("memtrack::%s: failed to allocate meminfo cache\n", __func__);
+                return -1;
+        }
+
+        /* initialize array of descriptors */
+        memset(tracked_objs_arr, 0, sizeof(tracked_objs_arr));
+
+        /* create a tracking object descriptor for all required objects */
+        for (i = 0, bit_mask = 1; i < MEMTRACK_NUM_OF_MEMTYPES;
+             ++i, bit_mask <<= 1) {
+                if (bit_mask & track_mask) {
+                        tracked_objs_arr[i] =
+                            vmalloc(sizeof(tracked_obj_desc_t));
+                        if (!tracked_objs_arr[i]) {
+                                printk("memtrack: failed to allocate tracking object\n");
+                                goto undo_cache_create;
+                        }
+
+                        memset(tracked_objs_arr[i], 0, sizeof(tracked_obj_desc_t));
+                        spin_lock_init(&tracked_objs_arr[i]->hash_lock);
+                        INIT_LIST_HEAD(&tracked_objs_arr[i]->tracked_objs_head);
+                        if (bit_mask & strict_track_mask) {
+                                tracked_objs_arr[i]->strict_track = 1;
+                        } else {
+                                tracked_objs_arr[i]->strict_track = 0;
+                        }
+                }
+        }
+
+
+        if ( create_procfs_tree() ) {
+                  printk("%s: create_procfs_tree() failed\n", __FILE__);
+                  goto undo_cache_create;
+        }
+
+
+        printk("memtrack::%s done.\n", __func__);
+
+        return 0;
+
+undo_cache_create:
+        for (j=0; j<i; ++j) {
+                if (tracked_objs_arr[j]) {
+                        vfree(tracked_objs_arr[j]);
+                }
+        }
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
+        if (kmem_cache_destroy(meminfo_cache) != 0) {
+                printk("Failed on kmem_cache_destroy !\n");
+        }
+#else
+        kmem_cache_destroy(meminfo_cache);
+#endif
+        return -1;
+}
+
+
+void cleanup_module(void)
+{
+        memtrack_memtype_t memtype;
+        unsigned long cur_bucket;
+        memtrack_meminfo_t *cur_mem_info_p, *next_mem_info_p;
+        tracked_obj_desc_t *obj_desc_p;
+        unsigned long flags;
+
+
+        memtrack_report();
+
+
+        destroy_procfs_tree();
+
+        /* clean up any hash table left-overs */
+        for (memtype = 0; memtype < MEMTRACK_NUM_OF_MEMTYPES; memtype++) {
+                /* Scan all buckets to find existing allocations */
+                /* TBD: this may be optimized by holding a linked list of all hash items */
+                if (tracked_objs_arr[memtype]) {
+                        obj_desc_p = tracked_objs_arr[memtype];
+                        for (cur_bucket = 0; cur_bucket < MEMTRACK_HASH_SZ;
+                             cur_bucket++) {
+                                memtrack_spin_lock(&obj_desc_p->hash_lock, flags);      /* protect per bucket/list */
+                                cur_mem_info_p =
+                                    obj_desc_p->mem_hash[cur_bucket];
+                                while (cur_mem_info_p != NULL) {        /* scan bucket */
+                                        next_mem_info_p = cur_mem_info_p->next; /* save "next" pointer before the "free" */
+                                        kmem_cache_free(meminfo_cache,
+                                                        cur_mem_info_p);
+                                        cur_mem_info_p = next_mem_info_p;
+                                }       /* while cur_mem_info_p */
+                                memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
+                        }       /* for cur_bucket */
+                        vfree(obj_desc_p);
+                }
+        }                       /* for memtype */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
+        if (kmem_cache_destroy(meminfo_cache) != 0) {
+                printk
+                    ("memtrack::cleanup_module: Failed on kmem_cache_destroy !\n");
+        }
+#else
+        kmem_cache_destroy(meminfo_cache);
+#endif
+        printk("memtrack::cleanup_module done.\n");
+}
+
+EXPORT_SYMBOL(memtrack_alloc);
+EXPORT_SYMBOL(memtrack_free);
+
+//module_init(memtrack_init)
+//module_exit(memtrack_exit)
+
diff --git a/sys/ofed/drivers/infiniband/debug/memtrack.h b/sys/ofed/drivers/infiniband/debug/memtrack.h
new file mode 100644
index 0000000..e443a31
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/debug/memtrack.h
@@ -0,0 +1,45 @@
+/*
+  This software is available to you under a choice of one of two
+  licenses.  You may choose to be licensed under the terms of the GNU
+  General Public License (GPL) Version 2, available at
+  <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+  license, available in the LICENSE.TXT file accompanying this
+  software.  These details are also available at
+  <http://openib.org/license.html>.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+  BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+  ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+  SOFTWARE.
+
+  Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+*/
+
+#ifndef H_MEMTRACK_H
+#define H_MEMTRACK_H
+
+typedef enum {
+        MEMTRACK_KMALLOC,
+        MEMTRACK_VMALLOC,
+        MEMTRACK_KMEM_OBJ,
+        MEMTRACK_NUM_OF_MEMTYPES
+} memtrack_memtype_t;
+
+/* Invoke on memory allocation */
+void memtrack_alloc(memtrack_memtype_t memtype, unsigned long addr,
+                    unsigned long size, const char *filename,
+                    const unsigned long line_num, int alloc_flags);
+
+/* Invoke on memory free */
+void memtrack_free(memtrack_memtype_t memtype, unsigned long addr,
+                   const char *filename, const unsigned long line_num);
+
+/* Report current allocations status (for all memory types) */
+/* we do not export this function since it is used by cleanup_module only */
+/* void memtrack_report(void); */
+
+#endif
diff --git a/sys/ofed/drivers/infiniband/debug/mtrack.h b/sys/ofed/drivers/infiniband/debug/mtrack.h
new file mode 100644
index 0000000..337d9c3
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/debug/mtrack.h
@@ -0,0 +1,138 @@
+#ifndef __mtrack_h_
+#define __mtrack_h_
+
+#include <memtrack.h>
+
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/version.h>
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14)
+#define RDMA_KZALLOC_H
+#define kzalloc(size, flags)  ({ \
+        void *__memtrack_kz_addr;                 \
+                                \
+        __memtrack_kz_addr = kmalloc(size, flags); \
+        if ( __memtrack_kz_addr ) {                               \
+                memset( __memtrack_kz_addr, 0, size) ; \
+        }                                                                     \
+        __memtrack_kz_addr;                                                                              \
+})
+
+#else
+#define kzalloc(size, flags) ({ \
+        void *__memtrack_addr;                 \
+                                \
+        __memtrack_addr = kzalloc(size, flags); \
+        if ( __memtrack_addr && (size)) {                               \
+                memtrack_alloc(MEMTRACK_KMALLOC, (unsigned long)(__memtrack_addr), size, __FILE__, __LINE__, flags); \
+        }                                                                     \
+        __memtrack_addr;                                                                              \
+})
+
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
+#define kcalloc(n, size, flags) kzalloc((n)*(size), flags)
+#else
+#define kcalloc(n, size, flags) ({ \
+        void *__memtrack_addr;                 \
+                                \
+        __memtrack_addr = kcalloc(n, size, flags); \
+        if ( __memtrack_addr && (size)) {                               \
+                memtrack_alloc(MEMTRACK_KMALLOC, (unsigned long)(__memtrack_addr), (n)*(size), __FILE__, __LINE__, flags); \
+        }                                                                     \
+        __memtrack_addr;                                                                              \
+})
+#endif
+
+
+
+#ifdef ZERO_OR_NULL_PTR
+#define kmalloc(sz, flgs) ({ \
+        void *__memtrack_addr;                 \
+                                \
+        __memtrack_addr = kmalloc(sz, flgs); \
+        if ( !ZERO_OR_NULL_PTR(__memtrack_addr)) {                               \
+                memtrack_alloc(MEMTRACK_KMALLOC, (unsigned long)(__memtrack_addr), sz, __FILE__, __LINE__, flgs); \
+        }                                                                     \
+        __memtrack_addr;                                                                              \
+})
+#else
+#define kmalloc(sz, flgs) ({ \
+        void *__memtrack_addr;                 \
+                                \
+        __memtrack_addr = kmalloc(sz, flgs); \
+        if ( __memtrack_addr ) {                               \
+                memtrack_alloc(MEMTRACK_KMALLOC, (unsigned long)(__memtrack_addr), sz, __FILE__, __LINE__, flgs); \
+        }                                                                     \
+        __memtrack_addr;                                                                              \
+})
+
+#endif
+
+#ifdef ZERO_OR_NULL_PTR
+#define kfree(addr) ({ \
+        void *__memtrack_addr = (void *)addr;                 \
+        if ( !ZERO_OR_NULL_PTR(__memtrack_addr) ) {      \
+                memtrack_free(MEMTRACK_KMALLOC, (unsigned long)(__memtrack_addr), __FILE__, __LINE__); \
+        }                    \
+        kfree(__memtrack_addr); \
+})
+#else
+#define kfree(addr) ({ \
+        void *__memtrack_addr = (void *)addr;                 \
+        if ( __memtrack_addr ) {      \
+                memtrack_free(MEMTRACK_KMALLOC, (unsigned long)(__memtrack_addr), __FILE__, __LINE__); \
+        }                    \
+        kfree(__memtrack_addr); \
+})
+#endif
+
+
+
+
+
+
+#define vmalloc(size) ({ \
+        void *__memtrack_addr;                 \
+                                \
+        __memtrack_addr = vmalloc(size); \
+        if ( __memtrack_addr ) {                               \
+                memtrack_alloc(MEMTRACK_VMALLOC, (unsigned long)(__memtrack_addr), size, __FILE__, __LINE__, GFP_ATOMIC); \
+        }                                                                     \
+        __memtrack_addr;                                                                              \
+})
+
+
+#define vfree(addr) ({ \
+        void *__memtrack_addr = (void *)addr;   \
+        if ( __memtrack_addr ) {      \
+                memtrack_free(MEMTRACK_VMALLOC, (unsigned long)(__memtrack_addr), __FILE__, __LINE__); \
+        }                    \
+        vfree(__memtrack_addr); \
+})
+
+
+#define kmem_cache_alloc(cache, flags) ({ \
+        void *__memtrack_addr;         \
+                                \
+        __memtrack_addr = kmem_cache_alloc(cache, flags); \
+        if ( __memtrack_addr ) {                               \
+                memtrack_alloc(MEMTRACK_KMEM_OBJ, (unsigned long)(__memtrack_addr), 1, __FILE__, __LINE__, flags); \
+        }                                                                     \
+        __memtrack_addr;                                                                              \
+})
+
+
+#define kmem_cache_free(cache, addr) ({ \
+        void *__memtrack_addr = (void *)addr;                 \
+        if ( __memtrack_addr ) {      \
+                memtrack_free(MEMTRACK_KMEM_OBJ, (unsigned long)(__memtrack_addr), __FILE__, __LINE__); \
+        }                    \
+        kmem_cache_free(cache, __memtrack_addr); \
+})
+
+
+#endif /* __mtrack_h_ */
+
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/Kconfig b/sys/ofed/drivers/infiniband/hw/mlx4/Kconfig
new file mode 100644
index 0000000..4175a4b
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/Kconfig
@@ -0,0 +1,8 @@
+config MLX4_INFINIBAND
+	tristate "Mellanox ConnectX HCA support"
+	select MLX4_CORE
+	---help---
+	  This driver provides low-level InfiniBand support for
+	  Mellanox ConnectX PCI Express host channel adapters (HCAs).
+	  This is required to use InfiniBand protocols such as
+	  IP-over-IB or SRP with these devices.
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/Makefile b/sys/ofed/drivers/infiniband/hw/mlx4/Makefile
new file mode 100644
index 0000000..ce885a8
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_MLX4_INFINIBAND)	+= mlx4_ib.o
+
+mlx4_ib-y :=	ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o
+mlx4_ib-y +=	wc.o
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/ah.c b/sys/ofed/drivers/infiniband/hw/mlx4/ah.c
new file mode 100644
index 0000000..26251b47
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/ah.c
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "mlx4_ib.h"
+#include <rdma/ib_addr.h>
+#include <linux/inet.h>
+#include <linux/string.h>
+#include <rdma/ib_cache.h>
+
+int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr,
+			u8 *mac, int *is_mcast, u8 port)
+{
+	struct mlx4_ib_iboe *iboe = &dev->iboe;
+	struct in6_addr in6;
+
+	*is_mcast = 0;
+	spin_lock(&iboe->lock);
+	if (!iboe->netdevs[port - 1]) {
+		spin_unlock(&iboe->lock);
+		return -EINVAL;
+	}
+	spin_unlock(&iboe->lock);
+
+	memcpy(&in6, ah_attr->grh.dgid.raw, sizeof in6);
+	if (rdma_link_local_addr(&in6))
+		rdma_get_ll_mac(&in6, mac);
+	else if (rdma_is_multicast_addr(&in6)) {
+		rdma_get_mcast_mac(&in6, mac);
+		*is_mcast = 1;
+	} else
+		return -EINVAL;
+
+	return 0;
+}
+
+static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr,
+				  struct mlx4_ib_ah *ah)
+{
+	struct mlx4_dev *dev = to_mdev(pd->device)->dev;
+
+	ah->av.ib.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24));
+	ah->av.ib.g_slid  = ah_attr->src_path_bits;
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		ah->av.ib.g_slid   |= 0x80;
+		ah->av.ib.gid_index = ah_attr->grh.sgid_index;
+		ah->av.ib.hop_limit = ah_attr->grh.hop_limit;
+		ah->av.ib.sl_tclass_flowlabel |=
+			cpu_to_be32((ah_attr->grh.traffic_class << 20) |
+				    ah_attr->grh.flow_label);
+		memcpy(ah->av.ib.dgid, ah_attr->grh.dgid.raw, 16);
+	}
+
+	ah->av.ib.dlid    = cpu_to_be16(ah_attr->dlid);
+	if (ah_attr->static_rate) {
+		ah->av.ib.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET;
+		while (ah->av.ib.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
+		       !(1 << ah->av.ib.stat_rate & dev->caps.stat_rate_support))
+			--ah->av.ib.stat_rate;
+	}
+	ah->av.ib.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28);
+
+	return &ah->ibah;
+}
+
+static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr,
+				   struct mlx4_ib_ah *ah)
+{
+	struct mlx4_ib_dev *ibdev = to_mdev(pd->device);
+	struct mlx4_dev *dev = ibdev->dev;
+	u8 mac[6];
+	int err;
+	int is_mcast;
+	u16 vlan_tag;
+	union ib_gid sgid;
+
+	err = mlx4_ib_resolve_grh(ibdev, ah_attr, mac, &is_mcast, ah_attr->port_num);
+	if (err)
+		return ERR_PTR(err);
+
+	memcpy(ah->av.eth.mac, mac, 6);
+	err = ib_get_cached_gid(pd->device, ah_attr->port_num, ah_attr->grh.sgid_index, &sgid);
+	if (err)
+		return ERR_PTR(err);
+	vlan_tag = rdma_get_vlan_id(&sgid);
+	if (vlan_tag < 0x1000)
+		vlan_tag |= (ah_attr->sl & 7) << 13;
+	ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24));
+	ah->av.eth.gid_index = ah_attr->grh.sgid_index;
+	ah->av.eth.vlan = cpu_to_be16(vlan_tag);
+	if (ah_attr->static_rate) {
+		ah->av.eth.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET;
+		while (ah->av.eth.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
+		       !(1 << ah->av.eth.stat_rate & dev->caps.stat_rate_support))
+			--ah->av.eth.stat_rate;
+	}
+
+	/*
+	 * HW requires multicast LID so we just choose one.
+	 */
+	if (is_mcast)
+		ah->av.ib.dlid = cpu_to_be16(0xc000);
+
+	memcpy(ah->av.eth.dgid, ah_attr->grh.dgid.raw, 16);
+	ah->av.eth.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28);
+
+	return &ah->ibah;
+}
+
+struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+	struct mlx4_ib_ah *ah;
+	struct ib_ah *ret;
+
+	ah = kzalloc(sizeof *ah, GFP_ATOMIC);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	if (rdma_port_get_link_layer(pd->device, ah_attr->port_num) == IB_LINK_LAYER_ETHERNET) {
+		if (!(ah_attr->ah_flags & IB_AH_GRH)) {
+			ret = ERR_PTR(-EINVAL);
+			goto out;
+		} else {
+			/* TBD: need to handle the case when we get called
+			in an atomic context and there we might sleep. We
+			don't expect this currently since we're working with
+			link local addresses which we can translate without
+			going to sleep */
+			ret = create_iboe_ah(pd, ah_attr, ah);
+			if (IS_ERR(ret))
+				goto out;
+			else
+				return ret;
+		}
+	} else
+		return create_ib_ah(pd, ah_attr, ah); /* never fails */
+
+out:
+	kfree(ah);
+	return ret;
+}
+
+int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+	struct mlx4_ib_ah *ah = to_mah(ibah);
+	enum rdma_link_layer ll;
+
+	memset(ah_attr, 0, sizeof *ah_attr);
+	ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
+	ah_attr->port_num = be32_to_cpu(ah->av.ib.port_pd) >> 24;
+	ll = rdma_port_get_link_layer(ibah->device, ah_attr->port_num);
+	ah_attr->dlid = ll == IB_LINK_LAYER_INFINIBAND ? be16_to_cpu(ah->av.ib.dlid) : 0;
+	if (ah->av.ib.stat_rate)
+		ah_attr->static_rate = ah->av.ib.stat_rate - MLX4_STAT_RATE_OFFSET;
+	ah_attr->src_path_bits = ah->av.ib.g_slid & 0x7F;
+
+	if (mlx4_ib_ah_grh_present(ah)) {
+		ah_attr->ah_flags = IB_AH_GRH;
+
+		ah_attr->grh.traffic_class =
+			be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20;
+		ah_attr->grh.flow_label =
+			be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) & 0xfffff;
+		ah_attr->grh.hop_limit  = ah->av.ib.hop_limit;
+		ah_attr->grh.sgid_index = ah->av.ib.gid_index;
+		memcpy(ah_attr->grh.dgid.raw, ah->av.ib.dgid, 16);
+	}
+
+	return 0;
+}
+
+int mlx4_ib_destroy_ah(struct ib_ah *ah)
+{
+	kfree(to_mah(ah));
+	return 0;
+}
+
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/cq.c b/sys/ofed/drivers/infiniband/hw/mlx4/cq.c
new file mode 100644
index 0000000..31cd00d
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/cq.c
@@ -0,0 +1,861 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx4/cq.h>
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/srq.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+
+/* Which firmware version adds support for Resize CQ */
+#define MLX4_FW_VER_RESIZE_CQ  mlx4_fw_ver(2, 5, 0)
+
+static void mlx4_ib_cq_comp(struct mlx4_cq *cq)
+{
+	struct ib_cq *ibcq = &to_mibcq(cq)->ibcq;
+	ibcq->comp_handler(ibcq, ibcq->cq_context);
+}
+
+static void mlx4_ib_cq_event(struct mlx4_cq *cq, enum mlx4_event type)
+{
+	struct ib_event event;
+	struct ib_cq *ibcq;
+
+	if (type != MLX4_EVENT_TYPE_CQ_ERROR) {
+		printk(KERN_WARNING "mlx4_ib: Unexpected event type %d "
+		       "on CQ %06x\n", type, cq->cqn);
+		return;
+	}
+
+	ibcq = &to_mibcq(cq)->ibcq;
+	if (ibcq->event_handler) {
+		event.device     = ibcq->device;
+		event.event      = IB_EVENT_CQ_ERR;
+		event.element.cq = ibcq;
+		ibcq->event_handler(&event, ibcq->cq_context);
+	}
+}
+
+static void *get_cqe_from_buf(struct mlx4_ib_cq_buf *buf, int n)
+{
+	return mlx4_buf_offset(&buf->buf, n * sizeof (struct mlx4_cqe));
+}
+
+static void *get_cqe(struct mlx4_ib_cq *cq, int n)
+{
+	return get_cqe_from_buf(&cq->buf, n);
+}
+
+static void *get_sw_cqe(struct mlx4_ib_cq *cq, int n)
+{
+	struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibcq.cqe);
+
+	return (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
+		!!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe;
+}
+
+static struct mlx4_cqe *next_cqe_sw(struct mlx4_ib_cq *cq)
+{
+	return get_sw_cqe(cq, cq->mcq.cons_index);
+}
+
+int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
+{
+	struct mlx4_ib_cq *mcq = to_mcq(cq);
+	struct mlx4_ib_dev *dev = to_mdev(cq->device);
+
+	return mlx4_cq_modify(dev->dev, &mcq->mcq, cq_count, cq_period);
+}
+
+static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int nent)
+{
+	int err;
+
+	err = mlx4_buf_alloc(dev->dev, nent * sizeof(struct mlx4_cqe),
+			     PAGE_SIZE * 2, &buf->buf);
+
+	if (err)
+		goto out;
+
+	err = mlx4_mtt_init(dev->dev, buf->buf.npages, buf->buf.page_shift,
+				    &buf->mtt);
+	if (err)
+		goto err_buf;
+
+	err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf);
+	if (err)
+		goto err_mtt;
+
+	return 0;
+
+err_mtt:
+	mlx4_mtt_cleanup(dev->dev, &buf->mtt);
+
+err_buf:
+	mlx4_buf_free(dev->dev, nent * sizeof(struct mlx4_cqe),
+			      &buf->buf);
+
+out:
+	return err;
+}
+
+static void mlx4_ib_free_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int cqe)
+{
+	mlx4_buf_free(dev->dev, (cqe + 1) * sizeof(struct mlx4_cqe), &buf->buf);
+}
+
+static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *context,
+			       struct mlx4_ib_cq_buf *buf, struct ib_umem **umem,
+			       u64 buf_addr, int cqe)
+{
+	int err;
+
+	*umem = ib_umem_get(context, buf_addr, cqe * sizeof (struct mlx4_cqe),
+			    IB_ACCESS_LOCAL_WRITE, 1);
+	if (IS_ERR(*umem))
+		return PTR_ERR(*umem);
+
+	err = mlx4_mtt_init(dev->dev, ib_umem_page_count(*umem),
+			    ilog2((*umem)->page_size), &buf->mtt);
+	if (err)
+		goto err_buf;
+
+	err = mlx4_ib_umem_write_mtt(dev, &buf->mtt, *umem);
+	if (err)
+		goto err_mtt;
+
+	return 0;
+
+err_mtt:
+	mlx4_mtt_cleanup(dev->dev, &buf->mtt);
+
+err_buf:
+	ib_umem_release(*umem);
+
+	return err;
+}
+
+struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector,
+				struct ib_ucontext *context,
+				struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct mlx4_ib_cq *cq;
+	struct mlx4_uar *uar;
+	int err;
+
+	if (entries < 1 || entries > dev->dev->caps.max_cqes) {
+		mlx4_ib_dbg("invalid num of entries: %d", entries);
+		return ERR_PTR(-EINVAL);
+	}
+
+	cq = kzalloc(sizeof *cq, GFP_KERNEL);
+	if (!cq)
+		return ERR_PTR(-ENOMEM);
+
+	entries      = roundup_pow_of_two(entries + 1);
+	cq->ibcq.cqe = entries - 1;
+	mutex_init(&cq->resize_mutex);
+	spin_lock_init(&cq->lock);
+	cq->resize_buf = NULL;
+	cq->resize_umem = NULL;
+
+	if (context) {
+		struct mlx4_ib_create_cq ucmd;
+
+		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+			err = -EFAULT;
+			goto err_cq;
+		}
+
+		err = mlx4_ib_get_cq_umem(dev, context, &cq->buf, &cq->umem,
+					  ucmd.buf_addr, entries);
+		if (err)
+			goto err_cq;
+
+		err = mlx4_ib_db_map_user(to_mucontext(context), ucmd.db_addr,
+					  &cq->db);
+		if (err)
+			goto err_mtt;
+
+		uar = &to_mucontext(context)->uar;
+	} else {
+		err = mlx4_db_alloc(dev->dev, &cq->db, 1);
+		if (err)
+			goto err_cq;
+
+		cq->mcq.set_ci_db  = cq->db.db;
+		cq->mcq.arm_db     = cq->db.db + 1;
+		*cq->mcq.set_ci_db = 0;
+		*cq->mcq.arm_db    = 0;
+
+		err = mlx4_ib_alloc_cq_buf(dev, &cq->buf, entries);
+		if (err)
+			goto err_db;
+
+		uar = &dev->priv_uar;
+	}
+
+	err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar,
+			    cq->db.dma, &cq->mcq,
+			    vector == IB_CQ_VECTOR_LEAST_ATTACHED ?
+			    MLX4_LEAST_ATTACHED_VECTOR : vector, 0);
+	if (err)
+		goto err_dbmap;
+
+	cq->mcq.comp  = mlx4_ib_cq_comp;
+	cq->mcq.event = mlx4_ib_cq_event;
+
+	if (context)
+		if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof (__u32))) {
+			err = -EFAULT;
+			goto err_dbmap;
+		}
+
+	return &cq->ibcq;
+
+err_dbmap:
+	if (context)
+		mlx4_ib_db_unmap_user(to_mucontext(context), &cq->db);
+
+err_mtt:
+	mlx4_mtt_cleanup(dev->dev, &cq->buf.mtt);
+
+	if (context)
+		ib_umem_release(cq->umem);
+	else
+		mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
+
+err_db:
+	if (!context)
+		mlx4_db_free(dev->dev, &cq->db);
+
+err_cq:
+	kfree(cq);
+
+	return ERR_PTR(err);
+}
+
+static int mlx4_alloc_resize_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq,
+				  int entries)
+{
+	int err;
+
+	if (cq->resize_buf)
+		return -EBUSY;
+
+	cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC);
+	if (!cq->resize_buf)
+		return -ENOMEM;
+
+	err = mlx4_ib_alloc_cq_buf(dev, &cq->resize_buf->buf, entries);
+	if (err) {
+		kfree(cq->resize_buf);
+		cq->resize_buf = NULL;
+		return err;
+	}
+
+	cq->resize_buf->cqe = entries - 1;
+
+	return 0;
+}
+
+static int mlx4_alloc_resize_umem(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq,
+				   int entries, struct ib_udata *udata)
+{
+	struct mlx4_ib_resize_cq ucmd;
+	int err;
+
+	if (cq->resize_umem)
+		return -EBUSY;
+
+	if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd))
+		return -EFAULT;
+
+	cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC);
+	if (!cq->resize_buf)
+		return -ENOMEM;
+
+	err = mlx4_ib_get_cq_umem(dev, cq->umem->context, &cq->resize_buf->buf,
+				  &cq->resize_umem, ucmd.buf_addr, entries);
+	if (err) {
+		kfree(cq->resize_buf);
+		cq->resize_buf = NULL;
+		return err;
+	}
+
+	cq->resize_buf->cqe = entries - 1;
+
+	return 0;
+}
+
+static int mlx4_ib_get_outstanding_cqes(struct mlx4_ib_cq *cq)
+{
+	u32 i;
+
+	i = cq->mcq.cons_index;
+	while (get_sw_cqe(cq, i & cq->ibcq.cqe))
+		++i;
+
+	return i - cq->mcq.cons_index;
+}
+
+static void mlx4_ib_cq_resize_copy_cqes(struct mlx4_ib_cq *cq)
+{
+	struct mlx4_cqe *cqe, *new_cqe;
+	int i;
+
+	i = cq->mcq.cons_index;
+	cqe = get_cqe(cq, i & cq->ibcq.cqe);
+	while ((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != MLX4_CQE_OPCODE_RESIZE) {
+		new_cqe = get_cqe_from_buf(&cq->resize_buf->buf,
+					   (i + 1) & cq->resize_buf->cqe);
+		memcpy(new_cqe, get_cqe(cq, i & cq->ibcq.cqe), sizeof(struct mlx4_cqe));
+		new_cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) |
+			(((i + 1) & (cq->resize_buf->cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0);
+		cqe = get_cqe(cq, ++i & cq->ibcq.cqe);
+	}
+	++cq->mcq.cons_index;
+}
+
+int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibcq->device);
+	struct mlx4_ib_cq *cq = to_mcq(ibcq);
+	struct mlx4_mtt mtt;
+	int outst_cqe;
+	int err;
+
+	if (dev->dev->caps.fw_ver < MLX4_FW_VER_RESIZE_CQ)
+		return -ENOSYS;
+
+	mutex_lock(&cq->resize_mutex);
+
+	if (entries < 1 || entries > dev->dev->caps.max_cqes) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	entries = roundup_pow_of_two(entries + 1);
+	if (entries == ibcq->cqe + 1) {
+		err = 0;
+		goto out;
+	}
+
+	if (ibcq->uobject) {
+		err = mlx4_alloc_resize_umem(dev, cq, entries, udata);
+		if (err)
+			goto out;
+	} else {
+		/* Can't be smaller than the number of outstanding CQEs */
+		outst_cqe = mlx4_ib_get_outstanding_cqes(cq);
+		if (entries < outst_cqe + 1) {
+			err = 0;
+			goto out;
+		}
+
+		err = mlx4_alloc_resize_buf(dev, cq, entries);
+		if (err)
+			goto out;
+	}
+
+	mtt = cq->buf.mtt;
+
+	err = mlx4_cq_resize(dev->dev, &cq->mcq, entries, &cq->resize_buf->buf.mtt);
+	if (err)
+		goto err_buf;
+
+	mlx4_mtt_cleanup(dev->dev, &mtt);
+	if (ibcq->uobject) {
+		cq->buf      = cq->resize_buf->buf;
+		cq->ibcq.cqe = cq->resize_buf->cqe;
+		ib_umem_release(cq->umem);
+		cq->umem     = cq->resize_umem;
+
+		kfree(cq->resize_buf);
+		cq->resize_buf = NULL;
+		cq->resize_umem = NULL;
+	} else {
+		struct mlx4_ib_cq_buf tmp_buf;
+		int tmp_cqe = 0;
+ 
+		spin_lock_irq(&cq->lock);
+		if (cq->resize_buf) {
+			mlx4_ib_cq_resize_copy_cqes(cq);
+			tmp_buf = cq->buf;
+			tmp_cqe = cq->ibcq.cqe;
+			cq->buf      = cq->resize_buf->buf;
+			cq->ibcq.cqe = cq->resize_buf->cqe;
+
+			kfree(cq->resize_buf);
+			cq->resize_buf = NULL;
+		}
+		spin_unlock_irq(&cq->lock);
+
+		if (tmp_cqe)
+			mlx4_ib_free_cq_buf(dev, &tmp_buf, tmp_cqe);
+	}
+
+	goto out;
+
+err_buf:
+	mlx4_mtt_cleanup(dev->dev, &cq->resize_buf->buf.mtt);
+	if (!ibcq->uobject)
+		mlx4_ib_free_cq_buf(dev, &cq->resize_buf->buf,
+				    cq->resize_buf->cqe);
+
+	kfree(cq->resize_buf);
+	cq->resize_buf = NULL;
+
+	if (cq->resize_umem) {
+		ib_umem_release(cq->resize_umem);
+		cq->resize_umem = NULL;
+	}
+
+out:
+	mutex_unlock(&cq->resize_mutex);
+	return err;
+}
+
+int mlx4_ib_destroy_cq(struct ib_cq *cq)
+{
+	struct mlx4_ib_dev *dev = to_mdev(cq->device);
+	struct mlx4_ib_cq *mcq = to_mcq(cq);
+
+	mlx4_cq_free(dev->dev, &mcq->mcq);
+	mlx4_mtt_cleanup(dev->dev, &mcq->buf.mtt);
+
+	if (cq->uobject) {
+		mlx4_ib_db_unmap_user(to_mucontext(cq->uobject->context), &mcq->db);
+		ib_umem_release(mcq->umem);
+	} else {
+		mlx4_ib_free_cq_buf(dev, &mcq->buf, cq->cqe);
+		mlx4_db_free(dev->dev, &mcq->db);
+	}
+
+	kfree(mcq);
+
+	return 0;
+}
+
+static void dump_cqe(void *cqe)
+{
+	__be32 *buf = cqe;
+
+	printk(KERN_DEBUG "CQE contents %08x %08x %08x %08x %08x %08x %08x %08x\n",
+	       be32_to_cpu(buf[0]), be32_to_cpu(buf[1]), be32_to_cpu(buf[2]),
+	       be32_to_cpu(buf[3]), be32_to_cpu(buf[4]), be32_to_cpu(buf[5]),
+	       be32_to_cpu(buf[6]), be32_to_cpu(buf[7]));
+}
+
+static void mlx4_ib_handle_error_cqe(struct mlx4_err_cqe *cqe,
+				     struct ib_wc *wc)
+{
+	if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR) {
+		printk(KERN_DEBUG "local QP operation err "
+		       "(QPN %06x, WQE index %x, vendor syndrome %02x, "
+		       "opcode = %02x)\n",
+		       be32_to_cpu(cqe->my_qpn), be16_to_cpu(cqe->wqe_index),
+		       cqe->vendor_err_syndrome,
+		       cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK);
+		dump_cqe(cqe);
+	}
+
+	switch (cqe->syndrome) {
+	case MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR:
+		wc->status = IB_WC_LOC_LEN_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR:
+		wc->status = IB_WC_LOC_QP_OP_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_LOCAL_PROT_ERR:
+		wc->status = IB_WC_LOC_PROT_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_WR_FLUSH_ERR:
+		wc->status = IB_WC_WR_FLUSH_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_MW_BIND_ERR:
+		wc->status = IB_WC_MW_BIND_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_BAD_RESP_ERR:
+		wc->status = IB_WC_BAD_RESP_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR:
+		wc->status = IB_WC_LOC_ACCESS_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR:
+		wc->status = IB_WC_REM_INV_REQ_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR:
+		wc->status = IB_WC_REM_ACCESS_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_REMOTE_OP_ERR:
+		wc->status = IB_WC_REM_OP_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR:
+		wc->status = IB_WC_RETRY_EXC_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR:
+		wc->status = IB_WC_RNR_RETRY_EXC_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR:
+		wc->status = IB_WC_REM_ABORT_ERR;
+		break;
+	default:
+		wc->status = IB_WC_GENERAL_ERR;
+		break;
+	}
+
+	wc->vendor_err = cqe->vendor_err_syndrome;
+}
+
+static int mlx4_ib_ipoib_csum_ok(__be16 status, __be16 checksum)
+{
+	return ((status & cpu_to_be16(MLX4_CQE_STATUS_IPV4      |
+				      MLX4_CQE_STATUS_IPV4F     |
+				      MLX4_CQE_STATUS_IPV4OPT   |
+				      MLX4_CQE_STATUS_IPV6      |
+				      MLX4_CQE_STATUS_IPOK)) ==
+		cpu_to_be16(MLX4_CQE_STATUS_IPV4        |
+			    MLX4_CQE_STATUS_IPOK))              &&
+		(status & cpu_to_be16(MLX4_CQE_STATUS_UDP       |
+				      MLX4_CQE_STATUS_TCP))     &&
+		checksum == cpu_to_be16(0xffff);
+}
+
+static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
+			    struct mlx4_ib_qp **cur_qp,
+			    struct ib_wc *wc)
+{
+	struct mlx4_cqe *cqe;
+	struct mlx4_qp *mqp;
+	struct mlx4_ib_wq *wq;
+	struct mlx4_ib_srq *srq;
+	struct mlx4_srq *msrq;
+	int is_send;
+	int is_error;
+	u32 g_mlpath_rqpn;
+	int is_xrc_recv = 0;
+	u16 wqe_ctr;
+
+repoll:
+	cqe = next_cqe_sw(cq);
+	if (!cqe)
+		return -EAGAIN;
+
+	++cq->mcq.cons_index;
+
+	/*
+	 * Make sure we read CQ entry contents after we've checked the
+	 * ownership bit.
+	 */
+	rmb();
+
+	is_send  = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK;
+	is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
+		MLX4_CQE_OPCODE_ERROR;
+
+	if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_OPCODE_NOP &&
+		     is_send)) {
+		printk(KERN_WARNING "Completion for NOP opcode detected!\n");
+		return -EINVAL;
+	}
+
+	/* Resize CQ in progress */
+	if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_CQE_OPCODE_RESIZE)) {
+		if (cq->resize_buf) {
+			struct mlx4_ib_dev *dev = to_mdev(cq->ibcq.device);
+
+			mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
+			cq->buf      = cq->resize_buf->buf;
+			cq->ibcq.cqe = cq->resize_buf->cqe;
+
+			kfree(cq->resize_buf);
+			cq->resize_buf = NULL;
+		}
+
+		goto repoll;
+	}
+
+	if ((be32_to_cpu(cqe->vlan_my_qpn) & (1 << 23)) && !is_send) {
+		 /*
+		  * We do not have to take the XRC SRQ table lock here,
+		  * because CQs will be locked while XRC SRQs are removed
+		  * from the table.
+		  */
+		 msrq = __mlx4_srq_lookup(to_mdev(cq->ibcq.device)->dev,
+					 be32_to_cpu(cqe->g_mlpath_rqpn) &
+					 0xffffff);
+		 if (unlikely(!msrq)) {
+			 printk(KERN_WARNING "CQ %06x with entry for unknown "
+				"XRC SRQ %06x\n", cq->mcq.cqn,
+				be32_to_cpu(cqe->g_mlpath_rqpn) & 0xffffff);
+			 return -EINVAL;
+		 }
+		 is_xrc_recv = 1;
+		 srq = to_mibsrq(msrq);
+	} else if (!*cur_qp ||
+	    (be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) != (*cur_qp)->mqp.qpn) {
+		/*
+		 * We do not have to take the QP table lock here,
+		 * because CQs will be locked while QPs are removed
+		 * from the table.
+		 */
+		mqp = __mlx4_qp_lookup(to_mdev(cq->ibcq.device)->dev,
+				       be32_to_cpu(cqe->vlan_my_qpn));
+		if (unlikely(!mqp)) {
+			printk(KERN_WARNING "CQ %06x with entry for unknown QPN %06x\n",
+			       cq->mcq.cqn, be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK);
+			return -EINVAL;
+		}
+
+		*cur_qp = to_mibqp(mqp);
+	}
+
+	wc->qp = is_xrc_recv ? NULL: &(*cur_qp)->ibqp;
+
+	if (is_send) {
+		wq = &(*cur_qp)->sq;
+		if (!(*cur_qp)->sq_signal_bits) {
+			wqe_ctr = be16_to_cpu(cqe->wqe_index);
+			wq->tail += (u16) (wqe_ctr - (u16) wq->tail);
+		}
+		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		++wq->tail;
+	} else if (is_xrc_recv) {
+		wqe_ctr = be16_to_cpu(cqe->wqe_index);
+		wc->wr_id = srq->wrid[wqe_ctr];
+		mlx4_ib_free_srq_wqe(srq, wqe_ctr);
+	} else if ((*cur_qp)->ibqp.srq) {
+		srq = to_msrq((*cur_qp)->ibqp.srq);
+		wqe_ctr = be16_to_cpu(cqe->wqe_index);
+		wc->wr_id = srq->wrid[wqe_ctr];
+		mlx4_ib_free_srq_wqe(srq, wqe_ctr);
+	} else {
+		wq	  = &(*cur_qp)->rq;
+		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		++wq->tail;
+	}
+
+	if (unlikely(is_error)) {
+		mlx4_ib_handle_error_cqe((struct mlx4_err_cqe *) cqe, wc);
+		return 0;
+	}
+
+	wc->status = IB_WC_SUCCESS;
+
+	if (is_send) {
+		wc->wc_flags = 0;
+		switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+		case MLX4_OPCODE_RDMA_WRITE_IMM:
+			wc->wc_flags |= IB_WC_WITH_IMM;
+		case MLX4_OPCODE_RDMA_WRITE:
+			wc->opcode    = IB_WC_RDMA_WRITE;
+			break;
+		case MLX4_OPCODE_SEND_IMM:
+			wc->wc_flags |= IB_WC_WITH_IMM;
+		case MLX4_OPCODE_SEND:
+		case MLX4_OPCODE_SEND_INVAL:
+			wc->opcode    = IB_WC_SEND;
+			break;
+		case MLX4_OPCODE_RDMA_READ:
+			wc->opcode    = IB_WC_RDMA_READ;
+			wc->byte_len  = be32_to_cpu(cqe->byte_cnt);
+			break;
+		case MLX4_OPCODE_ATOMIC_CS:
+			wc->opcode    = IB_WC_COMP_SWAP;
+			wc->byte_len  = 8;
+			break;
+		case MLX4_OPCODE_ATOMIC_FA:
+			wc->opcode    = IB_WC_FETCH_ADD;
+			wc->byte_len  = 8;
+			break;
+		case MLX4_OPCODE_MASKED_ATOMIC_CS:
+			wc->opcode    = IB_WC_MASKED_COMP_SWAP;
+			wc->byte_len  = 8;
+			break;
+		case MLX4_OPCODE_MASKED_ATOMIC_FA:
+			wc->opcode    = IB_WC_MASKED_FETCH_ADD;
+			wc->byte_len  = 8;
+			break;
+		case MLX4_OPCODE_BIND_MW:
+			wc->opcode    = IB_WC_BIND_MW;
+			break;
+		case MLX4_OPCODE_LSO:
+			wc->opcode    = IB_WC_LSO;
+			break;
+		case MLX4_OPCODE_FMR:
+			wc->opcode    = IB_WC_FAST_REG_MR;
+			break;
+		case MLX4_OPCODE_LOCAL_INVAL:
+			wc->opcode    = IB_WC_LOCAL_INV;
+			break;
+		}
+	} else {
+		wc->byte_len = be32_to_cpu(cqe->byte_cnt);
+
+		switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+		case MLX4_RECV_OPCODE_RDMA_WRITE_IMM:
+			wc->opcode	= IB_WC_RECV_RDMA_WITH_IMM;
+			wc->wc_flags	= IB_WC_WITH_IMM;
+			wc->ex.imm_data = cqe->immed_rss_invalid;
+			break;
+		case MLX4_RECV_OPCODE_SEND_INVAL:
+			wc->opcode	= IB_WC_RECV;
+			wc->wc_flags	= IB_WC_WITH_INVALIDATE;
+			wc->ex.invalidate_rkey = be32_to_cpu(cqe->immed_rss_invalid);
+			break;
+		case MLX4_RECV_OPCODE_SEND:
+			wc->opcode   = IB_WC_RECV;
+			wc->wc_flags = 0;
+			break;
+		case MLX4_RECV_OPCODE_SEND_IMM:
+			wc->opcode	= IB_WC_RECV;
+			wc->wc_flags	= IB_WC_WITH_IMM;
+			wc->ex.imm_data = cqe->immed_rss_invalid;
+			break;
+		}
+
+		wc->slid	   = be16_to_cpu(cqe->rlid);
+		wc->sl		   = be16_to_cpu(cqe->sl_vid) >> 12;
+		g_mlpath_rqpn	   = be32_to_cpu(cqe->g_mlpath_rqpn);
+		wc->src_qp	   = g_mlpath_rqpn & 0xffffff;
+		wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f;
+		wc->wc_flags	  |= g_mlpath_rqpn & 0x80000000 ? IB_WC_GRH : 0;
+		wc->pkey_index     = be32_to_cpu(cqe->immed_rss_invalid) & 0x7f;
+		wc->csum_ok	   = mlx4_ib_ipoib_csum_ok(cqe->status, cqe->checksum);
+	}
+
+	return 0;
+}
+
+int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+	struct mlx4_ib_cq *cq = to_mcq(ibcq);
+	struct mlx4_ib_qp *cur_qp = NULL;
+	unsigned long flags;
+	int npolled;
+	int err = 0;
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	for (npolled = 0; npolled < num_entries; ++npolled) {
+		err = mlx4_ib_poll_one(cq, &cur_qp, wc + npolled);
+		if (err)
+			break;
+	}
+
+	if (npolled)
+		mlx4_cq_set_ci(&cq->mcq);
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	if (err == 0 || err == -EAGAIN)
+		return npolled;
+	else
+		return err;
+}
+
+int mlx4_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+	mlx4_cq_arm(&to_mcq(ibcq)->mcq,
+		    (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+		    MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT,
+		    to_mdev(ibcq->device)->priv_uar.map,
+		    MLX4_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->uar_lock));
+
+	return 0;
+}
+
+void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq)
+{
+	u32 prod_index;
+	int nfreed = 0;
+	struct mlx4_cqe *cqe, *dest;
+	u8 owner_bit;
+	int is_xrc_srq = 0;
+
+	if (srq && srq->ibsrq.xrc_cq)
+		is_xrc_srq = 1;
+
+	/*
+	 * First we need to find the current producer index, so we
+	 * know where to start cleaning from.  It doesn't matter if HW
+	 * adds new entries after this loop -- the QP we're worried
+	 * about is already in RESET, so the new entries won't come
+	 * from our QP and therefore don't need to be checked.
+	 */
+	for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); ++prod_index)
+		if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe)
+			break;
+
+	/*
+	 * Now sweep backwards through the CQ, removing CQ entries
+	 * that match our QP by copying older entries on top of them.
+	 */
+	while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) {
+		cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
+		if (((be32_to_cpu(cqe->vlan_my_qpn) & 0xffffff) == qpn) ||
+		    (is_xrc_srq &&
+		     (be32_to_cpu(cqe->g_mlpath_rqpn) & 0xffffff) ==
+		      srq->msrq.srqn)) {
+			if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))
+				mlx4_ib_free_srq_wqe(srq, be16_to_cpu(cqe->wqe_index));
+			++nfreed;
+		} else if (nfreed) {
+			dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe);
+			owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK;
+			memcpy(dest, cqe, sizeof *cqe);
+			dest->owner_sr_opcode = owner_bit |
+				(dest->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK);
+		}
+	}
+
+	if (nfreed) {
+		cq->mcq.cons_index += nfreed;
+		/*
+		 * Make sure update of buffer contents is done before
+		 * updating consumer index.
+		 */
+		wmb();
+		mlx4_cq_set_ci(&cq->mcq);
+	}
+}
+
+void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq)
+{
+	spin_lock_irq(&cq->lock);
+	__mlx4_ib_cq_clean(cq, qpn, srq);
+	spin_unlock_irq(&cq->lock);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/doorbell.c b/sys/ofed/drivers/infiniband/hw/mlx4/doorbell.c
new file mode 100644
index 0000000..8aee423
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/doorbell.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+
+#include "mlx4_ib.h"
+
+struct mlx4_ib_user_db_page {
+	struct list_head	list;
+	struct ib_umem	       *umem;
+	unsigned long		user_virt;
+	int			refcnt;
+};
+
+int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
+			struct mlx4_db *db)
+{
+	struct mlx4_ib_user_db_page *page;
+	struct ib_umem_chunk *chunk;
+	int err = 0;
+
+	mutex_lock(&context->db_page_mutex);
+
+	list_for_each_entry(page, &context->db_page_list, list)
+		if (page->user_virt == (virt & PAGE_MASK))
+			goto found;
+
+	page = kmalloc(sizeof *page, GFP_KERNEL);
+	if (!page) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	page->user_virt = (virt & PAGE_MASK);
+	page->refcnt    = 0;
+	page->umem      = ib_umem_get(&context->ibucontext, virt & PAGE_MASK,
+				      PAGE_SIZE, 0, 0);
+	if (IS_ERR(page->umem)) {
+		err = PTR_ERR(page->umem);
+		kfree(page);
+		goto out;
+	}
+
+	list_add(&page->list, &context->db_page_list);
+
+found:
+	chunk = list_entry(page->umem->chunk_list.next, struct ib_umem_chunk, list);
+	db->dma		= sg_dma_address(chunk->page_list) + (virt & ~PAGE_MASK);
+	db->u.user_page = page;
+	++page->refcnt;
+
+out:
+	mutex_unlock(&context->db_page_mutex);
+
+	return err;
+}
+
+void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db)
+{
+	mutex_lock(&context->db_page_mutex);
+
+	if (!--db->u.user_page->refcnt) {
+		list_del(&db->u.user_page->list);
+		ib_umem_release(db->u.user_page->umem);
+		kfree(db->u.user_page);
+	}
+
+	mutex_unlock(&context->db_page_mutex);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/mad.c b/sys/ofed/drivers/infiniband/hw/mlx4/mad.c
new file mode 100644
index 0000000..2bb87ab
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/mad.c
@@ -0,0 +1,452 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4_ib.h"
+
+enum {
+	MLX4_IB_VENDOR_CLASS1 = 0x9,
+	MLX4_IB_VENDOR_CLASS2 = 0xa
+};
+
+int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey,
+		 int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+		 void *in_mad, void *response_mad)
+{
+	struct mlx4_cmd_mailbox *inmailbox, *outmailbox;
+	void *inbox;
+	int err;
+	u32 in_modifier = port;
+	u8 op_modifier = 0;
+
+	inmailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+	if (IS_ERR(inmailbox))
+		return PTR_ERR(inmailbox);
+	inbox = inmailbox->buf;
+
+	outmailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+	if (IS_ERR(outmailbox)) {
+		mlx4_free_cmd_mailbox(dev->dev, inmailbox);
+		return PTR_ERR(outmailbox);
+	}
+
+	memcpy(inbox, in_mad, 256);
+
+	/*
+	 * Key check traps can't be generated unless we have in_wc to
+	 * tell us where to send the trap.
+	 */
+	if (ignore_mkey || !in_wc)
+		op_modifier |= 0x1;
+	if (ignore_bkey || !in_wc)
+		op_modifier |= 0x2;
+
+	if (in_wc) {
+		struct {
+			__be32		my_qpn;
+			u32		reserved1;
+			__be32		rqpn;
+			u8		sl;
+			u8		g_path;
+			u16		reserved2[2];
+			__be16		pkey;
+			u32		reserved3[11];
+			u8		grh[40];
+		} *ext_info;
+
+		memset(inbox + 256, 0, 256);
+		ext_info = inbox + 256;
+
+		ext_info->my_qpn = cpu_to_be32(in_wc->qp->qp_num);
+		ext_info->rqpn   = cpu_to_be32(in_wc->src_qp);
+		ext_info->sl     = in_wc->sl << 4;
+		ext_info->g_path = in_wc->dlid_path_bits |
+			(in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0);
+		ext_info->pkey   = cpu_to_be16(in_wc->pkey_index);
+
+		if (in_grh)
+			memcpy(ext_info->grh, in_grh, 40);
+
+		op_modifier |= 0x4;
+
+		in_modifier |= in_wc->slid << 16;
+	}
+
+	err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma,
+			   in_modifier, op_modifier,
+			   MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C);
+
+	if (!err)
+		memcpy(response_mad, outmailbox->buf, 256);
+
+	mlx4_free_cmd_mailbox(dev->dev, inmailbox);
+	mlx4_free_cmd_mailbox(dev->dev, outmailbox);
+
+	return err;
+}
+
+static void update_sm_ah(struct mlx4_ib_dev *dev, u8 port_num, u16 lid, u8 sl)
+{
+	struct ib_ah *new_ah;
+	struct ib_ah_attr ah_attr;
+
+	if (!dev->send_agent[port_num - 1][0])
+		return;
+
+	memset(&ah_attr, 0, sizeof ah_attr);
+	ah_attr.dlid     = lid;
+	ah_attr.sl       = sl;
+	ah_attr.port_num = port_num;
+
+	new_ah = ib_create_ah(dev->send_agent[port_num - 1][0]->qp->pd,
+			      &ah_attr);
+	if (IS_ERR(new_ah))
+		return;
+
+	spin_lock(&dev->sm_lock);
+	if (dev->sm_ah[port_num - 1])
+		ib_destroy_ah(dev->sm_ah[port_num - 1]);
+	dev->sm_ah[port_num - 1] = new_ah;
+	spin_unlock(&dev->sm_lock);
+}
+
+/*
+ * Snoop SM MADs for port info and P_Key table sets, so we can
+ * synthesize LID change and P_Key change events.
+ */
+static void smp_snoop(struct ib_device *ibdev, u8 port_num, struct ib_mad *mad,
+				u16 prev_lid)
+{
+	struct ib_event event;
+
+	if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    mad->mad_hdr.method == IB_MGMT_METHOD_SET) {
+		if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO) {
+			struct ib_port_info *pinfo =
+				(struct ib_port_info *) ((struct ib_smp *) mad)->data;
+			u16 lid = be16_to_cpu(pinfo->lid);
+
+			update_sm_ah(to_mdev(ibdev), port_num,
+				     be16_to_cpu(pinfo->sm_lid),
+				     pinfo->neighbormtu_mastersmsl & 0xf);
+
+			event.device	       = ibdev;
+			event.element.port_num = port_num;
+
+			if (pinfo->clientrereg_resv_subnetto & 0x80) {
+				event.event    = IB_EVENT_CLIENT_REREGISTER;
+				ib_dispatch_event(&event);
+			}
+
+			if (prev_lid != lid) {
+				event.event    = IB_EVENT_LID_CHANGE;
+				ib_dispatch_event(&event);
+			}
+		}
+
+		if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PKEY_TABLE) {
+			event.device	       = ibdev;
+			event.event	       = IB_EVENT_PKEY_CHANGE;
+			event.element.port_num = port_num;
+			ib_dispatch_event(&event);
+		}
+	}
+}
+
+static void node_desc_override(struct ib_device *dev,
+			       struct ib_mad *mad)
+{
+	if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP &&
+	    mad->mad_hdr.attr_id == IB_SMP_ATTR_NODE_DESC) {
+		spin_lock(&to_mdev(dev)->sm_lock);
+		memcpy(((struct ib_smp *) mad)->data, dev->node_desc, 64);
+		spin_unlock(&to_mdev(dev)->sm_lock);
+	}
+}
+
+static void forward_trap(struct mlx4_ib_dev *dev, u8 port_num, struct ib_mad *mad)
+{
+	int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	struct ib_mad_send_buf *send_buf;
+	struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn];
+	int ret;
+
+	if (agent) {
+		send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR,
+					      IB_MGMT_MAD_DATA, GFP_ATOMIC);
+		/*
+		 * We rely here on the fact that MLX QPs don't use the
+		 * address handle after the send is posted (this is
+		 * wrong following the IB spec strictly, but we know
+		 * it's OK for our devices).
+		 */
+		spin_lock(&dev->sm_lock);
+		memcpy(send_buf->mad, mad, sizeof *mad);
+		if ((send_buf->ah = dev->sm_ah[port_num - 1]))
+			ret = ib_post_send_mad(send_buf, NULL);
+		else
+			ret = -EINVAL;
+		spin_unlock(&dev->sm_lock);
+
+		if (ret)
+			ib_free_send_mad(send_buf);
+	}
+}
+
+static int is_vendor_id(__be16 attr_id)
+{
+	return (attr_id & IB_SMP_ATTR_VENDOR_MASK) == IB_SMP_ATTR_VENDOR_MASK;
+}
+
+static int supported_vendor_id(__be16 attr_id)
+{
+	return 1;
+}
+
+static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+                          struct ib_wc *in_wc, struct ib_grh *in_grh,
+                          struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	u16 slid, prev_lid = 0;
+	int err;
+	struct ib_port_attr pattr;
+
+	slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE);
+
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) {
+		forward_trap(to_mdev(ibdev), port_num, in_mad);
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+	}
+
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	    in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		if (in_mad->mad_hdr.method   != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_SET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_TRAP_REPRESS)
+			return IB_MAD_RESULT_SUCCESS;
+
+		/*
+		 * Don't process SMInfo queries or vendor-specific
+		 * MADs -- the SMA can't handle them.
+		 */
+		if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO ||
+		    (is_vendor_id(in_mad->mad_hdr.attr_id) &&
+		    !supported_vendor_id(in_mad->mad_hdr.attr_id)))
+			return IB_MAD_RESULT_SUCCESS;
+	} else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
+		   in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS1   ||
+		   in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS2   ||
+		   in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CONG_MGMT) {
+		if (in_mad->mad_hdr.method  != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method  != IB_MGMT_METHOD_SET)
+			return IB_MAD_RESULT_SUCCESS;
+	} else
+		return IB_MAD_RESULT_SUCCESS;
+
+	if ((in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    in_mad->mad_hdr.method == IB_MGMT_METHOD_SET &&
+	    in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
+	    !ib_query_port(ibdev, port_num, &pattr))
+		prev_lid = pattr.lid;
+
+	err = mlx4_MAD_IFC(to_mdev(ibdev),
+			   mad_flags & IB_MAD_IGNORE_MKEY,
+			   mad_flags & IB_MAD_IGNORE_BKEY,
+			   port_num, in_wc, in_grh, in_mad, out_mad);
+	if (err)
+		return IB_MAD_RESULT_FAILURE;
+
+	if (!out_mad->mad_hdr.status) {
+		smp_snoop(ibdev, port_num, in_mad, prev_lid);
+		node_desc_override(ibdev, out_mad);
+	}
+
+	/* set return bit in status of directed route responses */
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		out_mad->mad_hdr.status |= cpu_to_be16(1 << 15);
+
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS)
+		/* no response for trap repress */
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static __be32 be64_to_be32(__be64 b64)
+{
+	return cpu_to_be32(be64_to_cpu(b64) & 0xffffffff);
+}
+
+static void edit_counters(struct mlx4_counters *cnt, void *data)
+{
+	*(__be32 *)(data + 40 + 24) = be64_to_be32(cnt->tx_bytes);
+	*(__be32 *)(data + 40 + 28) = be64_to_be32(cnt->rx_bytes);
+	*(__be32 *)(data + 40 + 32) = be64_to_be32(cnt->tx_frames);
+	*(__be32 *)(data + 40 + 36) = be64_to_be32(cnt->rx_frames);
+}
+
+static void edit_ext_counters(struct mlx4_counters_ext *cnt, void *data)
+{
+	*(__be32 *)(data + 40 + 24) = be64_to_be32(cnt->tx_uni_bytes);
+	*(__be32 *)(data + 40 + 28) = be64_to_be32(cnt->rx_uni_bytes);
+	*(__be32 *)(data + 40 + 32) = be64_to_be32(cnt->tx_uni_frames);
+	*(__be32 *)(data + 40 + 36) = be64_to_be32(cnt->rx_uni_frames);
+	*(__be32 *)(data + 40 + 8) = be64_to_be32(cnt->rx_err_frames);
+}
+
+static int rdmaoe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+                              struct ib_wc *in_wc, struct ib_grh *in_grh,
+                              struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	int err;
+	u32 inmod = dev->counters[port_num - 1] & 0xffff;
+	int mode;
+
+        if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT)
+		return -EINVAL;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+	if (IS_ERR(mailbox))
+		return IB_MAD_RESULT_FAILURE;
+
+	err = mlx4_cmd_box(dev->dev, 0, mailbox->dma, inmod, 0,
+			   MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C);
+	if (err)
+		err = IB_MAD_RESULT_FAILURE;
+	else {
+		memset(out_mad->data, 0, sizeof out_mad->data);
+		mode = be32_to_cpu(((struct mlx4_counters *)mailbox->buf)->counter_mode) & 0xf;
+		switch (mode) {
+		case 0:
+			edit_counters(mailbox->buf, out_mad->data);
+			err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+			break;
+		case 1:
+			edit_ext_counters(mailbox->buf, out_mad->data);
+			err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+			break;
+		default:
+			err = IB_MAD_RESULT_FAILURE;
+		}
+	}
+
+	mlx4_free_cmd_mailbox(dev->dev, mailbox);
+
+	return err;
+}
+
+int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags,	u8 port_num,
+			struct ib_wc *in_wc, struct ib_grh *in_grh,
+			struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	switch (rdma_port_get_link_layer(ibdev, port_num)) {
+	case IB_LINK_LAYER_INFINIBAND:
+		return ib_process_mad(ibdev, mad_flags, port_num, in_wc,
+				      in_grh, in_mad, out_mad);
+	case IB_LINK_LAYER_ETHERNET:
+		return rdmaoe_process_mad(ibdev, mad_flags, port_num, in_wc,
+					  in_grh, in_mad, out_mad);
+	default:
+		return -EINVAL;
+	}
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc)
+{
+	ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+int mlx4_ib_mad_init(struct mlx4_ib_dev *dev)
+{
+	struct ib_mad_agent *agent;
+	int p, q;
+	int ret;
+	enum rdma_link_layer ll;
+
+	for (p = 0; p < dev->num_ports; ++p) {
+		ll = rdma_port_get_link_layer(&dev->ib_dev, p + 1);
+		for (q = 0; q <= 1; ++q) {
+			if (ll == IB_LINK_LAYER_INFINIBAND) {
+				agent = ib_register_mad_agent(&dev->ib_dev, p + 1,
+							      q ? IB_QPT_GSI : IB_QPT_SMI,
+							      NULL, 0, send_handler,
+							      NULL, NULL);
+				if (IS_ERR(agent)) {
+					ret = PTR_ERR(agent);
+					goto err;
+				}
+				dev->send_agent[p][q] = agent;
+			} else
+				dev->send_agent[p][q] = NULL;
+		}
+	}
+
+	return 0;
+
+err:
+	for (p = 0; p < dev->num_ports; ++p)
+		for (q = 0; q <= 1; ++q)
+			if (dev->send_agent[p][q])
+				ib_unregister_mad_agent(dev->send_agent[p][q]);
+
+	return ret;
+}
+
+void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev)
+{
+	struct ib_mad_agent *agent;
+	int p, q;
+
+	for (p = 0; p < dev->num_ports; ++p) {
+		for (q = 0; q <= 1; ++q) {
+			agent = dev->send_agent[p][q];
+			if (agent) {
+				dev->send_agent[p][q] = NULL;
+				ib_unregister_mad_agent(agent);
+			}
+		}
+
+		if (dev->sm_ah[p])
+			ib_destroy_ah(dev->sm_ah[p]);
+	}
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/main.c b/sys/ofed/drivers/infiniband/hw/mlx4/main.c
new file mode 100644
index 0000000..bc99414
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/main.c
@@ -0,0 +1,1580 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_vlan.h>
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_addr.h>
+
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+#include "wc.h"
+
+#define DRV_NAME	MLX4_IB_DRV_NAME
+#define DRV_VERSION	"1.0-ofed1.5.2"
+#define DRV_RELDATE	"August 4, 2010"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+#ifdef CONFIG_MLX4_DEBUG
+
+int mlx4_ib_debug_level = 0;
+module_param_named(debug_level, mlx4_ib_debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
+
+#endif /* CONFIG_MLX4_DEBUG */
+
+static const char mlx4_ib_version[] =
+	DRV_NAME ": Mellanox ConnectX InfiniBand driver v"
+	DRV_VERSION " (" DRV_RELDATE ")\n";
+
+static void *get_ibdev(struct mlx4_dev *dev, void *ctx, u8 port)
+{
+       struct mlx4_ib_dev *mlxibdev = ctx;
+       return &mlxibdev->ib_dev;
+}
+
+struct update_gid_work {
+	struct work_struct work;
+	union ib_gid gids[128];
+	int port;
+	struct mlx4_ib_dev *dev;
+};
+
+static struct workqueue_struct *wq;
+
+static void init_query_mad(struct ib_smp *mad)
+{
+	mad->base_version  = 1;
+	mad->mgmt_class    = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	mad->class_version = 1;
+	mad->method	   = IB_MGMT_METHOD_GET;
+}
+
+static union ib_gid zgid;
+
+static int mlx4_ib_query_device(struct ib_device *ibdev,
+				struct ib_device_attr *props)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+	err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, 1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memset(props, 0, sizeof *props);
+
+	props->fw_ver = dev->dev->caps.fw_ver;
+	props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
+		IB_DEVICE_PORT_ACTIVE_EVENT		|
+		IB_DEVICE_SYS_IMAGE_GUID		|
+		IB_DEVICE_RC_RNR_NAK_GEN		|
+		IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR)
+		props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR)
+		props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_APM)
+		props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UD_AV_PORT)
+		props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM)
+		props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
+	if (dev->dev->caps.max_gso_sz && dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BLH)
+		props->device_cap_flags |= IB_DEVICE_UD_TSO;
+	if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_RESERVED_LKEY)
+		props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY;
+	if ((dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_LOCAL_INV) &&
+	    (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_REMOTE_INV) &&
+	    (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_FAST_REG_WR))
+		props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)
+		props->device_cap_flags |= IB_DEVICE_XRC;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_RAW_ETY)
+		props->max_raw_ethy_qp = dev->ib_dev.phys_port_cnt;
+
+	props->vendor_id	   = be32_to_cpup((__be32 *) (out_mad->data + 36)) &
+		0xffffff;
+	props->vendor_part_id	   = be16_to_cpup((__be16 *) (out_mad->data + 30));
+	props->hw_ver		   = be32_to_cpup((__be32 *) (out_mad->data + 32));
+	memcpy(&props->sys_image_guid, out_mad->data +	4, 8);
+
+	props->max_mr_size	   = ~0ull;
+	props->page_size_cap	   = dev->dev->caps.page_size_cap;
+	props->max_qp		   = dev->dev->caps.num_qps - dev->dev->caps.reserved_qps;
+	props->max_qp_wr	   = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE;
+	props->max_sge		   = min(dev->dev->caps.max_sq_sg,
+					 dev->dev->caps.max_rq_sg);
+	props->max_cq		   = dev->dev->caps.num_cqs - dev->dev->caps.reserved_cqs;
+	props->max_cqe		   = dev->dev->caps.max_cqes;
+	props->max_mr		   = dev->dev->caps.num_mpts - dev->dev->caps.reserved_mrws;
+	props->max_pd		   = dev->dev->caps.num_pds - dev->dev->caps.reserved_pds;
+	props->max_qp_rd_atom	   = dev->dev->caps.max_qp_dest_rdma;
+	props->max_qp_init_rd_atom = dev->dev->caps.max_qp_init_rdma;
+	props->max_res_rd_atom	   = props->max_qp_rd_atom * props->max_qp;
+	props->max_srq		   = dev->dev->caps.num_srqs - dev->dev->caps.reserved_srqs;
+	props->max_srq_wr	   = dev->dev->caps.max_srq_wqes - 1;
+	props->max_srq_sge	   = dev->dev->caps.max_srq_sge;
+	props->max_fast_reg_page_list_len = MAX_FAST_REG_PAGES;
+	props->local_ca_ack_delay  = dev->dev->caps.local_ca_ack_delay;
+	props->atomic_cap	   = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ?
+		IB_ATOMIC_HCA : IB_ATOMIC_NONE;
+	props->masked_atomic_cap   = IB_ATOMIC_HCA;
+	props->max_pkeys	   = dev->dev->caps.pkey_table_len[1];
+	props->max_mcast_grp	   = dev->dev->caps.num_mgms + dev->dev->caps.num_amgms;
+	props->max_mcast_qp_attach = dev->dev->caps.num_qp_per_mgm;
+	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+					   props->max_mcast_grp;
+	props->max_map_per_fmr = (1 << (32 - ilog2(dev->dev->caps.num_mpts))) - 1;
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+
+	return err;
+}
+
+static enum rdma_link_layer
+mlx4_ib_port_link_layer(struct ib_device *device, u8 port_num)
+{
+	struct mlx4_dev *dev = to_mdev(device)->dev;
+
+	return dev->caps.port_mask[port_num] == MLX4_PORT_TYPE_IB ?
+		IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET;
+}
+
+static void ib_link_query_port(struct ib_device *ibdev, u8 port,
+			       struct ib_port_attr *props,
+			       struct ib_smp *out_mad)
+{
+	props->lid		= be16_to_cpup((__be16 *) (out_mad->data + 16));
+	props->lmc		= out_mad->data[34] & 0x7;
+	props->sm_lid		= be16_to_cpup((__be16 *) (out_mad->data + 18));
+	props->sm_sl		= out_mad->data[36] & 0xf;
+	props->state		= out_mad->data[32] & 0xf;
+	props->phys_state	= out_mad->data[33] >> 4;
+	props->port_cap_flags	= be32_to_cpup((__be32 *) (out_mad->data + 20));
+	props->gid_tbl_len	= to_mdev(ibdev)->dev->caps.gid_table_len[port];
+	props->max_msg_sz	= to_mdev(ibdev)->dev->caps.max_msg_sz;
+	props->pkey_tbl_len	= to_mdev(ibdev)->dev->caps.pkey_table_len[port];
+	props->bad_pkey_cntr	= be16_to_cpup((__be16 *) (out_mad->data + 46));
+	props->qkey_viol_cntr	= be16_to_cpup((__be16 *) (out_mad->data + 48));
+	props->active_width	= out_mad->data[31] & 0xf;
+	props->active_speed	= out_mad->data[35] >> 4;
+	props->max_mtu		= out_mad->data[41] & 0xf;
+	props->active_mtu	= out_mad->data[36] >> 4;
+	props->subnet_timeout	= out_mad->data[51] & 0x1f;
+	props->max_vl_num	= out_mad->data[37] >> 4;
+	props->init_type_reply	= out_mad->data[41] >> 4;
+	props->link_layer	= IB_LINK_LAYER_INFINIBAND;
+}
+
+#ifdef notyet
+static int eth_to_ib_width(int w)
+{
+	switch (w) {
+	case 4:
+		return IB_WIDTH_4X;
+	case 8:
+	case 16:
+		return IB_WIDTH_8X;
+	case 32:
+		return IB_WIDTH_12X;
+	default:
+		return IB_WIDTH_1X;
+	}
+}
+
+static int eth_to_ib_speed(int s)
+{
+	switch (s) {
+	case 256:
+		return 1;
+	case 512:
+		return 2;
+	case 1024:
+		return 4;
+	default:
+		return 1;
+	}
+}
+#endif
+
+static u8 state_to_phys_state(enum ib_port_state state)
+{
+	return state == IB_PORT_ACTIVE ? 5 : 3;
+}
+
+static int eth_link_query_port(struct ib_device *ibdev, u8 port,
+			       struct ib_port_attr *props,
+			       struct ib_smp *out_mad)
+{
+	struct mlx4_ib_iboe *iboe = &to_mdev(ibdev)->iboe;
+	struct net_device *ndev;
+	enum ib_mtu tmp;
+
+	props->active_width	= IB_WIDTH_4X;
+	props->active_speed	= 1;
+	props->port_cap_flags	= IB_PORT_CM_SUP;
+	props->gid_tbl_len	= to_mdev(ibdev)->dev->caps.gid_table_len[port];
+	props->max_msg_sz	= to_mdev(ibdev)->dev->caps.max_msg_sz;
+	props->pkey_tbl_len	= 1;
+	props->bad_pkey_cntr	= be16_to_cpup((__be16 *) (out_mad->data + 46));
+	props->qkey_viol_cntr	= be16_to_cpup((__be16 *) (out_mad->data + 48));
+	props->max_mtu		= IB_MTU_2048;
+	props->subnet_timeout	= 0;
+	props->max_vl_num	= out_mad->data[37] >> 4;
+	props->init_type_reply	= 0;
+	props->link_layer	= IB_LINK_LAYER_ETHERNET;
+	props->state		= IB_PORT_DOWN;
+	props->phys_state	= state_to_phys_state(props->state);
+	props->active_mtu	= IB_MTU_256;
+	spin_lock(&iboe->lock);
+	ndev = iboe->netdevs[port - 1];
+	if (!ndev)
+		goto out;
+
+#ifdef __linux__
+	tmp = iboe_get_mtu(ndev->mtu);
+#else
+	tmp = iboe_get_mtu(ndev->if_mtu);
+#endif
+	props->active_mtu = tmp ? min(props->max_mtu, tmp) : IB_MTU_256;
+	props->state		= netif_carrier_ok(ndev) &&  netif_oper_up(ndev) ?
+					IB_PORT_ACTIVE : IB_PORT_DOWN;
+	props->phys_state	= state_to_phys_state(props->state);
+
+out:
+	spin_unlock(&iboe->lock);
+	return 0;
+}
+
+static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
+			      struct ib_port_attr *props)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	memset(props, 0, sizeof *props);
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	mlx4_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND ?
+		ib_link_query_port(ibdev, port, props, out_mad) :
+		eth_link_query_port(ibdev, port, props, out_mad);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+
+	return err;
+}
+
+static int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+			       union ib_gid *gid)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(gid->raw, out_mad->data + 8, 8);
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_GUID_INFO;
+	in_mad->attr_mod = cpu_to_be32(index / 8);
+
+	err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int iboe_query_gid(struct ib_device *ibdev, u8 port, int index,
+			    union ib_gid *gid)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+
+	*gid = dev->iboe.gid_table[port - 1][index];
+
+	return 0;
+}
+
+static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+			     union ib_gid *gid)
+{
+	if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND)
+		return __mlx4_ib_query_gid(ibdev, port, index, gid);
+	else
+		return iboe_query_gid(ibdev, port, index, gid);
+}
+
+static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+			      u16 *pkey)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PKEY_TABLE;
+	in_mad->attr_mod = cpu_to_be32(index / 32);
+
+	err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	*pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask,
+				 struct ib_device_modify *props)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+
+	if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
+		return -EOPNOTSUPP;
+
+	if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
+		return 0;
+
+	spin_lock(&to_mdev(ibdev)->sm_lock);
+	memcpy(ibdev->node_desc, props->node_desc, 64);
+	spin_unlock(&to_mdev(ibdev)->sm_lock);
+
+	/* if possible, pass node desc to FW, so it can generate
+	 * a 144 trap. If cmd fails, just ignore.
+	 */
+	mailbox = mlx4_alloc_cmd_mailbox(to_mdev(ibdev)->dev);
+	if (IS_ERR(mailbox))
+		return 0;
+
+	memset(mailbox->buf, 0, 256);
+	memcpy(mailbox->buf, props->node_desc, 64);
+	err = mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0,
+		       MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A);
+	if (err)
+		mlx4_ib_dbg("SET_NODE command failed (%d)", err);
+
+	mlx4_free_cmd_mailbox(to_mdev(ibdev)->dev, mailbox);
+
+	return 0;
+}
+
+static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols,
+			 u32 cap_mask)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+	u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memset(mailbox->buf, 0, 256);
+
+	if (dev->dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
+		*(u8 *) mailbox->buf	     = !!reset_qkey_viols << 6;
+		((__be32 *) mailbox->buf)[2] = cpu_to_be32(cap_mask);
+	} else {
+		((u8 *) mailbox->buf)[3]     = !!reset_qkey_viols;
+		((__be32 *) mailbox->buf)[1] = cpu_to_be32(cap_mask);
+	}
+
+	err = mlx4_cmd(dev->dev, mailbox->dma, port, is_eth, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B);
+
+	mlx4_free_cmd_mailbox(dev->dev, mailbox);
+	return err;
+}
+
+static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
+			       struct ib_port_modify *props)
+{
+	struct ib_port_attr attr;
+	u32 cap_mask;
+	int err;
+
+	mutex_lock(&to_mdev(ibdev)->cap_mask_mutex);
+
+	err = mlx4_ib_query_port(ibdev, port, &attr);
+	if (err)
+		goto out;
+
+	cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) &
+		~props->clr_port_cap_mask;
+
+	err = mlx4_SET_PORT(to_mdev(ibdev), port,
+			    !!(mask & IB_PORT_RESET_QKEY_CNTR),
+			    cap_mask);
+
+out:
+	mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex);
+	return err;
+}
+
+static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev,
+						  struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct mlx4_ib_ucontext *context;
+	struct mlx4_ib_alloc_ucontext_resp resp;
+	int err;
+
+	if (!dev->ib_active)
+		return ERR_PTR(-EAGAIN);
+
+	resp.qp_tab_size      = dev->dev->caps.num_qps;
+
+	if (mlx4_wc_enabled()) {
+		resp.bf_reg_size      = dev->dev->caps.bf_reg_size;
+		resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
+	} else {
+		resp.bf_reg_size      = 0;
+		resp.bf_regs_per_page = 0;
+	}
+
+	context = kzalloc(sizeof *context, GFP_KERNEL);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_uar_alloc(to_mdev(ibdev)->dev, &context->uar);
+	if (err) {
+		kfree(context);
+		return ERR_PTR(err);
+	}
+
+	INIT_LIST_HEAD(&context->db_page_list);
+	mutex_init(&context->db_page_mutex);
+
+	err = ib_copy_to_udata(udata, &resp, sizeof resp);
+	if (err) {
+		mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar);
+		kfree(context);
+		return ERR_PTR(-EFAULT);
+	}
+
+	return &context->ibucontext;
+}
+
+static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+{
+	struct mlx4_ib_ucontext *context = to_mucontext(ibcontext);
+
+	mlx4_uar_free(to_mdev(ibcontext->device)->dev, &context->uar);
+	kfree(context);
+
+	return 0;
+}
+
+static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	struct mlx4_ib_dev *dev = to_mdev(context->device);
+
+	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+		return -EINVAL;
+
+	if (vma->vm_pgoff == 0) {
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+		if (io_remap_pfn_range(vma, vma->vm_start,
+				       to_mucontext(context)->uar.pfn,
+				       PAGE_SIZE, vma->vm_page_prot))
+			return -EAGAIN;
+	} else if (vma->vm_pgoff == 1 && dev->dev->caps.bf_reg_size != 0) {
+		vma->vm_page_prot = pgprot_wc(vma->vm_page_prot);
+
+		if (io_remap_pfn_range(vma, vma->vm_start,
+				       to_mucontext(context)->uar.pfn +
+				       dev->dev->caps.num_uars,
+				       PAGE_SIZE, vma->vm_page_prot))
+			return -EAGAIN;
+	} else
+		return -EINVAL;
+
+	return 0;
+}
+
+static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev,
+				      struct ib_ucontext *context,
+				      struct ib_udata *udata)
+{
+	struct mlx4_ib_pd *pd;
+	int err;
+
+	pd = kzalloc(sizeof *pd, GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_pd_alloc(to_mdev(ibdev)->dev, &pd->pdn);
+	if (err) {
+		kfree(pd);
+		return ERR_PTR(err);
+	}
+
+	if (context)
+		if (ib_copy_to_udata(udata, &pd->pdn, sizeof (__u32))) {
+			mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn);
+			kfree(pd);
+			return ERR_PTR(-EFAULT);
+		}
+
+	return &pd->ibpd;
+}
+
+static int mlx4_ib_dealloc_pd(struct ib_pd *pd)
+{
+	mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn);
+	kfree(pd);
+
+	return 0;
+}
+
+static int add_gid_entry(struct ib_qp *ibqp, union ib_gid *gid)
+{
+	struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+	struct gid_entry *ge;
+
+	ge = kzalloc(sizeof *ge, GFP_KERNEL);
+	if (!ge)
+		return -ENOMEM;
+
+	ge->gid = *gid;
+	if (mlx4_ib_add_mc(mdev, mqp, gid)) {
+		ge->port = mqp->port;
+		ge->added = 1;
+	}
+
+	mutex_lock(&mqp->mutex);
+	list_add_tail(&ge->list, &mqp->gid_list);
+	mutex_unlock(&mqp->mutex);
+
+	return 0;
+}
+
+int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
+		   union ib_gid *gid)
+{
+	u8 mac[6];
+	struct net_device *ndev;
+	int ret = 0;
+
+	if (!mqp->port)
+		return 0;
+	spin_lock(&mdev->iboe.lock);
+	ndev = mdev->iboe.netdevs[mqp->port - 1];
+	if (ndev)
+		dev_hold(ndev);
+	spin_unlock(&mdev->iboe.lock);
+	if (ndev) {
+		rdma_get_mcast_mac((struct in6_addr *)gid, mac);
+		rtnl_lock();
+		dev_mc_add(mdev->iboe.netdevs[mqp->port - 1], mac, 6, 0);
+		ret = 1;
+		rtnl_unlock();
+		dev_put(ndev);
+	}
+
+	return ret;
+}
+
+static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	int err;
+	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+	struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+
+	err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, !!(mqp->flags &
+				MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK),
+				(ibqp->qp_type == IB_QPT_RAW_ETH) ?
+				MLX4_MCAST_PROT_EN : MLX4_MCAST_PROT_IB);
+	if (err)
+		return err;
+
+	err = add_gid_entry(ibqp, gid);
+	if (err)
+		goto err_add;
+
+	return 0;
+
+err_add:
+	mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
+				(ibqp->qp_type == IB_QPT_RAW_ETH) ?
+				MLX4_MCAST_PROT_EN : MLX4_MCAST_PROT_IB);
+	return err;
+}
+
+static struct gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw)
+{
+	struct gid_entry *ge;
+	struct gid_entry *tmp;
+	struct gid_entry *ret = NULL;
+
+	list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
+		if (!memcmp(raw, ge->gid.raw, 16)) {
+			ret = ge;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	int err;
+	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+	struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+	u8 mac[6];
+	struct net_device *ndev;
+	struct gid_entry *ge;
+
+	err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
+				(ibqp->qp_type == IB_QPT_RAW_ETH) ?
+				MLX4_MCAST_PROT_EN : MLX4_MCAST_PROT_IB);
+	if (err)
+		return err;
+
+	mutex_lock(&mqp->mutex);
+	ge = find_gid_entry(mqp, gid->raw);
+	if (ge) {
+		spin_lock(&mdev->iboe.lock);
+		ndev = ge->added ? mdev->iboe.netdevs[ge->port - 1] : NULL;
+		if (ndev)
+			dev_hold(ndev);
+		spin_unlock(&mdev->iboe.lock);
+		rdma_get_mcast_mac((struct in6_addr *)gid, mac);
+		if (ndev) {
+			rtnl_lock();
+			dev_mc_delete(mdev->iboe.netdevs[ge->port - 1], mac, 6, 0);
+			rtnl_unlock();
+			dev_put(ndev);
+		}
+		list_del(&ge->list);
+		kfree(ge);
+	} else
+		printk(KERN_WARNING "could not find mgid entry\n");
+
+	mutex_unlock(&mqp->mutex);
+
+	return 0;
+}
+
+static void mlx4_dummy_comp_handler(struct ib_cq *cq, void *cq_context)
+{
+}
+
+static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev,
+					  struct ib_ucontext *context,
+					  struct ib_udata *udata)
+{
+	struct mlx4_ib_xrcd *xrcd;
+	struct mlx4_ib_dev *mdev = to_mdev(ibdev);
+	struct ib_pd *pd;
+	struct ib_cq *cq;
+	int err;
+
+	if (!(mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
+		return ERR_PTR(-ENOSYS);
+
+	xrcd = kmalloc(sizeof *xrcd, GFP_KERNEL);
+	if (!xrcd)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_xrcd_alloc(mdev->dev, &xrcd->xrcdn);
+	if (err)
+		goto err_xrcd;
+
+	pd = mlx4_ib_alloc_pd(ibdev, NULL, NULL);
+	if (IS_ERR(pd)) {
+		err = PTR_ERR(pd);
+		goto err_pd;
+	}
+	pd->device  = ibdev;
+
+	cq = mlx4_ib_create_cq(ibdev, 1, 0, NULL, NULL);
+	if (IS_ERR(cq)) {
+		err = PTR_ERR(cq);
+		goto err_cq;
+	}
+	cq->device        = ibdev;
+	cq->comp_handler  = mlx4_dummy_comp_handler;
+
+	if (context)
+		if (ib_copy_to_udata(udata, &xrcd->xrcdn, sizeof(__u32))) {
+			err = -EFAULT;
+			goto err_copy;
+		}
+
+	xrcd->cq = cq;
+	xrcd->pd = pd;
+	return &xrcd->ibxrcd;
+
+err_copy:
+	mlx4_ib_destroy_cq(cq);
+err_cq:
+	mlx4_ib_dealloc_pd(pd);
+err_pd:
+	mlx4_xrcd_free(mdev->dev, xrcd->xrcdn);
+err_xrcd:
+	kfree(xrcd);
+	return ERR_PTR(err);
+}
+
+static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+{
+	struct mlx4_ib_xrcd *mxrcd = to_mxrcd(xrcd);
+
+	mlx4_ib_destroy_cq(mxrcd->cq);
+	mlx4_ib_dealloc_pd(mxrcd->pd);
+	mlx4_xrcd_free(to_mdev(xrcd->device)->dev, to_mxrcd(xrcd)->xrcdn);
+	kfree(xrcd);
+
+	return 0;
+}
+
+
+static int init_node_data(struct mlx4_ib_dev *dev)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = IB_SMP_ATTR_NODE_DESC;
+
+	err = mlx4_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(dev->ib_dev.node_desc, out_mad->data, 64);
+
+	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+	err = mlx4_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	dev->dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32));
+	memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct mlx4_ib_dev *dev =
+		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+	return sprintf(buf, "MT%d\n", dev->dev->pdev->device);
+}
+
+static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
+			   char *buf)
+{
+	struct mlx4_ib_dev *dev =
+		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+	return sprintf(buf, "%d.%d.%d\n", (int) (dev->dev->caps.fw_ver >> 32),
+		       (int) (dev->dev->caps.fw_ver >> 16) & 0xffff,
+		       (int) dev->dev->caps.fw_ver & 0xffff);
+}
+
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct mlx4_ib_dev *dev =
+		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+	return sprintf(buf, "%x\n", dev->dev->rev_id);
+}
+
+static ssize_t show_board(struct device *device, struct device_attribute *attr,
+			  char *buf)
+{
+	struct mlx4_ib_dev *dev =
+		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+	return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN,
+		       dev->dev->board_id);
+}
+
+static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
+static DEVICE_ATTR(fw_ver,   S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
+
+static struct device_attribute *mlx4_class_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_fw_ver,
+	&dev_attr_hca_type,
+	&dev_attr_board_id
+};
+
+/*
+ * create show function and a device_attribute struct pointing to
+ * the function for _name
+ */
+#define DEVICE_DIAG_RPRT_ATTR(_name, _offset, _op_mod)		\
+static ssize_t show_rprt_##_name(struct device *dev,		\
+				 struct device_attribute *attr,	\
+				 char *buf){			\
+	return show_diag_rprt(dev, buf, _offset, _op_mod);	\
+}								\
+static DEVICE_ATTR(_name, S_IRUGO, show_rprt_##_name, NULL);
+
+#define MLX4_DIAG_RPRT_CLEAR_DIAGS 3
+
+static size_t show_diag_rprt(struct device *device, char *buf,
+                              u32 offset, u8 op_modifier)
+{
+	size_t ret;
+	u32 counter_offset = offset;
+	u32 diag_counter = 0;
+	struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev,
+					       ib_dev.dev);
+
+	ret = mlx4_query_diag_counters(dev->dev, 1, op_modifier,
+				       &counter_offset, &diag_counter);
+	if (ret)
+		return ret;
+
+	return sprintf(buf,"%d\n", diag_counter);
+}
+
+static ssize_t clear_diag_counters(struct device *device,
+				   struct device_attribute *attr,
+				   const char *buf, size_t length)
+{
+	size_t ret;
+	struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev,
+					       ib_dev.dev);
+
+	ret = mlx4_query_diag_counters(dev->dev, 0, MLX4_DIAG_RPRT_CLEAR_DIAGS,
+				       NULL, NULL);
+	if (ret)
+		return ret;
+
+	return length;
+}
+
+DEVICE_DIAG_RPRT_ATTR(rq_num_lle	, 0x00, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_lle	, 0x04, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_lqpoe	, 0x08, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_lqpoe 	, 0x0C, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_leeoe	, 0x10, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_leeoe	, 0x14, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_lpe	, 0x18, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_lpe	, 0x1C, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_wrfe	, 0x20, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_wrfe	, 0x24, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_mwbe	, 0x2C, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_bre	, 0x34, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_lae	, 0x38, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_rire	, 0x44, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_rire	, 0x48, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_rae	, 0x4C, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_rae	, 0x50, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_roe	, 0x54, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_tree	, 0x5C, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_rree	, 0x64, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_rnr	, 0x68, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_rnr	, 0x6C, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_rabrte	, 0x7C, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_ieecne	, 0x84, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_ieecse	, 0x8C, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_oos	, 0x100, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_oos	, 0x104, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_mce	, 0x108, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_rsync	, 0x110, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_rsync	, 0x114, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_udsdprd	, 0x118, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_ucsdprd	, 0x120, 2);
+DEVICE_DIAG_RPRT_ATTR(num_cqovf		, 0x1A0, 2);
+DEVICE_DIAG_RPRT_ATTR(num_eqovf		, 0x1A4, 2);
+DEVICE_DIAG_RPRT_ATTR(num_baddb		, 0x1A8, 2);
+
+static DEVICE_ATTR(clear_diag, S_IWUGO, NULL, clear_diag_counters);
+
+static struct attribute *diag_rprt_attrs[] = {
+	&dev_attr_rq_num_lle.attr,
+	&dev_attr_sq_num_lle.attr,
+	&dev_attr_rq_num_lqpoe.attr,
+	&dev_attr_sq_num_lqpoe.attr,
+	&dev_attr_rq_num_leeoe.attr,
+	&dev_attr_sq_num_leeoe.attr,
+	&dev_attr_rq_num_lpe.attr,
+	&dev_attr_sq_num_lpe.attr,
+	&dev_attr_rq_num_wrfe.attr,
+	&dev_attr_sq_num_wrfe.attr,
+	&dev_attr_sq_num_mwbe.attr,
+	&dev_attr_sq_num_bre.attr,
+	&dev_attr_rq_num_lae.attr,
+	&dev_attr_sq_num_rire.attr,
+	&dev_attr_rq_num_rire.attr,
+	&dev_attr_sq_num_rae.attr,
+	&dev_attr_rq_num_rae.attr,
+	&dev_attr_sq_num_roe.attr,
+	&dev_attr_sq_num_tree.attr,
+	&dev_attr_sq_num_rree.attr,
+	&dev_attr_rq_num_rnr.attr,
+	&dev_attr_sq_num_rnr.attr,
+	&dev_attr_sq_num_rabrte.attr,
+	&dev_attr_sq_num_ieecne.attr,
+	&dev_attr_sq_num_ieecse.attr,
+	&dev_attr_rq_num_oos.attr,
+	&dev_attr_sq_num_oos.attr,
+	&dev_attr_rq_num_mce.attr,
+	&dev_attr_rq_num_rsync.attr,
+	&dev_attr_sq_num_rsync.attr,
+	&dev_attr_rq_num_udsdprd.attr,
+	&dev_attr_rq_num_ucsdprd.attr,
+	&dev_attr_num_cqovf.attr,
+	&dev_attr_num_eqovf.attr,
+	&dev_attr_num_baddb.attr,
+	&dev_attr_clear_diag.attr,
+	NULL
+};
+
+struct attribute_group diag_counters_group = {
+	.name  = "diag_counters",
+	.attrs  = diag_rprt_attrs
+};
+
+static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, struct net_device *dev)
+{
+#ifdef __linux__
+	memcpy(eui, dev->dev_addr, 3);
+	memcpy(eui + 5, dev->dev_addr + 3, 3);
+#else
+	memcpy(eui, IF_LLADDR(dev), 3);
+	memcpy(eui + 5, IF_LLADDR(dev) + 3, 3);
+#endif
+	if (vlan_id < 0x1000) {
+		eui[3] = vlan_id >> 8;
+		eui[4] = vlan_id & 0xff;
+	} else {
+		eui[3] = 0xff;
+		eui[4] = 0xfe;
+	}
+	eui[0] ^= 2;
+}
+
+static void update_gids_task(struct work_struct *work)
+{
+	struct update_gid_work *gw = container_of(work, struct update_gid_work, work);
+	struct mlx4_cmd_mailbox *mailbox;
+	union ib_gid *gids;
+	int err;
+	struct mlx4_dev	*dev = gw->dev->dev;
+	struct ib_event event;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		printk(KERN_WARNING "update gid table failed %ld\n", PTR_ERR(mailbox));
+		return;
+	}
+
+	gids = mailbox->buf;
+	memcpy(gids, gw->gids, sizeof gw->gids);
+
+	err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | gw->port,
+		       1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B);
+	if (err)
+		printk(KERN_WARNING "set port command failed\n");
+	else {
+		memcpy(gw->dev->iboe.gid_table[gw->port - 1], gw->gids, sizeof gw->gids);
+		event.device = &gw->dev->ib_dev;
+		event.element.port_num = gw->port;
+		event.event    = IB_EVENT_GID_CHANGE;
+		ib_dispatch_event(&event);
+	}
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	kfree(gw);
+}
+
+enum {
+	MLX4_MAX_EFF_VLANS = 128 - MLX4_VLAN_REGULAR,
+};
+
+static int update_ipv6_gids(struct mlx4_ib_dev *dev, int port, int clear)
+{
+	struct net_device *ndev = dev->iboe.netdevs[port - 1];
+	struct update_gid_work *work;
+	struct net_device *tmp;
+	int i;
+	u8 *hits;
+	int ret;
+	union ib_gid gid;
+	int tofree;
+	int found;
+	int need_update = 0;
+	u16 vid;
+
+	work = kzalloc(sizeof *work, GFP_ATOMIC);
+	if (!work)
+		return -ENOMEM;
+
+	hits = kzalloc(MLX4_MAX_EFF_VLANS + 1, GFP_ATOMIC);
+	if (!hits) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+#ifdef __linux__
+	read_lock(&dev_base_lock);
+	for_each_netdev(&init_net, tmp) {
+#else
+	IFNET_RLOCK();
+	TAILQ_FOREACH(tmp, &V_ifnet, if_link) {
+#endif
+		if (ndev && (tmp == ndev || rdma_vlan_dev_real_dev(tmp) == ndev)) {
+			gid.global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
+			vid = rdma_vlan_dev_vlan_id(tmp);
+			mlx4_addrconf_ifid_eui48(&gid.raw[8], vid, ndev);
+			found = 0;
+			tofree = -1;
+			for (i = 0; i < MLX4_MAX_EFF_VLANS + 1; ++i) {
+				if (tofree < 0 &&
+				    !memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof zgid))
+					tofree = i;
+				if (!memcmp(&dev->iboe.gid_table[port - 1][i], &gid, sizeof gid)) {
+					hits[i] = 1;
+					found = 1;
+					break;
+				}
+			}
+
+			if (!found) {
+				if (tmp == ndev  && (memcmp(&dev->iboe.gid_table[port - 1][0], &gid, sizeof gid) || !memcmp(&dev->iboe.gid_table[port - 1][0], &zgid, sizeof gid))) {
+					dev->iboe.gid_table[port - 1][0] = gid;
+					++need_update;
+					hits[0] = 1;
+				} else if (tofree >= 0) {
+					dev->iboe.gid_table[port - 1][tofree] = gid;
+					hits[tofree] = 1;
+					++need_update;
+				}
+			}
+		}
+#ifdef __linux__
+	}
+	read_unlock(&dev_base_lock);
+#else
+	}
+	IFNET_RUNLOCK();
+#endif
+
+	for (i = 0; i < MLX4_MAX_EFF_VLANS + 1; ++i)
+		if (!hits[i]) {
+			if (memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof zgid))
+				++need_update;
+			dev->iboe.gid_table[port - 1][i] = zgid;
+		}
+
+
+	if (need_update) {
+		memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof work->gids);
+		INIT_WORK(&work->work, update_gids_task);
+		work->port = port;
+		work->dev = dev;
+		queue_work(wq, &work->work);
+	} else
+		kfree(work);
+
+	kfree(hits);
+	return 0;
+
+out:
+	kfree(work);
+	return ret;
+}
+
+static void handle_en_event(struct mlx4_ib_dev *dev, int port, unsigned long event)
+{
+	switch (event) {
+	case NETDEV_UP:
+#ifdef __linux__
+	case NETDEV_CHANGEADDR:
+#endif
+		update_ipv6_gids(dev, port, 0);
+		break;
+
+	case NETDEV_DOWN:
+		update_ipv6_gids(dev, port, 1);
+		dev->iboe.netdevs[port - 1] = NULL;
+	}
+}
+
+static void netdev_added(struct mlx4_ib_dev *dev, int port)
+{
+	update_ipv6_gids(dev, port, 0);
+}
+
+static void netdev_removed(struct mlx4_ib_dev *dev, int port)
+{
+	update_ipv6_gids(dev, port, 1);
+}
+
+static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event,
+				void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct mlx4_ib_dev *ibdev;
+	struct net_device *oldnd;
+	struct mlx4_ib_iboe *iboe;
+	int port;
+
+#ifdef __linux__
+	if (!net_eq(dev_net(dev), &init_net))
+		return NOTIFY_DONE;
+#endif
+
+	ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb);
+	iboe = &ibdev->iboe;
+
+	spin_lock(&iboe->lock);
+	mlx4_foreach_ib_transport_port(port, ibdev->dev) {
+		oldnd = iboe->netdevs[port - 1];
+		iboe->netdevs[port - 1] = mlx4_get_prot_dev(ibdev->dev, MLX4_PROT_EN, port);
+		if (oldnd != iboe->netdevs[port - 1]) {
+			if (iboe->netdevs[port - 1])
+				netdev_added(ibdev, port);
+			else
+				netdev_removed(ibdev, port);
+		}
+	}
+
+	if (dev == iboe->netdevs[0] ||
+	    (iboe->netdevs[0] && rdma_vlan_dev_real_dev(dev) == iboe->netdevs[0]))
+		handle_en_event(ibdev, 1, event);
+	else if (dev == iboe->netdevs[1]
+		 || (iboe->netdevs[1] && rdma_vlan_dev_real_dev(dev) == iboe->netdevs[1]))
+		handle_en_event(ibdev, 2, event);
+
+	spin_unlock(&iboe->lock);
+
+	return NOTIFY_DONE;
+}
+
+static void *mlx4_ib_add(struct mlx4_dev *dev)
+{
+	static int mlx4_ib_version_printed;
+	struct mlx4_ib_dev *ibdev;
+	int num_ports = 0;
+	int i;
+	int err;
+	struct mlx4_ib_iboe *iboe;
+	int k;
+
+	if (!mlx4_ib_version_printed) {
+		printk(KERN_INFO "%s", mlx4_ib_version);
+		++mlx4_ib_version_printed;
+	}
+
+	mlx4_foreach_ib_transport_port(i, dev)
+		num_ports++;
+
+	/* No point in registering a device with no ports... */
+	if (num_ports == 0)
+		return NULL;
+
+	ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev);
+	if (!ibdev) {
+		dev_err(&dev->pdev->dev, "Device struct alloc failed\n");
+		return NULL;
+	}
+
+	iboe = &ibdev->iboe;
+
+	if (mlx4_pd_alloc(dev, &ibdev->priv_pdn))
+		goto err_dealloc;
+
+	if (mlx4_uar_alloc(dev, &ibdev->priv_uar))
+		goto err_pd;
+
+	ibdev->priv_uar.map = ioremap(ibdev->priv_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
+	if (!ibdev->priv_uar.map)
+		goto err_uar;
+	MLX4_INIT_DOORBELL_LOCK(&ibdev->uar_lock);
+
+	ibdev->dev = dev;
+
+	strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX);
+	ibdev->ib_dev.owner		= THIS_MODULE;
+	ibdev->ib_dev.node_type		= RDMA_NODE_IB_CA;
+	ibdev->ib_dev.local_dma_lkey	= dev->caps.reserved_lkey;
+	ibdev->num_ports		= num_ports;
+	ibdev->ib_dev.phys_port_cnt     = ibdev->num_ports;
+	ibdev->ib_dev.num_comp_vectors	= dev->caps.num_comp_vectors;
+	ibdev->ib_dev.dma_device	= &dev->pdev->dev;
+
+	ibdev->ib_dev.uverbs_abi_ver	= MLX4_IB_UVERBS_ABI_VERSION;
+	ibdev->ib_dev.uverbs_cmd_mask	=
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ);
+
+	ibdev->ib_dev.query_device	= mlx4_ib_query_device;
+	ibdev->ib_dev.query_port	= mlx4_ib_query_port;
+	ibdev->ib_dev.get_link_layer	= mlx4_ib_port_link_layer;
+	ibdev->ib_dev.query_gid		= mlx4_ib_query_gid;
+	ibdev->ib_dev.query_pkey	= mlx4_ib_query_pkey;
+	ibdev->ib_dev.modify_device	= mlx4_ib_modify_device;
+	ibdev->ib_dev.modify_port	= mlx4_ib_modify_port;
+	ibdev->ib_dev.alloc_ucontext	= mlx4_ib_alloc_ucontext;
+	ibdev->ib_dev.dealloc_ucontext	= mlx4_ib_dealloc_ucontext;
+	ibdev->ib_dev.mmap		= mlx4_ib_mmap;
+	ibdev->ib_dev.alloc_pd		= mlx4_ib_alloc_pd;
+	ibdev->ib_dev.dealloc_pd	= mlx4_ib_dealloc_pd;
+	ibdev->ib_dev.create_ah		= mlx4_ib_create_ah;
+	ibdev->ib_dev.query_ah		= mlx4_ib_query_ah;
+	ibdev->ib_dev.destroy_ah	= mlx4_ib_destroy_ah;
+	ibdev->ib_dev.create_srq	= mlx4_ib_create_srq;
+	ibdev->ib_dev.modify_srq	= mlx4_ib_modify_srq;
+	ibdev->ib_dev.query_srq		= mlx4_ib_query_srq;
+	ibdev->ib_dev.destroy_srq	= mlx4_ib_destroy_srq;
+	ibdev->ib_dev.post_srq_recv	= mlx4_ib_post_srq_recv;
+	ibdev->ib_dev.create_qp		= mlx4_ib_create_qp;
+	ibdev->ib_dev.modify_qp		= mlx4_ib_modify_qp;
+	ibdev->ib_dev.query_qp		= mlx4_ib_query_qp;
+	ibdev->ib_dev.destroy_qp	= mlx4_ib_destroy_qp;
+	ibdev->ib_dev.post_send		= mlx4_ib_post_send;
+	ibdev->ib_dev.post_recv		= mlx4_ib_post_recv;
+	ibdev->ib_dev.create_cq		= mlx4_ib_create_cq;
+	ibdev->ib_dev.modify_cq		= mlx4_ib_modify_cq;
+	ibdev->ib_dev.resize_cq		= mlx4_ib_resize_cq;
+	ibdev->ib_dev.destroy_cq	= mlx4_ib_destroy_cq;
+	ibdev->ib_dev.poll_cq		= mlx4_ib_poll_cq;
+	ibdev->ib_dev.req_notify_cq	= mlx4_ib_arm_cq;
+	ibdev->ib_dev.get_dma_mr	= mlx4_ib_get_dma_mr;
+	ibdev->ib_dev.reg_user_mr	= mlx4_ib_reg_user_mr;
+	ibdev->ib_dev.dereg_mr		= mlx4_ib_dereg_mr;
+	ibdev->ib_dev.alloc_fast_reg_mr = mlx4_ib_alloc_fast_reg_mr;
+	ibdev->ib_dev.alloc_fast_reg_page_list = mlx4_ib_alloc_fast_reg_page_list;
+	ibdev->ib_dev.free_fast_reg_page_list  = mlx4_ib_free_fast_reg_page_list;
+	ibdev->ib_dev.attach_mcast	= mlx4_ib_mcg_attach;
+	ibdev->ib_dev.detach_mcast	= mlx4_ib_mcg_detach;
+	ibdev->ib_dev.process_mad	= mlx4_ib_process_mad;
+
+	ibdev->ib_dev.alloc_fmr		= mlx4_ib_fmr_alloc;
+	ibdev->ib_dev.map_phys_fmr	= mlx4_ib_map_phys_fmr;
+	ibdev->ib_dev.unmap_fmr		= mlx4_ib_unmap_fmr;
+	ibdev->ib_dev.dealloc_fmr	= mlx4_ib_fmr_dealloc;
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) {
+		ibdev->ib_dev.create_xrc_srq = mlx4_ib_create_xrc_srq;
+		ibdev->ib_dev.alloc_xrcd = mlx4_ib_alloc_xrcd;
+		ibdev->ib_dev.dealloc_xrcd = mlx4_ib_dealloc_xrcd;
+		ibdev->ib_dev.create_xrc_rcv_qp = mlx4_ib_create_xrc_rcv_qp;
+		ibdev->ib_dev.modify_xrc_rcv_qp = mlx4_ib_modify_xrc_rcv_qp;
+		ibdev->ib_dev.query_xrc_rcv_qp = mlx4_ib_query_xrc_rcv_qp;
+		ibdev->ib_dev.reg_xrc_rcv_qp = mlx4_ib_reg_xrc_rcv_qp;
+		ibdev->ib_dev.unreg_xrc_rcv_qp = mlx4_ib_unreg_xrc_rcv_qp;
+		ibdev->ib_dev.uverbs_cmd_mask |=
+			(1ull << IB_USER_VERBS_CMD_CREATE_XRC_SRQ)	|
+			(1ull << IB_USER_VERBS_CMD_OPEN_XRC_DOMAIN)	|
+			(1ull << IB_USER_VERBS_CMD_CLOSE_XRC_DOMAIN)	|
+			(1ull << IB_USER_VERBS_CMD_CREATE_XRC_RCV_QP)	|
+			(1ull << IB_USER_VERBS_CMD_MODIFY_XRC_RCV_QP)	|
+			(1ull << IB_USER_VERBS_CMD_QUERY_XRC_RCV_QP)	|
+			(1ull << IB_USER_VERBS_CMD_REG_XRC_RCV_QP)	|
+			(1ull << IB_USER_VERBS_CMD_UNREG_XRC_RCV_QP);
+	}
+
+
+	spin_lock_init(&iboe->lock);
+	if (init_node_data(ibdev))
+		goto err_map;
+
+	for (k = 0; k < ibdev->num_ports; ++k) {
+		err = mlx4_counter_alloc(ibdev->dev, &ibdev->counters[k]);
+		if (err)
+			ibdev->counters[k] = -1;
+		else
+			mlx4_set_iboe_counter(dev, ibdev->counters[k], k + 1);
+	}
+
+	spin_lock_init(&ibdev->sm_lock);
+	mutex_init(&ibdev->cap_mask_mutex);
+	mutex_init(&ibdev->xrc_reg_mutex);
+
+	if (ib_register_device(&ibdev->ib_dev))
+		goto err_counter;
+
+	if (mlx4_ib_mad_init(ibdev))
+		goto err_reg;
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE && !iboe->nb.notifier_call) {
+		iboe->nb.notifier_call = mlx4_ib_netdev_event;
+		err = register_netdevice_notifier(&iboe->nb);
+		if (err)
+			goto err_reg;
+	}
+	for (i = 0; i < ARRAY_SIZE(mlx4_class_attributes); ++i) {
+		if (device_create_file(&ibdev->ib_dev.dev,
+				       mlx4_class_attributes[i]))
+			goto err_notif;
+	}
+
+	if(sysfs_create_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group))
+		goto err_notif;
+
+	ibdev->ib_active = 1;
+
+	return ibdev;
+
+err_notif:
+	if (unregister_netdevice_notifier(&ibdev->iboe.nb))
+		printk(KERN_WARNING "failure unregistering notifier\n");
+	flush_workqueue(wq);
+
+err_reg:
+	ib_unregister_device(&ibdev->ib_dev);
+
+err_counter:
+	for (; k; --k)
+		mlx4_counter_free(ibdev->dev, ibdev->counters[k - 1]);
+
+err_map:
+	iounmap(ibdev->priv_uar.map);
+
+err_uar:
+	mlx4_uar_free(dev, &ibdev->priv_uar);
+
+err_pd:
+	mlx4_pd_free(dev, ibdev->priv_pdn);
+
+err_dealloc:
+	ib_dealloc_device(&ibdev->ib_dev);
+
+	return NULL;
+}
+
+static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
+{
+	struct mlx4_ib_dev *ibdev = ibdev_ptr;
+	int p;
+	int k;
+
+	sysfs_remove_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group);
+
+	mlx4_ib_mad_cleanup(ibdev);
+	ib_unregister_device(&ibdev->ib_dev);
+	for (k = 0; k < ibdev->num_ports; ++k)
+		mlx4_counter_free(ibdev->dev, ibdev->counters[k]);
+
+	if (ibdev->iboe.nb.notifier_call) {
+		unregister_netdevice_notifier(&ibdev->iboe.nb);
+		flush_workqueue(wq);
+		ibdev->iboe.nb.notifier_call = NULL;
+	}
+	iounmap(ibdev->priv_uar.map);
+
+	mlx4_foreach_port(p, dev, MLX4_PORT_TYPE_IB)
+		mlx4_CLOSE_PORT(dev, p);
+
+	mlx4_uar_free(dev, &ibdev->priv_uar);
+	mlx4_pd_free(dev, ibdev->priv_pdn);
+	ib_dealloc_device(&ibdev->ib_dev);
+}
+
+static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr,
+			  enum mlx4_dev_event event, int port)
+{
+	struct ib_event ibev;
+	struct mlx4_ib_dev *ibdev = to_mdev((struct ib_device *) ibdev_ptr);
+
+	if (port > ibdev->num_ports)
+		return;
+
+	switch (event) {
+	case MLX4_DEV_EVENT_PORT_UP:
+		ibev.event = IB_EVENT_PORT_ACTIVE;
+		break;
+
+	case MLX4_DEV_EVENT_PORT_DOWN:
+		ibev.event = IB_EVENT_PORT_ERR;
+		break;
+
+	case MLX4_DEV_EVENT_CATASTROPHIC_ERROR:
+		ibdev->ib_active = 0;
+		ibev.event = IB_EVENT_DEVICE_FATAL;
+		break;
+
+	default:
+		return;
+	}
+
+	ibev.device	      = ibdev_ptr;
+	ibev.element.port_num = port;
+
+	ib_dispatch_event(&ibev);
+}
+
+static struct mlx4_interface mlx4_ib_interface = {
+	.add	= mlx4_ib_add,
+	.remove	= mlx4_ib_remove,
+       .event  = mlx4_ib_event,
+       .get_prot_dev = get_ibdev,
+       .protocol     = MLX4_PROT_IB,
+};
+
+static int __init mlx4_ib_init(void)
+{
+	int err;
+
+	wq = create_singlethread_workqueue("mlx4_ib");
+	if (!wq)
+		return -ENOMEM;
+
+	err = mlx4_register_interface(&mlx4_ib_interface);
+	if (err) {
+		destroy_workqueue(wq);
+		return err;
+	}
+
+	return 0;
+}
+
+static void __exit mlx4_ib_cleanup(void)
+{
+	mlx4_unregister_interface(&mlx4_ib_interface);
+	destroy_workqueue(wq);
+}
+
+module_init_order(mlx4_ib_init, SI_ORDER_MIDDLE);
+module_exit(mlx4_ib_cleanup);
+
+#undef MODULE_VERSION
+#include <sys/module.h>
+static int
+mlx4ib_evhand(module_t mod, int event, void *arg)
+{
+        return (0);
+}
+static moduledata_t mlx4ib_mod = {
+        .name = "mlx4ib",
+        .evhand = mlx4ib_evhand,
+};
+DECLARE_MODULE(mlx4ib, mlx4ib_mod, SI_SUB_SMP, SI_ORDER_ANY);
+MODULE_DEPEND(mlx4ib, mlx4, 1, 1, 1);
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h b/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h
new file mode 100644
index 0000000..b8f6996
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -0,0 +1,409 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_IB_H
+#define MLX4_IB_H
+
+#include <linux/compiler.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/doorbell.h>
+
+
+#define MLX4_IB_DRV_NAME	"mlx4_ib"
+
+#ifdef CONFIG_MLX4_DEBUG
+extern int mlx4_ib_debug_level;
+
+#define mlx4_ib_dbg(format, arg...) 		\
+	do {					\
+		if (mlx4_ib_debug_level) 	\
+			printk(KERN_DEBUG "<" MLX4_IB_DRV_NAME "> %s: " format "\n",\
+			__func__, ## arg);	\
+	} while (0)
+
+#else /* CONFIG_MLX4_DEBUG */
+
+#define mlx4_ib_dbg(format, arg...) do {} while (0)
+
+#endif /* CONFIG_MLX4_DEBUG */
+
+enum {
+	MLX4_IB_SQ_MIN_WQE_SHIFT = 6
+};
+
+#define MLX4_IB_SQ_HEADROOM(shift) ((2048 >> (shift)) + 1)
+#define MLX4_IB_SQ_MAX_SPARE (MLX4_IB_SQ_HEADROOM(MLX4_IB_SQ_MIN_WQE_SHIFT))
+
+struct mlx4_ib_ucontext {
+	struct ib_ucontext	ibucontext;
+	struct mlx4_uar		uar;
+	struct list_head	db_page_list;
+	struct mutex		db_page_mutex;
+};
+
+struct mlx4_ib_pd {
+	struct ib_pd		ibpd;
+	u32			pdn;
+};
+
+struct mlx4_ib_xrcd {
+	struct ib_xrcd	ibxrcd;
+	u32		xrcdn;
+	struct ib_pd	*pd;
+	struct ib_cq	*cq;
+};
+
+struct mlx4_ib_cq_buf {
+	struct mlx4_buf		buf;
+	struct mlx4_mtt		mtt;
+};
+
+struct mlx4_ib_cq_resize {
+	struct mlx4_ib_cq_buf	buf;
+	int			cqe;
+};
+
+struct mlx4_ib_cq {
+	struct ib_cq		ibcq;
+	struct mlx4_cq		mcq;
+	struct mlx4_ib_cq_buf	buf;
+	struct mlx4_ib_cq_resize *resize_buf;
+	struct mlx4_db		db;
+	spinlock_t		lock;
+	struct mutex		resize_mutex;
+	struct ib_umem	       *umem;
+	struct ib_umem	       *resize_umem;
+};
+
+struct mlx4_ib_mr {
+	struct ib_mr		ibmr;
+	struct mlx4_mr		mmr;
+	struct ib_umem	       *umem;
+};
+
+struct mlx4_ib_fast_reg_page_list {
+	struct ib_fast_reg_page_list	ibfrpl;
+	__be64			       *mapped_page_list;
+	dma_addr_t			map;
+};
+
+struct mlx4_ib_fmr {
+	struct ib_fmr           ibfmr;
+	struct mlx4_fmr         mfmr;
+};
+
+struct mlx4_ib_wq {
+	u64		       *wrid;
+	spinlock_t		lock;
+	int			wqe_cnt;
+	int			max_post;
+	int			max_gs;
+	int			offset;
+	int			wqe_shift;
+	unsigned		head;
+	unsigned		tail;
+};
+
+enum mlx4_ib_qp_flags {
+	MLX4_IB_QP_LSO				= 1 << 0,
+	MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK	= 1 << 1,
+	MLX4_IB_XRC_RCV				= 1 << 2,
+};
+
+struct gid_entry {
+	struct list_head	list;
+	union ib_gid		gid;
+	int			added;
+	u8			port;
+};
+
+struct mlx4_ib_qp {
+	struct ib_qp		ibqp;
+	struct mlx4_qp		mqp;
+	struct mlx4_buf		buf;
+
+	struct mlx4_db		db;
+	struct mlx4_ib_wq	rq;
+
+	u32			doorbell_qpn;
+	__be32			sq_signal_bits;
+	unsigned		sq_next_wqe;
+	int			sq_max_wqes_per_wr;
+	int			sq_spare_wqes;
+	struct mlx4_ib_wq	sq;
+
+	struct ib_umem	       *umem;
+	struct mlx4_mtt		mtt;
+	int			buf_size;
+	struct mutex		mutex;
+	u32			flags;
+	struct list_head	xrc_reg_list;
+	spinlock_t		xrc_reg_list_lock;
+	u16			xrcdn;
+	u8			port;
+	u8			alt_port;
+	u8			atomic_rd_en;
+	u8			resp_depth;
+	u8			sq_no_prefetch;
+	u8			state;
+	int			mlx_type;
+	struct list_head	gid_list;
+	int			max_inline_data;
+	struct mlx4_bf		bf;
+};
+
+struct mlx4_ib_srq {
+	struct ib_srq		ibsrq;
+	struct mlx4_srq		msrq;
+	struct mlx4_buf		buf;
+	struct mlx4_db		db;
+	u64		       *wrid;
+	spinlock_t		lock;
+	int			head;
+	int			tail;
+	u16			wqe_ctr;
+	struct ib_umem	       *umem;
+	struct mlx4_mtt		mtt;
+	struct mutex		mutex;
+};
+
+struct mlx4_ib_ah {
+	struct ib_ah		ibah;
+	union mlx4_ext_av       av;
+};
+
+struct mlx4_ib_iboe {
+	spinlock_t		lock;
+	struct net_device      *netdevs[MLX4_MAX_PORTS];
+	struct notifier_block 	nb;
+	union ib_gid		gid_table[MLX4_MAX_PORTS][128];
+};
+
+struct mlx4_ib_dev {
+	struct ib_device	ib_dev;
+	struct mlx4_dev	       *dev;
+	int			num_ports;
+	struct mlx4_uar		priv_uar;
+	u32			priv_pdn;
+	MLX4_DECLARE_DOORBELL_LOCK(uar_lock);
+
+	struct ib_mad_agent    *send_agent[MLX4_MAX_PORTS][2];
+	struct ib_ah	       *sm_ah[MLX4_MAX_PORTS];
+	spinlock_t		sm_lock;
+
+	struct mutex		cap_mask_mutex;
+	struct mutex		xrc_reg_mutex;
+	int			ib_active;
+	struct mlx4_ib_iboe	iboe;
+	int			counters[MLX4_MAX_PORTS];
+};
+
+static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct mlx4_ib_dev, ib_dev);
+}
+
+static inline struct mlx4_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct mlx4_ib_ucontext, ibucontext);
+}
+
+static inline struct mlx4_ib_pd *to_mpd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct mlx4_ib_pd, ibpd);
+}
+
+static inline struct mlx4_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd)
+{
+	return container_of(ibxrcd, struct mlx4_ib_xrcd, ibxrcd);
+}
+
+static inline struct mlx4_ib_cq *to_mcq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct mlx4_ib_cq, ibcq);
+}
+
+static inline struct mlx4_ib_cq *to_mibcq(struct mlx4_cq *mcq)
+{
+	return container_of(mcq, struct mlx4_ib_cq, mcq);
+}
+
+static inline struct mlx4_ib_mr *to_mmr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct mlx4_ib_mr, ibmr);
+}
+
+static inline struct mlx4_ib_fast_reg_page_list *to_mfrpl(struct ib_fast_reg_page_list *ibfrpl)
+{
+	return container_of(ibfrpl, struct mlx4_ib_fast_reg_page_list, ibfrpl);
+}
+
+static inline struct mlx4_ib_fmr *to_mfmr(struct ib_fmr *ibfmr)
+{
+	return container_of(ibfmr, struct mlx4_ib_fmr, ibfmr);
+}
+static inline struct mlx4_ib_qp *to_mqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct mlx4_ib_qp, ibqp);
+}
+
+static inline struct mlx4_ib_qp *to_mibqp(struct mlx4_qp *mqp)
+{
+	return container_of(mqp, struct mlx4_ib_qp, mqp);
+}
+
+static inline struct mlx4_ib_srq *to_msrq(struct ib_srq *ibsrq)
+{
+	return container_of(ibsrq, struct mlx4_ib_srq, ibsrq);
+}
+
+static inline struct mlx4_ib_srq *to_mibsrq(struct mlx4_srq *msrq)
+{
+	return container_of(msrq, struct mlx4_ib_srq, msrq);
+}
+
+static inline struct mlx4_ib_ah *to_mah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct mlx4_ib_ah, ibah);
+}
+
+int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
+			struct mlx4_db *db);
+void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db);
+
+struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc);
+int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
+			   struct ib_umem *umem);
+struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				  u64 virt_addr, int access_flags,
+				  struct ib_udata *udata);
+int mlx4_ib_dereg_mr(struct ib_mr *mr);
+struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd,
+					int max_page_list_len);
+struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
+							       int page_list_len);
+void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);
+
+int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
+int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata);
+struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector,
+				struct ib_ucontext *context,
+				struct ib_udata *udata);
+int mlx4_ib_destroy_cq(struct ib_cq *cq);
+int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
+void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq);
+void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq);
+
+struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
+int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr);
+int mlx4_ib_destroy_ah(struct ib_ah *ah);
+
+struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
+				  struct ib_srq_init_attr *init_attr,
+				  struct ib_udata *udata);
+struct ib_srq *mlx4_ib_create_xrc_srq(struct ib_pd *pd,
+				      struct ib_cq *xrc_cq,
+				      struct ib_xrcd *xrcd,
+				      struct ib_srq_init_attr *init_attr,
+				      struct ib_udata *udata);
+int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		       enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
+int mlx4_ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
+int mlx4_ib_destroy_srq(struct ib_srq *srq);
+void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index);
+int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			  struct ib_recv_wr **bad_wr);
+
+struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
+				struct ib_qp_init_attr *init_attr,
+				struct ib_udata *udata);
+int mlx4_ib_destroy_qp(struct ib_qp *qp);
+int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata);
+int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+		     struct ib_qp_init_attr *qp_init_attr);
+int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr);
+int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr);
+
+int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey,
+		 int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+		 void *in_mad, void *response_mad);
+int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags,	u8 port_num,
+			struct ib_wc *in_wc, struct ib_grh *in_grh,
+			struct ib_mad *in_mad, struct ib_mad *out_mad);
+int mlx4_ib_mad_init(struct mlx4_ib_dev *dev);
+void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev);
+
+struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int mr_access_flags,
+				  struct ib_fmr_attr *fmr_attr);
+int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int npages,
+			 u64 iova);
+int mlx4_ib_unmap_fmr(struct list_head *fmr_list);
+int mlx4_ib_fmr_dealloc(struct ib_fmr *fmr);
+int mlx4_ib_create_xrc_rcv_qp(struct ib_qp_init_attr *init_attr,
+			      u32 *qp_num);
+int mlx4_ib_modify_xrc_rcv_qp(struct ib_xrcd *xrcd, u32 qp_num,
+			      struct ib_qp_attr *attr, int attr_mask);
+int mlx4_ib_query_xrc_rcv_qp(struct ib_xrcd *xrcd, u32 qp_num,
+			     struct ib_qp_attr *attr, int attr_mask,
+			     struct ib_qp_init_attr *init_attr);
+int mlx4_ib_reg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num);
+int mlx4_ib_unreg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num);
+
+
+int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr,
+			u8 *mac, int *is_mcast, u8 port);
+
+static inline int mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah)
+{
+	u8 port = be32_to_cpu(ah->av.ib.port_pd) >> 24 & 3;
+
+	if (rdma_port_get_link_layer(ah->ibah.device, port) == IB_LINK_LAYER_ETHERNET)
+		return 1;
+
+	return !!(ah->av.ib.g_slid & 0x80);
+}
+
+int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
+		   union ib_gid *gid);
+
+#endif /* MLX4_IB_H */
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/mr.c b/sys/ofed/drivers/infiniband/hw/mlx4/mr.c
new file mode 100644
index 0000000..c49b460
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/mr.c
@@ -0,0 +1,424 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "mlx4_ib.h"
+
+static u32 convert_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX4_PERM_ATOMIC       : 0) |
+	       (acc & IB_ACCESS_REMOTE_WRITE  ? MLX4_PERM_REMOTE_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ   ? MLX4_PERM_REMOTE_READ  : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE   ? MLX4_PERM_LOCAL_WRITE  : 0) |
+	       MLX4_PERM_LOCAL_READ;
+}
+
+struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct mlx4_ib_mr *mr;
+	int err;
+
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_mr_alloc(to_mdev(pd->device)->dev, to_mpd(pd)->pdn, 0,
+			    ~0ull, convert_access(acc), 0, 0, &mr->mmr);
+	if (err)
+		goto err_free;
+
+	err = mlx4_mr_enable(to_mdev(pd->device)->dev, &mr->mmr);
+	if (err)
+		goto err_mr;
+
+	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
+	mr->umem = NULL;
+
+	return &mr->ibmr;
+
+err_mr:
+	mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
+
+err_free:
+	kfree(mr);
+
+	return ERR_PTR(err);
+}
+
+int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
+			   struct ib_umem *umem)
+{
+	u64 *pages;
+	struct ib_umem_chunk *chunk;
+	int i, j, k;
+	int n;
+	int len;
+	int err = 0;
+
+	pages = (u64 *) __get_free_page(GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	i = n = 0;
+
+	list_for_each_entry(chunk, &umem->chunk_list, list)
+		for (j = 0; j < chunk->nmap; ++j) {
+			len = sg_dma_len(&chunk->page_list[j]) >> mtt->page_shift;
+			for (k = 0; k < len; ++k) {
+				pages[i++] = sg_dma_address(&chunk->page_list[j]) +
+					umem->page_size * k;
+				/*
+				 * Be friendly to mlx4_write_mtt() and
+				 * pass it chunks of appropriate size.
+				 */
+				if (i == PAGE_SIZE / sizeof (u64)) {
+					err = mlx4_write_mtt(dev->dev, mtt, n,
+							     i, pages);
+					if (err)
+						goto out;
+					n += i;
+					i = 0;
+				}
+			}
+		}
+
+	if (i)
+		err = mlx4_write_mtt(dev->dev, mtt, n, i, pages);
+
+out:
+	free_page((unsigned long) pages);
+	return err;
+}
+
+static int handle_hugetlb_user_mr(struct ib_pd *pd, struct mlx4_ib_mr *mr,
+				  u64 start, u64 virt_addr, int access_flags)
+{
+#if defined(CONFIG_HUGETLB_PAGE) && !defined(__powerpc__) && !defined(__ia64__)
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	struct ib_umem_chunk *chunk;
+	unsigned dsize;
+	dma_addr_t daddr;
+	unsigned cur_size = 0;
+	dma_addr_t uninitialized_var(cur_addr);
+	int n;
+	struct ib_umem	*umem = mr->umem;
+	u64 *arr;
+	int err = 0;
+	int i;
+	int j = 0;
+	int off = start & (HPAGE_SIZE - 1);
+
+	n = DIV_ROUND_UP(off + umem->length, HPAGE_SIZE);
+	arr = kmalloc(n * sizeof *arr, GFP_KERNEL);
+	if (!arr)
+		return -ENOMEM;
+
+	list_for_each_entry(chunk, &umem->chunk_list, list)
+		for (i = 0; i < chunk->nmap; ++i) {
+			daddr = sg_dma_address(&chunk->page_list[i]);
+			dsize = sg_dma_len(&chunk->page_list[i]);
+			if (!cur_size) {
+				cur_addr = daddr;
+				cur_size = dsize;
+			} else if (cur_addr + cur_size != daddr) {
+				err = -EINVAL;
+				goto out;
+			} else
+				cur_size += dsize;
+
+			if (cur_size > HPAGE_SIZE) {
+				err = -EINVAL;
+				goto out;
+			} else if (cur_size == HPAGE_SIZE) {
+				cur_size = 0;
+				arr[j++] = cur_addr;
+			}
+		}
+
+	if (cur_size) {
+		arr[j++] = cur_addr;
+	}
+
+	err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, umem->length,
+			    convert_access(access_flags), n, HPAGE_SHIFT, &mr->mmr);
+	if (err)
+		goto out;
+
+	err = mlx4_write_mtt(dev->dev, &mr->mmr.mtt, 0, n, arr);
+
+out:
+	kfree(arr);
+	return err;
+#else
+	return -ENOSYS;
+#endif
+}
+
+struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				  u64 virt_addr, int access_flags,
+				  struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	struct mlx4_ib_mr *mr;
+	int shift;
+	int err;
+	int n;
+
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	mr->umem = ib_umem_get(pd->uobject->context, start, length,
+			       access_flags, 0);
+	if (IS_ERR(mr->umem)) {
+		err = PTR_ERR(mr->umem);
+		goto err_free;
+	}
+
+	if (!mr->umem->hugetlb ||
+	    handle_hugetlb_user_mr(pd, mr, start, virt_addr, access_flags)) {
+		n = ib_umem_page_count(mr->umem);
+		shift = ilog2(mr->umem->page_size);
+
+		err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length,
+				    convert_access(access_flags), n, shift, &mr->mmr);
+		if (err)
+			goto err_umem;
+
+		err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem);
+		if (err)
+			goto err_mr;
+	}
+
+	err = mlx4_mr_enable(dev->dev, &mr->mmr);
+	if (err)
+		goto err_mr;
+
+	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
+
+	return &mr->ibmr;
+
+err_mr:
+	mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
+
+err_umem:
+	ib_umem_release(mr->umem);
+
+err_free:
+	kfree(mr);
+
+	return ERR_PTR(err);
+}
+
+int mlx4_ib_dereg_mr(struct ib_mr *ibmr)
+{
+	struct mlx4_ib_mr *mr = to_mmr(ibmr);
+
+	mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr);
+	if (mr->umem)
+		ib_umem_release(mr->umem);
+	kfree(mr);
+
+	return 0;
+}
+
+struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd,
+					int max_page_list_len)
+{
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	struct mlx4_ib_mr *mr;
+	int err;
+
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, 0, 0, 0,
+			    max_page_list_len, 0, &mr->mmr);
+	if (err)
+		goto err_free;
+
+	err = mlx4_mr_enable(dev->dev, &mr->mmr);
+	if (err)
+		goto err_mr;
+
+	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
+	mr->umem = NULL;
+
+	return &mr->ibmr;
+
+err_mr:
+	mlx4_mr_free(dev->dev, &mr->mmr);
+
+err_free:
+	kfree(mr);
+	return ERR_PTR(err);
+}
+
+struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
+							       int page_list_len)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct mlx4_ib_fast_reg_page_list *mfrpl;
+	int size = page_list_len * sizeof (u64);
+
+	if (page_list_len > MAX_FAST_REG_PAGES)
+		return ERR_PTR(-EINVAL);
+
+	mfrpl = kmalloc(sizeof *mfrpl, GFP_KERNEL);
+	if (!mfrpl)
+		return ERR_PTR(-ENOMEM);
+
+	mfrpl->ibfrpl.page_list = kmalloc(size, GFP_KERNEL);
+	if (!mfrpl->ibfrpl.page_list)
+		goto err_free;
+
+	mfrpl->mapped_page_list = dma_alloc_coherent(&dev->dev->pdev->dev,
+						     size, &mfrpl->map,
+						     GFP_KERNEL);
+	if (!mfrpl->mapped_page_list)
+		goto err_free;
+
+	WARN_ON(mfrpl->map & 0x3f);
+
+	return &mfrpl->ibfrpl;
+
+err_free:
+	kfree(mfrpl->ibfrpl.page_list);
+	kfree(mfrpl);
+	return ERR_PTR(-ENOMEM);
+}
+
+void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
+{
+	struct mlx4_ib_dev *dev = to_mdev(page_list->device);
+	struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list);
+	int size = page_list->max_page_list_len * sizeof (u64);
+
+	dma_free_coherent(&dev->dev->pdev->dev, size, mfrpl->mapped_page_list,
+			  mfrpl->map);
+	kfree(mfrpl->ibfrpl.page_list);
+	kfree(mfrpl);
+}
+
+struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int acc,
+				 struct ib_fmr_attr *fmr_attr)
+{
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	struct mlx4_ib_fmr *fmr;
+	int err = -ENOMEM;
+
+	fmr = kmalloc(sizeof *fmr, GFP_KERNEL);
+	if (!fmr)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_fmr_alloc(dev->dev, to_mpd(pd)->pdn, convert_access(acc),
+			     fmr_attr->max_pages, fmr_attr->max_maps,
+			     fmr_attr->page_shift, &fmr->mfmr);
+	if (err)
+		goto err_free;
+
+	err = mlx4_fmr_enable(to_mdev(pd->device)->dev, &fmr->mfmr);
+	if (err)
+		goto err_mr;
+
+	fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mfmr.mr.key;
+
+	return &fmr->ibfmr;
+
+err_mr:
+	mlx4_mr_free(to_mdev(pd->device)->dev, &fmr->mfmr.mr);
+
+err_free:
+	kfree(fmr);
+
+	return ERR_PTR(err);
+}
+
+int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+		      int npages, u64 iova)
+{
+	struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
+	struct mlx4_ib_dev *dev = to_mdev(ifmr->ibfmr.device);
+
+	return mlx4_map_phys_fmr(dev->dev, &ifmr->mfmr, page_list, npages, iova,
+				 &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey);
+}
+
+int mlx4_ib_unmap_fmr(struct list_head *fmr_list)
+{
+	struct ib_fmr *ibfmr;
+	int err;
+	struct mlx4_dev *mdev = NULL;
+
+	list_for_each_entry(ibfmr, fmr_list, list) {
+		if (mdev && to_mdev(ibfmr->device)->dev != mdev)
+			return -EINVAL;
+		mdev = to_mdev(ibfmr->device)->dev;
+	}
+
+	if (!mdev)
+		return 0;
+
+	list_for_each_entry(ibfmr, fmr_list, list) {
+		struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
+
+		mlx4_fmr_unmap(mdev, &ifmr->mfmr, &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey);
+	}
+
+	/*
+	 * Make sure all MPT status updates are visible before issuing
+	 * SYNC_TPT firmware command.
+	 */
+	wmb();
+
+	err = mlx4_SYNC_TPT(mdev);
+	if (err)
+		printk(KERN_WARNING "mlx4_ib: SYNC_TPT error %d when "
+		       "unmapping FMRs\n", err);
+
+	return 0;
+}
+
+int mlx4_ib_fmr_dealloc(struct ib_fmr *ibfmr)
+{
+	struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
+	struct mlx4_ib_dev *dev = to_mdev(ibfmr->device);
+	int err;
+
+	err = mlx4_fmr_free(dev->dev, &ifmr->mfmr);
+
+	if (!err)
+		kfree(ifmr);
+
+	return err;
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/qp.c b/sys/ofed/drivers/infiniband/hw/mlx4/qp.c
new file mode 100644
index 0000000..8958c1e
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/qp.c
@@ -0,0 +1,2770 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/log2.h>
+#include <linux/netdevice.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_addr.h>
+
+#include <linux/mlx4/qp.h>
+#include <linux/io.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+
+enum {
+	MLX4_IB_ACK_REQ_FREQ	= 8,
+};
+
+enum {
+	MLX4_IB_DEFAULT_SCHED_QUEUE	= 0x83,
+	MLX4_IB_DEFAULT_QP0_SCHED_QUEUE	= 0x3f,
+	MLX4_IB_LINK_TYPE_IB		= 0,
+	MLX4_IB_LINK_TYPE_ETH		= 1,
+};
+
+enum {
+	/*
+	 * Largest possible UD header: send with GRH and immediate data.
+	 * 4 bytes added to accommodate for eth header instead of lrh
+	 */
+	MLX4_IB_UD_HEADER_SIZE		= 76,
+	MLX4_IB_MAX_RAW_ETY_HDR_SIZE	= 12
+};
+
+enum {
+	MLX4_IBOE_ETHERTYPE = 0x8915
+};
+
+struct mlx4_ib_xrc_reg_entry {
+	struct list_head list;
+	void *context;
+};
+
+struct mlx4_ib_sqp {
+	struct mlx4_ib_qp	qp;
+	int			pkey_index;
+	u32			qkey;
+	u32			send_psn;
+	struct ib_ud_header	ud_header;
+	u8			header_buf[MLX4_IB_UD_HEADER_SIZE];
+};
+
+enum {
+	MLX4_IB_MIN_SQ_STRIDE = 6
+};
+
+static const __be32 mlx4_ib_opcode[] = {
+	[IB_WR_SEND]				= cpu_to_be32(MLX4_OPCODE_SEND),
+	[IB_WR_LSO]				= cpu_to_be32(MLX4_OPCODE_LSO),
+	[IB_WR_SEND_WITH_IMM]			= cpu_to_be32(MLX4_OPCODE_SEND_IMM),
+	[IB_WR_RDMA_WRITE]			= cpu_to_be32(MLX4_OPCODE_RDMA_WRITE),
+	[IB_WR_RDMA_WRITE_WITH_IMM]		= cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM),
+	[IB_WR_RDMA_READ]			= cpu_to_be32(MLX4_OPCODE_RDMA_READ),
+	[IB_WR_ATOMIC_CMP_AND_SWP]		= cpu_to_be32(MLX4_OPCODE_ATOMIC_CS),
+	[IB_WR_ATOMIC_FETCH_AND_ADD]		= cpu_to_be32(MLX4_OPCODE_ATOMIC_FA),
+	[IB_WR_SEND_WITH_INV]			= cpu_to_be32(MLX4_OPCODE_SEND_INVAL),
+	[IB_WR_LOCAL_INV]			= cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL),
+	[IB_WR_FAST_REG_MR]			= cpu_to_be32(MLX4_OPCODE_FMR),
+	[IB_WR_MASKED_ATOMIC_CMP_AND_SWP]	= cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS),
+	[IB_WR_MASKED_ATOMIC_FETCH_AND_ADD]	= cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA),
+};
+
+#ifndef wc_wmb
+	#if defined(__i386__)
+		#define wc_wmb() __asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
+	#elif defined(__x86_64__)
+		#define wc_wmb() __asm volatile("sfence" ::: "memory")
+	#elif defined(__ia64__)
+		#define wc_wmb() __asm volatile("fwb" ::: "memory")
+	#else
+		#define wc_wmb() wmb()
+	#endif
+#endif
+
+
+static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
+{
+	return container_of(mqp, struct mlx4_ib_sqp, qp);
+}
+
+static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+	return qp->mqp.qpn >= dev->dev->caps.sqp_start &&
+		qp->mqp.qpn <= dev->dev->caps.sqp_start + 3;
+}
+
+static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+	return qp->mqp.qpn >= dev->dev->caps.sqp_start &&
+		qp->mqp.qpn <= dev->dev->caps.sqp_start + 1;
+}
+
+static void *get_wqe(struct mlx4_ib_qp *qp, int offset)
+{
+	return mlx4_buf_offset(&qp->buf, offset);
+}
+
+static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n)
+{
+	return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift));
+}
+
+static void *get_send_wqe(struct mlx4_ib_qp *qp, int n)
+{
+	return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift));
+}
+
+/*
+ * Stamp a SQ WQE so that it is invalid if prefetched by marking the
+ * first four bytes of every 64 byte chunk with
+ *     0x7FFFFFF | (invalid_ownership_value << 31).
+ *
+ * When the max work request size is less than or equal to the WQE
+ * basic block size, as an optimization, we can stamp all WQEs with
+ * 0xffffffff, and skip the very first chunk of each WQE.
+ */
+static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size)
+{
+	__be32 *wqe;
+	int i;
+	int s;
+	int ind;
+	void *buf;
+	__be32 stamp;
+	struct mlx4_wqe_ctrl_seg *ctrl;
+
+	if (qp->sq_max_wqes_per_wr > 1) {
+		s = roundup(size, 1U << qp->sq.wqe_shift);
+		for (i = 0; i < s; i += 64) {
+			ind = (i >> qp->sq.wqe_shift) + n;
+			stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) :
+						       cpu_to_be32(0xffffffff);
+			buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+			wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1));
+			*wqe = stamp;
+		}
+	} else {
+		ctrl = buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+		s = (ctrl->fence_size & 0x3f) << 4;
+		for (i = 64; i < s; i += 64) {
+			wqe = buf + i;
+			*wqe = cpu_to_be32(0xffffffff);
+		}
+	}
+}
+
+static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size)
+{
+	struct mlx4_wqe_ctrl_seg *ctrl;
+	struct mlx4_wqe_inline_seg *inl;
+	void *wqe;
+	int s;
+
+	ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+	s = sizeof(struct mlx4_wqe_ctrl_seg);
+
+	if (qp->ibqp.qp_type == IB_QPT_UD) {
+		struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl;
+		struct mlx4_av *av = (struct mlx4_av *)dgram->av;
+		memset(dgram, 0, sizeof *dgram);
+		av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn);
+		s += sizeof(struct mlx4_wqe_datagram_seg);
+	}
+
+	/* Pad the remainder of the WQE with an inline data segment. */
+	if (size > s) {
+		inl = wqe + s;
+		inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl));
+	}
+	ctrl->srcrb_flags = 0;
+	ctrl->fence_size = size / 16;
+	/*
+	 * Make sure descriptor is fully written before setting ownership bit
+	 * (because HW can start executing as soon as we do).
+	 */
+	wmb();
+
+	ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) |
+		(n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
+
+	stamp_send_wqe(qp, n + qp->sq_spare_wqes, size);
+}
+
+/* Post NOP WQE to prevent wrap-around in the middle of WR */
+static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind)
+{
+	unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1));
+	if (unlikely(s < qp->sq_max_wqes_per_wr)) {
+		post_nop_wqe(qp, ind, s << qp->sq.wqe_shift);
+		ind += s;
+	}
+	return ind;
+}
+
+static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
+{
+	struct ib_event event;
+	struct mlx4_ib_qp *mqp = to_mibqp(qp);
+	struct ib_qp *ibqp = &mqp->ibqp;
+	struct mlx4_ib_xrc_reg_entry *ctx_entry;
+	unsigned long flags;
+
+	if (type == MLX4_EVENT_TYPE_PATH_MIG)
+		to_mibqp(qp)->port = to_mibqp(qp)->alt_port;
+
+	if (ibqp->event_handler) {
+		event.device     = ibqp->device;
+		switch (type) {
+		case MLX4_EVENT_TYPE_PATH_MIG:
+			event.event = IB_EVENT_PATH_MIG;
+			break;
+		case MLX4_EVENT_TYPE_COMM_EST:
+			event.event = IB_EVENT_COMM_EST;
+			break;
+		case MLX4_EVENT_TYPE_SQ_DRAINED:
+			event.event = IB_EVENT_SQ_DRAINED;
+			break;
+		case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE:
+			event.event = IB_EVENT_QP_LAST_WQE_REACHED;
+			break;
+		case MLX4_EVENT_TYPE_WQ_CATAS_ERROR:
+			event.event = IB_EVENT_QP_FATAL;
+			break;
+		case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
+			event.event = IB_EVENT_PATH_MIG_ERR;
+			break;
+		case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+			event.event = IB_EVENT_QP_REQ_ERR;
+			break;
+		case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
+			event.event = IB_EVENT_QP_ACCESS_ERR;
+			break;
+		default:
+			printk(KERN_WARNING "mlx4_ib: Unexpected event type %d "
+			       "on QP %06x\n", type, qp->qpn);
+			return;
+		}
+
+		if (unlikely(ibqp->qp_type == IB_QPT_XRC &&
+			     mqp->flags & MLX4_IB_XRC_RCV)) {
+			event.event |= IB_XRC_QP_EVENT_FLAG;
+			event.element.xrc_qp_num = ibqp->qp_num;
+			spin_lock_irqsave(&mqp->xrc_reg_list_lock, flags);
+			list_for_each_entry(ctx_entry, &mqp->xrc_reg_list, list)
+				ibqp->event_handler(&event, ctx_entry->context);
+			spin_unlock_irqrestore(&mqp->xrc_reg_list_lock, flags);
+			return;
+		}
+		event.element.qp = ibqp;
+		ibqp->event_handler(&event, ibqp->qp_context);
+	}
+}
+
+static int send_wqe_overhead(enum ib_qp_type type, u32 flags)
+{
+	/*
+	 * UD WQEs must have a datagram segment.
+	 * RC and UC WQEs might have a remote address segment.
+	 * MLX WQEs need two extra inline data segments (for the UD
+	 * header and space for the ICRC).
+	 */
+	switch (type) {
+	case IB_QPT_UD:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			sizeof (struct mlx4_wqe_datagram_seg) +
+			((flags & MLX4_IB_QP_LSO) ? 128 : 0);
+	case IB_QPT_UC:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			sizeof (struct mlx4_wqe_raddr_seg);
+	case IB_QPT_XRC:
+	case IB_QPT_RC:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			sizeof (struct mlx4_wqe_atomic_seg) +
+			sizeof (struct mlx4_wqe_raddr_seg);
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			ALIGN(MLX4_IB_UD_HEADER_SIZE +
+			      DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE,
+					   MLX4_INLINE_ALIGN) *
+			      sizeof (struct mlx4_wqe_inline_seg),
+			      sizeof (struct mlx4_wqe_data_seg)) +
+			ALIGN(4 +
+			      sizeof (struct mlx4_wqe_inline_seg),
+			      sizeof (struct mlx4_wqe_data_seg));
+	case IB_QPT_RAW_ETY:
+		return sizeof(struct mlx4_wqe_ctrl_seg) +
+			ALIGN(MLX4_IB_MAX_RAW_ETY_HDR_SIZE +
+			      sizeof(struct mlx4_wqe_inline_seg),
+			      sizeof(struct mlx4_wqe_data_seg));
+
+	default:
+		return sizeof (struct mlx4_wqe_ctrl_seg);
+	}
+}
+
+static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+		       int is_user, int has_srq_or_is_xrc, struct mlx4_ib_qp *qp)
+{
+	/* Sanity check RQ size before proceeding */
+	if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE ||
+	    cap->max_recv_sge >
+		min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg)) {
+		mlx4_ib_dbg("Requested RQ size (sge or wr) too large");
+		return -EINVAL;
+	}
+
+	if (has_srq_or_is_xrc) {
+		/* QPs attached to an SRQ should have no RQ */
+		if (cap->max_recv_wr) {
+			mlx4_ib_dbg("non-zero RQ size for QP using SRQ");
+			return -EINVAL;
+		}
+
+		qp->rq.wqe_cnt = qp->rq.max_gs = 0;
+	} else {
+		/* HW requires >= 1 RQ entry with >= 1 gather entry */
+		if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge)) {
+			mlx4_ib_dbg("user QP RQ has 0 wr's or 0 sge's "
+				    "(wr: 0x%x, sge: 0x%x)", cap->max_recv_wr,
+				    cap->max_recv_sge);
+			return -EINVAL;
+		}
+
+		qp->rq.wqe_cnt	 = roundup_pow_of_two(max(1U, cap->max_recv_wr));
+		qp->rq.max_gs	 = roundup_pow_of_two(max(1U, cap->max_recv_sge));
+		qp->rq.wqe_shift = ilog2(qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg));
+	}
+
+	/* leave userspace return values as they were, so as not to break ABI */
+	if (is_user) {
+		cap->max_recv_wr  = qp->rq.max_post = qp->rq.wqe_cnt;
+		cap->max_recv_sge = qp->rq.max_gs;
+	} else {
+		cap->max_recv_wr  = qp->rq.max_post =
+			min(dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE, qp->rq.wqe_cnt);
+		cap->max_recv_sge = min(qp->rq.max_gs,
+					min(dev->dev->caps.max_sq_sg,
+				    	dev->dev->caps.max_rq_sg));
+	}
+	/* We don't support inline sends for kernel QPs (yet) */
+
+
+	return 0;
+}
+
+static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+			      enum ib_qp_type type, struct mlx4_ib_qp *qp)
+{
+	int s;
+
+	/* Sanity check SQ size before proceeding */
+	if (cap->max_send_wr	 > (dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE) ||
+	    cap->max_send_sge	 >
+		min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg) ||
+	    cap->max_inline_data + send_wqe_overhead(type, qp->flags) +
+	    sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz) {
+		mlx4_ib_dbg("Requested SQ resources exceed device maxima");
+		return -EINVAL;
+	}
+
+	/*
+	 * For MLX transport we need 2 extra S/G entries:
+	 * one for the header and one for the checksum at the end
+	 */
+	if ((type == IB_QPT_SMI || type == IB_QPT_GSI) &&
+	    cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg) {
+		mlx4_ib_dbg("No space for SQP hdr/csum sge's");
+		return -EINVAL;
+	}
+
+	if (type == IB_QPT_RAW_ETY &&
+	    cap->max_send_sge + 1 > dev->dev->caps.max_sq_sg) {
+		mlx4_ib_dbg("No space for RAW ETY hdr");
+		return -EINVAL;
+	}
+
+	s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg),
+		cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) +
+		send_wqe_overhead(type, qp->flags);
+
+	if (s > dev->dev->caps.max_sq_desc_sz)
+		return -EINVAL;
+
+	/*
+	 * Hermon supports shrinking WQEs, such that a single work
+	 * request can include multiple units of 1 << wqe_shift.  This
+	 * way, work requests can differ in size, and do not have to
+	 * be a power of 2 in size, saving memory and speeding up send
+	 * WR posting.  Unfortunately, if we do this then the
+	 * wqe_index field in CQEs can't be used to look up the WR ID
+	 * anymore, so we do this only if selective signaling is off.
+	 *
+	 * Further, on 32-bit platforms, we can't use vmap() to make
+	 * the QP buffer virtually contigious.  Thus we have to use
+	 * constant-sized WRs to make sure a WR is always fully within
+	 * a single page-sized chunk.
+	 *
+	 * Finally, we use NOP work requests to pad the end of the
+	 * work queue, to avoid wrap-around in the middle of WR.  We
+	 * set NEC bit to avoid getting completions with error for
+	 * these NOP WRs, but since NEC is only supported starting
+	 * with firmware 2.2.232, we use constant-sized WRs for older
+	 * firmware.
+	 *
+	 * And, since MLX QPs only support SEND, we use constant-sized
+	 * WRs in this case.
+	 *
+	 * We look for the smallest value of wqe_shift such that the
+	 * resulting number of wqes does not exceed device
+	 * capabilities.
+	 *
+	 * We set WQE size to at least 64 bytes, this way stamping
+	 * invalidates each WQE.
+	 */
+	if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC &&
+	    qp->sq_signal_bits && BITS_PER_LONG == 64 &&
+	    type != IB_QPT_SMI && type != IB_QPT_GSI && type != IB_QPT_RAW_ETY)
+		qp->sq.wqe_shift = ilog2(64);
+	else
+		qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s));
+
+	for (;;) {
+		qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift);
+
+		/*
+		 * We need to leave 2 KB + 1 WR of headroom in the SQ to
+		 * allow HW to prefetch.
+		 */
+		qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr;
+		qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr *
+						    qp->sq_max_wqes_per_wr +
+						    qp->sq_spare_wqes);
+
+		if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes)
+			break;
+
+		if (qp->sq_max_wqes_per_wr <= 1)
+			return -EINVAL;
+
+		++qp->sq.wqe_shift;
+	}
+
+	qp->sq.max_gs = (min(dev->dev->caps.max_sq_desc_sz,
+			     (qp->sq_max_wqes_per_wr << qp->sq.wqe_shift)) -
+			 send_wqe_overhead(type, qp->flags)) /
+		sizeof (struct mlx4_wqe_data_seg);
+
+	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+		(qp->sq.wqe_cnt << qp->sq.wqe_shift);
+	if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
+		qp->rq.offset = 0;
+		qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
+	} else {
+		qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
+		qp->sq.offset = 0;
+	}
+
+	cap->max_send_wr  = qp->sq.max_post =
+		(qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr;
+	cap->max_send_sge = min(qp->sq.max_gs,
+				min(dev->dev->caps.max_sq_sg,
+				    dev->dev->caps.max_rq_sg));
+	qp->max_inline_data = cap->max_inline_data;
+
+	return 0;
+}
+
+static int set_user_sq_size(struct mlx4_ib_dev *dev,
+			    struct mlx4_ib_qp *qp,
+			    struct mlx4_ib_create_qp *ucmd)
+{
+	/* Sanity check SQ size before proceeding */
+	if ((1 << ucmd->log_sq_bb_count) > dev->dev->caps.max_wqes	 ||
+	    ucmd->log_sq_stride >
+		ilog2(roundup_pow_of_two(dev->dev->caps.max_sq_desc_sz)) ||
+	    ucmd->log_sq_stride < MLX4_IB_MIN_SQ_STRIDE) {
+		mlx4_ib_dbg("Requested max wqes or wqe stride exceeds max");
+		return -EINVAL;
+	}
+
+	qp->sq.wqe_cnt   = 1 << ucmd->log_sq_bb_count;
+	qp->sq.wqe_shift = ucmd->log_sq_stride;
+
+	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+		(qp->sq.wqe_cnt << qp->sq.wqe_shift);
+
+	return 0;
+}
+
+static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
+			    struct ib_qp_init_attr *init_attr,
+			    struct ib_udata *udata, int sqpn, struct mlx4_ib_qp *qp)
+{
+	int qpn;
+	int err;
+
+	mutex_init(&qp->mutex);
+	spin_lock_init(&qp->sq.lock);
+	spin_lock_init(&qp->rq.lock);
+	spin_lock_init(&qp->xrc_reg_list_lock);
+	INIT_LIST_HEAD(&qp->gid_list);
+
+	qp->state	 = IB_QPS_RESET;
+	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
+		qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+
+	err = set_rq_size(dev, &init_attr->cap, !!pd->uobject,
+			  !!init_attr->srq || !!init_attr->xrc_domain , qp);
+	if (err)
+		goto err;
+
+	if (pd->uobject) {
+		struct mlx4_ib_create_qp ucmd;
+
+		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+			err = -EFAULT;
+			goto err;
+		}
+
+		qp->sq_no_prefetch = ucmd.sq_no_prefetch;
+
+		err = set_user_sq_size(dev, qp, &ucmd);
+		if (err)
+			goto err;
+
+		qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
+				       qp->buf_size, 0, 0);
+		if (IS_ERR(qp->umem)) {
+			err = PTR_ERR(qp->umem);
+			mlx4_ib_dbg("ib_umem_get error (%d)", err);
+			goto err;
+		}
+
+		err = mlx4_mtt_init(dev->dev, ib_umem_page_count(qp->umem),
+				    ilog2(qp->umem->page_size), &qp->mtt);
+		if (err) {
+			mlx4_ib_dbg("mlx4_mtt_init error (%d)", err);
+			goto err_buf;
+		}
+
+		err = mlx4_ib_umem_write_mtt(dev, &qp->mtt, qp->umem);
+		if (err) {
+			mlx4_ib_dbg("mlx4_ib_umem_write_mtt error (%d)", err);
+			goto err_mtt;
+		}
+
+		if (!init_attr->srq && init_attr->qp_type != IB_QPT_XRC) {
+			err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
+						  ucmd.db_addr, &qp->db);
+			if (err) {
+				mlx4_ib_dbg("mlx4_ib_db_map_user error (%d)", err);
+				goto err_mtt;
+			}
+		}
+	} else {
+		qp->sq_no_prefetch = 0;
+
+		if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
+			qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
+
+		if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
+			qp->flags |= MLX4_IB_QP_LSO;
+
+		err = set_kernel_sq_size(dev, &init_attr->cap, init_attr->qp_type, qp);
+		if (err)
+			goto err;
+
+		if (!init_attr->srq && init_attr->qp_type != IB_QPT_XRC) {
+			err = mlx4_db_alloc(dev->dev, &qp->db, 0);
+			if (err)
+				goto err;
+
+			*qp->db.db = 0;
+		}
+
+		if (qp->max_inline_data) {
+			err = mlx4_bf_alloc(dev->dev, &qp->bf);
+			if (err) {
+				mlx4_ib_dbg("failed to allocate blue flame register (%d)", err);
+				qp->bf.uar = &dev->priv_uar;
+			}
+		} else
+			qp->bf.uar = &dev->priv_uar;
+
+		if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf)) {
+			err = -ENOMEM;
+			goto err_db;
+		}
+
+		err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift,
+				    &qp->mtt);
+		if (err) {
+			mlx4_ib_dbg("kernel qp mlx4_mtt_init error (%d)", err);
+			goto err_buf;
+		}
+
+		err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf);
+		if (err) {
+			mlx4_ib_dbg("mlx4_buf_write_mtt error (%d)", err);
+			goto err_mtt;
+		}
+
+		qp->sq.wrid  = kmalloc(qp->sq.wqe_cnt * sizeof (u64), GFP_KERNEL);
+		qp->rq.wrid  = kmalloc(qp->rq.wqe_cnt * sizeof (u64), GFP_KERNEL);
+
+		if (!qp->sq.wrid || !qp->rq.wrid) {
+			err = -ENOMEM;
+			goto err_wrid;
+		}
+	}
+
+	if (sqpn) {
+		qpn = sqpn;
+	} else {
+		err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn);
+		if (err)
+			goto err_wrid;
+	}
+
+	err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp);
+	if (err)
+		goto err_qpn;
+
+	if (init_attr->qp_type == IB_QPT_XRC)
+		qp->mqp.qpn |= (1 << 23);
+
+	/*
+	 * Hardware wants QPN written in big-endian order (after
+	 * shifting) for send doorbell.  Precompute this value to save
+	 * a little bit when posting sends.
+	 */
+	qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
+
+	qp->mqp.event = mlx4_ib_qp_event;
+
+	return 0;
+
+err_qpn:
+	if (!sqpn)
+		mlx4_qp_release_range(dev->dev, qpn, 1);
+
+err_wrid:
+	if (pd->uobject) {
+		if (!init_attr->srq && init_attr->qp_type != IB_QPT_XRC)
+			mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context),
+					      &qp->db);
+	} else {
+		kfree(qp->sq.wrid);
+		kfree(qp->rq.wrid);
+	}
+
+err_mtt:
+	mlx4_mtt_cleanup(dev->dev, &qp->mtt);
+
+err_buf:
+	if (pd->uobject)
+		ib_umem_release(qp->umem);
+	else
+		mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
+
+err_db:
+	if (!pd->uobject && !init_attr->srq && init_attr->qp_type != IB_QPT_XRC)
+		mlx4_db_free(dev->dev, &qp->db);
+
+	if (qp->max_inline_data)
+		mlx4_bf_free(dev->dev, &qp->bf);
+
+err:
+	return err;
+}
+
+static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state)
+{
+	switch (state) {
+	case IB_QPS_RESET:	return MLX4_QP_STATE_RST;
+	case IB_QPS_INIT:	return MLX4_QP_STATE_INIT;
+	case IB_QPS_RTR:	return MLX4_QP_STATE_RTR;
+	case IB_QPS_RTS:	return MLX4_QP_STATE_RTS;
+	case IB_QPS_SQD:	return MLX4_QP_STATE_SQD;
+	case IB_QPS_SQE:	return MLX4_QP_STATE_SQER;
+	case IB_QPS_ERR:	return MLX4_QP_STATE_ERR;
+	default:		return -1;
+	}
+}
+
+static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
+{
+	if (send_cq == recv_cq)
+		spin_lock_irq(&send_cq->lock);
+	else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
+		spin_lock_irq(&send_cq->lock);
+		spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
+	} else {
+		spin_lock_irq(&recv_cq->lock);
+		spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
+	}
+}
+
+static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
+{
+	if (send_cq == recv_cq)
+		spin_unlock_irq(&send_cq->lock);
+	else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
+		spin_unlock(&recv_cq->lock);
+		spin_unlock_irq(&send_cq->lock);
+	} else {
+		spin_unlock(&send_cq->lock);
+		spin_unlock_irq(&recv_cq->lock);
+	}
+}
+
+static void del_gid_entries(struct mlx4_ib_qp *qp)
+{
+	struct gid_entry *ge, *tmp;
+
+	list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
+		list_del(&ge->list);
+		kfree(ge);
+	}
+}
+
+static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
+			      int is_user)
+{
+	struct mlx4_ib_cq *send_cq, *recv_cq;
+
+	if (qp->state != IB_QPS_RESET)
+		if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state),
+				   MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp))
+			printk(KERN_WARNING "mlx4_ib: modify QP %06x to RESET failed.\n",
+			       qp->mqp.qpn);
+
+	send_cq = to_mcq(qp->ibqp.send_cq);
+	recv_cq = to_mcq(qp->ibqp.recv_cq);
+
+	mlx4_ib_lock_cqs(send_cq, recv_cq);
+
+	if (!is_user) {
+		__mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
+				 qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL);
+		if (send_cq != recv_cq)
+			__mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
+	}
+
+	mlx4_qp_remove(dev->dev, &qp->mqp);
+
+	mlx4_ib_unlock_cqs(send_cq, recv_cq);
+
+	mlx4_qp_free(dev->dev, &qp->mqp);
+
+	if (!is_sqp(dev, qp))
+		mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
+
+	mlx4_mtt_cleanup(dev->dev, &qp->mtt);
+
+	if (is_user) {
+		if (!qp->ibqp.srq && qp->ibqp.qp_type != IB_QPT_XRC)
+			mlx4_ib_db_unmap_user(to_mucontext(qp->ibqp.uobject->context),
+					      &qp->db);
+		ib_umem_release(qp->umem);
+	} else {
+		kfree(qp->sq.wrid);
+		kfree(qp->rq.wrid);
+		mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
+		if (qp->max_inline_data)
+			mlx4_bf_free(dev->dev, &qp->bf);
+		if (!qp->ibqp.srq && qp->ibqp.qp_type != IB_QPT_XRC)
+			mlx4_db_free(dev->dev, &qp->db);
+	}
+
+	del_gid_entries(qp);
+}
+
+struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
+				struct ib_qp_init_attr *init_attr,
+				struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	struct mlx4_ib_sqp *sqp;
+	struct mlx4_ib_qp *qp;
+	int err;
+
+	/*
+	 * We only support LSO and multicast loopback blocking, and
+	 * only for kernel UD QPs.
+	 */
+	if (init_attr->create_flags & ~(IB_QP_CREATE_IPOIB_UD_LSO |
+					IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK))
+		return ERR_PTR(-EINVAL);
+
+	if (init_attr->create_flags &&
+	    (pd->uobject || init_attr->qp_type != IB_QPT_UD))
+		return ERR_PTR(-EINVAL);
+
+	switch (init_attr->qp_type) {
+	case IB_QPT_XRC:
+		if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
+			return ERR_PTR(-ENOSYS);
+	case IB_QPT_RC:
+	case IB_QPT_UC:
+	case IB_QPT_UD:
+	case IB_QPT_RAW_ETH:
+	{
+		qp = kzalloc(sizeof *qp, GFP_KERNEL);
+		if (!qp)
+			return ERR_PTR(-ENOMEM);
+
+		err = create_qp_common(dev, pd, init_attr, udata, 0, qp);
+		if (err) {
+			kfree(qp);
+			return ERR_PTR(err);
+		}
+
+		if (init_attr->qp_type == IB_QPT_XRC)
+			qp->xrcdn = to_mxrcd(init_attr->xrc_domain)->xrcdn;
+		else
+			qp->xrcdn = 0;
+
+		qp->ibqp.qp_num = qp->mqp.qpn;
+
+		break;
+	}
+	case IB_QPT_RAW_ETY:
+		if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_RAW_ETY))
+			return ERR_PTR(-ENOSYS);
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+	{
+		/* Userspace is not allowed to create special QPs: */
+		if (pd->uobject) {
+			mlx4_ib_dbg("Userspace is not allowed to create special QPs");
+			return ERR_PTR(-EINVAL);
+		}
+
+		sqp = kzalloc(sizeof *sqp, GFP_KERNEL);
+		if (!sqp)
+			return ERR_PTR(-ENOMEM);
+
+		qp = &sqp->qp;
+
+		err = create_qp_common(dev, pd, init_attr, udata,
+				       dev->dev->caps.sqp_start +
+				       (init_attr->qp_type == IB_QPT_RAW_ETY ? 4 :
+				       (init_attr->qp_type == IB_QPT_SMI ? 0 : 2)) +
+				       init_attr->port_num - 1,
+				       qp);
+		if (err) {
+			kfree(sqp);
+			return ERR_PTR(err);
+		}
+
+		qp->port	= init_attr->port_num;
+		qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1;
+
+		break;
+	}
+	default:
+		mlx4_ib_dbg("Invalid QP type requested for create_qp (%d)",
+			    init_attr->qp_type);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return &qp->ibqp;
+}
+
+int mlx4_ib_destroy_qp(struct ib_qp *qp)
+{
+	struct mlx4_ib_dev *dev = to_mdev(qp->device);
+	struct mlx4_ib_qp *mqp = to_mqp(qp);
+
+	if (is_qp0(dev, mqp))
+		mlx4_CLOSE_PORT(dev->dev, mqp->port);
+
+	destroy_qp_common(dev, mqp, !!qp->pd->uobject);
+
+	if (is_sqp(dev, mqp))
+		kfree(to_msqp(mqp));
+	else
+		kfree(mqp);
+
+	return 0;
+}
+
+static int to_mlx4_st(enum ib_qp_type type)
+{
+	switch (type) {
+	case IB_QPT_RC:		return MLX4_QP_ST_RC;
+	case IB_QPT_UC:		return MLX4_QP_ST_UC;
+	case IB_QPT_UD:		return MLX4_QP_ST_UD;
+	case IB_QPT_XRC:	return MLX4_QP_ST_XRC;
+	case IB_QPT_RAW_ETY:
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+	case IB_QPT_RAW_ETH:	return MLX4_QP_ST_MLX;
+	default:		return -1;
+	}
+}
+
+static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, const struct ib_qp_attr *attr,
+				   int attr_mask)
+{
+	u8 dest_rd_atomic;
+	u32 access_flags;
+	u32 hw_access_flags = 0;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		dest_rd_atomic = attr->max_dest_rd_atomic;
+	else
+		dest_rd_atomic = qp->resp_depth;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		access_flags = attr->qp_access_flags;
+	else
+		access_flags = qp->atomic_rd_en;
+
+	if (!dest_rd_atomic)
+		access_flags &= IB_ACCESS_REMOTE_WRITE;
+
+	if (access_flags & IB_ACCESS_REMOTE_READ)
+		hw_access_flags |= MLX4_QP_BIT_RRE;
+	if (access_flags & IB_ACCESS_REMOTE_ATOMIC)
+		hw_access_flags |= MLX4_QP_BIT_RAE;
+	if (access_flags & IB_ACCESS_REMOTE_WRITE)
+		hw_access_flags |= MLX4_QP_BIT_RWE;
+
+	return cpu_to_be32(hw_access_flags);
+}
+
+static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, const struct ib_qp_attr *attr,
+			    int attr_mask)
+{
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		sqp->pkey_index = attr->pkey_index;
+	if (attr_mask & IB_QP_QKEY)
+		sqp->qkey = attr->qkey;
+	if (attr_mask & IB_QP_SQ_PSN)
+		sqp->send_psn = attr->sq_psn;
+}
+
+static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port)
+{
+	path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6);
+}
+
+static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
+			 struct mlx4_qp_path *path, u8 port)
+{
+	int err;
+	int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port) ==
+		IB_LINK_LAYER_ETHERNET;
+	u8 mac[6];
+	int is_mcast;
+	u16 vlan_tag;
+	int vidx;
+
+	path->grh_mylmc     = ah->src_path_bits & 0x7f;
+	path->rlid	    = cpu_to_be16(ah->dlid);
+	if (ah->static_rate) {
+		path->static_rate = ah->static_rate + MLX4_STAT_RATE_OFFSET;
+		while (path->static_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
+		       !(1 << path->static_rate & dev->dev->caps.stat_rate_support))
+			--path->static_rate;
+	} else
+		path->static_rate = 0;
+
+	if (ah->ah_flags & IB_AH_GRH) {
+		if (ah->grh.sgid_index >= dev->dev->caps.gid_table_len[port]) {
+			printk(KERN_ERR "sgid_index (%u) too large. max is %d\n",
+			       ah->grh.sgid_index, dev->dev->caps.gid_table_len[port] - 1);
+			return -1;
+		}
+
+		path->grh_mylmc |= 1 << 7;
+		path->mgid_index = ah->grh.sgid_index;
+		path->hop_limit  = ah->grh.hop_limit;
+		path->tclass_flowlabel =
+			cpu_to_be32((ah->grh.traffic_class << 20) |
+				    (ah->grh.flow_label));
+		memcpy(path->rgid, ah->grh.dgid.raw, 16);
+	}
+
+	if (is_eth) {
+		path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
+			((port - 1) << 6) | ((ah->sl & 0x7) << 3) | ((ah->sl & 8) >> 1);
+
+		if (!(ah->ah_flags & IB_AH_GRH))
+			return -1;
+
+		err = mlx4_ib_resolve_grh(dev, ah, mac, &is_mcast, port);
+		if (err)
+			return err;
+
+		memcpy(path->dmac, mac, 6);
+		path->ackto = MLX4_IB_LINK_TYPE_ETH;
+		/* use index 0 into MAC table for IBoE */
+		path->grh_mylmc &= 0x80;
+
+		vlan_tag = rdma_get_vlan_id(&dev->iboe.gid_table[port - 1][ah->grh.sgid_index]);
+		if (vlan_tag < 0x1000) {
+			if (mlx4_find_cached_vlan(dev->dev, port, vlan_tag, &vidx))
+				return -ENOENT;
+
+			path->vlan_index = vidx;
+			path->fl = 1 << 6;
+		}
+	} else
+		path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
+			((port - 1) << 6) | ((ah->sl & 0xf) << 2);
+
+	return 0;
+}
+
+static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+	struct gid_entry *ge, *tmp;
+
+	list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
+		if (!ge->added && mlx4_ib_add_mc(dev, qp, &ge->gid)) {
+			ge->added = 1;
+			ge->port = qp->port;
+		}
+	}
+}
+
+static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
+			       const struct ib_qp_attr *attr, int attr_mask,
+			       enum ib_qp_state cur_state, enum ib_qp_state new_state)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	struct mlx4_qp_context *context;
+	enum mlx4_qp_optpar optpar = 0;
+	int sqd_event;
+	int err = -EINVAL;
+
+	context = kzalloc(sizeof *context, GFP_KERNEL);
+	if (!context)
+		return -ENOMEM;
+
+	context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) |
+				     (to_mlx4_st(ibqp->qp_type) << 16));
+
+	if (!(attr_mask & IB_QP_PATH_MIG_STATE))
+		context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
+	else {
+		optpar |= MLX4_QP_OPTPAR_PM_STATE;
+		switch (attr->path_mig_state) {
+		case IB_MIG_MIGRATED:
+			context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
+			break;
+		case IB_MIG_REARM:
+			context->flags |= cpu_to_be32(MLX4_QP_PM_REARM << 11);
+			break;
+		case IB_MIG_ARMED:
+			context->flags |= cpu_to_be32(MLX4_QP_PM_ARMED << 11);
+			break;
+		}
+	}
+	if (ibqp->qp_type == IB_QPT_RAW_ETH)
+		context->mtu_msgmax = 0xff;
+	else if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI ||
+	    ibqp->qp_type == IB_QPT_RAW_ETY)
+		context->mtu_msgmax = (IB_MTU_4096 << 5) | 11;
+	else if (ibqp->qp_type == IB_QPT_UD) {
+		if (qp->flags & MLX4_IB_QP_LSO)
+			context->mtu_msgmax = (IB_MTU_4096 << 5) |
+					      ilog2(dev->dev->caps.max_gso_sz);
+		else
+			context->mtu_msgmax = (IB_MTU_4096 << 5) | 12;
+	} else if (attr_mask & IB_QP_PATH_MTU) {
+		if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) {
+			printk(KERN_ERR "path MTU (%u) is invalid\n",
+			       attr->path_mtu);
+			goto out;
+		}
+		context->mtu_msgmax = (attr->path_mtu << 5) |
+			ilog2(dev->dev->caps.max_msg_sz);
+	}
+
+	if (qp->rq.wqe_cnt)
+		context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3;
+	context->rq_size_stride |= qp->rq.wqe_shift - 4;
+
+	if (qp->sq.wqe_cnt)
+		context->sq_size_stride = ilog2(qp->sq.wqe_cnt) << 3;
+	context->sq_size_stride |= qp->sq.wqe_shift - 4;
+
+	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+		context->sq_size_stride |= !!qp->sq_no_prefetch << 7;
+		if (ibqp->qp_type == IB_QPT_XRC)
+			context->xrcd = cpu_to_be32((u32) qp->xrcdn);
+	}
+
+	if (qp->ibqp.uobject)
+		context->usr_page = cpu_to_be32(to_mucontext(ibqp->uobject->context)->uar.index);
+	else
+		context->usr_page = cpu_to_be32(qp->bf.uar->index);
+
+	if (attr_mask & IB_QP_DEST_QPN)
+		context->remote_qpn = cpu_to_be32(attr->dest_qp_num);
+
+	if (attr_mask & IB_QP_PORT) {
+		if (cur_state == IB_QPS_SQD && new_state == IB_QPS_SQD &&
+		    !(attr_mask & IB_QP_AV)) {
+			mlx4_set_sched(&context->pri_path, attr->port_num);
+			optpar |= MLX4_QP_OPTPAR_SCHED_QUEUE;
+		}
+	}
+
+	if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR &&
+	    dev->counters[qp->port - 1] != -1) {
+		context->pri_path.counter_index = dev->counters[qp->port - 1];
+		optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX;
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		context->pri_path.pkey_index = attr->pkey_index;
+		optpar |= MLX4_QP_OPTPAR_PKEY_INDEX;
+	}
+
+	if (attr_mask & IB_QP_AV) {
+		if (mlx4_set_path(dev, &attr->ah_attr, &context->pri_path,
+				  attr_mask & IB_QP_PORT ? attr->port_num : qp->port)) {
+			mlx4_ib_dbg("qpn 0x%x: could not set pri path params",
+				    ibqp->qp_num);
+			goto out;
+		}
+
+		optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH |
+			   MLX4_QP_OPTPAR_SCHED_QUEUE);
+	}
+
+	if (attr_mask & IB_QP_TIMEOUT) {
+		context->pri_path.ackto |= (attr->timeout << 3);
+		optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT;
+	}
+
+	if (attr_mask & IB_QP_ALT_PATH) {
+		if (attr->alt_port_num == 0 ||
+		    attr->alt_port_num > dev->num_ports) {
+			mlx4_ib_dbg("qpn 0x%x: invalid alternate port num (%d)",
+				    ibqp->qp_num, attr->alt_port_num);
+			goto out;
+		}
+
+		if (attr->alt_pkey_index >=
+		    dev->dev->caps.pkey_table_len[attr->alt_port_num]) {
+			mlx4_ib_dbg("qpn 0x%x: invalid alt pkey index (0x%x)",
+				    ibqp->qp_num, attr->alt_pkey_index);
+			goto out;
+		}
+
+		if (mlx4_set_path(dev, &attr->alt_ah_attr, &context->alt_path,
+				  attr->alt_port_num)) {
+			mlx4_ib_dbg("qpn 0x%x: could not set alt path params",
+				    ibqp->qp_num);
+			goto out;
+		}
+
+		context->alt_path.pkey_index = attr->alt_pkey_index;
+		context->alt_path.ackto = attr->alt_timeout << 3;
+		optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH;
+	}
+
+	context->pd	    = cpu_to_be32(to_mpd(ibqp->pd)->pdn);
+	context->params1    = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28);
+
+	/* Set "fast registration enabled" for all kernel QPs */
+	if (!qp->ibqp.uobject)
+		context->params1 |= cpu_to_be32(1 << 11);
+
+	if (attr_mask & IB_QP_RNR_RETRY) {
+		context->params1 |= cpu_to_be32(attr->rnr_retry << 13);
+		optpar |= MLX4_QP_OPTPAR_RNR_RETRY;
+	}
+
+	if (attr_mask & IB_QP_RETRY_CNT) {
+		context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
+		optpar |= MLX4_QP_OPTPAR_RETRY_COUNT;
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+		if (attr->max_rd_atomic)
+			context->params1 |=
+				cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21);
+		optpar |= MLX4_QP_OPTPAR_SRA_MAX;
+	}
+
+	if (attr_mask & IB_QP_SQ_PSN)
+		context->next_send_psn = cpu_to_be32(attr->sq_psn);
+
+	context->cqn_send = cpu_to_be32(to_mcq(ibqp->send_cq)->mcq.cqn);
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+		if (attr->max_dest_rd_atomic)
+			context->params2 |=
+				cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
+		optpar |= MLX4_QP_OPTPAR_RRA_MAX;
+	}
+
+	if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) {
+		context->params2 |= to_mlx4_access_flags(qp, attr, attr_mask);
+		optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE;
+	}
+
+	if (ibqp->srq)
+		context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC);
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+		context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
+		optpar |= MLX4_QP_OPTPAR_RNR_TIMEOUT;
+	}
+	if (attr_mask & IB_QP_RQ_PSN)
+		context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
+
+	context->cqn_recv = cpu_to_be32(to_mcq(ibqp->recv_cq)->mcq.cqn);
+
+	if (attr_mask & IB_QP_QKEY) {
+		context->qkey = cpu_to_be32(attr->qkey);
+		optpar |= MLX4_QP_OPTPAR_Q_KEY;
+	}
+
+	if (ibqp->srq)
+		context->srqn = cpu_to_be32(1 << 24 | to_msrq(ibqp->srq)->msrq.srqn);
+
+	if (!ibqp->srq && ibqp->qp_type != IB_QPT_XRC &&
+	    cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+		context->db_rec_addr = cpu_to_be64(qp->db.dma);
+
+	if (cur_state == IB_QPS_INIT &&
+	    new_state == IB_QPS_RTR  &&
+	    (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI ||
+	     ibqp->qp_type == IB_QPT_UD || ibqp->qp_type == IB_QPT_RAW_ETY ||
+		ibqp->qp_type == IB_QPT_RAW_ETH)) {
+		context->pri_path.sched_queue = (qp->port - 1) << 6;
+		if (is_qp0(dev, qp))
+			context->pri_path.sched_queue |= MLX4_IB_DEFAULT_QP0_SCHED_QUEUE;
+		else
+			context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE;
+	}
+
+	if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD	&&
+	    attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify)
+		sqd_event = 1;
+	else
+		sqd_event = 0;
+
+	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+		context->rlkey |= (1 << 4);
+
+	/*
+	 * Before passing a kernel QP to the HW, make sure that the
+	 * ownership bits of the send queue are set and the SQ
+	 * headroom is stamped so that the hardware doesn't start
+	 * processing stale work requests.
+	 */
+	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+		struct mlx4_wqe_ctrl_seg *ctrl;
+		int i;
+
+		for (i = 0; i < qp->sq.wqe_cnt; ++i) {
+			ctrl = get_send_wqe(qp, i);
+			ctrl->owner_opcode = cpu_to_be32(1 << 31);
+			if (qp->sq_max_wqes_per_wr == 1)
+				ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
+
+			stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift);
+		}
+	}
+
+	err = mlx4_qp_modify(dev->dev, &qp->mtt, to_mlx4_state(cur_state),
+			     to_mlx4_state(new_state), context, optpar,
+			     sqd_event, &qp->mqp);
+	if (err)
+		goto out;
+
+	qp->state = new_state;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		qp->atomic_rd_en = attr->qp_access_flags;
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		qp->resp_depth = attr->max_dest_rd_atomic;
+	if (attr_mask & IB_QP_PORT) {
+		qp->port = attr->port_num;
+		update_mcg_macs(dev, qp);
+	}
+	if (attr_mask & IB_QP_ALT_PATH)
+		qp->alt_port = attr->alt_port_num;
+
+	if (is_sqp(dev, qp))
+		store_sqp_attrs(to_msqp(qp), attr, attr_mask);
+
+	/*
+	 * If we moved QP0 to RTR, bring the IB link up; if we moved
+	 * QP0 to RESET or ERROR, bring the link back down.
+	 */
+	if (is_qp0(dev, qp)) {
+		if (cur_state != IB_QPS_RTR && new_state == IB_QPS_RTR)
+			if (mlx4_INIT_PORT(dev->dev, qp->port))
+				printk(KERN_WARNING "INIT_PORT failed for port %d\n",
+				       qp->port);
+
+		if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR &&
+		    (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR))
+			mlx4_CLOSE_PORT(dev->dev, qp->port);
+	}
+
+	/*
+	 * If we moved a kernel QP to RESET, clean up all old CQ
+	 * entries and reinitialize the QP.
+	 */
+	if (new_state == IB_QPS_RESET && !ibqp->uobject) {
+		mlx4_ib_cq_clean(to_mcq(ibqp->recv_cq), qp->mqp.qpn,
+				 ibqp->srq ? to_msrq(ibqp->srq): NULL);
+		if (ibqp->send_cq != ibqp->recv_cq)
+			mlx4_ib_cq_clean(to_mcq(ibqp->send_cq), qp->mqp.qpn, NULL);
+
+		qp->rq.head = 0;
+		qp->rq.tail = 0;
+		qp->sq.head = 0;
+		qp->sq.tail = 0;
+		qp->sq_next_wqe = 0;
+		if (!ibqp->srq && ibqp->qp_type != IB_QPT_XRC)
+			*qp->db.db  = 0;
+	}
+
+out:
+	kfree(context);
+	return err;
+}
+
+int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	enum ib_qp_state cur_state, new_state;
+	int err = -EINVAL;
+
+	mutex_lock(&qp->mutex);
+
+	cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
+	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) {
+		mlx4_ib_dbg("qpn 0x%x: invalid attribute mask specified "
+			    "for transition %d to %d. qp_type %d, attr_mask 0x%x",
+			    ibqp->qp_num, cur_state, new_state,
+			    ibqp->qp_type, attr_mask);
+		goto out;
+	}
+
+	if ((attr_mask & IB_QP_PORT) && (ibqp->qp_type != IB_QPT_RAW_ETH) &&
+	    (attr->port_num == 0 || attr->port_num > dev->num_ports)) {
+		mlx4_ib_dbg("qpn 0x%x: invalid port number (%d) specified "
+			    "for transition %d to %d. qp_type %d",
+			    ibqp->qp_num, attr->port_num, cur_state,
+			    new_state, ibqp->qp_type);
+		goto out;
+	}
+
+	if ((attr_mask & IB_QP_PORT) && (ibqp->qp_type == IB_QPT_RAW_ETH) &&
+		(rdma_port_get_link_layer(&dev->ib_dev, attr->port_num)
+				!= IB_LINK_LAYER_ETHERNET)) {
+		mlx4_ib_dbg("qpn 0x%x: invalid port (%d) specified (not RDMAoE)"
+			    "for transition %d to %d. qp_type %d",
+			    ibqp->qp_num, attr->port_num, cur_state,
+			    new_state, ibqp->qp_type);
+		goto out;
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		int p = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+		if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) {
+			mlx4_ib_dbg("qpn 0x%x: invalid pkey index (%d) specified "
+				    "for transition %d to %d. qp_type %d",
+				    ibqp->qp_num, attr->pkey_index, cur_state,
+				    new_state, ibqp->qp_type);
+			goto out;
+		}
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+	    attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) {
+		mlx4_ib_dbg("qpn 0x%x: max_rd_atomic (%d) too large. "
+			    "Transition %d to %d. qp_type %d",
+			    ibqp->qp_num, attr->max_rd_atomic, cur_state,
+			    new_state, ibqp->qp_type);
+		goto out;
+	}
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+	    attr->max_dest_rd_atomic > dev->dev->caps.max_qp_dest_rdma) {
+		mlx4_ib_dbg("qpn 0x%x: max_dest_rd_atomic (%d) too large. "
+			    "Transition %d to %d. qp_type %d",
+			    ibqp->qp_num, attr->max_dest_rd_atomic, cur_state,
+			    new_state, ibqp->qp_type);
+		goto out;
+	}
+
+	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+		err = 0;
+		goto out;
+	}
+
+	err = __mlx4_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
+
+out:
+	mutex_unlock(&qp->mutex);
+	return err;
+}
+
+static int build_raw_ety_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
+			    void *wqe, unsigned *mlx_seg_len)
+{
+	int payload = 0;
+	int header_size, packet_length;
+	struct mlx4_wqe_mlx_seg *mlx = wqe;
+	struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
+	u32 *lrh = wqe + sizeof *mlx + sizeof *inl;
+	int i;
+
+	/* Only IB_WR_SEND is supported */
+	if (wr->opcode != IB_WR_SEND)
+		return -EINVAL;
+
+	for (i = 0; i < wr->num_sge; ++i)
+		payload += wr->sg_list[i].length;
+
+	header_size = IB_LRH_BYTES + 4; /* LRH + RAW_HEADER (32 bits) */
+
+	/* headers + payload and round up */
+	packet_length = (header_size + payload + 3) / 4;
+
+	mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+
+	mlx->flags |= cpu_to_be32(MLX4_WQE_MLX_ICRC |
+				  (wr->wr.raw_ety.lrh->service_level << 8));
+
+	mlx->rlid = wr->wr.raw_ety.lrh->destination_lid;
+
+	wr->wr.raw_ety.lrh->packet_length = cpu_to_be16(packet_length);
+
+	ib_lrh_header_pack(wr->wr.raw_ety.lrh, lrh);
+	lrh += IB_LRH_BYTES / 4;	/* LRH size is a dword multiple */
+	*lrh = cpu_to_be32(wr->wr.raw_ety.eth_type);
+
+	inl->byte_count = cpu_to_be32(1 << 31 | header_size);
+
+	*mlx_seg_len =
+		ALIGN(sizeof(struct mlx4_wqe_inline_seg) + header_size, 16);
+
+	return 0;
+}
+
+static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
+			    void *wqe, unsigned *mlx_seg_len)
+{
+	struct ib_device *ib_dev = &to_mdev(sqp->qp.ibqp.device)->ib_dev;
+	struct mlx4_wqe_mlx_seg *mlx = wqe;
+	struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
+	struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
+	u16 pkey;
+	int send_size;
+	int header_size;
+	int spc;
+	int i;
+	union ib_gid sgid;
+	int is_eth;
+	int is_grh;
+	int is_vlan = 0;
+	int err;
+	u16 vlan;
+
+	vlan = 0;
+	send_size = 0;
+	for (i = 0; i < wr->num_sge; ++i)
+		send_size += wr->sg_list[i].length;
+
+	is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET;
+	is_grh = mlx4_ib_ah_grh_present(ah);
+	err = ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24,
+				ah->av.ib.gid_index, &sgid);
+	if (err)
+		return err;
+	if (is_eth) {
+		is_vlan = rdma_get_vlan_id(&sgid) < 0x1000;
+		vlan = rdma_get_vlan_id(&sgid);
+	}
+
+	ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header);
+	if (!is_eth) {
+		sqp->ud_header.lrh.service_level =
+			be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
+		sqp->ud_header.lrh.destination_lid = ah->av.ib.dlid;
+		sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f);
+	}
+
+	if (is_grh) {
+		sqp->ud_header.grh.traffic_class =
+			(be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
+		sqp->ud_header.grh.flow_label    =
+			ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+		sqp->ud_header.grh.hop_limit     = ah->av.ib.hop_limit;
+		ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24,
+				  ah->av.ib.gid_index, &sqp->ud_header.grh.source_gid);
+		memcpy(sqp->ud_header.grh.destination_gid.raw,
+		       ah->av.ib.dgid, 16);
+	}
+
+	mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+
+	if (!is_eth) {
+		mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) |
+					  (sqp->ud_header.lrh.destination_lid ==
+					   IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) |
+					  (sqp->ud_header.lrh.service_level << 8));
+		mlx->rlid = sqp->ud_header.lrh.destination_lid;
+	}
+
+	switch (wr->opcode) {
+	case IB_WR_SEND:
+		sqp->ud_header.bth.opcode        = IB_OPCODE_UD_SEND_ONLY;
+		sqp->ud_header.immediate_present = 0;
+		break;
+	case IB_WR_SEND_WITH_IMM:
+		sqp->ud_header.bth.opcode        = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+		sqp->ud_header.immediate_present = 1;
+		sqp->ud_header.immediate_data    = wr->ex.imm_data;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (is_eth) {
+		u8 *smac;
+
+		memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6);
+#ifdef __linux__
+		smac = to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]->dev_addr; /* fixme: cache this value */
+#else
+		smac = IF_LLADDR(to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]); /* fixme: cache this value */
+#endif
+		memcpy(sqp->ud_header.eth.smac_h, smac, 6);
+		if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6))
+			mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK);
+		if (!is_vlan)
+			sqp->ud_header.eth.type = cpu_to_be16(MLX4_IBOE_ETHERTYPE);
+		else {
+			u16 pcp;
+
+			sqp->ud_header.vlan.type = cpu_to_be16(MLX4_IBOE_ETHERTYPE);
+			pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 27 & 3) << 13;
+			sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp);
+		}
+	} else {
+		sqp->ud_header.lrh.virtual_lane    = !sqp->qp.ibqp.qp_num ? 15 : 0;
+		if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
+			sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
+	}
+	sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
+	if (!sqp->qp.ibqp.qp_num)
+		ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey);
+	else
+		ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey);
+	sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
+	sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+	sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+	sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ?
+					       sqp->qkey : wr->wr.ud.remote_qkey);
+	sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
+
+	header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
+
+	if (0) {
+		printk(KERN_ERR "built UD header of size %d:\n", header_size);
+		for (i = 0; i < header_size / 4; ++i) {
+			if (i % 8 == 0)
+				printk("  [%02x] ", i * 4);
+			printk(" %08x",
+			       be32_to_cpu(((__be32 *) sqp->header_buf)[i]));
+			if ((i + 1) % 8 == 0)
+				printk("\n");
+		}
+		printk("\n");
+	}
+
+	/*
+	 * Inline data segments may not cross a 64 byte boundary.  If
+	 * our UD header is bigger than the space available up to the
+	 * next 64 byte boundary in the WQE, use two inline data
+	 * segments to hold the UD header.
+	 */
+	spc = MLX4_INLINE_ALIGN -
+	      ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
+	if (header_size <= spc) {
+		inl->byte_count = cpu_to_be32(1 << 31 | header_size);
+		memcpy(inl + 1, sqp->header_buf, header_size);
+		i = 1;
+	} else {
+		inl->byte_count = cpu_to_be32(1 << 31 | spc);
+		memcpy(inl + 1, sqp->header_buf, spc);
+
+		inl = (void *) (inl + 1) + spc;
+		memcpy(inl + 1, sqp->header_buf + spc, header_size - spc);
+		/*
+		 * Need a barrier here to make sure all the data is
+		 * visible before the byte_count field is set.
+		 * Otherwise the HCA prefetcher could grab the 64-byte
+		 * chunk with this inline segment and get a valid (!=
+		 * 0xffffffff) byte count but stale data, and end up
+		 * generating a packet with bad headers.
+		 *
+		 * The first inline segment's byte_count field doesn't
+		 * need a barrier, because it comes after a
+		 * control/MLX segment and therefore is at an offset
+		 * of 16 mod 64.
+		 */
+		wmb();
+		inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc));
+		i = 2;
+	}
+
+	*mlx_seg_len =
+	ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16);
+	return 0;
+}
+
+static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq)
+{
+	unsigned cur;
+	struct mlx4_ib_cq *cq;
+
+	cur = wq->head - wq->tail;
+	if (likely(cur + nreq < wq->max_post))
+		return 0;
+
+	cq = to_mcq(ib_cq);
+	spin_lock(&cq->lock);
+	cur = wq->head - wq->tail;
+	spin_unlock(&cq->lock);
+
+	return cur + nreq >= wq->max_post;
+}
+
+static __be32 convert_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_ATOMIC ? cpu_to_be32(MLX4_WQE_FMR_PERM_ATOMIC)       : 0) |
+	       (acc & IB_ACCESS_REMOTE_WRITE  ? cpu_to_be32(MLX4_WQE_FMR_PERM_REMOTE_WRITE) : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ   ? cpu_to_be32(MLX4_WQE_FMR_PERM_REMOTE_READ)  : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE   ? cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_WRITE)  : 0) |
+		cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_READ);
+}
+
+static void set_fmr_seg(struct mlx4_wqe_fmr_seg *fseg, struct ib_send_wr *wr)
+{
+	struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(wr->wr.fast_reg.page_list);
+	int i;
+
+	for (i = 0; i < wr->wr.fast_reg.page_list_len; ++i)
+		mfrpl->mapped_page_list[i] =
+			cpu_to_be64(wr->wr.fast_reg.page_list->page_list[i] |
+				    MLX4_MTT_FLAG_PRESENT);
+
+	fseg->flags		= convert_access(wr->wr.fast_reg.access_flags);
+	fseg->mem_key		= cpu_to_be32(wr->wr.fast_reg.rkey);
+	fseg->buf_list		= cpu_to_be64(mfrpl->map);
+	fseg->start_addr	= cpu_to_be64(wr->wr.fast_reg.iova_start);
+	fseg->reg_len		= cpu_to_be64(wr->wr.fast_reg.length);
+	fseg->offset		= 0; /* XXX -- is this just for ZBVA? */
+	fseg->page_size		= cpu_to_be32(wr->wr.fast_reg.page_shift);
+	fseg->reserved[0]	= 0;
+	fseg->reserved[1]	= 0;
+}
+
+static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey)
+{
+	iseg->flags	= 0;
+	iseg->mem_key	= cpu_to_be32(rkey);
+	iseg->guest_id	= 0;
+	iseg->pa	= 0;
+}
+
+static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
+					  u64 remote_addr, u32 rkey)
+{
+	rseg->raddr    = cpu_to_be64(remote_addr);
+	rseg->rkey     = cpu_to_be32(rkey);
+	rseg->reserved = 0;
+}
+
+static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ib_send_wr *wr)
+{
+	if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+		aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap);
+		aseg->compare  = cpu_to_be64(wr->wr.atomic.compare_add);
+	} else if (wr->opcode == IB_WR_MASKED_ATOMIC_FETCH_AND_ADD) {
+		aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add);
+		aseg->compare  = cpu_to_be64(wr->wr.atomic.compare_add_mask);
+	} else {
+		aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add);
+		aseg->compare  = 0;
+	}
+
+}
+
+static void set_masked_atomic_seg(struct mlx4_wqe_masked_atomic_seg *aseg,
+				  struct ib_send_wr *wr)
+{
+	aseg->swap_add		= cpu_to_be64(wr->wr.atomic.swap);
+	aseg->swap_add_mask	= cpu_to_be64(wr->wr.atomic.swap_mask);
+	aseg->compare		= cpu_to_be64(wr->wr.atomic.compare_add);
+	aseg->compare_mask	= cpu_to_be64(wr->wr.atomic.compare_add_mask);
+}
+
+static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
+			     struct ib_send_wr *wr, __be16 *vlan)
+{
+	memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
+	dseg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+	dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
+	dseg->vlan = to_mah(wr->wr.ud.ah)->av.eth.vlan;
+	memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->av.eth.mac, 6);
+	*vlan = dseg->vlan;
+}
+
+static void set_mlx_icrc_seg(void *dseg)
+{
+	u32 *t = dseg;
+	struct mlx4_wqe_inline_seg *iseg = dseg;
+
+	t[1] = 0;
+
+	/*
+	 * Need a barrier here before writing the byte_count field to
+	 * make sure that all the data is visible before the
+	 * byte_count field is set.  Otherwise, if the segment begins
+	 * a new cacheline, the HCA prefetcher could grab the 64-byte
+	 * chunk and get a valid (!= * 0xffffffff) byte count but
+	 * stale data, and end up sending the wrong data.
+	 */
+	wmb();
+
+	iseg->byte_count = cpu_to_be32((1 << 31) | 4);
+}
+
+static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
+{
+	dseg->lkey       = cpu_to_be32(sg->lkey);
+	dseg->addr       = cpu_to_be64(sg->addr);
+
+	/*
+	 * Need a barrier here before writing the byte_count field to
+	 * make sure that all the data is visible before the
+	 * byte_count field is set.  Otherwise, if the segment begins
+	 * a new cacheline, the HCA prefetcher could grab the 64-byte
+	 * chunk and get a valid (!= * 0xffffffff) byte count but
+	 * stale data, and end up sending the wrong data.
+	 */
+	wmb();
+
+	dseg->byte_count = cpu_to_be32(sg->length);
+}
+
+static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
+{
+	dseg->byte_count = cpu_to_be32(sg->length);
+	dseg->lkey       = cpu_to_be32(sg->lkey);
+	dseg->addr       = cpu_to_be64(sg->addr);
+}
+
+static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr,
+			 struct mlx4_ib_qp *qp, unsigned *lso_seg_len,
+			 __be32 *lso_hdr_sz, int *blh)
+{
+	unsigned halign = ALIGN(sizeof *wqe + wr->wr.ud.hlen, 16);
+
+	*blh = unlikely(halign > 64) ? 1 : 0;
+
+	if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) &&
+		     wr->num_sge > qp->sq.max_gs - (halign >> 4)))
+		return -EINVAL;
+
+	memcpy(wqe->header, wr->wr.ud.header, wr->wr.ud.hlen);
+
+	*lso_hdr_sz  = cpu_to_be32((wr->wr.ud.mss - wr->wr.ud.hlen) << 16 |
+				   wr->wr.ud.hlen);
+	*lso_seg_len = halign;
+	return 0;
+}
+
+static __be32 send_ieth(struct ib_send_wr *wr)
+{
+	switch (wr->opcode) {
+	case IB_WR_SEND_WITH_IMM:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		return wr->ex.imm_data;
+
+	case IB_WR_SEND_WITH_INV:
+		return cpu_to_be32(wr->ex.invalidate_rkey);
+
+	default:
+		return 0;
+	}
+}
+
+static int lay_inline_data(struct mlx4_ib_qp *qp, struct ib_send_wr *wr,
+			   void *wqe, int *sz)
+{
+	struct mlx4_wqe_inline_seg *seg;
+	void *addr;
+	int len, seg_len;
+	int num_seg;
+	int off, to_copy;
+	int i;
+	int inl = 0;
+
+	seg = wqe;
+	wqe += sizeof *seg;
+	off = ((unsigned long)wqe) & (unsigned long)(MLX4_INLINE_ALIGN - 1);
+	num_seg = 0;
+	seg_len = 0;
+
+	for (i = 0; i < wr->num_sge; ++i) {
+		addr = (void *) (unsigned long)(wr->sg_list[i].addr);
+		len  = wr->sg_list[i].length;
+		inl += len;
+
+		if (inl > qp->max_inline_data) {
+			inl = 0;
+			return -1;
+		}
+
+		while (len >= MLX4_INLINE_ALIGN - off) {
+			to_copy = MLX4_INLINE_ALIGN - off;
+			memcpy(wqe, addr, to_copy);
+			len -= to_copy;
+			wqe += to_copy;
+			addr += to_copy;
+			seg_len += to_copy;
+			wmb(); /* see comment below */
+			seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
+			seg_len = 0;
+			seg = wqe;
+			wqe += sizeof *seg;
+			off = sizeof *seg;
+			++num_seg;
+		}
+
+		memcpy(wqe, addr, len);
+		wqe += len;
+		seg_len += len;
+		off += len;
+	}
+
+	if (seg_len) {
+		++num_seg;
+		/*
+		 * Need a barrier here to make sure
+		 * all the data is visible before the
+		 * byte_count field is set.  Otherwise
+		 * the HCA prefetcher could grab the
+		 * 64-byte chunk with this inline
+		 * segment and get a valid (!=
+		 * 0xffffffff) byte count but stale
+		 * data, and end up sending the wrong
+		 * data.
+		 */
+		wmb();
+		seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
+	}
+
+	*sz = (inl + num_seg * sizeof *seg + 15) / 16;
+
+	return 0;
+}
+
+/*
+ * Avoid using memcpy() to copy to BlueFlame page, since memcpy()
+ * implementations may use move-string-buffer assembler instructions,
+ * which do not guarantee order of copying.
+ */
+static void mlx4_bf_copy(unsigned long *dst, unsigned long *src, unsigned bytecnt)
+{
+	__iowrite64_copy(dst, src, bytecnt / 8);
+}
+
+int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr)
+{
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	void *wqe;
+	struct mlx4_wqe_ctrl_seg *ctrl;
+	struct mlx4_wqe_data_seg *dseg;
+	unsigned long flags;
+	int nreq;
+	int err = 0;
+	unsigned ind;
+	int uninitialized_var(stamp);
+	int uninitialized_var(size);
+	unsigned uninitialized_var(seglen);
+	__be32 dummy;
+	__be32 *lso_wqe;
+	__be32 uninitialized_var(lso_hdr_sz);
+	int i;
+	int blh = 0;
+	__be16 vlan = 0;
+	int inl = 0;
+
+	ctrl = NULL;
+	spin_lock_irqsave(&qp->sq.lock, flags);
+
+	ind = qp->sq_next_wqe;
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		lso_wqe = &dummy;
+
+		if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+			mlx4_ib_dbg("QP 0x%x: WQE overflow", ibqp->qp_num);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (unlikely(wr->num_sge > qp->sq.max_gs)) {
+			mlx4_ib_dbg("QP 0x%x: too many sg entries (%d)",
+				    ibqp->qp_num, wr->num_sge);
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+		*((u32 *) (&ctrl->vlan_tag)) = 0;
+		qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
+
+		ctrl->srcrb_flags =
+			(wr->send_flags & IB_SEND_SIGNALED ?
+			 cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
+			(wr->send_flags & IB_SEND_SOLICITED ?
+			 cpu_to_be32(MLX4_WQE_CTRL_SOLICITED) : 0) |
+			((wr->send_flags & IB_SEND_IP_CSUM) ?
+			 cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM |
+				     MLX4_WQE_CTRL_TCP_UDP_CSUM) : 0) |
+			qp->sq_signal_bits;
+
+		ctrl->imm = send_ieth(wr);
+
+		wqe += sizeof *ctrl;
+		size = sizeof *ctrl / 16;
+
+		switch (ibqp->qp_type) {
+		case IB_QPT_XRC:
+			ctrl->srcrb_flags |=
+				cpu_to_be32(wr->xrc_remote_srq_num << 8);
+			/* fall thru */
+		case IB_QPT_RC:
+		case IB_QPT_UC:
+			switch (wr->opcode) {
+			case IB_WR_ATOMIC_CMP_AND_SWP:
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+			case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
+				set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
+					      wr->wr.atomic.rkey);
+				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
+
+				set_atomic_seg(wqe, wr);
+				wqe  += sizeof (struct mlx4_wqe_atomic_seg);
+
+				size += (sizeof (struct mlx4_wqe_raddr_seg) +
+					 sizeof (struct mlx4_wqe_atomic_seg)) / 16;
+
+				break;
+
+			case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
+				set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
+					      wr->wr.atomic.rkey);
+				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
+
+				set_masked_atomic_seg(wqe, wr);
+				wqe  += sizeof (struct mlx4_wqe_masked_atomic_seg);
+
+				size += (sizeof (struct mlx4_wqe_raddr_seg) +
+					 sizeof (struct mlx4_wqe_masked_atomic_seg)) / 16;
+
+				break;
+
+			case IB_WR_RDMA_READ:
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+					      wr->wr.rdma.rkey);
+				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
+				size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
+				break;
+
+			case IB_WR_LOCAL_INV:
+				ctrl->srcrb_flags |=
+					cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
+				set_local_inv_seg(wqe, wr->ex.invalidate_rkey);
+				wqe  += sizeof (struct mlx4_wqe_local_inval_seg);
+				size += sizeof (struct mlx4_wqe_local_inval_seg) / 16;
+				break;
+
+			case IB_WR_FAST_REG_MR:
+				ctrl->srcrb_flags |=
+					cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
+				set_fmr_seg(wqe, wr);
+				wqe  += sizeof (struct mlx4_wqe_fmr_seg);
+				size += sizeof (struct mlx4_wqe_fmr_seg) / 16;
+				break;
+
+			default:
+				/* No extra segments required for sends */
+				break;
+			}
+			break;
+
+		case IB_QPT_UD:
+			set_datagram_seg(wqe, wr, &vlan);
+			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
+			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+
+			if (wr->opcode == IB_WR_LSO) {
+				err = build_lso_seg(wqe, wr, qp, &seglen, &lso_hdr_sz, &blh);
+				if (unlikely(err)) {
+					*bad_wr = wr;
+					goto out;
+				}
+				lso_wqe = (__be32 *) wqe;
+				wqe  += seglen;
+				size += seglen / 16;
+			}
+			break;
+
+		case IB_QPT_SMI:
+		case IB_QPT_GSI:
+			err = build_mlx_header(to_msqp(qp), wr, ctrl, &seglen);
+			if (unlikely(err)) {
+				*bad_wr = wr;
+				goto out;
+			}
+			wqe  += seglen;
+			size += seglen / 16;
+			break;
+
+		case IB_QPT_RAW_ETY:
+			err = build_raw_ety_header(to_msqp(qp), wr, ctrl,
+						   &seglen);
+			if (unlikely(err)) {
+				*bad_wr = wr;
+				goto out;
+			}
+			wqe  += seglen;
+			size += seglen / 16;
+			break;
+
+		default:
+			break;
+		}
+
+		/*
+		 * Write data segments in reverse order, so as to
+		 * overwrite cacheline stamp last within each
+		 * cacheline.  This avoids issues with WQE
+		 * prefetching.
+		 */
+
+		dseg = wqe;
+		dseg += wr->num_sge - 1;
+
+		/* Add one more inline data segment for ICRC for MLX sends */
+		if (unlikely(qp->ibqp.qp_type == IB_QPT_SMI ||
+			     qp->ibqp.qp_type == IB_QPT_GSI)) {
+			set_mlx_icrc_seg(dseg + 1);
+			size += sizeof (struct mlx4_wqe_data_seg) / 16;
+		}
+
+		if (wr->send_flags & IB_SEND_INLINE && wr->num_sge) {
+			int sz;
+			err = lay_inline_data(qp, wr, wqe, &sz);
+			if (!err) {
+				inl = 1;
+				size += sz;
+			}
+		} else {
+			size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) / 16);
+			for (i = wr->num_sge - 1; i >= 0; --i, --dseg)
+				set_data_seg(dseg, wr->sg_list + i);
+		}
+
+		/*
+		 * Possibly overwrite stamping in cacheline with LSO
+		 * segment only after making sure all data segments
+		 * are written.
+		 */
+		wmb();
+		*lso_wqe = lso_hdr_sz;
+
+		ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ?
+				    MLX4_WQE_CTRL_FENCE : 0) | size;
+
+		if (vlan) {
+			ctrl->ins_vlan = 1 << 6;
+			ctrl->vlan_tag = vlan;
+		}
+
+		/*
+		 * Make sure descriptor is fully written before
+		 * setting ownership bit (because HW can start
+		 * executing as soon as we do).
+		 */
+		wmb();
+
+		if (wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] |
+			(ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) |
+			(blh ? cpu_to_be32(1 << 6) : 0);
+
+		stamp = ind + qp->sq_spare_wqes;
+		ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift);
+
+		/*
+		 * We can improve latency by not stamping the last
+		 * send queue WQE until after ringing the doorbell, so
+		 * only stamp here if there are still more WQEs to post.
+		 *
+		 * Same optimization applies to padding with NOP wqe
+		 * in case of WQE shrinking (used to prevent wrap-around
+		 * in the middle of WR).
+		 */
+		if (wr->next) {
+			stamp_send_wqe(qp, stamp, size * 16);
+			ind = pad_wraparound(qp, ind);
+		}
+	}
+
+out:
+	if (nreq == 1 && inl && size > 1 && size < qp->bf.buf_size / 16) {
+		ctrl->owner_opcode |= htonl((qp->sq_next_wqe & 0xffff) << 8);
+		*(u32 *) (&ctrl->vlan_tag) |= qp->doorbell_qpn;
+		/*
+		 * Make sure that descriptor is written to memory
+		 * before writing to BlueFlame page.
+		 */
+		wmb();
+
+		++qp->sq.head;
+
+		mlx4_bf_copy(qp->bf.reg + qp->bf.offset, (unsigned long *) ctrl,
+			     ALIGN(size * 16, 64));
+		wc_wmb();
+
+		qp->bf.offset ^= qp->bf.buf_size;
+
+	} else if (nreq) {
+		qp->sq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		writel(qp->doorbell_qpn, qp->bf.uar->map + MLX4_SEND_DOORBELL);
+
+		/*
+		 * Make sure doorbells don't leak out of SQ spinlock
+		 * and reach the HCA out of order.
+		 */
+		mmiowb();
+
+	}
+
+	if (likely(nreq)) {
+		stamp_send_wqe(qp, stamp, size * 16);
+		ind = pad_wraparound(qp, ind);
+		qp->sq_next_wqe = ind;
+	}
+
+	spin_unlock_irqrestore(&qp->sq.lock, flags);
+
+	return err;
+}
+
+int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr)
+{
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	struct mlx4_wqe_data_seg *scat;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int ind;
+	int i;
+
+	spin_lock_irqsave(&qp->rq.lock, flags);
+
+	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+			mlx4_ib_dbg("QP 0x%x: WQE overflow", ibqp->qp_num);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+			mlx4_ib_dbg("QP 0x%x: too many sg entries (%d)",
+				    ibqp->qp_num, wr->num_sge);
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		scat = get_recv_wqe(qp, ind);
+
+		for (i = 0; i < wr->num_sge; ++i)
+			__set_data_seg(scat + i, wr->sg_list + i);
+
+		if (i < qp->rq.max_gs) {
+			scat[i].byte_count = 0;
+			scat[i].lkey       = cpu_to_be32(MLX4_INVALID_LKEY);
+			scat[i].addr       = 0;
+		}
+
+		qp->rq.wrid[ind] = wr->wr_id;
+
+		ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
+	}
+
+out:
+	if (likely(nreq)) {
+		qp->rq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		*qp->db.db = cpu_to_be32(qp->rq.head & 0xffff);
+	}
+
+	spin_unlock_irqrestore(&qp->rq.lock, flags);
+
+	return err;
+}
+
+static inline enum ib_qp_state to_ib_qp_state(enum mlx4_qp_state mlx4_state)
+{
+	switch (mlx4_state) {
+	case MLX4_QP_STATE_RST:      return IB_QPS_RESET;
+	case MLX4_QP_STATE_INIT:     return IB_QPS_INIT;
+	case MLX4_QP_STATE_RTR:      return IB_QPS_RTR;
+	case MLX4_QP_STATE_RTS:      return IB_QPS_RTS;
+	case MLX4_QP_STATE_SQ_DRAINING:
+	case MLX4_QP_STATE_SQD:      return IB_QPS_SQD;
+	case MLX4_QP_STATE_SQER:     return IB_QPS_SQE;
+	case MLX4_QP_STATE_ERR:      return IB_QPS_ERR;
+	default:		     return -1;
+	}
+}
+
+static inline enum ib_mig_state to_ib_mig_state(int mlx4_mig_state)
+{
+	switch (mlx4_mig_state) {
+	case MLX4_QP_PM_ARMED:		return IB_MIG_ARMED;
+	case MLX4_QP_PM_REARM:		return IB_MIG_REARM;
+	case MLX4_QP_PM_MIGRATED:	return IB_MIG_MIGRATED;
+	default: return -1;
+	}
+}
+
+static int to_ib_qp_access_flags(int mlx4_flags)
+{
+	int ib_flags = 0;
+
+	if (mlx4_flags & MLX4_QP_BIT_RRE)
+		ib_flags |= IB_ACCESS_REMOTE_READ;
+	if (mlx4_flags & MLX4_QP_BIT_RWE)
+		ib_flags |= IB_ACCESS_REMOTE_WRITE;
+	if (mlx4_flags & MLX4_QP_BIT_RAE)
+		ib_flags |= IB_ACCESS_REMOTE_ATOMIC;
+
+	return ib_flags;
+}
+
+static void to_ib_ah_attr(struct mlx4_ib_dev *ib_dev, struct ib_ah_attr *ib_ah_attr,
+			  struct mlx4_qp_path *path)
+{
+	struct mlx4_dev *dev = ib_dev->dev;
+	int is_eth;
+
+	memset(ib_ah_attr, 0, sizeof *ib_ah_attr);
+	ib_ah_attr->port_num	  = path->sched_queue & 0x40 ? 2 : 1;
+
+	if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->caps.num_ports)
+		return;
+
+	is_eth = rdma_port_get_link_layer(&ib_dev->ib_dev, ib_ah_attr->port_num) ==
+		IB_LINK_LAYER_ETHERNET;
+	if (is_eth)
+		ib_ah_attr->sl = ((path->sched_queue >> 3) & 0x7) |
+		((path->sched_queue & 4) << 1);
+	else
+		ib_ah_attr->sl = (path->sched_queue >> 2) & 0xf;
+
+	ib_ah_attr->dlid	  = be16_to_cpu(path->rlid);
+
+	ib_ah_attr->src_path_bits = path->grh_mylmc & 0x7f;
+	ib_ah_attr->static_rate   = path->static_rate ? path->static_rate - 5 : 0;
+	ib_ah_attr->ah_flags      = (path->grh_mylmc & (1 << 7)) ? IB_AH_GRH : 0;
+	if (ib_ah_attr->ah_flags) {
+		ib_ah_attr->grh.sgid_index = path->mgid_index;
+		ib_ah_attr->grh.hop_limit  = path->hop_limit;
+		ib_ah_attr->grh.traffic_class =
+			(be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff;
+		ib_ah_attr->grh.flow_label =
+			be32_to_cpu(path->tclass_flowlabel) & 0xfffff;
+		memcpy(ib_ah_attr->grh.dgid.raw,
+			path->rgid, sizeof ib_ah_attr->grh.dgid.raw);
+	}
+}
+
+int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+		     struct ib_qp_init_attr *qp_init_attr)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	struct mlx4_qp_context context;
+	int mlx4_state;
+	int err = 0;
+
+	mutex_lock(&qp->mutex);
+
+	if (qp->state == IB_QPS_RESET) {
+		qp_attr->qp_state = IB_QPS_RESET;
+		goto done;
+	}
+
+	err = mlx4_qp_query(dev->dev, &qp->mqp, &context);
+	if (err) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	mlx4_state = be32_to_cpu(context.flags) >> 28;
+
+	qp->state		     = to_ib_qp_state(mlx4_state);
+	qp_attr->qp_state	     = qp->state;
+	qp_attr->path_mtu	     = context.mtu_msgmax >> 5;
+	qp_attr->path_mig_state	     =
+		to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3);
+	qp_attr->qkey		     = be32_to_cpu(context.qkey);
+	qp_attr->rq_psn		     = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff;
+	qp_attr->sq_psn		     = be32_to_cpu(context.next_send_psn) & 0xffffff;
+	qp_attr->dest_qp_num	     = be32_to_cpu(context.remote_qpn) & 0xffffff;
+	qp_attr->qp_access_flags     =
+		to_ib_qp_access_flags(be32_to_cpu(context.params2));
+
+	if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC ||
+	    qp->ibqp.qp_type == IB_QPT_XRC) {
+		to_ib_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path);
+		to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context.alt_path);
+		qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f;
+		qp_attr->alt_port_num	= qp_attr->alt_ah_attr.port_num;
+	}
+
+	qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f;
+	if (qp_attr->qp_state == IB_QPS_INIT)
+		qp_attr->port_num = qp->port;
+	else
+		qp_attr->port_num = context.pri_path.sched_queue & 0x40 ? 2 : 1;
+
+	/* qp_attr->en_sqd_async_notify is only applicable in modify qp */
+	qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING;
+
+	qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context.params1) >> 21) & 0x7);
+
+	qp_attr->max_dest_rd_atomic =
+		1 << ((be32_to_cpu(context.params2) >> 21) & 0x7);
+	qp_attr->min_rnr_timer	    =
+		(be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f;
+	qp_attr->timeout	    = context.pri_path.ackto >> 3;
+	qp_attr->retry_cnt	    = (be32_to_cpu(context.params1) >> 16) & 0x7;
+	qp_attr->rnr_retry	    = (be32_to_cpu(context.params1) >> 13) & 0x7;
+	qp_attr->alt_timeout	    = context.alt_path.ackto >> 3;
+
+done:
+	qp_attr->cur_qp_state	     = qp_attr->qp_state;
+	qp_attr->cap.max_recv_wr     = qp->rq.wqe_cnt;
+	qp_attr->cap.max_recv_sge    = qp->rq.max_gs;
+
+	if (!ibqp->uobject) {
+		qp_attr->cap.max_send_wr  = qp->sq.wqe_cnt;
+		qp_attr->cap.max_send_sge = qp->sq.max_gs;
+	} else {
+		qp_attr->cap.max_send_wr  = 0;
+		qp_attr->cap.max_send_sge = 0;
+	}
+
+	/*
+	 * We don't support inline sends for kernel QPs (yet), and we
+	 * don't know what userspace's value should be.
+	 */
+	qp_attr->cap.max_inline_data = 0;
+
+	qp_init_attr->cap	     = qp_attr->cap;
+
+	qp_init_attr->create_flags = 0;
+	if (qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK)
+		qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
+
+	if (qp->flags & MLX4_IB_QP_LSO)
+		qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
+
+out:
+	mutex_unlock(&qp->mutex);
+	return err;
+}
+
+int mlx4_ib_create_xrc_rcv_qp(struct ib_qp_init_attr *init_attr,
+			      u32 *qp_num)
+{
+	struct mlx4_ib_dev *dev = to_mdev(init_attr->xrc_domain->device);
+	struct mlx4_ib_xrcd *xrcd = to_mxrcd(init_attr->xrc_domain);
+	struct mlx4_ib_qp *qp;
+	struct ib_qp *ibqp;
+	struct mlx4_ib_xrc_reg_entry *ctx_entry;
+	unsigned long flags;
+	int err;
+
+	if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
+		return -ENOSYS;
+
+	if (init_attr->qp_type != IB_QPT_XRC)
+		return -EINVAL;
+
+	ctx_entry = kmalloc(sizeof *ctx_entry, GFP_KERNEL);
+	if (!ctx_entry)
+		return -ENOMEM;
+
+	qp = kzalloc(sizeof *qp, GFP_KERNEL);
+	if (!qp) {
+		kfree(ctx_entry);
+		return -ENOMEM;
+	}
+	mutex_lock(&dev->xrc_reg_mutex);
+	qp->flags = MLX4_IB_XRC_RCV;
+	qp->xrcdn = to_mxrcd(init_attr->xrc_domain)->xrcdn;
+	INIT_LIST_HEAD(&qp->xrc_reg_list);
+	err = create_qp_common(dev, xrcd->pd, init_attr, NULL, 0, qp);
+	if (err) {
+		mutex_unlock(&dev->xrc_reg_mutex);
+		kfree(ctx_entry);
+		kfree(qp);
+		return err;
+	}
+
+	ibqp = &qp->ibqp;
+	/* set the ibpq attributes which will be used by the mlx4 module */
+	ibqp->qp_num = qp->mqp.qpn;
+	ibqp->device = init_attr->xrc_domain->device;
+	ibqp->pd = xrcd->pd;
+	ibqp->send_cq = ibqp->recv_cq = xrcd->cq;
+	ibqp->event_handler = init_attr->event_handler;
+	ibqp->qp_context = init_attr->qp_context;
+	ibqp->qp_type = init_attr->qp_type;
+	ibqp->xrcd = init_attr->xrc_domain;
+
+	mutex_lock(&qp->mutex);
+	ctx_entry->context = init_attr->qp_context;
+	spin_lock_irqsave(&qp->xrc_reg_list_lock, flags);
+	list_add_tail(&ctx_entry->list, &qp->xrc_reg_list);
+	spin_unlock_irqrestore(&qp->xrc_reg_list_lock, flags);
+	mutex_unlock(&qp->mutex);
+	mutex_unlock(&dev->xrc_reg_mutex);
+	*qp_num = qp->mqp.qpn;
+	return 0;
+}
+
+int mlx4_ib_modify_xrc_rcv_qp(struct ib_xrcd *ibxrcd, u32 qp_num,
+			      struct ib_qp_attr *attr, int attr_mask)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibxrcd->device);
+	struct mlx4_ib_xrcd *xrcd = to_mxrcd(ibxrcd);
+	struct mlx4_qp *mqp;
+	struct mlx4_ib_qp *mibqp;
+	int err = -EINVAL;
+
+	if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
+		return -ENOSYS;
+
+	mutex_lock(&dev->xrc_reg_mutex);
+	mqp = mlx4_qp_lookup_lock(dev->dev, qp_num);
+	if (unlikely(!mqp)) {
+		printk(KERN_WARNING "mlx4_ib_reg_xrc_rcv_qp: "
+		       "unknown QPN %06x\n", qp_num);
+		goto err_out;
+	}
+
+	mibqp = to_mibqp(mqp);
+
+	if (!(mibqp->flags & MLX4_IB_XRC_RCV) || !mibqp->ibqp.xrcd ||
+	    xrcd->xrcdn != to_mxrcd(mibqp->ibqp.xrcd)->xrcdn)
+		goto err_out;
+
+	err = mlx4_ib_modify_qp(&mibqp->ibqp, attr, attr_mask, NULL);
+	mutex_unlock(&dev->xrc_reg_mutex);
+	return err;
+
+err_out:
+	mutex_unlock(&dev->xrc_reg_mutex);
+	return err;
+}
+
+int mlx4_ib_query_xrc_rcv_qp(struct ib_xrcd *ibxrcd, u32 qp_num,
+			     struct ib_qp_attr *qp_attr, int qp_attr_mask,
+			     struct ib_qp_init_attr *qp_init_attr)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibxrcd->device);
+	struct mlx4_ib_xrcd *xrcd = to_mxrcd(ibxrcd);
+	struct mlx4_ib_qp *qp;
+	struct mlx4_qp *mqp;
+	struct mlx4_qp_context context;
+	int mlx4_state;
+	int err = -EINVAL;
+
+	if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
+		return -ENOSYS;
+
+	mutex_lock(&dev->xrc_reg_mutex);
+	mqp = mlx4_qp_lookup_lock(dev->dev, qp_num);
+	if (unlikely(!mqp)) {
+		printk(KERN_WARNING "mlx4_ib_reg_xrc_rcv_qp: "
+		       "unknown QPN %06x\n", qp_num);
+		goto err_out;
+	}
+
+	qp = to_mibqp(mqp);
+	if (!(qp->flags & MLX4_IB_XRC_RCV) || !(qp->ibqp.xrcd) ||
+	    xrcd->xrcdn != to_mxrcd(qp->ibqp.xrcd)->xrcdn)
+		goto err_out;
+
+	if (qp->state == IB_QPS_RESET) {
+		qp_attr->qp_state = IB_QPS_RESET;
+		goto done;
+	}
+
+	err = mlx4_qp_query(dev->dev, mqp, &context);
+	if (err)
+		goto err_out;
+
+	mlx4_state = be32_to_cpu(context.flags) >> 28;
+
+	qp_attr->qp_state = to_ib_qp_state(mlx4_state);
+	qp_attr->path_mtu = context.mtu_msgmax >> 5;
+	qp_attr->path_mig_state =
+		to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3);
+	qp_attr->qkey = be32_to_cpu(context.qkey);
+	qp_attr->rq_psn = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff;
+	qp_attr->sq_psn = be32_to_cpu(context.next_send_psn) & 0xffffff;
+	qp_attr->dest_qp_num = be32_to_cpu(context.remote_qpn) & 0xffffff;
+	qp_attr->qp_access_flags =
+		to_ib_qp_access_flags(be32_to_cpu(context.params2));
+
+	if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC ||
+	    qp->ibqp.qp_type == IB_QPT_XRC) {
+		to_ib_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path);
+		to_ib_ah_attr(dev, &qp_attr->alt_ah_attr,
+			      &context.alt_path);
+		qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f;
+		qp_attr->alt_port_num	= qp_attr->alt_ah_attr.port_num;
+	}
+
+	qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f;
+	if (qp_attr->qp_state == IB_QPS_INIT)
+		qp_attr->port_num = qp->port;
+	else
+		qp_attr->port_num = context.pri_path.sched_queue & 0x40 ? 2 : 1;
+
+	/* qp_attr->en_sqd_async_notify is only applicable in modify qp */
+	qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING;
+
+	qp_attr->max_rd_atomic =
+		1 << ((be32_to_cpu(context.params1) >> 21) & 0x7);
+
+	qp_attr->max_dest_rd_atomic =
+		1 << ((be32_to_cpu(context.params2) >> 21) & 0x7);
+	qp_attr->min_rnr_timer =
+		(be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f;
+	qp_attr->timeout = context.pri_path.ackto >> 3;
+	qp_attr->retry_cnt = (be32_to_cpu(context.params1) >> 16) & 0x7;
+	qp_attr->rnr_retry = (be32_to_cpu(context.params1) >> 13) & 0x7;
+	qp_attr->alt_timeout = context.alt_path.ackto >> 3;
+
+done:
+	qp_attr->cur_qp_state	     = qp_attr->qp_state;
+	qp_attr->cap.max_recv_wr     = 0;
+	qp_attr->cap.max_recv_sge    = 0;
+	qp_attr->cap.max_send_wr     = 0;
+	qp_attr->cap.max_send_sge    = 0;
+	qp_attr->cap.max_inline_data = 0;
+	qp_init_attr->cap	     = qp_attr->cap;
+
+	mutex_unlock(&dev->xrc_reg_mutex);
+	return 0;
+
+err_out:
+	mutex_unlock(&dev->xrc_reg_mutex);
+	return err;
+}
+
+int mlx4_ib_reg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num)
+{
+
+	struct mlx4_ib_xrcd *mxrcd = to_mxrcd(xrcd);
+
+	struct mlx4_qp *mqp;
+	struct mlx4_ib_qp *mibqp;
+	struct mlx4_ib_xrc_reg_entry *ctx_entry, *tmp;
+	unsigned long flags;
+	int err = -EINVAL;
+
+	mutex_lock(&to_mdev(xrcd->device)->xrc_reg_mutex);
+	mqp = mlx4_qp_lookup_lock(to_mdev(xrcd->device)->dev, qp_num);
+	if (unlikely(!mqp)) {
+		printk(KERN_WARNING "mlx4_ib_reg_xrc_rcv_qp: "
+		       "unknown QPN %06x\n", qp_num);
+		goto err_out;
+	}
+
+	mibqp = to_mibqp(mqp);
+
+	if (!(mibqp->flags & MLX4_IB_XRC_RCV) || !(mibqp->ibqp.xrcd) ||
+	    mxrcd->xrcdn != to_mxrcd(mibqp->ibqp.xrcd)->xrcdn)
+		goto err_out;
+
+	ctx_entry = kmalloc(sizeof *ctx_entry, GFP_KERNEL);
+	if (!ctx_entry) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	mutex_lock(&mibqp->mutex);
+	list_for_each_entry(tmp, &mibqp->xrc_reg_list, list)
+		if (tmp->context == context) {
+			mutex_unlock(&mibqp->mutex);
+			kfree(ctx_entry);
+			mutex_unlock(&to_mdev(xrcd->device)->xrc_reg_mutex);
+			return 0;
+		}
+
+	ctx_entry->context = context;
+	spin_lock_irqsave(&mibqp->xrc_reg_list_lock, flags);
+	list_add_tail(&ctx_entry->list, &mibqp->xrc_reg_list);
+	spin_unlock_irqrestore(&mibqp->xrc_reg_list_lock, flags);
+	mutex_unlock(&mibqp->mutex);
+	mutex_unlock(&to_mdev(xrcd->device)->xrc_reg_mutex);
+	return 0;
+
+err_out:
+	mutex_unlock(&to_mdev(xrcd->device)->xrc_reg_mutex);
+	return err;
+}
+
+int mlx4_ib_unreg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num)
+{
+
+	struct mlx4_ib_xrcd *mxrcd = to_mxrcd(xrcd);
+
+	struct mlx4_qp *mqp;
+	struct mlx4_ib_qp *mibqp;
+	struct mlx4_ib_xrc_reg_entry *ctx_entry, *tmp;
+	unsigned long flags;
+	int found = 0;
+	int err = -EINVAL;
+
+	mutex_lock(&to_mdev(xrcd->device)->xrc_reg_mutex);
+	mqp = mlx4_qp_lookup_lock(to_mdev(xrcd->device)->dev, qp_num);
+	if (unlikely(!mqp)) {
+		printk(KERN_WARNING "mlx4_ib_unreg_xrc_rcv_qp: "
+		       "unknown QPN %06x\n", qp_num);
+		goto err_out;
+	}
+
+	mibqp = to_mibqp(mqp);
+
+	if (!(mibqp->flags & MLX4_IB_XRC_RCV) ||
+	    mxrcd->xrcdn != (mibqp->xrcdn & 0xffff))
+		goto err_out;
+
+	mutex_lock(&mibqp->mutex);
+	spin_lock_irqsave(&mibqp->xrc_reg_list_lock, flags);
+	list_for_each_entry_safe(ctx_entry, tmp, &mibqp->xrc_reg_list, list)
+		if (ctx_entry->context == context) {
+			found = 1;
+			list_del(&ctx_entry->list);
+			spin_unlock_irqrestore(&mibqp->xrc_reg_list_lock, flags);
+			kfree(ctx_entry);
+			break;
+		}
+
+	if (!found)
+		spin_unlock_irqrestore(&mibqp->xrc_reg_list_lock, flags);
+	mutex_unlock(&mibqp->mutex);
+	if (!found)
+		goto err_out;
+
+	/* destroy the QP if the registration list is empty */
+	if (list_empty(&mibqp->xrc_reg_list))
+		mlx4_ib_destroy_qp(&mibqp->ibqp);
+
+	mutex_unlock(&to_mdev(xrcd->device)->xrc_reg_mutex);
+	return 0;
+
+err_out:
+	mutex_unlock(&to_mdev(xrcd->device)->xrc_reg_mutex);
+	return err;
+}
+
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/srq.c b/sys/ofed/drivers/infiniband/hw/mlx4/srq.c
new file mode 100644
index 0000000..90918c7
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/srq.c
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/srq.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+
+static void *get_wqe(struct mlx4_ib_srq *srq, int n)
+{
+	return mlx4_buf_offset(&srq->buf, n << srq->msrq.wqe_shift);
+}
+
+static void mlx4_ib_srq_event(struct mlx4_srq *srq, enum mlx4_event type)
+{
+	struct ib_event event;
+	struct ib_srq *ibsrq = &to_mibsrq(srq)->ibsrq;
+
+	if (ibsrq->event_handler) {
+		event.device      = ibsrq->device;
+		event.element.srq = ibsrq;
+		switch (type) {
+		case MLX4_EVENT_TYPE_SRQ_LIMIT:
+			event.event = IB_EVENT_SRQ_LIMIT_REACHED;
+			break;
+		case MLX4_EVENT_TYPE_SRQ_CATAS_ERROR:
+			event.event = IB_EVENT_SRQ_ERR;
+			break;
+		default:
+			printk(KERN_WARNING "mlx4_ib: Unexpected event type %d "
+			       "on SRQ %06x\n", type, srq->srqn);
+			return;
+		}
+
+		ibsrq->event_handler(&event, ibsrq->srq_context);
+	}
+}
+
+struct ib_srq *mlx4_ib_create_xrc_srq(struct ib_pd *pd,
+				      struct ib_cq *xrc_cq,
+				      struct ib_xrcd *xrcd,
+				      struct ib_srq_init_attr *init_attr,
+				      struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	struct mlx4_ib_srq *srq;
+	struct mlx4_wqe_srq_next_seg *next;
+	u32	cqn;
+	u16	xrcdn;
+	int desc_size;
+	int buf_size;
+	int err;
+	int i;
+
+	/* Sanity check SRQ size before proceeding */
+	if (init_attr->attr.max_wr  >= dev->dev->caps.max_srq_wqes ||
+	    init_attr->attr.max_sge >  dev->dev->caps.max_srq_sge) {
+		mlx4_ib_dbg("a size param is out of range. "
+			    "max_wr = 0x%x, max_sge = 0x%x",
+			    init_attr->attr.max_wr, init_attr->attr.max_sge);
+		return ERR_PTR(-EINVAL);
+	}
+
+	srq = kzalloc(sizeof *srq, GFP_KERNEL);
+	if (!srq)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&srq->mutex);
+	spin_lock_init(&srq->lock);
+	srq->msrq.max    = roundup_pow_of_two(init_attr->attr.max_wr + 1);
+	srq->msrq.max_gs = init_attr->attr.max_sge;
+
+	desc_size = max(32UL,
+			roundup_pow_of_two(sizeof (struct mlx4_wqe_srq_next_seg) +
+					   srq->msrq.max_gs *
+					   sizeof (struct mlx4_wqe_data_seg)));
+	srq->msrq.wqe_shift = ilog2(desc_size);
+
+	buf_size = srq->msrq.max * desc_size;
+
+	if (pd->uobject) {
+		struct mlx4_ib_create_srq ucmd;
+
+		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+			err = -EFAULT;
+			goto err_srq;
+		}
+
+		srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
+					buf_size, 0, 0);
+		if (IS_ERR(srq->umem)) {
+			err = PTR_ERR(srq->umem);
+			goto err_srq;
+		}
+
+		err = mlx4_mtt_init(dev->dev, ib_umem_page_count(srq->umem),
+				    ilog2(srq->umem->page_size), &srq->mtt);
+		if (err)
+			goto err_buf;
+
+		err = mlx4_ib_umem_write_mtt(dev, &srq->mtt, srq->umem);
+		if (err)
+			goto err_mtt;
+
+		err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
+					  ucmd.db_addr, &srq->db);
+		if (err)
+			goto err_mtt;
+	} else {
+		struct mlx4_wqe_data_seg *scatter;
+
+		err = mlx4_db_alloc(dev->dev, &srq->db, 0);
+		if (err)
+			goto err_srq;
+
+		*srq->db.db = 0;
+
+		if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf)) {
+			err = -ENOMEM;
+			goto err_db;
+		}
+
+		srq->head    = 0;
+		srq->tail    = srq->msrq.max - 1;
+		srq->wqe_ctr = 0;
+
+		for (i = 0; i < srq->msrq.max; ++i) {
+			next = get_wqe(srq, i);
+			next->next_wqe_index =
+				cpu_to_be16((i + 1) & (srq->msrq.max - 1));
+
+			for (scatter = (void *) (next + 1);
+			     (void *) scatter < (void *) next + desc_size;
+			     ++scatter)
+				scatter->lkey = cpu_to_be32(MLX4_INVALID_LKEY);
+		}
+
+		err = mlx4_mtt_init(dev->dev, srq->buf.npages, srq->buf.page_shift,
+				    &srq->mtt);
+		if (err)
+			goto err_buf;
+
+		err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf);
+		if (err)
+			goto err_mtt;
+
+		srq->wrid = kmalloc(srq->msrq.max * sizeof (u64), GFP_KERNEL);
+		if (!srq->wrid) {
+			err = -ENOMEM;
+			goto err_mtt;
+		}
+	}
+
+	cqn = xrc_cq ? (u32) (to_mcq(xrc_cq)->mcq.cqn) : 0;
+	xrcdn = xrcd ? (u16) (to_mxrcd(xrcd)->xrcdn) :
+		(u16) dev->dev->caps.reserved_xrcds;
+
+	err = mlx4_srq_alloc(dev->dev, to_mpd(pd)->pdn, cqn, xrcdn, &srq->mtt,
+			     srq->db.dma, &srq->msrq);
+	if (err)
+		goto err_wrid;
+
+	srq->msrq.event = mlx4_ib_srq_event;
+
+	if (pd->uobject) {
+		if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof (__u32))) {
+			err = -EFAULT;
+			goto err_wrid;
+		}
+	} else
+		srq->ibsrq.xrc_srq_num = srq->msrq.srqn;
+
+	init_attr->attr.max_wr = srq->msrq.max - 1;
+
+	return &srq->ibsrq;
+
+err_wrid:
+	if (pd->uobject)
+		mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db);
+	else
+		kfree(srq->wrid);
+
+err_mtt:
+	mlx4_mtt_cleanup(dev->dev, &srq->mtt);
+
+err_buf:
+	if (pd->uobject)
+		ib_umem_release(srq->umem);
+	else
+		mlx4_buf_free(dev->dev, buf_size, &srq->buf);
+
+err_db:
+	if (!pd->uobject)
+		mlx4_db_free(dev->dev, &srq->db);
+
+err_srq:
+	kfree(srq);
+
+	return ERR_PTR(err);
+}
+
+int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		       enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibsrq->device);
+	struct mlx4_ib_srq *srq = to_msrq(ibsrq);
+	int ret;
+
+	/* We don't support resizing SRQs (yet?) */
+	if (attr_mask & IB_SRQ_MAX_WR) {
+		mlx4_ib_dbg("resize not yet supported");
+		return -EINVAL;
+	}
+
+	if (attr_mask & IB_SRQ_LIMIT) {
+		if (attr->srq_limit >= srq->msrq.max){
+			mlx4_ib_dbg("limit (0x%x) too high", attr->srq_limit);
+			return -EINVAL;
+		}
+
+		mutex_lock(&srq->mutex);
+		ret = mlx4_srq_arm(dev->dev, &srq->msrq, attr->srq_limit);
+		mutex_unlock(&srq->mutex);
+
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
+				  struct ib_srq_init_attr *init_attr,
+				  struct ib_udata *udata)
+{
+	return mlx4_ib_create_xrc_srq(pd, NULL, NULL, init_attr, udata);
+}
+
+int mlx4_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibsrq->device);
+	struct mlx4_ib_srq *srq = to_msrq(ibsrq);
+	int ret;
+	int limit_watermark;
+
+	ret = mlx4_srq_query(dev->dev, &srq->msrq, &limit_watermark);
+	if (ret)
+		return ret;
+
+	srq_attr->srq_limit = limit_watermark;
+	srq_attr->max_wr    = srq->msrq.max - 1;
+	srq_attr->max_sge   = srq->msrq.max_gs;
+
+	return 0;
+}
+
+int mlx4_ib_destroy_srq(struct ib_srq *srq)
+{
+	struct mlx4_ib_dev *dev = to_mdev(srq->device);
+	struct mlx4_ib_srq *msrq = to_msrq(srq);
+	struct mlx4_ib_cq *cq;
+
+	mlx4_srq_invalidate(dev->dev, &msrq->msrq);
+
+	if (srq->xrc_cq && !srq->uobject) {
+		cq = to_mcq(srq->xrc_cq);
+		spin_lock_irq(&cq->lock);
+		__mlx4_ib_cq_clean(cq, -1, msrq);
+		mlx4_srq_remove(dev->dev, &msrq->msrq);
+		spin_unlock_irq(&cq->lock);
+	} else
+		mlx4_srq_remove(dev->dev, &msrq->msrq);
+
+	mlx4_srq_free(dev->dev, &msrq->msrq);
+	mlx4_mtt_cleanup(dev->dev, &msrq->mtt);
+
+	if (srq->uobject) {
+		mlx4_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db);
+		ib_umem_release(msrq->umem);
+	} else {
+		kfree(msrq->wrid);
+		mlx4_buf_free(dev->dev, msrq->msrq.max << msrq->msrq.wqe_shift,
+			      &msrq->buf);
+		mlx4_db_free(dev->dev, &msrq->db);
+	}
+
+	kfree(msrq);
+
+	return 0;
+}
+
+void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index)
+{
+	struct mlx4_wqe_srq_next_seg *next;
+
+	/* always called with interrupts disabled. */
+	spin_lock(&srq->lock);
+
+	next = get_wqe(srq, srq->tail);
+	next->next_wqe_index = cpu_to_be16(wqe_index);
+	srq->tail = wqe_index;
+
+	spin_unlock(&srq->lock);
+}
+
+int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			  struct ib_recv_wr **bad_wr)
+{
+	struct mlx4_ib_srq *srq = to_msrq(ibsrq);
+	struct mlx4_wqe_srq_next_seg *next;
+	struct mlx4_wqe_data_seg *scat;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int i;
+
+	spin_lock_irqsave(&srq->lock, flags);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (unlikely(wr->num_sge > srq->msrq.max_gs)) {
+			mlx4_ib_dbg("srq num 0x%x: num s/g entries too large (%d)",
+				    srq->msrq.srqn, wr->num_sge);
+			err = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+
+		if (unlikely(srq->head == srq->tail)) {
+			mlx4_ib_dbg("srq num 0x%x: No entries available to post.",
+				    srq->msrq.srqn);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+
+		srq->wrid[srq->head] = wr->wr_id;
+
+		next      = get_wqe(srq, srq->head);
+		srq->head = be16_to_cpu(next->next_wqe_index);
+		scat      = (struct mlx4_wqe_data_seg *) (next + 1);
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length);
+			scat[i].lkey       = cpu_to_be32(wr->sg_list[i].lkey);
+			scat[i].addr       = cpu_to_be64(wr->sg_list[i].addr);
+		}
+
+		if (i < srq->msrq.max_gs) {
+			scat[i].byte_count = 0;
+			scat[i].lkey       = cpu_to_be32(MLX4_INVALID_LKEY);
+			scat[i].addr       = 0;
+		}
+	}
+
+	if (likely(nreq)) {
+		srq->wqe_ctr += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		*srq->db.db = cpu_to_be32(srq->wqe_ctr);
+	}
+
+	spin_unlock_irqrestore(&srq->lock, flags);
+
+	return err;
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/user.h b/sys/ofed/drivers/infiniband/hw/mlx4/user.h
new file mode 100644
index 0000000..13beede
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/user.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_IB_USER_H
+#define MLX4_IB_USER_H
+
+#include <linux/types.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define MLX4_IB_UVERBS_ABI_VERSION	3
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct mlx4_ib_alloc_ucontext_resp {
+	__u32	qp_tab_size;
+	__u16	bf_reg_size;
+	__u16	bf_regs_per_page;
+};
+
+struct mlx4_ib_alloc_pd_resp {
+	__u32	pdn;
+	__u32	reserved;
+};
+
+struct mlx4_ib_create_cq {
+	__u64	buf_addr;
+	__u64	db_addr;
+};
+
+struct mlx4_ib_create_cq_resp {
+	__u32	cqn;
+	__u32	reserved;
+};
+
+struct mlx4_ib_resize_cq {
+	__u64	buf_addr;
+};
+
+struct mlx4_ib_create_srq {
+	__u64	buf_addr;
+	__u64	db_addr;
+};
+
+struct mlx4_ib_create_srq_resp {
+	__u32	srqn;
+	__u32	reserved;
+};
+
+struct mlx4_ib_create_qp {
+	__u64	buf_addr;
+	__u64	db_addr;
+	__u8	log_sq_bb_count;
+	__u8	log_sq_stride;
+	__u8	sq_no_prefetch;
+	__u8	reserved[5];
+};
+
+#endif /* MLX4_IB_USER_H */
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/wc.c b/sys/ofed/drivers/infiniband/hw/mlx4/wc.c
new file mode 100644
index 0000000..827de14
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/wc.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2006-2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include "wc.h"
+
+#if defined(__i386__) || defined(__x86_64__)
+
+pgprot_t pgprot_wc(pgprot_t _prot)
+{
+	return pgprot_writecombine(_prot);
+}
+
+int mlx4_wc_enabled(void)
+{
+	return 1;
+}
+
+#elif defined(CONFIG_PPC64)
+
+pgprot_t pgprot_wc(pgprot_t _prot)
+{
+	return __pgprot((pgprot_val(_prot) | _PAGE_NO_CACHE) &
+				     ~(pgprot_t)_PAGE_GUARDED);
+}
+
+int mlx4_wc_enabled(void)
+{
+	return 1;
+}
+
+#else	/* !(defined(__i386__) || defined(__x86_64__)) */
+
+pgprot_t pgprot_wc(pgprot_t _prot)
+{
+	return pgprot_noncached(_prot);
+}
+
+int mlx4_wc_enabled(void)
+{
+	return 0;
+}
+
+#endif
+
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/wc.h b/sys/ofed/drivers/infiniband/hw/mlx4/wc.h
new file mode 100644
index 0000000..f32fe1e
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/wc.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2006-2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef mlx4_WC_H
+#define mlx4_WC_H
+
+#include <asm/pgtable.h>
+
+int mlx4_wc_enabled(void);
+pgprot_t pgprot_wc(pgprot_t _prot);
+
+#endif
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/Kconfig b/sys/ofed/drivers/infiniband/hw/mthca/Kconfig
new file mode 100644
index 0000000..03efc07
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/Kconfig
@@ -0,0 +1,17 @@
+config INFINIBAND_MTHCA
+	tristate "Mellanox HCA support"
+	depends on PCI
+	---help---
+	  This is a low-level driver for Mellanox InfiniHost host
+	  channel adapters (HCAs), including the MT23108 PCI-X HCA
+	  ("Tavor") and the MT25208 PCI Express HCA ("Arbel").
+
+config INFINIBAND_MTHCA_DEBUG
+	bool "Verbose debugging output" if EMBEDDED
+	depends on INFINIBAND_MTHCA
+	default y
+	---help---
+	  This option causes debugging code to be compiled into the
+	  mthca driver.  The output can be turned on via the
+	  debug_level module parameter (which can also be set after
+	  the driver is loaded through sysfs).
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/Makefile b/sys/ofed/drivers/infiniband/hw/mthca/Makefile
new file mode 100644
index 0000000..e388d95
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/Makefile
@@ -0,0 +1,7 @@
+obj-$(CONFIG_INFINIBAND_MTHCA) += ib_mthca.o
+
+ib_mthca-y :=	mthca_main.o mthca_cmd.o mthca_profile.o mthca_reset.o \
+		mthca_allocator.o mthca_eq.o mthca_pd.o mthca_cq.o \
+		mthca_mr.o mthca_qp.o mthca_av.o mthca_mcg.o mthca_mad.o \
+		mthca_provider.o mthca_memfree.o mthca_uar.o mthca_srq.o \
+		mthca_catas.o
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_allocator.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_allocator.c
new file mode 100644
index 0000000..c5ccc2d
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_allocator.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/bitmap.h>
+
+#include "mthca_dev.h"
+
+/* Trivial bitmap-based allocator */
+u32 mthca_alloc(struct mthca_alloc *alloc)
+{
+	unsigned long flags;
+	u32 obj;
+
+	spin_lock_irqsave(&alloc->lock, flags);
+
+	obj = find_next_zero_bit(alloc->table, alloc->max, alloc->last);
+	if (obj >= alloc->max) {
+		alloc->top = (alloc->top + alloc->max) & alloc->mask;
+		obj = find_first_zero_bit(alloc->table, alloc->max);
+	}
+
+	if (obj < alloc->max) {
+		set_bit(obj, alloc->table);
+		obj |= alloc->top;
+	} else
+		obj = -1;
+
+	spin_unlock_irqrestore(&alloc->lock, flags);
+
+	return obj;
+}
+
+void mthca_free(struct mthca_alloc *alloc, u32 obj)
+{
+	unsigned long flags;
+
+	obj &= alloc->max - 1;
+
+	spin_lock_irqsave(&alloc->lock, flags);
+
+	clear_bit(obj, alloc->table);
+	alloc->last = min(alloc->last, obj);
+	alloc->top = (alloc->top + alloc->max) & alloc->mask;
+
+	spin_unlock_irqrestore(&alloc->lock, flags);
+}
+
+int mthca_alloc_init(struct mthca_alloc *alloc, u32 num, u32 mask,
+		     u32 reserved)
+{
+	int i;
+
+	/* num must be a power of 2 */
+	if (num != 1 << (ffs(num) - 1))
+		return -EINVAL;
+
+	alloc->last = 0;
+	alloc->top  = 0;
+	alloc->max  = num;
+	alloc->mask = mask;
+	spin_lock_init(&alloc->lock);
+	alloc->table = kmalloc(BITS_TO_LONGS(num) * sizeof (long),
+			       GFP_KERNEL);
+	if (!alloc->table)
+		return -ENOMEM;
+
+	bitmap_zero(alloc->table, num);
+	for (i = 0; i < reserved; ++i)
+		set_bit(i, alloc->table);
+
+	return 0;
+}
+
+void mthca_alloc_cleanup(struct mthca_alloc *alloc)
+{
+	kfree(alloc->table);
+}
+
+/*
+ * Array of pointers with lazy allocation of leaf pages.  Callers of
+ * _get, _set and _clear methods must use a lock or otherwise
+ * serialize access to the array.
+ */
+
+#define MTHCA_ARRAY_MASK (PAGE_SIZE / sizeof (void *) - 1)
+
+void *mthca_array_get(struct mthca_array *array, int index)
+{
+	int p = (index * sizeof (void *)) >> PAGE_SHIFT;
+
+	if (array->page_list[p].page)
+		return array->page_list[p].page[index & MTHCA_ARRAY_MASK];
+	else
+		return NULL;
+}
+
+int mthca_array_set(struct mthca_array *array, int index, void *value)
+{
+	int p = (index * sizeof (void *)) >> PAGE_SHIFT;
+
+	/* Allocate with GFP_ATOMIC because we'll be called with locks held. */
+	if (!array->page_list[p].page)
+		array->page_list[p].page = (void **) get_zeroed_page(GFP_ATOMIC);
+
+	if (!array->page_list[p].page)
+		return -ENOMEM;
+
+	array->page_list[p].page[index & MTHCA_ARRAY_MASK] = value;
+	++array->page_list[p].used;
+
+	return 0;
+}
+
+void mthca_array_clear(struct mthca_array *array, int index)
+{
+	int p = (index * sizeof (void *)) >> PAGE_SHIFT;
+
+	if (--array->page_list[p].used == 0) {
+		free_page((unsigned long) array->page_list[p].page);
+		array->page_list[p].page = NULL;
+	} else
+		array->page_list[p].page[index & MTHCA_ARRAY_MASK] = NULL;
+
+	if (array->page_list[p].used < 0)
+		pr_debug("Array %p index %d page %d with ref count %d < 0\n",
+			 array, index, p, array->page_list[p].used);
+}
+
+int mthca_array_init(struct mthca_array *array, int nent)
+{
+	int npage = (nent * sizeof (void *) + PAGE_SIZE - 1) / PAGE_SIZE;
+	int i;
+
+	array->page_list = kmalloc(npage * sizeof *array->page_list, GFP_KERNEL);
+	if (!array->page_list)
+		return -ENOMEM;
+
+	for (i = 0; i < npage; ++i) {
+		array->page_list[i].page = NULL;
+		array->page_list[i].used = 0;
+	}
+
+	return 0;
+}
+
+void mthca_array_cleanup(struct mthca_array *array, int nent)
+{
+	int i;
+
+	for (i = 0; i < (nent * sizeof (void *) + PAGE_SIZE - 1) / PAGE_SIZE; ++i)
+		free_page((unsigned long) array->page_list[i].page);
+
+	kfree(array->page_list);
+}
+
+/*
+ * Handling for queue buffers -- we allocate a bunch of memory and
+ * register it in a memory region at HCA virtual address 0.  If the
+ * requested size is > max_direct, we split the allocation into
+ * multiple pages, so we don't require too much contiguous memory.
+ */
+
+int mthca_buf_alloc(struct mthca_dev *dev, int size, int max_direct,
+		    union mthca_buf *buf, int *is_direct, struct mthca_pd *pd,
+		    int hca_write, struct mthca_mr *mr)
+{
+	int err = -ENOMEM;
+	int npages, shift;
+	u64 *dma_list = NULL;
+	dma_addr_t t;
+	int i;
+
+	if (size <= max_direct) {
+		*is_direct = 1;
+		npages     = 1;
+		shift      = get_order(size) + PAGE_SHIFT;
+
+		buf->direct.buf = dma_alloc_coherent(&dev->pdev->dev,
+						     size, &t, GFP_KERNEL);
+		if (!buf->direct.buf)
+			return -ENOMEM;
+
+		pci_unmap_addr_set(&buf->direct, mapping, t);
+
+		memset(buf->direct.buf, 0, size);
+
+		while (t & ((1 << shift) - 1)) {
+			--shift;
+			npages *= 2;
+		}
+
+		dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+		if (!dma_list)
+			goto err_free;
+
+		for (i = 0; i < npages; ++i)
+			dma_list[i] = t + i * (1 << shift);
+	} else {
+		*is_direct = 0;
+		npages     = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+		shift      = PAGE_SHIFT;
+
+		dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+		if (!dma_list)
+			return -ENOMEM;
+
+		buf->page_list = kmalloc(npages * sizeof *buf->page_list,
+					 GFP_KERNEL);
+		if (!buf->page_list)
+			goto err_out;
+
+		for (i = 0; i < npages; ++i)
+			buf->page_list[i].buf = NULL;
+
+		for (i = 0; i < npages; ++i) {
+			buf->page_list[i].buf =
+				dma_alloc_coherent(&dev->pdev->dev, PAGE_SIZE,
+						   &t, GFP_KERNEL);
+			if (!buf->page_list[i].buf)
+				goto err_free;
+
+			dma_list[i] = t;
+			pci_unmap_addr_set(&buf->page_list[i], mapping, t);
+
+			clear_page(buf->page_list[i].buf);
+		}
+	}
+
+	err = mthca_mr_alloc_phys(dev, pd->pd_num,
+				  dma_list, shift, npages,
+				  0, size,
+				  MTHCA_MPT_FLAG_LOCAL_READ |
+				  (hca_write ? MTHCA_MPT_FLAG_LOCAL_WRITE : 0),
+				  mr);
+	if (err)
+		goto err_free;
+
+	kfree(dma_list);
+
+	return 0;
+
+err_free:
+	mthca_buf_free(dev, size, buf, *is_direct, NULL);
+
+err_out:
+	kfree(dma_list);
+
+	return err;
+}
+
+void mthca_buf_free(struct mthca_dev *dev, int size, union mthca_buf *buf,
+		    int is_direct, struct mthca_mr *mr)
+{
+	int i;
+
+	if (mr)
+		mthca_free_mr(dev, mr);
+
+	if (is_direct)
+		dma_free_coherent(&dev->pdev->dev, size, buf->direct.buf,
+				  pci_unmap_addr(&buf->direct, mapping));
+	else {
+		for (i = 0; i < (size + PAGE_SIZE - 1) / PAGE_SIZE; ++i)
+			dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+					  buf->page_list[i].buf,
+					  pci_unmap_addr(&buf->page_list[i],
+							 mapping));
+		kfree(buf->page_list);
+	}
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_av.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_av.c
new file mode 100644
index 0000000..32f6c63
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_av.c
@@ -0,0 +1,374 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+
+#include "mthca_dev.h"
+
+enum {
+      MTHCA_RATE_TAVOR_FULL   = 0,
+      MTHCA_RATE_TAVOR_1X     = 1,
+      MTHCA_RATE_TAVOR_4X     = 2,
+      MTHCA_RATE_TAVOR_1X_DDR = 3
+};
+
+enum {
+      MTHCA_RATE_MEMFREE_FULL    = 0,
+      MTHCA_RATE_MEMFREE_QUARTER = 1,
+      MTHCA_RATE_MEMFREE_EIGHTH  = 2,
+      MTHCA_RATE_MEMFREE_HALF    = 3
+};
+
+struct mthca_av {
+	__be32 port_pd;
+	u8     reserved1;
+	u8     g_slid;
+	__be16 dlid;
+	u8     reserved2;
+	u8     gid_index;
+	u8     msg_sr;
+	u8     hop_limit;
+	__be32 sl_tclass_flowlabel;
+	__be32 dgid[4];
+};
+
+static enum ib_rate memfree_rate_to_ib(u8 mthca_rate, u8 port_rate)
+{
+	switch (mthca_rate) {
+	case MTHCA_RATE_MEMFREE_EIGHTH:
+		return mult_to_ib_rate(port_rate >> 3);
+	case MTHCA_RATE_MEMFREE_QUARTER:
+		return mult_to_ib_rate(port_rate >> 2);
+	case MTHCA_RATE_MEMFREE_HALF:
+		return mult_to_ib_rate(port_rate >> 1);
+	case MTHCA_RATE_MEMFREE_FULL:
+	default:
+		return mult_to_ib_rate(port_rate);
+	}
+}
+
+static enum ib_rate tavor_rate_to_ib(u8 mthca_rate, u8 port_rate)
+{
+	switch (mthca_rate) {
+	case MTHCA_RATE_TAVOR_1X:     return IB_RATE_2_5_GBPS;
+	case MTHCA_RATE_TAVOR_1X_DDR: return IB_RATE_5_GBPS;
+	case MTHCA_RATE_TAVOR_4X:     return IB_RATE_10_GBPS;
+	default:		      return mult_to_ib_rate(port_rate);
+	}
+}
+
+enum ib_rate mthca_rate_to_ib(struct mthca_dev *dev, u8 mthca_rate, u8 port)
+{
+	if (mthca_is_memfree(dev)) {
+		/* Handle old Arbel FW */
+		if (dev->limits.stat_rate_support == 0x3 && mthca_rate)
+			return IB_RATE_2_5_GBPS;
+
+		return memfree_rate_to_ib(mthca_rate, dev->rate[port - 1]);
+	} else
+		return tavor_rate_to_ib(mthca_rate, dev->rate[port - 1]);
+}
+
+static u8 ib_rate_to_memfree(u8 req_rate, u8 cur_rate)
+{
+	if (cur_rate <= req_rate)
+		return 0;
+
+	/*
+	 * Inter-packet delay (IPD) to get from rate X down to a rate
+	 * no more than Y is (X - 1) / Y.
+	 */
+	switch ((cur_rate - 1) / req_rate) {
+	case 0:	 return MTHCA_RATE_MEMFREE_FULL;
+	case 1:	 return MTHCA_RATE_MEMFREE_HALF;
+	case 2:	 /* fall through */
+	case 3:	 return MTHCA_RATE_MEMFREE_QUARTER;
+	default: return MTHCA_RATE_MEMFREE_EIGHTH;
+	}
+}
+
+static u8 ib_rate_to_tavor(u8 static_rate)
+{
+	switch (static_rate) {
+	case IB_RATE_2_5_GBPS: return MTHCA_RATE_TAVOR_1X;
+	case IB_RATE_5_GBPS:   return MTHCA_RATE_TAVOR_1X_DDR;
+	case IB_RATE_10_GBPS:  return MTHCA_RATE_TAVOR_4X;
+	default:	       return MTHCA_RATE_TAVOR_FULL;
+	}
+}
+
+u8 mthca_get_rate(struct mthca_dev *dev, int static_rate, u8 port)
+{
+	u8 rate;
+
+	if (!static_rate || ib_rate_to_mult(static_rate) >= dev->rate[port - 1])
+		return 0;
+
+	if (mthca_is_memfree(dev))
+		rate = ib_rate_to_memfree(ib_rate_to_mult(static_rate),
+					  dev->rate[port - 1]);
+	else
+		rate = ib_rate_to_tavor(static_rate);
+
+	if (!(dev->limits.stat_rate_support & (1 << rate)))
+		rate = 1;
+
+	return rate;
+}
+
+int mthca_create_ah(struct mthca_dev *dev,
+		    struct mthca_pd *pd,
+		    struct ib_ah_attr *ah_attr,
+		    struct mthca_ah *ah)
+{
+	u32 index = -1;
+	struct mthca_av *av = NULL;
+
+	ah->type = MTHCA_AH_PCI_POOL;
+
+	if (mthca_is_memfree(dev)) {
+		ah->av   = kmalloc(sizeof *ah->av, GFP_ATOMIC);
+		if (!ah->av)
+			return -ENOMEM;
+
+		ah->type = MTHCA_AH_KMALLOC;
+		av       = ah->av;
+	} else if (!atomic_read(&pd->sqp_count) &&
+		 !(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+		index = mthca_alloc(&dev->av_table.alloc);
+
+		/* fall back to allocate in host memory */
+		if (index == -1)
+			goto on_hca_fail;
+
+		av = kmalloc(sizeof *av, GFP_ATOMIC);
+		if (!av)
+			goto on_hca_fail;
+
+		ah->type = MTHCA_AH_ON_HCA;
+		ah->avdma  = dev->av_table.ddr_av_base +
+			index * MTHCA_AV_SIZE;
+	}
+
+on_hca_fail:
+	if (ah->type == MTHCA_AH_PCI_POOL) {
+		ah->av = pci_pool_alloc(dev->av_table.pool,
+					GFP_ATOMIC, &ah->avdma);
+		if (!ah->av)
+			return -ENOMEM;
+
+		av = ah->av;
+	}
+
+	ah->key = pd->ntmr.ibmr.lkey;
+
+	memset(av, 0, MTHCA_AV_SIZE);
+
+	av->port_pd = cpu_to_be32(pd->pd_num | (ah_attr->port_num << 24));
+	av->g_slid  = ah_attr->src_path_bits;
+	av->dlid    = cpu_to_be16(ah_attr->dlid);
+	av->msg_sr  = (3 << 4) | /* 2K message */
+		mthca_get_rate(dev, ah_attr->static_rate, ah_attr->port_num);
+	av->sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28);
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		av->g_slid |= 0x80;
+		av->gid_index = (ah_attr->port_num - 1) * dev->limits.gid_table_len +
+			ah_attr->grh.sgid_index;
+		av->hop_limit = ah_attr->grh.hop_limit;
+		av->sl_tclass_flowlabel |=
+			cpu_to_be32((ah_attr->grh.traffic_class << 20) |
+				    ah_attr->grh.flow_label);
+		memcpy(av->dgid, ah_attr->grh.dgid.raw, 16);
+	} else {
+		/* Arbel workaround -- low byte of GID must be 2 */
+		av->dgid[3] = cpu_to_be32(2);
+	}
+
+	if (0) {
+		int j;
+
+		mthca_dbg(dev, "Created UDAV at %p/%08lx:\n",
+			  av, (unsigned long) ah->avdma);
+		for (j = 0; j < 8; ++j)
+			printk(KERN_DEBUG "  [%2x] %08x\n",
+			       j * 4, be32_to_cpu(((__be32 *) av)[j]));
+	}
+
+	if (ah->type == MTHCA_AH_ON_HCA) {
+		memcpy_toio(dev->av_table.av_map + index * MTHCA_AV_SIZE,
+			    av, MTHCA_AV_SIZE);
+		kfree(av);
+	}
+
+	return 0;
+}
+
+int mthca_destroy_ah(struct mthca_dev *dev, struct mthca_ah *ah)
+{
+	switch (ah->type) {
+	case MTHCA_AH_ON_HCA:
+		mthca_free(&dev->av_table.alloc,
+			   (ah->avdma - dev->av_table.ddr_av_base) /
+			   MTHCA_AV_SIZE);
+		break;
+
+	case MTHCA_AH_PCI_POOL:
+		pci_pool_free(dev->av_table.pool, ah->av, ah->avdma);
+		break;
+
+	case MTHCA_AH_KMALLOC:
+		kfree(ah->av);
+		break;
+	}
+
+	return 0;
+}
+
+int mthca_ah_grh_present(struct mthca_ah *ah)
+{
+	return !!(ah->av->g_slid & 0x80);
+}
+
+int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah,
+		  struct ib_ud_header *header)
+{
+	if (ah->type == MTHCA_AH_ON_HCA)
+		return -EINVAL;
+
+	header->lrh.service_level   = be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 28;
+	header->lrh.destination_lid = ah->av->dlid;
+	header->lrh.source_lid      = cpu_to_be16(ah->av->g_slid & 0x7f);
+	if (mthca_ah_grh_present(ah)) {
+		header->grh.traffic_class =
+			(be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 20) & 0xff;
+		header->grh.flow_label    =
+			ah->av->sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+		header->grh.hop_limit     = ah->av->hop_limit;
+		ib_get_cached_gid(&dev->ib_dev,
+				  be32_to_cpu(ah->av->port_pd) >> 24,
+				  ah->av->gid_index % dev->limits.gid_table_len,
+				  &header->grh.source_gid);
+		memcpy(header->grh.destination_gid.raw,
+		       ah->av->dgid, 16);
+	}
+
+	return 0;
+}
+
+int mthca_ah_query(struct ib_ah *ibah, struct ib_ah_attr *attr)
+{
+	struct mthca_ah *ah   = to_mah(ibah);
+	struct mthca_dev *dev = to_mdev(ibah->device);
+
+	/* Only implement for MAD and memfree ah for now. */
+	if (ah->type == MTHCA_AH_ON_HCA)
+		return -ENOSYS;
+
+	memset(attr, 0, sizeof *attr);
+	attr->dlid          = be16_to_cpu(ah->av->dlid);
+	attr->sl            = be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 28;
+	attr->port_num      = be32_to_cpu(ah->av->port_pd) >> 24;
+	attr->static_rate   = mthca_rate_to_ib(dev, ah->av->msg_sr & 0x7,
+					       attr->port_num);
+	attr->src_path_bits = ah->av->g_slid & 0x7F;
+	attr->ah_flags      = mthca_ah_grh_present(ah) ? IB_AH_GRH : 0;
+
+	if (attr->ah_flags) {
+		attr->grh.traffic_class =
+			be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 20;
+		attr->grh.flow_label =
+			be32_to_cpu(ah->av->sl_tclass_flowlabel) & 0xfffff;
+		attr->grh.hop_limit  = ah->av->hop_limit;
+		attr->grh.sgid_index = ah->av->gid_index &
+				       (dev->limits.gid_table_len - 1);
+		memcpy(attr->grh.dgid.raw, ah->av->dgid, 16);
+	}
+
+	return 0;
+}
+
+int mthca_init_av_table(struct mthca_dev *dev)
+{
+	int err;
+
+	if (mthca_is_memfree(dev))
+		return 0;
+
+	err = mthca_alloc_init(&dev->av_table.alloc,
+			       dev->av_table.num_ddr_avs,
+			       dev->av_table.num_ddr_avs - 1,
+			       0);
+	if (err)
+		return err;
+
+	dev->av_table.pool = pci_pool_create("mthca_av", dev->pdev,
+					     MTHCA_AV_SIZE,
+					     MTHCA_AV_SIZE, 0);
+	if (!dev->av_table.pool)
+		goto out_free_alloc;
+
+	if (!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+		dev->av_table.av_map = ioremap(pci_resource_start(dev->pdev, 4) +
+					       dev->av_table.ddr_av_base -
+					       dev->ddr_start,
+					       dev->av_table.num_ddr_avs *
+					       MTHCA_AV_SIZE);
+		if (!dev->av_table.av_map)
+			goto out_free_pool;
+	} else
+		dev->av_table.av_map = NULL;
+
+	return 0;
+
+ out_free_pool:
+	pci_pool_destroy(dev->av_table.pool);
+
+ out_free_alloc:
+	mthca_alloc_cleanup(&dev->av_table.alloc);
+	return -ENOMEM;
+}
+
+void mthca_cleanup_av_table(struct mthca_dev *dev)
+{
+	if (mthca_is_memfree(dev))
+		return;
+
+	if (dev->av_table.av_map)
+		iounmap(dev->av_table.av_map);
+	pci_pool_destroy(dev->av_table.pool);
+	mthca_alloc_cleanup(&dev->av_table.alloc);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_catas.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_catas.c
new file mode 100644
index 0000000..b200170
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_catas.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+
+#include "mthca_dev.h"
+
+enum {
+	MTHCA_CATAS_TYPE_INTERNAL	= 0,
+	MTHCA_CATAS_TYPE_UPLINK		= 3,
+	MTHCA_CATAS_TYPE_DDR		= 4,
+	MTHCA_CATAS_TYPE_PARITY		= 5,
+};
+
+#define	MTHCA_CATAS_POLL_INTERVAL	(5 * HZ)
+
+static DEFINE_SPINLOCK(catas_lock);
+
+static LIST_HEAD(catas_list);
+static struct workqueue_struct *catas_wq;
+static struct work_struct catas_work;
+
+static int catas_reset_disable;
+module_param_named(catas_reset_disable, catas_reset_disable, int, 0644);
+MODULE_PARM_DESC(catas_reset_disable, "disable reset on catastrophic event if nonzero");
+
+static void catas_reset(struct work_struct *work)
+{
+	struct mthca_dev *dev, *tmpdev;
+	LIST_HEAD(tlist);
+	int ret;
+
+	mutex_lock(&mthca_device_mutex);
+
+	spin_lock_irq(&catas_lock);
+	list_splice_init(&catas_list, &tlist);
+	spin_unlock_irq(&catas_lock);
+
+	list_for_each_entry_safe(dev, tmpdev, &tlist, catas_err.list) {
+		struct pci_dev *pdev = dev->pdev;
+		ret = __mthca_restart_one(dev->pdev);
+		/* 'dev' now is not valid */
+		if (ret)
+			printk(KERN_ERR "mthca %s: Reset failed (%d)\n",
+			       pci_name(pdev), ret);
+		else {
+			struct mthca_dev *d = pci_get_drvdata(pdev);
+			mthca_dbg(d, "Reset succeeded\n");
+		}
+	}
+
+	mutex_unlock(&mthca_device_mutex);
+}
+
+static void handle_catas(struct mthca_dev *dev)
+{
+	struct ib_event event;
+	unsigned long flags;
+	const char *type;
+	int i;
+
+	event.device = &dev->ib_dev;
+	event.event  = IB_EVENT_DEVICE_FATAL;
+	event.element.port_num = 0;
+	dev->active = 0;
+
+	ib_dispatch_event(&event);
+
+	switch (swab32(readl(dev->catas_err.map)) >> 24) {
+	case MTHCA_CATAS_TYPE_INTERNAL:
+		type = "internal error";
+		break;
+	case MTHCA_CATAS_TYPE_UPLINK:
+		type = "uplink bus error";
+		break;
+	case MTHCA_CATAS_TYPE_DDR:
+		type = "DDR data error";
+		break;
+	case MTHCA_CATAS_TYPE_PARITY:
+		type = "internal parity error";
+		break;
+	default:
+		type = "unknown error";
+		break;
+	}
+
+	mthca_err(dev, "Catastrophic error detected: %s\n", type);
+	for (i = 0; i < dev->catas_err.size; ++i)
+		mthca_err(dev, "  buf[%02x]: %08x\n",
+			  i, swab32(readl(dev->catas_err.map + i)));
+
+	if (catas_reset_disable)
+		return;
+
+	spin_lock_irqsave(&catas_lock, flags);
+	list_add(&dev->catas_err.list, &catas_list);
+	queue_work(catas_wq, &catas_work);
+	spin_unlock_irqrestore(&catas_lock, flags);
+}
+
+static void poll_catas(unsigned long dev_ptr)
+{
+	struct mthca_dev *dev = (struct mthca_dev *) dev_ptr;
+	int i;
+
+	for (i = 0; i < dev->catas_err.size; ++i)
+		if (readl(dev->catas_err.map + i)) {
+			handle_catas(dev);
+			return;
+		}
+
+	mod_timer(&dev->catas_err.timer,
+		  round_jiffies(jiffies + MTHCA_CATAS_POLL_INTERVAL));
+}
+
+void mthca_start_catas_poll(struct mthca_dev *dev)
+{
+	unsigned long addr;
+
+	init_timer(&dev->catas_err.timer);
+	dev->catas_err.map  = NULL;
+
+	addr = pci_resource_start(dev->pdev, 0) +
+		((pci_resource_len(dev->pdev, 0) - 1) &
+		 dev->catas_err.addr);
+
+	dev->catas_err.map = ioremap(addr, dev->catas_err.size * 4);
+	if (!dev->catas_err.map) {
+		mthca_warn(dev, "couldn't map catastrophic error region "
+			   "at 0x%lx/0x%x\n", addr, dev->catas_err.size * 4);
+		return;
+	}
+
+	dev->catas_err.timer.data     = (unsigned long) dev;
+	dev->catas_err.timer.function = poll_catas;
+	dev->catas_err.timer.expires  = jiffies + MTHCA_CATAS_POLL_INTERVAL;
+	INIT_LIST_HEAD(&dev->catas_err.list);
+	add_timer(&dev->catas_err.timer);
+}
+
+void mthca_stop_catas_poll(struct mthca_dev *dev)
+{
+	del_timer_sync(&dev->catas_err.timer);
+
+	if (dev->catas_err.map)
+		iounmap(dev->catas_err.map);
+
+	spin_lock_irq(&catas_lock);
+	list_del(&dev->catas_err.list);
+	spin_unlock_irq(&catas_lock);
+}
+
+int __init mthca_catas_init(void)
+{
+	INIT_WORK(&catas_work, catas_reset);
+
+	catas_wq = create_singlethread_workqueue("mthcacatas");
+	if (!catas_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mthca_catas_cleanup(void)
+{
+	destroy_workqueue(catas_wq);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.c
new file mode 100644
index 0000000..81e2838
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.c
@@ -0,0 +1,1931 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/pci.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <asm/io.h>
+#include <rdma/ib_mad.h>
+
+#include "mthca_dev.h"
+#include "mthca_config_reg.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+
+#define CMD_POLL_TOKEN 0xffff
+
+enum {
+	HCR_IN_PARAM_OFFSET    = 0x00,
+	HCR_IN_MODIFIER_OFFSET = 0x08,
+	HCR_OUT_PARAM_OFFSET   = 0x0c,
+	HCR_TOKEN_OFFSET       = 0x14,
+	HCR_STATUS_OFFSET      = 0x18,
+
+	HCR_OPMOD_SHIFT        = 12,
+	HCA_E_BIT              = 22,
+	HCR_GO_BIT             = 23
+};
+
+enum {
+	/* initialization and general commands */
+	CMD_SYS_EN          = 0x1,
+	CMD_SYS_DIS         = 0x2,
+	CMD_MAP_FA          = 0xfff,
+	CMD_UNMAP_FA        = 0xffe,
+	CMD_RUN_FW          = 0xff6,
+	CMD_MOD_STAT_CFG    = 0x34,
+	CMD_QUERY_DEV_LIM   = 0x3,
+	CMD_QUERY_FW        = 0x4,
+	CMD_ENABLE_LAM      = 0xff8,
+	CMD_DISABLE_LAM     = 0xff7,
+	CMD_QUERY_DDR       = 0x5,
+	CMD_QUERY_ADAPTER   = 0x6,
+	CMD_INIT_HCA        = 0x7,
+	CMD_CLOSE_HCA       = 0x8,
+	CMD_INIT_IB         = 0x9,
+	CMD_CLOSE_IB        = 0xa,
+	CMD_QUERY_HCA       = 0xb,
+	CMD_SET_IB          = 0xc,
+	CMD_ACCESS_DDR      = 0x2e,
+	CMD_MAP_ICM         = 0xffa,
+	CMD_UNMAP_ICM       = 0xff9,
+	CMD_MAP_ICM_AUX     = 0xffc,
+	CMD_UNMAP_ICM_AUX   = 0xffb,
+	CMD_SET_ICM_SIZE    = 0xffd,
+
+	/* TPT commands */
+	CMD_SW2HW_MPT 	    = 0xd,
+	CMD_QUERY_MPT 	    = 0xe,
+	CMD_HW2SW_MPT 	    = 0xf,
+	CMD_READ_MTT        = 0x10,
+	CMD_WRITE_MTT       = 0x11,
+	CMD_SYNC_TPT        = 0x2f,
+
+	/* EQ commands */
+	CMD_MAP_EQ          = 0x12,
+	CMD_SW2HW_EQ 	    = 0x13,
+	CMD_HW2SW_EQ 	    = 0x14,
+	CMD_QUERY_EQ        = 0x15,
+
+	/* CQ commands */
+	CMD_SW2HW_CQ 	    = 0x16,
+	CMD_HW2SW_CQ 	    = 0x17,
+	CMD_QUERY_CQ 	    = 0x18,
+	CMD_RESIZE_CQ       = 0x2c,
+
+	/* SRQ commands */
+	CMD_SW2HW_SRQ 	    = 0x35,
+	CMD_HW2SW_SRQ 	    = 0x36,
+	CMD_QUERY_SRQ       = 0x37,
+	CMD_ARM_SRQ         = 0x40,
+
+	/* QP/EE commands */
+	CMD_RST2INIT_QPEE   = 0x19,
+	CMD_INIT2RTR_QPEE   = 0x1a,
+	CMD_RTR2RTS_QPEE    = 0x1b,
+	CMD_RTS2RTS_QPEE    = 0x1c,
+	CMD_SQERR2RTS_QPEE  = 0x1d,
+	CMD_2ERR_QPEE       = 0x1e,
+	CMD_RTS2SQD_QPEE    = 0x1f,
+	CMD_SQD2SQD_QPEE    = 0x38,
+	CMD_SQD2RTS_QPEE    = 0x20,
+	CMD_ERR2RST_QPEE    = 0x21,
+	CMD_QUERY_QPEE      = 0x22,
+	CMD_INIT2INIT_QPEE  = 0x2d,
+	CMD_SUSPEND_QPEE    = 0x32,
+	CMD_UNSUSPEND_QPEE  = 0x33,
+	/* special QPs and management commands */
+	CMD_CONF_SPECIAL_QP = 0x23,
+	CMD_MAD_IFC         = 0x24,
+
+	/* multicast commands */
+	CMD_READ_MGM        = 0x25,
+	CMD_WRITE_MGM       = 0x26,
+	CMD_MGID_HASH       = 0x27,
+
+	/* miscellaneous commands */
+	CMD_DIAG_RPRT       = 0x30,
+	CMD_NOP             = 0x31,
+
+	/* debug commands */
+	CMD_QUERY_DEBUG_MSG = 0x2a,
+	CMD_SET_DEBUG_MSG   = 0x2b,
+};
+
+/*
+ * According to Mellanox code, FW may be starved and never complete
+ * commands.  So we can't use strict timeouts described in PRM -- we
+ * just arbitrarily select 60 seconds for now.
+ */
+#if 0
+/*
+ * Round up and add 1 to make sure we get the full wait time (since we
+ * will be starting in the middle of a jiffy)
+ */
+enum {
+	CMD_TIME_CLASS_A = (HZ + 999) / 1000 + 1,
+	CMD_TIME_CLASS_B = (HZ +  99) /  100 + 1,
+	CMD_TIME_CLASS_C = (HZ +   9) /   10 + 1,
+	CMD_TIME_CLASS_D = 60 * HZ
+};
+#else
+#define	CMD_TIME_CLASS_A	(60 * HZ)
+#define	CMD_TIME_CLASS_B	(60 * HZ)
+#define	CMD_TIME_CLASS_C	(60 * HZ)
+#define	CMD_TIME_CLASS_D	(60 * HZ)
+#endif
+
+#define	GO_BIT_TIMEOUT		(HZ * 10)
+
+struct mthca_cmd_context {
+	struct completion done;
+	int               result;
+	int               next;
+	u64               out_param;
+	u16               token;
+	u8                status;
+};
+
+static int fw_cmd_doorbell = 0;
+module_param(fw_cmd_doorbell, int, 0644);
+MODULE_PARM_DESC(fw_cmd_doorbell, "post FW commands through doorbell page if nonzero "
+		 "(and supported by FW)");
+
+static inline int go_bit(struct mthca_dev *dev)
+{
+	return readl(dev->hcr + HCR_STATUS_OFFSET) &
+		swab32(1 << HCR_GO_BIT);
+}
+
+static void mthca_cmd_post_dbell(struct mthca_dev *dev,
+				 u64 in_param,
+				 u64 out_param,
+				 u32 in_modifier,
+				 u8 op_modifier,
+				 u16 op,
+				 u16 token)
+{
+	void __iomem *ptr = dev->cmd.dbell_map;
+	u16 *offs = dev->cmd.dbell_offsets;
+
+	__raw_writel((__force u32) cpu_to_be32(in_param >> 32),           ptr + offs[0]);
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32(in_param & 0xfffffffful),  ptr + offs[1]);
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32(in_modifier),              ptr + offs[2]);
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32(out_param >> 32),          ptr + offs[3]);
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32(out_param & 0xfffffffful), ptr + offs[4]);
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32(token << 16),              ptr + offs[5]);
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32((1 << HCR_GO_BIT)                |
+					       (1 << HCA_E_BIT)                 |
+					       (op_modifier << HCR_OPMOD_SHIFT) |
+						op),			  ptr + offs[6]);
+	wmb();
+	__raw_writel((__force u32) 0,                                     ptr + offs[7]);
+	wmb();
+}
+
+static int mthca_cmd_post_hcr(struct mthca_dev *dev,
+			      u64 in_param,
+			      u64 out_param,
+			      u32 in_modifier,
+			      u8 op_modifier,
+			      u16 op,
+			      u16 token,
+			      int event)
+{
+	if (event) {
+		unsigned long end = jiffies + GO_BIT_TIMEOUT;
+
+		while (go_bit(dev) && time_before(jiffies, end))
+			sched_yield();
+	}
+
+	if (go_bit(dev))
+		return -EAGAIN;
+
+	/*
+	 * We use writel (instead of something like memcpy_toio)
+	 * because writes of less than 32 bits to the HCR don't work
+	 * (and some architectures such as ia64 implement memcpy_toio
+	 * in terms of writeb).
+	 */
+	__raw_writel((__force u32) cpu_to_be32(in_param >> 32),           dev->hcr + 0 * 4);
+	__raw_writel((__force u32) cpu_to_be32(in_param & 0xfffffffful),  dev->hcr + 1 * 4);
+	__raw_writel((__force u32) cpu_to_be32(in_modifier),              dev->hcr + 2 * 4);
+	__raw_writel((__force u32) cpu_to_be32(out_param >> 32),          dev->hcr + 3 * 4);
+	__raw_writel((__force u32) cpu_to_be32(out_param & 0xfffffffful), dev->hcr + 4 * 4);
+	__raw_writel((__force u32) cpu_to_be32(token << 16),              dev->hcr + 5 * 4);
+
+	/* __raw_writel may not order writes. */
+	wmb();
+
+	__raw_writel((__force u32) cpu_to_be32((1 << HCR_GO_BIT)                |
+					       (event ? (1 << HCA_E_BIT) : 0)   |
+					       (op_modifier << HCR_OPMOD_SHIFT) |
+					       op),                       dev->hcr + 6 * 4);
+
+	return 0;
+}
+
+static int mthca_cmd_post(struct mthca_dev *dev,
+			  u64 in_param,
+			  u64 out_param,
+			  u32 in_modifier,
+			  u8 op_modifier,
+			  u16 op,
+			  u16 token,
+			  int event)
+{
+	int err = 0;
+
+	mutex_lock(&dev->cmd.hcr_mutex);
+
+	if (event && dev->cmd.flags & MTHCA_CMD_POST_DOORBELLS && fw_cmd_doorbell)
+		mthca_cmd_post_dbell(dev, in_param, out_param, in_modifier,
+					   op_modifier, op, token);
+	else
+		err = mthca_cmd_post_hcr(dev, in_param, out_param, in_modifier,
+					 op_modifier, op, token, event);
+
+	/*
+	 * Make sure that our HCR writes don't get mixed in with
+	 * writes from another CPU starting a FW command.
+	 */
+	mmiowb();
+
+	mutex_unlock(&dev->cmd.hcr_mutex);
+	return err;
+}
+
+static int mthca_cmd_poll(struct mthca_dev *dev,
+			  u64 in_param,
+			  u64 *out_param,
+			  int out_is_imm,
+			  u32 in_modifier,
+			  u8 op_modifier,
+			  u16 op,
+			  unsigned long timeout,
+			  u8 *status)
+{
+	int err = 0;
+	unsigned long end;
+
+	down(&dev->cmd.poll_sem);
+
+	err = mthca_cmd_post(dev, in_param,
+			     out_param ? *out_param : 0,
+			     in_modifier, op_modifier,
+			     op, CMD_POLL_TOKEN, 0);
+	if (err)
+		goto out;
+
+	end = timeout + jiffies;
+	while (go_bit(dev) && time_before(jiffies, end))
+		sched_yield();
+
+	if (go_bit(dev)) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	if (out_is_imm)
+		*out_param =
+			(u64) be32_to_cpu((__force __be32)
+					  __raw_readl(dev->hcr + HCR_OUT_PARAM_OFFSET)) << 32 |
+			(u64) be32_to_cpu((__force __be32)
+					  __raw_readl(dev->hcr + HCR_OUT_PARAM_OFFSET + 4));
+
+	*status = be32_to_cpu((__force __be32) __raw_readl(dev->hcr + HCR_STATUS_OFFSET)) >> 24;
+
+out:
+	up(&dev->cmd.poll_sem);
+	return err;
+}
+
+void mthca_cmd_event(struct mthca_dev *dev,
+		     u16 token,
+		     u8  status,
+		     u64 out_param)
+{
+	struct mthca_cmd_context *context =
+		&dev->cmd.context[token & dev->cmd.token_mask];
+
+	/* previously timed out command completing at long last */
+	if (token != context->token)
+		return;
+
+	context->result    = 0;
+	context->status    = status;
+	context->out_param = out_param;
+
+	complete(&context->done);
+}
+
+static int mthca_cmd_wait(struct mthca_dev *dev,
+			  u64 in_param,
+			  u64 *out_param,
+			  int out_is_imm,
+			  u32 in_modifier,
+			  u8 op_modifier,
+			  u16 op,
+			  unsigned long timeout,
+			  u8 *status)
+{
+	int err = 0;
+	struct mthca_cmd_context *context;
+
+	down(&dev->cmd.event_sem);
+
+	spin_lock(&dev->cmd.context_lock);
+	BUG_ON(dev->cmd.free_head < 0);
+	context = &dev->cmd.context[dev->cmd.free_head];
+	context->token += dev->cmd.token_mask + 1;
+	dev->cmd.free_head = context->next;
+	spin_unlock(&dev->cmd.context_lock);
+
+	init_completion(&context->done);
+
+	err = mthca_cmd_post(dev, in_param,
+			     out_param ? *out_param : 0,
+			     in_modifier, op_modifier,
+			     op, context->token, 1);
+	if (err)
+		goto out;
+
+	if (!wait_for_completion_timeout(&context->done, timeout)) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	err = context->result;
+	if (err)
+		goto out;
+
+	*status = context->status;
+	if (*status)
+		mthca_dbg(dev, "Command %02x completed with status %02x\n",
+			  op, *status);
+
+	if (out_is_imm)
+		*out_param = context->out_param;
+
+out:
+	spin_lock(&dev->cmd.context_lock);
+	context->next = dev->cmd.free_head;
+	dev->cmd.free_head = context - dev->cmd.context;
+	spin_unlock(&dev->cmd.context_lock);
+
+	up(&dev->cmd.event_sem);
+	return err;
+}
+
+/* Invoke a command with an output mailbox */
+static int mthca_cmd_box(struct mthca_dev *dev,
+			 u64 in_param,
+			 u64 out_param,
+			 u32 in_modifier,
+			 u8 op_modifier,
+			 u16 op,
+			 unsigned long timeout,
+			 u8 *status)
+{
+	if (dev->cmd.flags & MTHCA_CMD_USE_EVENTS)
+		return mthca_cmd_wait(dev, in_param, &out_param, 0,
+				      in_modifier, op_modifier, op,
+				      timeout, status);
+	else
+		return mthca_cmd_poll(dev, in_param, &out_param, 0,
+				      in_modifier, op_modifier, op,
+				      timeout, status);
+}
+
+/* Invoke a command with no output parameter */
+static int mthca_cmd(struct mthca_dev *dev,
+		     u64 in_param,
+		     u32 in_modifier,
+		     u8 op_modifier,
+		     u16 op,
+		     unsigned long timeout,
+		     u8 *status)
+{
+	return mthca_cmd_box(dev, in_param, 0, in_modifier,
+			     op_modifier, op, timeout, status);
+}
+
+/*
+ * Invoke a command with an immediate output parameter (and copy the
+ * output into the caller's out_param pointer after the command
+ * executes).
+ */
+static int mthca_cmd_imm(struct mthca_dev *dev,
+			 u64 in_param,
+			 u64 *out_param,
+			 u32 in_modifier,
+			 u8 op_modifier,
+			 u16 op,
+			 unsigned long timeout,
+			 u8 *status)
+{
+	if (dev->cmd.flags & MTHCA_CMD_USE_EVENTS)
+		return mthca_cmd_wait(dev, in_param, out_param, 1,
+				      in_modifier, op_modifier, op,
+				      timeout, status);
+	else
+		return mthca_cmd_poll(dev, in_param, out_param, 1,
+				      in_modifier, op_modifier, op,
+				      timeout, status);
+}
+
+int mthca_cmd_init(struct mthca_dev *dev)
+{
+	mutex_init(&dev->cmd.hcr_mutex);
+	sema_init(&dev->cmd.poll_sem, 1);
+	dev->cmd.flags = 0;
+
+	dev->hcr = ioremap(pci_resource_start(dev->pdev, 0) + MTHCA_HCR_BASE,
+			   MTHCA_HCR_SIZE);
+	if (!dev->hcr) {
+		mthca_err(dev, "Couldn't map command register.");
+		return -ENOMEM;
+	}
+
+	dev->cmd.pool = pci_pool_create("mthca_cmd", dev->pdev,
+					MTHCA_MAILBOX_SIZE,
+					MTHCA_MAILBOX_SIZE, 0);
+	if (!dev->cmd.pool) {
+		iounmap(dev->hcr);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void mthca_cmd_cleanup(struct mthca_dev *dev)
+{
+	pci_pool_destroy(dev->cmd.pool);
+	iounmap(dev->hcr);
+	if (dev->cmd.flags & MTHCA_CMD_POST_DOORBELLS)
+		iounmap(dev->cmd.dbell_map);
+}
+
+/*
+ * Switch to using events to issue FW commands (should be called after
+ * event queue to command events has been initialized).
+ */
+int mthca_cmd_use_events(struct mthca_dev *dev)
+{
+	int i;
+
+	dev->cmd.context = kmalloc(dev->cmd.max_cmds *
+				   sizeof (struct mthca_cmd_context),
+				   GFP_KERNEL);
+	if (!dev->cmd.context)
+		return -ENOMEM;
+
+	for (i = 0; i < dev->cmd.max_cmds; ++i) {
+		dev->cmd.context[i].token = i;
+		dev->cmd.context[i].next = i + 1;
+	}
+
+	dev->cmd.context[dev->cmd.max_cmds - 1].next = -1;
+	dev->cmd.free_head = 0;
+
+	sema_init(&dev->cmd.event_sem, dev->cmd.max_cmds);
+	spin_lock_init(&dev->cmd.context_lock);
+
+	for (dev->cmd.token_mask = 1;
+	     dev->cmd.token_mask < dev->cmd.max_cmds;
+	     dev->cmd.token_mask <<= 1)
+		; /* nothing */
+	--dev->cmd.token_mask;
+
+	dev->cmd.flags |= MTHCA_CMD_USE_EVENTS;
+
+	down(&dev->cmd.poll_sem);
+
+	return 0;
+}
+
+/*
+ * Switch back to polling (used when shutting down the device)
+ */
+void mthca_cmd_use_polling(struct mthca_dev *dev)
+{
+	int i;
+
+	dev->cmd.flags &= ~MTHCA_CMD_USE_EVENTS;
+
+	for (i = 0; i < dev->cmd.max_cmds; ++i)
+		down(&dev->cmd.event_sem);
+
+	kfree(dev->cmd.context);
+
+	up(&dev->cmd.poll_sem);
+}
+
+struct mthca_mailbox *mthca_alloc_mailbox(struct mthca_dev *dev,
+					  gfp_t gfp_mask)
+{
+	struct mthca_mailbox *mailbox;
+
+	mailbox = kmalloc(sizeof *mailbox, gfp_mask);
+	if (!mailbox)
+		return ERR_PTR(-ENOMEM);
+
+	mailbox->buf = pci_pool_alloc(dev->cmd.pool, gfp_mask, &mailbox->dma);
+	if (!mailbox->buf) {
+		kfree(mailbox);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return mailbox;
+}
+
+void mthca_free_mailbox(struct mthca_dev *dev, struct mthca_mailbox *mailbox)
+{
+	if (!mailbox)
+		return;
+
+	pci_pool_free(dev->cmd.pool, mailbox->buf, mailbox->dma);
+	kfree(mailbox);
+}
+
+int mthca_SYS_EN(struct mthca_dev *dev, u8 *status)
+{
+	u64 out;
+	int ret;
+
+	ret = mthca_cmd_imm(dev, 0, &out, 0, 0, CMD_SYS_EN, CMD_TIME_CLASS_D, status);
+
+	if (*status == MTHCA_CMD_STAT_DDR_MEM_ERR)
+		mthca_warn(dev, "SYS_EN DDR error: syn=%x, sock=%d, "
+			   "sladdr=%d, SPD source=%s\n",
+			   (int) (out >> 6) & 0xf, (int) (out >> 4) & 3,
+			   (int) (out >> 1) & 7, (int) out & 1 ? "NVMEM" : "DIMM");
+
+	return ret;
+}
+
+int mthca_SYS_DIS(struct mthca_dev *dev, u8 *status)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_SYS_DIS, CMD_TIME_CLASS_C, status);
+}
+
+static int mthca_map_cmd(struct mthca_dev *dev, u16 op, struct mthca_icm *icm,
+			 u64 virt, u8 *status)
+{
+	struct mthca_mailbox *mailbox;
+	struct mthca_icm_iter iter;
+	__be64 *pages;
+	int lg;
+	int nent = 0;
+	int i;
+	int err = 0;
+	int ts = 0, tc = 0;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	memset(mailbox->buf, 0, MTHCA_MAILBOX_SIZE);
+	pages = mailbox->buf;
+
+	for (mthca_icm_first(icm, &iter);
+	     !mthca_icm_last(&iter);
+	     mthca_icm_next(&iter)) {
+		/*
+		 * We have to pass pages that are aligned to their
+		 * size, so find the least significant 1 in the
+		 * address or size and use that as our log2 size.
+		 */
+		lg = ffs(mthca_icm_addr(&iter) | mthca_icm_size(&iter)) - 1;
+		if (lg < MTHCA_ICM_PAGE_SHIFT) {
+			mthca_warn(dev, "Got FW area not aligned to %d (%llx/%lx).\n",
+				   MTHCA_ICM_PAGE_SIZE,
+				   (unsigned long long) mthca_icm_addr(&iter),
+				   mthca_icm_size(&iter));
+			err = -EINVAL;
+			goto out;
+		}
+		for (i = 0; i < mthca_icm_size(&iter) >> lg; ++i) {
+			if (virt != -1) {
+				pages[nent * 2] = cpu_to_be64(virt);
+				virt += 1 << lg;
+			}
+
+			pages[nent * 2 + 1] =
+				cpu_to_be64((mthca_icm_addr(&iter) + (i << lg)) |
+					    (lg - MTHCA_ICM_PAGE_SHIFT));
+			ts += 1 << (lg - 10);
+			++tc;
+
+			if (++nent == MTHCA_MAILBOX_SIZE / 16) {
+				err = mthca_cmd(dev, mailbox->dma, nent, 0, op,
+						CMD_TIME_CLASS_B, status);
+				if (err || *status)
+					goto out;
+				nent = 0;
+			}
+		}
+	}
+
+	if (nent)
+		err = mthca_cmd(dev, mailbox->dma, nent, 0, op,
+				CMD_TIME_CLASS_B, status);
+
+	switch (op) {
+	case CMD_MAP_FA:
+		mthca_dbg(dev, "Mapped %d chunks/%d KB for FW.\n", tc, ts);
+		break;
+	case CMD_MAP_ICM_AUX:
+		mthca_dbg(dev, "Mapped %d chunks/%d KB for ICM aux.\n", tc, ts);
+		break;
+	case CMD_MAP_ICM:
+		mthca_dbg(dev, "Mapped %d chunks/%d KB at %llx for ICM.\n",
+			  tc, ts, (unsigned long long) virt - (ts << 10));
+		break;
+	}
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_MAP_FA(struct mthca_dev *dev, struct mthca_icm *icm, u8 *status)
+{
+	return mthca_map_cmd(dev, CMD_MAP_FA, icm, -1, status);
+}
+
+int mthca_UNMAP_FA(struct mthca_dev *dev, u8 *status)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_UNMAP_FA, CMD_TIME_CLASS_B, status);
+}
+
+int mthca_RUN_FW(struct mthca_dev *dev, u8 *status)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_RUN_FW, CMD_TIME_CLASS_A, status);
+}
+
+static void mthca_setup_cmd_doorbells(struct mthca_dev *dev, u64 base)
+{
+	unsigned long addr;
+	u16 max_off = 0;
+	int i;
+
+	for (i = 0; i < 8; ++i)
+		max_off = max(max_off, dev->cmd.dbell_offsets[i]);
+
+	if ((base & PAGE_MASK) != ((base + max_off) & PAGE_MASK)) {
+		mthca_warn(dev, "Firmware doorbell region at 0x%016llx, "
+			   "length 0x%x crosses a page boundary\n",
+			   (unsigned long long) base, max_off);
+		return;
+	}
+
+	addr = pci_resource_start(dev->pdev, 2) +
+		((pci_resource_len(dev->pdev, 2) - 1) & base);
+	dev->cmd.dbell_map = ioremap(addr, max_off + sizeof(u32));
+	if (!dev->cmd.dbell_map)
+		return;
+
+	dev->cmd.flags |= MTHCA_CMD_POST_DOORBELLS;
+	mthca_dbg(dev, "Mapped doorbell page for posting FW commands\n");
+}
+
+int mthca_QUERY_FW(struct mthca_dev *dev, u8 *status)
+{
+	struct mthca_mailbox *mailbox;
+	u32 *outbox;
+	u64 base;
+	u32 tmp;
+	int err = 0;
+	u8 lg;
+	int i;
+
+#define QUERY_FW_OUT_SIZE             0x100
+#define QUERY_FW_VER_OFFSET            0x00
+#define QUERY_FW_MAX_CMD_OFFSET        0x0f
+#define QUERY_FW_ERR_START_OFFSET      0x30
+#define QUERY_FW_ERR_SIZE_OFFSET       0x38
+
+#define QUERY_FW_CMD_DB_EN_OFFSET      0x10
+#define QUERY_FW_CMD_DB_OFFSET         0x50
+#define QUERY_FW_CMD_DB_BASE           0x60
+
+#define QUERY_FW_START_OFFSET          0x20
+#define QUERY_FW_END_OFFSET            0x28
+
+#define QUERY_FW_SIZE_OFFSET           0x00
+#define QUERY_FW_CLR_INT_BASE_OFFSET   0x20
+#define QUERY_FW_EQ_ARM_BASE_OFFSET    0x40
+#define QUERY_FW_EQ_SET_CI_BASE_OFFSET 0x48
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_FW,
+			    CMD_TIME_CLASS_A, status);
+
+	if (err)
+		goto out;
+
+	MTHCA_GET(dev->fw_ver,   outbox, QUERY_FW_VER_OFFSET);
+	/*
+	 * FW subminor version is at more significant bits than minor
+	 * version, so swap here.
+	 */
+	dev->fw_ver = (dev->fw_ver & 0xffff00000000ull) |
+		((dev->fw_ver & 0xffff0000ull) >> 16) |
+		((dev->fw_ver & 0x0000ffffull) << 16);
+
+	MTHCA_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET);
+	dev->cmd.max_cmds = 1 << lg;
+
+	mthca_dbg(dev, "FW version %012llx, max commands %d\n",
+		  (unsigned long long) dev->fw_ver, dev->cmd.max_cmds);
+
+	MTHCA_GET(dev->catas_err.addr, outbox, QUERY_FW_ERR_START_OFFSET);
+	MTHCA_GET(dev->catas_err.size, outbox, QUERY_FW_ERR_SIZE_OFFSET);
+
+	mthca_dbg(dev, "Catastrophic error buffer at 0x%llx, size 0x%x\n",
+		  (unsigned long long) dev->catas_err.addr, dev->catas_err.size);
+
+	MTHCA_GET(tmp, outbox, QUERY_FW_CMD_DB_EN_OFFSET);
+	if (tmp & 0x1) {
+		mthca_dbg(dev, "FW supports commands through doorbells\n");
+
+		MTHCA_GET(base, outbox, QUERY_FW_CMD_DB_BASE);
+		for (i = 0; i < MTHCA_CMD_NUM_DBELL_DWORDS; ++i)
+			MTHCA_GET(dev->cmd.dbell_offsets[i], outbox,
+				  QUERY_FW_CMD_DB_OFFSET + (i << 1));
+
+		mthca_setup_cmd_doorbells(dev, base);
+	}
+
+	if (mthca_is_memfree(dev)) {
+		MTHCA_GET(dev->fw.arbel.fw_pages,       outbox, QUERY_FW_SIZE_OFFSET);
+		MTHCA_GET(dev->fw.arbel.clr_int_base,   outbox, QUERY_FW_CLR_INT_BASE_OFFSET);
+		MTHCA_GET(dev->fw.arbel.eq_arm_base,    outbox, QUERY_FW_EQ_ARM_BASE_OFFSET);
+		MTHCA_GET(dev->fw.arbel.eq_set_ci_base, outbox, QUERY_FW_EQ_SET_CI_BASE_OFFSET);
+		mthca_dbg(dev, "FW size %d KB\n", dev->fw.arbel.fw_pages << 2);
+
+		/*
+		 * Round up number of system pages needed in case
+		 * MTHCA_ICM_PAGE_SIZE < PAGE_SIZE.
+		 */
+		dev->fw.arbel.fw_pages =
+			ALIGN(dev->fw.arbel.fw_pages, PAGE_SIZE / MTHCA_ICM_PAGE_SIZE) >>
+				(PAGE_SHIFT - MTHCA_ICM_PAGE_SHIFT);
+
+		mthca_dbg(dev, "Clear int @ %llx, EQ arm @ %llx, EQ set CI @ %llx\n",
+			  (unsigned long long) dev->fw.arbel.clr_int_base,
+			  (unsigned long long) dev->fw.arbel.eq_arm_base,
+			  (unsigned long long) dev->fw.arbel.eq_set_ci_base);
+	} else {
+		MTHCA_GET(dev->fw.tavor.fw_start, outbox, QUERY_FW_START_OFFSET);
+		MTHCA_GET(dev->fw.tavor.fw_end,   outbox, QUERY_FW_END_OFFSET);
+
+		mthca_dbg(dev, "FW size %d KB (start %llx, end %llx)\n",
+			  (int) ((dev->fw.tavor.fw_end - dev->fw.tavor.fw_start) >> 10),
+			  (unsigned long long) dev->fw.tavor.fw_start,
+			  (unsigned long long) dev->fw.tavor.fw_end);
+	}
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_ENABLE_LAM(struct mthca_dev *dev, u8 *status)
+{
+	struct mthca_mailbox *mailbox;
+	u8 info;
+	u32 *outbox;
+	int err = 0;
+
+#define ENABLE_LAM_OUT_SIZE         0x100
+#define ENABLE_LAM_START_OFFSET     0x00
+#define ENABLE_LAM_END_OFFSET       0x08
+#define ENABLE_LAM_INFO_OFFSET      0x13
+
+#define ENABLE_LAM_INFO_HIDDEN_FLAG (1 << 4)
+#define ENABLE_LAM_INFO_ECC_MASK    0x3
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_ENABLE_LAM,
+			    CMD_TIME_CLASS_C, status);
+
+	if (err)
+		goto out;
+
+	if (*status == MTHCA_CMD_STAT_LAM_NOT_PRE)
+		goto out;
+
+	MTHCA_GET(dev->ddr_start, outbox, ENABLE_LAM_START_OFFSET);
+	MTHCA_GET(dev->ddr_end,   outbox, ENABLE_LAM_END_OFFSET);
+	MTHCA_GET(info,           outbox, ENABLE_LAM_INFO_OFFSET);
+
+	if (!!(info & ENABLE_LAM_INFO_HIDDEN_FLAG) !=
+	    !!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+		mthca_info(dev, "FW reports that HCA-attached memory "
+			   "is %s hidden; does not match PCI config\n",
+			   (info & ENABLE_LAM_INFO_HIDDEN_FLAG) ?
+			   "" : "not");
+	}
+	if (info & ENABLE_LAM_INFO_HIDDEN_FLAG)
+		mthca_dbg(dev, "HCA-attached memory is hidden.\n");
+
+	mthca_dbg(dev, "HCA memory size %d KB (start %llx, end %llx)\n",
+		  (int) ((dev->ddr_end - dev->ddr_start) >> 10),
+		  (unsigned long long) dev->ddr_start,
+		  (unsigned long long) dev->ddr_end);
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_DISABLE_LAM(struct mthca_dev *dev, u8 *status)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_SYS_DIS, CMD_TIME_CLASS_C, status);
+}
+
+int mthca_QUERY_DDR(struct mthca_dev *dev, u8 *status)
+{
+	struct mthca_mailbox *mailbox;
+	u8 info;
+	u32 *outbox;
+	int err = 0;
+
+#define QUERY_DDR_OUT_SIZE         0x100
+#define QUERY_DDR_START_OFFSET     0x00
+#define QUERY_DDR_END_OFFSET       0x08
+#define QUERY_DDR_INFO_OFFSET      0x13
+
+#define QUERY_DDR_INFO_HIDDEN_FLAG (1 << 4)
+#define QUERY_DDR_INFO_ECC_MASK    0x3
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_DDR,
+			    CMD_TIME_CLASS_A, status);
+
+	if (err)
+		goto out;
+
+	MTHCA_GET(dev->ddr_start, outbox, QUERY_DDR_START_OFFSET);
+	MTHCA_GET(dev->ddr_end,   outbox, QUERY_DDR_END_OFFSET);
+	MTHCA_GET(info,           outbox, QUERY_DDR_INFO_OFFSET);
+
+	if (!!(info & QUERY_DDR_INFO_HIDDEN_FLAG) !=
+	    !!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+		mthca_info(dev, "FW reports that HCA-attached memory "
+			   "is %s hidden; does not match PCI config\n",
+			   (info & QUERY_DDR_INFO_HIDDEN_FLAG) ?
+			   "" : "not");
+	}
+	if (info & QUERY_DDR_INFO_HIDDEN_FLAG)
+		mthca_dbg(dev, "HCA-attached memory is hidden.\n");
+
+	mthca_dbg(dev, "HCA memory size %d KB (start %llx, end %llx)\n",
+		  (int) ((dev->ddr_end - dev->ddr_start) >> 10),
+		  (unsigned long long) dev->ddr_start,
+		  (unsigned long long) dev->ddr_end);
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_QUERY_DEV_LIM(struct mthca_dev *dev,
+			struct mthca_dev_lim *dev_lim, u8 *status)
+{
+	struct mthca_mailbox *mailbox;
+	u32 *outbox;
+	u8 field;
+	u16 size;
+	u16 stat_rate;
+	int err;
+
+#define QUERY_DEV_LIM_OUT_SIZE             0x100
+#define QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET     0x10
+#define QUERY_DEV_LIM_MAX_QP_SZ_OFFSET      0x11
+#define QUERY_DEV_LIM_RSVD_QP_OFFSET        0x12
+#define QUERY_DEV_LIM_MAX_QP_OFFSET         0x13
+#define QUERY_DEV_LIM_RSVD_SRQ_OFFSET       0x14
+#define QUERY_DEV_LIM_MAX_SRQ_OFFSET        0x15
+#define QUERY_DEV_LIM_RSVD_EEC_OFFSET       0x16
+#define QUERY_DEV_LIM_MAX_EEC_OFFSET        0x17
+#define QUERY_DEV_LIM_MAX_CQ_SZ_OFFSET      0x19
+#define QUERY_DEV_LIM_RSVD_CQ_OFFSET        0x1a
+#define QUERY_DEV_LIM_MAX_CQ_OFFSET         0x1b
+#define QUERY_DEV_LIM_MAX_MPT_OFFSET        0x1d
+#define QUERY_DEV_LIM_RSVD_EQ_OFFSET        0x1e
+#define QUERY_DEV_LIM_MAX_EQ_OFFSET         0x1f
+#define QUERY_DEV_LIM_RSVD_MTT_OFFSET       0x20
+#define QUERY_DEV_LIM_MAX_MRW_SZ_OFFSET     0x21
+#define QUERY_DEV_LIM_RSVD_MRW_OFFSET       0x22
+#define QUERY_DEV_LIM_MAX_MTT_SEG_OFFSET    0x23
+#define QUERY_DEV_LIM_MAX_AV_OFFSET         0x27
+#define QUERY_DEV_LIM_MAX_REQ_QP_OFFSET     0x29
+#define QUERY_DEV_LIM_MAX_RES_QP_OFFSET     0x2b
+#define QUERY_DEV_LIM_MAX_RDMA_OFFSET       0x2f
+#define QUERY_DEV_LIM_RSZ_SRQ_OFFSET        0x33
+#define QUERY_DEV_LIM_ACK_DELAY_OFFSET      0x35
+#define QUERY_DEV_LIM_MTU_WIDTH_OFFSET      0x36
+#define QUERY_DEV_LIM_VL_PORT_OFFSET        0x37
+#define QUERY_DEV_LIM_MAX_GID_OFFSET        0x3b
+#define QUERY_DEV_LIM_RATE_SUPPORT_OFFSET   0x3c
+#define QUERY_DEV_LIM_MAX_PKEY_OFFSET       0x3f
+#define QUERY_DEV_LIM_FLAGS_OFFSET          0x44
+#define QUERY_DEV_LIM_RSVD_UAR_OFFSET       0x48
+#define QUERY_DEV_LIM_UAR_SZ_OFFSET         0x49
+#define QUERY_DEV_LIM_PAGE_SZ_OFFSET        0x4b
+#define QUERY_DEV_LIM_MAX_SG_OFFSET         0x51
+#define QUERY_DEV_LIM_MAX_DESC_SZ_OFFSET    0x52
+#define QUERY_DEV_LIM_MAX_SG_RQ_OFFSET      0x55
+#define QUERY_DEV_LIM_MAX_DESC_SZ_RQ_OFFSET 0x56
+#define QUERY_DEV_LIM_MAX_QP_MCG_OFFSET     0x61
+#define QUERY_DEV_LIM_RSVD_MCG_OFFSET       0x62
+#define QUERY_DEV_LIM_MAX_MCG_OFFSET        0x63
+#define QUERY_DEV_LIM_RSVD_PD_OFFSET        0x64
+#define QUERY_DEV_LIM_MAX_PD_OFFSET         0x65
+#define QUERY_DEV_LIM_RSVD_RDD_OFFSET       0x66
+#define QUERY_DEV_LIM_MAX_RDD_OFFSET        0x67
+#define QUERY_DEV_LIM_EEC_ENTRY_SZ_OFFSET   0x80
+#define QUERY_DEV_LIM_QPC_ENTRY_SZ_OFFSET   0x82
+#define QUERY_DEV_LIM_EEEC_ENTRY_SZ_OFFSET  0x84
+#define QUERY_DEV_LIM_EQPC_ENTRY_SZ_OFFSET  0x86
+#define QUERY_DEV_LIM_EQC_ENTRY_SZ_OFFSET   0x88
+#define QUERY_DEV_LIM_CQC_ENTRY_SZ_OFFSET   0x8a
+#define QUERY_DEV_LIM_SRQ_ENTRY_SZ_OFFSET   0x8c
+#define QUERY_DEV_LIM_UAR_ENTRY_SZ_OFFSET   0x8e
+#define QUERY_DEV_LIM_MTT_ENTRY_SZ_OFFSET   0x90
+#define QUERY_DEV_LIM_MPT_ENTRY_SZ_OFFSET   0x92
+#define QUERY_DEV_LIM_PBL_SZ_OFFSET         0x96
+#define QUERY_DEV_LIM_BMME_FLAGS_OFFSET     0x97
+#define QUERY_DEV_LIM_RSVD_LKEY_OFFSET      0x98
+#define QUERY_DEV_LIM_LAMR_OFFSET           0x9f
+#define QUERY_DEV_LIM_MAX_ICM_SZ_OFFSET     0xa0
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_DEV_LIM,
+			    CMD_TIME_CLASS_A, status);
+
+	if (err)
+		goto out;
+
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_QP_OFFSET);
+	dev_lim->reserved_qps = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_OFFSET);
+	dev_lim->max_qps = 1 << (field & 0x1f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_SRQ_OFFSET);
+	dev_lim->reserved_srqs = 1 << (field >> 4);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_OFFSET);
+	dev_lim->max_srqs = 1 << (field & 0x1f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_EEC_OFFSET);
+	dev_lim->reserved_eecs = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_EEC_OFFSET);
+	dev_lim->max_eecs = 1 << (field & 0x1f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_CQ_SZ_OFFSET);
+	dev_lim->max_cq_sz = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_CQ_OFFSET);
+	dev_lim->reserved_cqs = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_CQ_OFFSET);
+	dev_lim->max_cqs = 1 << (field & 0x1f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MPT_OFFSET);
+	dev_lim->max_mpts = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_EQ_OFFSET);
+	dev_lim->reserved_eqs = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_EQ_OFFSET);
+	dev_lim->max_eqs = 1 << (field & 0x7);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MTT_OFFSET);
+	if (mthca_is_memfree(dev))
+		dev_lim->reserved_mtts = ALIGN((1 << (field >> 4)) * sizeof(u64),
+					       dev->limits.mtt_seg_size) / dev->limits.mtt_seg_size;
+	else
+		dev_lim->reserved_mtts = 1 << (field >> 4);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MRW_SZ_OFFSET);
+	dev_lim->max_mrw_sz = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MRW_OFFSET);
+	dev_lim->reserved_mrws = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MTT_SEG_OFFSET);
+	dev_lim->max_mtt_seg = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_REQ_QP_OFFSET);
+	dev_lim->max_requester_per_qp = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RES_QP_OFFSET);
+	dev_lim->max_responder_per_qp = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RDMA_OFFSET);
+	dev_lim->max_rdma_global = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_ACK_DELAY_OFFSET);
+	dev_lim->local_ca_ack_delay = field & 0x1f;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MTU_WIDTH_OFFSET);
+	dev_lim->max_mtu        = field >> 4;
+	dev_lim->max_port_width = field & 0xf;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_VL_PORT_OFFSET);
+	dev_lim->max_vl    = field >> 4;
+	dev_lim->num_ports = field & 0xf;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_GID_OFFSET);
+	dev_lim->max_gids = 1 << (field & 0xf);
+	MTHCA_GET(stat_rate, outbox, QUERY_DEV_LIM_RATE_SUPPORT_OFFSET);
+	dev_lim->stat_rate_support = stat_rate;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_PKEY_OFFSET);
+	dev_lim->max_pkeys = 1 << (field & 0xf);
+	MTHCA_GET(dev_lim->flags, outbox, QUERY_DEV_LIM_FLAGS_OFFSET);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_UAR_OFFSET);
+	dev_lim->reserved_uars = field >> 4;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_UAR_SZ_OFFSET);
+	dev_lim->uar_size = 1 << ((field & 0x3f) + 20);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_PAGE_SZ_OFFSET);
+	dev_lim->min_page_sz = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SG_OFFSET);
+	dev_lim->max_sg = field;
+
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_MAX_DESC_SZ_OFFSET);
+	dev_lim->max_desc_sz = size;
+
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_MCG_OFFSET);
+	dev_lim->max_qp_per_mcg = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MCG_OFFSET);
+	dev_lim->reserved_mgms = field & 0xf;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MCG_OFFSET);
+	dev_lim->max_mcgs = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_PD_OFFSET);
+	dev_lim->reserved_pds = field >> 4;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_PD_OFFSET);
+	dev_lim->max_pds = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_RDD_OFFSET);
+	dev_lim->reserved_rdds = field >> 4;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RDD_OFFSET);
+	dev_lim->max_rdds = 1 << (field & 0x3f);
+
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_EEC_ENTRY_SZ_OFFSET);
+	dev_lim->eec_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_QPC_ENTRY_SZ_OFFSET);
+	dev_lim->qpc_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_EEEC_ENTRY_SZ_OFFSET);
+	dev_lim->eeec_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_EQPC_ENTRY_SZ_OFFSET);
+	dev_lim->eqpc_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_EQC_ENTRY_SZ_OFFSET);
+	dev_lim->eqc_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_CQC_ENTRY_SZ_OFFSET);
+	dev_lim->cqc_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_SRQ_ENTRY_SZ_OFFSET);
+	dev_lim->srq_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_UAR_ENTRY_SZ_OFFSET);
+	dev_lim->uar_scratch_entry_sz = size;
+
+	if (mthca_is_memfree(dev)) {
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET);
+		dev_lim->max_srq_sz = 1 << field;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_SZ_OFFSET);
+		dev_lim->max_qp_sz = 1 << field;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSZ_SRQ_OFFSET);
+		dev_lim->hca.arbel.resize_srq = field & 1;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SG_RQ_OFFSET);
+		dev_lim->max_sg = min_t(int, field, dev_lim->max_sg);
+		MTHCA_GET(size, outbox, QUERY_DEV_LIM_MAX_DESC_SZ_RQ_OFFSET);
+		dev_lim->max_desc_sz = min_t(int, size, dev_lim->max_desc_sz);
+		MTHCA_GET(size, outbox, QUERY_DEV_LIM_MPT_ENTRY_SZ_OFFSET);
+		dev_lim->mpt_entry_sz = size;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_PBL_SZ_OFFSET);
+		dev_lim->hca.arbel.max_pbl_sz = 1 << (field & 0x3f);
+		MTHCA_GET(dev_lim->hca.arbel.bmme_flags, outbox,
+			  QUERY_DEV_LIM_BMME_FLAGS_OFFSET);
+		MTHCA_GET(dev_lim->hca.arbel.reserved_lkey, outbox,
+			  QUERY_DEV_LIM_RSVD_LKEY_OFFSET);
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_LAMR_OFFSET);
+		dev_lim->hca.arbel.lam_required = field & 1;
+		MTHCA_GET(dev_lim->hca.arbel.max_icm_sz, outbox,
+			  QUERY_DEV_LIM_MAX_ICM_SZ_OFFSET);
+
+		if (dev_lim->hca.arbel.bmme_flags & 1)
+			mthca_dbg(dev, "Base MM extensions: yes "
+				  "(flags %d, max PBL %d, rsvd L_Key %08x)\n",
+				  dev_lim->hca.arbel.bmme_flags,
+				  dev_lim->hca.arbel.max_pbl_sz,
+				  dev_lim->hca.arbel.reserved_lkey);
+		else
+			mthca_dbg(dev, "Base MM extensions: no\n");
+
+		mthca_dbg(dev, "Max ICM size %lld MB\n",
+			  (unsigned long long) dev_lim->hca.arbel.max_icm_sz >> 20);
+	} else {
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET);
+		dev_lim->max_srq_sz = (1 << field) - 1;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_SZ_OFFSET);
+		dev_lim->max_qp_sz = (1 << field) - 1;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_AV_OFFSET);
+		dev_lim->hca.tavor.max_avs = 1 << (field & 0x3f);
+		dev_lim->mpt_entry_sz = MTHCA_MPT_ENTRY_SIZE;
+	}
+
+	mthca_dbg(dev, "Max QPs: %d, reserved QPs: %d, entry size: %d\n",
+		  dev_lim->max_qps, dev_lim->reserved_qps, dev_lim->qpc_entry_sz);
+	mthca_dbg(dev, "Max SRQs: %d, reserved SRQs: %d, entry size: %d\n",
+		  dev_lim->max_srqs, dev_lim->reserved_srqs, dev_lim->srq_entry_sz);
+	mthca_dbg(dev, "Max CQs: %d, reserved CQs: %d, entry size: %d\n",
+		  dev_lim->max_cqs, dev_lim->reserved_cqs, dev_lim->cqc_entry_sz);
+	mthca_dbg(dev, "Max EQs: %d, reserved EQs: %d, entry size: %d\n",
+		  dev_lim->max_eqs, dev_lim->reserved_eqs, dev_lim->eqc_entry_sz);
+	mthca_dbg(dev, "reserved MPTs: %d, reserved MTTs: %d\n",
+		  dev_lim->reserved_mrws, dev_lim->reserved_mtts);
+	mthca_dbg(dev, "Max PDs: %d, reserved PDs: %d, reserved UARs: %d\n",
+		  dev_lim->max_pds, dev_lim->reserved_pds, dev_lim->reserved_uars);
+	mthca_dbg(dev, "Max QP/MCG: %d, reserved MGMs: %d\n",
+		  dev_lim->max_pds, dev_lim->reserved_mgms);
+	mthca_dbg(dev, "Max CQEs: %d, max WQEs: %d, max SRQ WQEs: %d\n",
+		  dev_lim->max_cq_sz, dev_lim->max_qp_sz, dev_lim->max_srq_sz);
+
+	mthca_dbg(dev, "Flags: %08x\n", dev_lim->flags);
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+static void get_board_id(void *vsd, char *board_id)
+{
+	int i;
+
+#define VSD_OFFSET_SIG1		0x00
+#define VSD_OFFSET_SIG2		0xde
+#define VSD_OFFSET_MLX_BOARD_ID	0xd0
+#define VSD_OFFSET_TS_BOARD_ID	0x20
+
+#define VSD_SIGNATURE_TOPSPIN	0x5ad
+
+	memset(board_id, 0, MTHCA_BOARD_ID_LEN);
+
+	if (be16_to_cpup(vsd + VSD_OFFSET_SIG1) == VSD_SIGNATURE_TOPSPIN &&
+	    be16_to_cpup(vsd + VSD_OFFSET_SIG2) == VSD_SIGNATURE_TOPSPIN) {
+		strlcpy(board_id, vsd + VSD_OFFSET_TS_BOARD_ID, MTHCA_BOARD_ID_LEN);
+	} else {
+		/*
+		 * The board ID is a string but the firmware byte
+		 * swaps each 4-byte word before passing it back to
+		 * us.  Therefore we need to swab it before printing.
+		 */
+		for (i = 0; i < 4; ++i)
+			((u32 *) board_id)[i] =
+				swab32(*(u32 *) (vsd + VSD_OFFSET_MLX_BOARD_ID + i * 4));
+	}
+}
+
+int mthca_QUERY_ADAPTER(struct mthca_dev *dev,
+			struct mthca_adapter *adapter, u8 *status)
+{
+	struct mthca_mailbox *mailbox;
+	u32 *outbox;
+	int err;
+
+#define QUERY_ADAPTER_OUT_SIZE             0x100
+#define QUERY_ADAPTER_VENDOR_ID_OFFSET     0x00
+#define QUERY_ADAPTER_DEVICE_ID_OFFSET     0x04
+#define QUERY_ADAPTER_REVISION_ID_OFFSET   0x08
+#define QUERY_ADAPTER_INTA_PIN_OFFSET      0x10
+#define QUERY_ADAPTER_VSD_OFFSET           0x20
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_ADAPTER,
+			    CMD_TIME_CLASS_A, status);
+
+	if (err)
+		goto out;
+
+	if (!mthca_is_memfree(dev)) {
+		MTHCA_GET(adapter->vendor_id, outbox,
+			  QUERY_ADAPTER_VENDOR_ID_OFFSET);
+		MTHCA_GET(adapter->device_id, outbox,
+			  QUERY_ADAPTER_DEVICE_ID_OFFSET);
+		MTHCA_GET(adapter->revision_id, outbox,
+			  QUERY_ADAPTER_REVISION_ID_OFFSET);
+	}
+	MTHCA_GET(adapter->inta_pin, outbox,    QUERY_ADAPTER_INTA_PIN_OFFSET);
+
+	get_board_id(outbox + QUERY_ADAPTER_VSD_OFFSET / 4,
+		     adapter->board_id);
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_INIT_HCA(struct mthca_dev *dev,
+		   struct mthca_init_hca_param *param,
+		   u8 *status)
+{
+	struct mthca_mailbox *mailbox;
+	__be32 *inbox;
+	int err;
+
+#define INIT_HCA_IN_SIZE             	 0x200
+#define INIT_HCA_FLAGS1_OFFSET           0x00c
+#define INIT_HCA_FLAGS2_OFFSET           0x014
+#define INIT_HCA_QPC_OFFSET          	 0x020
+#define  INIT_HCA_QPC_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x10)
+#define  INIT_HCA_LOG_QP_OFFSET      	 (INIT_HCA_QPC_OFFSET + 0x17)
+#define  INIT_HCA_EEC_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x20)
+#define  INIT_HCA_LOG_EEC_OFFSET     	 (INIT_HCA_QPC_OFFSET + 0x27)
+#define  INIT_HCA_SRQC_BASE_OFFSET   	 (INIT_HCA_QPC_OFFSET + 0x28)
+#define  INIT_HCA_LOG_SRQ_OFFSET     	 (INIT_HCA_QPC_OFFSET + 0x2f)
+#define  INIT_HCA_CQC_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x30)
+#define  INIT_HCA_LOG_CQ_OFFSET      	 (INIT_HCA_QPC_OFFSET + 0x37)
+#define  INIT_HCA_EQPC_BASE_OFFSET   	 (INIT_HCA_QPC_OFFSET + 0x40)
+#define  INIT_HCA_EEEC_BASE_OFFSET   	 (INIT_HCA_QPC_OFFSET + 0x50)
+#define  INIT_HCA_EQC_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x60)
+#define  INIT_HCA_LOG_EQ_OFFSET      	 (INIT_HCA_QPC_OFFSET + 0x67)
+#define  INIT_HCA_RDB_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x70)
+#define INIT_HCA_UDAV_OFFSET         	 0x0b0
+#define  INIT_HCA_UDAV_LKEY_OFFSET   	 (INIT_HCA_UDAV_OFFSET + 0x0)
+#define  INIT_HCA_UDAV_PD_OFFSET     	 (INIT_HCA_UDAV_OFFSET + 0x4)
+#define INIT_HCA_MCAST_OFFSET        	 0x0c0
+#define  INIT_HCA_MC_BASE_OFFSET         (INIT_HCA_MCAST_OFFSET + 0x00)
+#define  INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x12)
+#define  INIT_HCA_MC_HASH_SZ_OFFSET      (INIT_HCA_MCAST_OFFSET + 0x16)
+#define  INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b)
+#define INIT_HCA_TPT_OFFSET              0x0f0
+#define  INIT_HCA_MPT_BASE_OFFSET        (INIT_HCA_TPT_OFFSET + 0x00)
+#define  INIT_HCA_MTT_SEG_SZ_OFFSET      (INIT_HCA_TPT_OFFSET + 0x09)
+#define  INIT_HCA_LOG_MPT_SZ_OFFSET      (INIT_HCA_TPT_OFFSET + 0x0b)
+#define  INIT_HCA_MTT_BASE_OFFSET        (INIT_HCA_TPT_OFFSET + 0x10)
+#define INIT_HCA_UAR_OFFSET              0x120
+#define  INIT_HCA_UAR_BASE_OFFSET        (INIT_HCA_UAR_OFFSET + 0x00)
+#define  INIT_HCA_UARC_SZ_OFFSET         (INIT_HCA_UAR_OFFSET + 0x09)
+#define  INIT_HCA_LOG_UAR_SZ_OFFSET      (INIT_HCA_UAR_OFFSET + 0x0a)
+#define  INIT_HCA_UAR_PAGE_SZ_OFFSET     (INIT_HCA_UAR_OFFSET + 0x0b)
+#define  INIT_HCA_UAR_SCATCH_BASE_OFFSET (INIT_HCA_UAR_OFFSET + 0x10)
+#define  INIT_HCA_UAR_CTX_BASE_OFFSET    (INIT_HCA_UAR_OFFSET + 0x18)
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	memset(inbox, 0, INIT_HCA_IN_SIZE);
+
+	if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		MTHCA_PUT(inbox, 0x1, INIT_HCA_FLAGS1_OFFSET);
+
+#if defined(__LITTLE_ENDIAN)
+	*(inbox + INIT_HCA_FLAGS2_OFFSET / 4) &= ~cpu_to_be32(1 << 1);
+#elif defined(__BIG_ENDIAN)
+	*(inbox + INIT_HCA_FLAGS2_OFFSET / 4) |= cpu_to_be32(1 << 1);
+#else
+#error Host endianness not defined
+#endif
+	/* Check port for UD address vector: */
+	*(inbox + INIT_HCA_FLAGS2_OFFSET / 4) |= cpu_to_be32(1);
+
+	/* Enable IPoIB checksumming if we can: */
+	if (dev->device_cap_flags & IB_DEVICE_UD_IP_CSUM)
+		*(inbox + INIT_HCA_FLAGS2_OFFSET / 4) |= cpu_to_be32(7 << 3);
+
+	/* We leave wqe_quota, responder_exu, etc as 0 (default) */
+
+	/* QPC/EEC/CQC/EQC/RDB attributes */
+
+	MTHCA_PUT(inbox, param->qpc_base,     INIT_HCA_QPC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_qps,  INIT_HCA_LOG_QP_OFFSET);
+	MTHCA_PUT(inbox, param->eec_base,     INIT_HCA_EEC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_eecs, INIT_HCA_LOG_EEC_OFFSET);
+	MTHCA_PUT(inbox, param->srqc_base,    INIT_HCA_SRQC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_srqs, INIT_HCA_LOG_SRQ_OFFSET);
+	MTHCA_PUT(inbox, param->cqc_base,     INIT_HCA_CQC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_cqs,  INIT_HCA_LOG_CQ_OFFSET);
+	MTHCA_PUT(inbox, param->eqpc_base,    INIT_HCA_EQPC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->eeec_base,    INIT_HCA_EEEC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->eqc_base,     INIT_HCA_EQC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_eqs,  INIT_HCA_LOG_EQ_OFFSET);
+	MTHCA_PUT(inbox, param->rdb_base,     INIT_HCA_RDB_BASE_OFFSET);
+
+	/* UD AV attributes */
+
+	/* multicast attributes */
+
+	MTHCA_PUT(inbox, param->mc_base,         INIT_HCA_MC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_mc_entry_sz, INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
+	MTHCA_PUT(inbox, param->mc_hash_sz,      INIT_HCA_MC_HASH_SZ_OFFSET);
+	MTHCA_PUT(inbox, param->log_mc_table_sz, INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
+
+	/* TPT attributes */
+
+	MTHCA_PUT(inbox, param->mpt_base,   INIT_HCA_MPT_BASE_OFFSET);
+	if (!mthca_is_memfree(dev))
+		MTHCA_PUT(inbox, param->mtt_seg_sz, INIT_HCA_MTT_SEG_SZ_OFFSET);
+	MTHCA_PUT(inbox, param->log_mpt_sz, INIT_HCA_LOG_MPT_SZ_OFFSET);
+	MTHCA_PUT(inbox, param->mtt_base,   INIT_HCA_MTT_BASE_OFFSET);
+
+	/* UAR attributes */
+	{
+		u8 uar_page_sz = PAGE_SHIFT - 12;
+		MTHCA_PUT(inbox, uar_page_sz, INIT_HCA_UAR_PAGE_SZ_OFFSET);
+	}
+
+	MTHCA_PUT(inbox, param->uar_scratch_base, INIT_HCA_UAR_SCATCH_BASE_OFFSET);
+
+	if (mthca_is_memfree(dev)) {
+		MTHCA_PUT(inbox, param->log_uarc_sz, INIT_HCA_UARC_SZ_OFFSET);
+		MTHCA_PUT(inbox, param->log_uar_sz,  INIT_HCA_LOG_UAR_SZ_OFFSET);
+		MTHCA_PUT(inbox, param->uarc_base,   INIT_HCA_UAR_CTX_BASE_OFFSET);
+	}
+
+	err = mthca_cmd(dev, mailbox->dma, 0, 0, CMD_INIT_HCA, CMD_TIME_CLASS_D, status);
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_INIT_IB(struct mthca_dev *dev,
+		  struct mthca_init_ib_param *param,
+		  int port, u8 *status)
+{
+	struct mthca_mailbox *mailbox;
+	u32 *inbox;
+	int err;
+	u32 flags;
+
+#define INIT_IB_IN_SIZE          56
+#define INIT_IB_FLAGS_OFFSET     0x00
+#define INIT_IB_FLAG_SIG         (1 << 18)
+#define INIT_IB_FLAG_NG          (1 << 17)
+#define INIT_IB_FLAG_G0          (1 << 16)
+#define INIT_IB_VL_SHIFT         4
+#define INIT_IB_PORT_WIDTH_SHIFT 8
+#define INIT_IB_MTU_SHIFT        12
+#define INIT_IB_MAX_GID_OFFSET   0x06
+#define INIT_IB_MAX_PKEY_OFFSET  0x0a
+#define INIT_IB_GUID0_OFFSET     0x10
+#define INIT_IB_NODE_GUID_OFFSET 0x18
+#define INIT_IB_SI_GUID_OFFSET   0x20
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	memset(inbox, 0, INIT_IB_IN_SIZE);
+
+	flags = 0;
+	flags |= param->set_guid0     ? INIT_IB_FLAG_G0  : 0;
+	flags |= param->set_node_guid ? INIT_IB_FLAG_NG  : 0;
+	flags |= param->set_si_guid   ? INIT_IB_FLAG_SIG : 0;
+	flags |= param->vl_cap << INIT_IB_VL_SHIFT;
+	flags |= param->port_width << INIT_IB_PORT_WIDTH_SHIFT;
+	flags |= param->mtu_cap << INIT_IB_MTU_SHIFT;
+	MTHCA_PUT(inbox, flags, INIT_IB_FLAGS_OFFSET);
+
+	MTHCA_PUT(inbox, param->gid_cap,   INIT_IB_MAX_GID_OFFSET);
+	MTHCA_PUT(inbox, param->pkey_cap,  INIT_IB_MAX_PKEY_OFFSET);
+	MTHCA_PUT(inbox, param->guid0,     INIT_IB_GUID0_OFFSET);
+	MTHCA_PUT(inbox, param->node_guid, INIT_IB_NODE_GUID_OFFSET);
+	MTHCA_PUT(inbox, param->si_guid,   INIT_IB_SI_GUID_OFFSET);
+
+	err = mthca_cmd(dev, mailbox->dma, port, 0, CMD_INIT_IB,
+			CMD_TIME_CLASS_A, status);
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_CLOSE_IB(struct mthca_dev *dev, int port, u8 *status)
+{
+	return mthca_cmd(dev, 0, port, 0, CMD_CLOSE_IB, CMD_TIME_CLASS_A, status);
+}
+
+int mthca_CLOSE_HCA(struct mthca_dev *dev, int panic, u8 *status)
+{
+	return mthca_cmd(dev, 0, 0, panic, CMD_CLOSE_HCA, CMD_TIME_CLASS_C, status);
+}
+
+int mthca_SET_IB(struct mthca_dev *dev, struct mthca_set_ib_param *param,
+		 int port, u8 *status)
+{
+	struct mthca_mailbox *mailbox;
+	u32 *inbox;
+	int err;
+	u32 flags = 0;
+
+#define SET_IB_IN_SIZE         0x40
+#define SET_IB_FLAGS_OFFSET    0x00
+#define SET_IB_FLAG_SIG        (1 << 18)
+#define SET_IB_FLAG_RQK        (1 <<  0)
+#define SET_IB_CAP_MASK_OFFSET 0x04
+#define SET_IB_SI_GUID_OFFSET  0x08
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	memset(inbox, 0, SET_IB_IN_SIZE);
+
+	flags |= param->set_si_guid     ? SET_IB_FLAG_SIG : 0;
+	flags |= param->reset_qkey_viol ? SET_IB_FLAG_RQK : 0;
+	MTHCA_PUT(inbox, flags, SET_IB_FLAGS_OFFSET);
+
+	MTHCA_PUT(inbox, param->cap_mask, SET_IB_CAP_MASK_OFFSET);
+	MTHCA_PUT(inbox, param->si_guid,  SET_IB_SI_GUID_OFFSET);
+
+	err = mthca_cmd(dev, mailbox->dma, port, 0, CMD_SET_IB,
+			CMD_TIME_CLASS_B, status);
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_MAP_ICM(struct mthca_dev *dev, struct mthca_icm *icm, u64 virt, u8 *status)
+{
+	return mthca_map_cmd(dev, CMD_MAP_ICM, icm, virt, status);
+}
+
+int mthca_MAP_ICM_page(struct mthca_dev *dev, u64 dma_addr, u64 virt, u8 *status)
+{
+	struct mthca_mailbox *mailbox;
+	__be64 *inbox;
+	int err;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	inbox[0] = cpu_to_be64(virt);
+	inbox[1] = cpu_to_be64(dma_addr);
+
+	err = mthca_cmd(dev, mailbox->dma, 1, 0, CMD_MAP_ICM,
+			CMD_TIME_CLASS_B, status);
+
+	mthca_free_mailbox(dev, mailbox);
+
+	if (!err)
+		mthca_dbg(dev, "Mapped page at %llx to %llx for ICM.\n",
+			  (unsigned long long) dma_addr, (unsigned long long) virt);
+
+	return err;
+}
+
+int mthca_UNMAP_ICM(struct mthca_dev *dev, u64 virt, u32 page_count, u8 *status)
+{
+	mthca_dbg(dev, "Unmapping %d pages at %llx from ICM.\n",
+		  page_count, (unsigned long long) virt);
+
+	return mthca_cmd(dev, virt, page_count, 0, CMD_UNMAP_ICM, CMD_TIME_CLASS_B, status);
+}
+
+int mthca_MAP_ICM_AUX(struct mthca_dev *dev, struct mthca_icm *icm, u8 *status)
+{
+	return mthca_map_cmd(dev, CMD_MAP_ICM_AUX, icm, -1, status);
+}
+
+int mthca_UNMAP_ICM_AUX(struct mthca_dev *dev, u8 *status)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_UNMAP_ICM_AUX, CMD_TIME_CLASS_B, status);
+}
+
+int mthca_SET_ICM_SIZE(struct mthca_dev *dev, u64 icm_size, u64 *aux_pages,
+		       u8 *status)
+{
+	int ret = mthca_cmd_imm(dev, icm_size, aux_pages, 0, 0, CMD_SET_ICM_SIZE,
+				CMD_TIME_CLASS_A, status);
+
+	if (ret || status)
+		return ret;
+
+	/*
+	 * Round up number of system pages needed in case
+	 * MTHCA_ICM_PAGE_SIZE < PAGE_SIZE.
+	 */
+	*aux_pages = ALIGN(*aux_pages, PAGE_SIZE / MTHCA_ICM_PAGE_SIZE) >>
+		(PAGE_SHIFT - MTHCA_ICM_PAGE_SHIFT);
+
+	return 0;
+}
+
+int mthca_SW2HW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int mpt_index, u8 *status)
+{
+	return mthca_cmd(dev, mailbox->dma, mpt_index, 0, CMD_SW2HW_MPT,
+			 CMD_TIME_CLASS_B, status);
+}
+
+int mthca_HW2SW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int mpt_index, u8 *status)
+{
+	return mthca_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, mpt_index,
+			     !mailbox, CMD_HW2SW_MPT,
+			     CMD_TIME_CLASS_B, status);
+}
+
+int mthca_WRITE_MTT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int num_mtt, u8 *status)
+{
+	return mthca_cmd(dev, mailbox->dma, num_mtt, 0, CMD_WRITE_MTT,
+			 CMD_TIME_CLASS_B, status);
+}
+
+int mthca_SYNC_TPT(struct mthca_dev *dev, u8 *status)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_SYNC_TPT, CMD_TIME_CLASS_B, status);
+}
+
+int mthca_MAP_EQ(struct mthca_dev *dev, u64 event_mask, int unmap,
+		 int eq_num, u8 *status)
+{
+	mthca_dbg(dev, "%s mask %016llx for eqn %d\n",
+		  unmap ? "Clearing" : "Setting",
+		  (unsigned long long) event_mask, eq_num);
+	return mthca_cmd(dev, event_mask, (unmap << 31) | eq_num,
+			 0, CMD_MAP_EQ, CMD_TIME_CLASS_B, status);
+}
+
+int mthca_SW2HW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int eq_num, u8 *status)
+{
+	return mthca_cmd(dev, mailbox->dma, eq_num, 0, CMD_SW2HW_EQ,
+			 CMD_TIME_CLASS_A, status);
+}
+
+int mthca_HW2SW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int eq_num, u8 *status)
+{
+	return mthca_cmd_box(dev, 0, mailbox->dma, eq_num, 0,
+			     CMD_HW2SW_EQ,
+			     CMD_TIME_CLASS_A, status);
+}
+
+int mthca_SW2HW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int cq_num, u8 *status)
+{
+	return mthca_cmd(dev, mailbox->dma, cq_num, 0, CMD_SW2HW_CQ,
+			CMD_TIME_CLASS_A, status);
+}
+
+int mthca_HW2SW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int cq_num, u8 *status)
+{
+	return mthca_cmd_box(dev, 0, mailbox->dma, cq_num, 0,
+			     CMD_HW2SW_CQ,
+			     CMD_TIME_CLASS_A, status);
+}
+
+int mthca_RESIZE_CQ(struct mthca_dev *dev, int cq_num, u32 lkey, u8 log_size,
+		    u8 *status)
+{
+	struct mthca_mailbox *mailbox;
+	__be32 *inbox;
+	int err;
+
+#define RESIZE_CQ_IN_SIZE		0x40
+#define RESIZE_CQ_LOG_SIZE_OFFSET	0x0c
+#define RESIZE_CQ_LKEY_OFFSET		0x1c
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	memset(inbox, 0, RESIZE_CQ_IN_SIZE);
+	/*
+	 * Leave start address fields zeroed out -- mthca assumes that
+	 * MRs for CQs always start at virtual address 0.
+	 */
+	MTHCA_PUT(inbox, log_size, RESIZE_CQ_LOG_SIZE_OFFSET);
+	MTHCA_PUT(inbox, lkey,     RESIZE_CQ_LKEY_OFFSET);
+
+	err = mthca_cmd(dev, mailbox->dma, cq_num, 1, CMD_RESIZE_CQ,
+			CMD_TIME_CLASS_B, status);
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_SW2HW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int srq_num, u8 *status)
+{
+	return mthca_cmd(dev, mailbox->dma, srq_num, 0, CMD_SW2HW_SRQ,
+			CMD_TIME_CLASS_A, status);
+}
+
+int mthca_HW2SW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int srq_num, u8 *status)
+{
+	return mthca_cmd_box(dev, 0, mailbox->dma, srq_num, 0,
+			     CMD_HW2SW_SRQ,
+			     CMD_TIME_CLASS_A, status);
+}
+
+int mthca_QUERY_SRQ(struct mthca_dev *dev, u32 num,
+		    struct mthca_mailbox *mailbox, u8 *status)
+{
+	return mthca_cmd_box(dev, 0, mailbox->dma, num, 0,
+			     CMD_QUERY_SRQ, CMD_TIME_CLASS_A, status);
+}
+
+int mthca_ARM_SRQ(struct mthca_dev *dev, int srq_num, int limit, u8 *status)
+{
+	return mthca_cmd(dev, limit, srq_num, 0, CMD_ARM_SRQ,
+			 CMD_TIME_CLASS_B, status);
+}
+
+int mthca_MODIFY_QP(struct mthca_dev *dev, enum ib_qp_state cur,
+		    enum ib_qp_state next, u32 num, int is_ee,
+		    struct mthca_mailbox *mailbox, u32 optmask,
+		    u8 *status)
+{
+	static const u16 op[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
+		[IB_QPS_RESET] = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+			[IB_QPS_INIT]	= CMD_RST2INIT_QPEE,
+		},
+		[IB_QPS_INIT]  = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+			[IB_QPS_INIT]	= CMD_INIT2INIT_QPEE,
+			[IB_QPS_RTR]	= CMD_INIT2RTR_QPEE,
+		},
+		[IB_QPS_RTR]   = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+			[IB_QPS_RTS]	= CMD_RTR2RTS_QPEE,
+		},
+		[IB_QPS_RTS]   = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+			[IB_QPS_RTS]	= CMD_RTS2RTS_QPEE,
+			[IB_QPS_SQD]	= CMD_RTS2SQD_QPEE,
+		},
+		[IB_QPS_SQD] = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+			[IB_QPS_RTS]	= CMD_SQD2RTS_QPEE,
+			[IB_QPS_SQD]	= CMD_SQD2SQD_QPEE,
+		},
+		[IB_QPS_SQE] = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+			[IB_QPS_RTS]	= CMD_SQERR2RTS_QPEE,
+		},
+		[IB_QPS_ERR] = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+		}
+	};
+
+	u8 op_mod = 0;
+	int my_mailbox = 0;
+	int err;
+
+	if (op[cur][next] == CMD_ERR2RST_QPEE) {
+		op_mod = 3;	/* don't write outbox, any->reset */
+
+		/* For debugging */
+		if (!mailbox) {
+			mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+			if (!IS_ERR(mailbox)) {
+				my_mailbox = 1;
+				op_mod     = 2;	/* write outbox, any->reset */
+			} else
+				mailbox = NULL;
+		}
+
+		err = mthca_cmd_box(dev, 0, mailbox ? mailbox->dma : 0,
+				    (!!is_ee << 24) | num, op_mod,
+				    op[cur][next], CMD_TIME_CLASS_C, status);
+
+		if (0 && mailbox) {
+			int i;
+			mthca_dbg(dev, "Dumping QP context:\n");
+			printk(" %08x\n", be32_to_cpup(mailbox->buf));
+			for (i = 0; i < 0x100 / 4; ++i) {
+				if (i % 8 == 0)
+					printk("[%02x] ", i * 4);
+				printk(" %08x",
+				       be32_to_cpu(((__be32 *) mailbox->buf)[i + 2]));
+				if ((i + 1) % 8 == 0)
+					printk("\n");
+			}
+		}
+
+		if (my_mailbox)
+			mthca_free_mailbox(dev, mailbox);
+	} else {
+		if (0) {
+			int i;
+			mthca_dbg(dev, "Dumping QP context:\n");
+			printk("  opt param mask: %08x\n", be32_to_cpup(mailbox->buf));
+			for (i = 0; i < 0x100 / 4; ++i) {
+				if (i % 8 == 0)
+					printk("  [%02x] ", i * 4);
+				printk(" %08x",
+				       be32_to_cpu(((__be32 *) mailbox->buf)[i + 2]));
+				if ((i + 1) % 8 == 0)
+					printk("\n");
+			}
+		}
+
+		err = mthca_cmd(dev, mailbox->dma, optmask | (!!is_ee << 24) | num,
+				op_mod, op[cur][next], CMD_TIME_CLASS_C, status);
+	}
+
+	return err;
+}
+
+int mthca_QUERY_QP(struct mthca_dev *dev, u32 num, int is_ee,
+		   struct mthca_mailbox *mailbox, u8 *status)
+{
+	return mthca_cmd_box(dev, 0, mailbox->dma, (!!is_ee << 24) | num, 0,
+			     CMD_QUERY_QPEE, CMD_TIME_CLASS_A, status);
+}
+
+int mthca_CONF_SPECIAL_QP(struct mthca_dev *dev, int type, u32 qpn,
+			  u8 *status)
+{
+	u8 op_mod;
+
+	switch (type) {
+	case IB_QPT_SMI:
+		op_mod = 0;
+		break;
+	case IB_QPT_GSI:
+		op_mod = 1;
+		break;
+	case IB_QPT_RAW_IPV6:
+		op_mod = 2;
+		break;
+	case IB_QPT_RAW_ETY:
+		op_mod = 3;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return mthca_cmd(dev, 0, qpn, op_mod, CMD_CONF_SPECIAL_QP,
+			 CMD_TIME_CLASS_B, status);
+}
+
+int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey,
+		  int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+		  void *in_mad, void *response_mad, u8 *status)
+{
+	struct mthca_mailbox *inmailbox, *outmailbox;
+	void *inbox;
+	int err;
+	u32 in_modifier = port;
+	u8 op_modifier = 0;
+
+#define MAD_IFC_BOX_SIZE      0x400
+#define MAD_IFC_MY_QPN_OFFSET 0x100
+#define MAD_IFC_RQPN_OFFSET   0x108
+#define MAD_IFC_SL_OFFSET     0x10c
+#define MAD_IFC_G_PATH_OFFSET 0x10d
+#define MAD_IFC_RLID_OFFSET   0x10e
+#define MAD_IFC_PKEY_OFFSET   0x112
+#define MAD_IFC_GRH_OFFSET    0x140
+
+	inmailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(inmailbox))
+		return PTR_ERR(inmailbox);
+	inbox = inmailbox->buf;
+
+	outmailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(outmailbox)) {
+		mthca_free_mailbox(dev, inmailbox);
+		return PTR_ERR(outmailbox);
+	}
+
+	memcpy(inbox, in_mad, 256);
+
+	/*
+	 * Key check traps can't be generated unless we have in_wc to
+	 * tell us where to send the trap.
+	 */
+	if (ignore_mkey || !in_wc)
+		op_modifier |= 0x1;
+	if (ignore_bkey || !in_wc)
+		op_modifier |= 0x2;
+
+	if (in_wc) {
+		u8 val;
+
+		memset(inbox + 256, 0, 256);
+
+		MTHCA_PUT(inbox, in_wc->qp->qp_num, MAD_IFC_MY_QPN_OFFSET);
+		MTHCA_PUT(inbox, in_wc->src_qp,     MAD_IFC_RQPN_OFFSET);
+
+		val = in_wc->sl << 4;
+		MTHCA_PUT(inbox, val,               MAD_IFC_SL_OFFSET);
+
+		val = in_wc->dlid_path_bits |
+			(in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0);
+		MTHCA_PUT(inbox, val,               MAD_IFC_G_PATH_OFFSET);
+
+		MTHCA_PUT(inbox, in_wc->slid,       MAD_IFC_RLID_OFFSET);
+		MTHCA_PUT(inbox, in_wc->pkey_index, MAD_IFC_PKEY_OFFSET);
+
+		if (in_grh)
+			memcpy(inbox + MAD_IFC_GRH_OFFSET, in_grh, 40);
+
+		op_modifier |= 0x4;
+
+		in_modifier |= in_wc->slid << 16;
+	}
+
+	err = mthca_cmd_box(dev, inmailbox->dma, outmailbox->dma,
+			    in_modifier, op_modifier,
+			    CMD_MAD_IFC, CMD_TIME_CLASS_C, status);
+
+	if (!err && !*status)
+		memcpy(response_mad, outmailbox->buf, 256);
+
+	mthca_free_mailbox(dev, inmailbox);
+	mthca_free_mailbox(dev, outmailbox);
+	return err;
+}
+
+int mthca_READ_MGM(struct mthca_dev *dev, int index,
+		   struct mthca_mailbox *mailbox, u8 *status)
+{
+	return mthca_cmd_box(dev, 0, mailbox->dma, index, 0,
+			     CMD_READ_MGM, CMD_TIME_CLASS_A, status);
+}
+
+int mthca_WRITE_MGM(struct mthca_dev *dev, int index,
+		    struct mthca_mailbox *mailbox, u8 *status)
+{
+	return mthca_cmd(dev, mailbox->dma, index, 0, CMD_WRITE_MGM,
+			 CMD_TIME_CLASS_A, status);
+}
+
+int mthca_MGID_HASH(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    u16 *hash, u8 *status)
+{
+	u64 imm;
+	int err;
+
+	err = mthca_cmd_imm(dev, mailbox->dma, &imm, 0, 0, CMD_MGID_HASH,
+			    CMD_TIME_CLASS_A, status);
+
+	*hash = imm;
+	return err;
+}
+
+int mthca_NOP(struct mthca_dev *dev, u8 *status)
+{
+	return mthca_cmd(dev, 0, 0x1f, 0, CMD_NOP, msecs_to_jiffies(100), status);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.h b/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.h
new file mode 100644
index 0000000..6efd326
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.h
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_CMD_H
+#define MTHCA_CMD_H
+
+#include <rdma/ib_verbs.h>
+
+#define MTHCA_MAILBOX_SIZE 4096
+
+enum {
+	/* command completed successfully: */
+	MTHCA_CMD_STAT_OK 	      = 0x00,
+	/* Internal error (such as a bus error) occurred while processing command: */
+	MTHCA_CMD_STAT_INTERNAL_ERR   = 0x01,
+	/* Operation/command not supported or opcode modifier not supported: */
+	MTHCA_CMD_STAT_BAD_OP 	      = 0x02,
+	/* Parameter not supported or parameter out of range: */
+	MTHCA_CMD_STAT_BAD_PARAM      = 0x03,
+	/* System not enabled or bad system state: */
+	MTHCA_CMD_STAT_BAD_SYS_STATE  = 0x04,
+	/* Attempt to access reserved or unallocaterd resource: */
+	MTHCA_CMD_STAT_BAD_RESOURCE   = 0x05,
+	/* Requested resource is currently executing a command, or is otherwise busy: */
+	MTHCA_CMD_STAT_RESOURCE_BUSY  = 0x06,
+	/* memory error: */
+	MTHCA_CMD_STAT_DDR_MEM_ERR    = 0x07,
+	/* Required capability exceeds device limits: */
+	MTHCA_CMD_STAT_EXCEED_LIM     = 0x08,
+	/* Resource is not in the appropriate state or ownership: */
+	MTHCA_CMD_STAT_BAD_RES_STATE  = 0x09,
+	/* Index out of range: */
+	MTHCA_CMD_STAT_BAD_INDEX      = 0x0a,
+	/* FW image corrupted: */
+	MTHCA_CMD_STAT_BAD_NVMEM      = 0x0b,
+	/* Attempt to modify a QP/EE which is not in the presumed state: */
+	MTHCA_CMD_STAT_BAD_QPEE_STATE = 0x10,
+	/* Bad segment parameters (Address/Size): */
+	MTHCA_CMD_STAT_BAD_SEG_PARAM  = 0x20,
+	/* Memory Region has Memory Windows bound to: */
+	MTHCA_CMD_STAT_REG_BOUND      = 0x21,
+	/* HCA local attached memory not present: */
+	MTHCA_CMD_STAT_LAM_NOT_PRE    = 0x22,
+	/* Bad management packet (silently discarded): */
+	MTHCA_CMD_STAT_BAD_PKT 	      = 0x30,
+	/* More outstanding CQEs in CQ than new CQ size: */
+	MTHCA_CMD_STAT_BAD_SIZE       = 0x40
+};
+
+enum {
+	MTHCA_TRANS_INVALID = 0,
+	MTHCA_TRANS_RST2INIT,
+	MTHCA_TRANS_INIT2INIT,
+	MTHCA_TRANS_INIT2RTR,
+	MTHCA_TRANS_RTR2RTS,
+	MTHCA_TRANS_RTS2RTS,
+	MTHCA_TRANS_SQERR2RTS,
+	MTHCA_TRANS_ANY2ERR,
+	MTHCA_TRANS_RTS2SQD,
+	MTHCA_TRANS_SQD2SQD,
+	MTHCA_TRANS_SQD2RTS,
+	MTHCA_TRANS_ANY2RST,
+};
+
+enum {
+	DEV_LIM_FLAG_RC                 = 1 << 0,
+	DEV_LIM_FLAG_UC                 = 1 << 1,
+	DEV_LIM_FLAG_UD                 = 1 << 2,
+	DEV_LIM_FLAG_RD                 = 1 << 3,
+	DEV_LIM_FLAG_RAW_IPV6           = 1 << 4,
+	DEV_LIM_FLAG_RAW_ETHER          = 1 << 5,
+	DEV_LIM_FLAG_SRQ                = 1 << 6,
+	DEV_LIM_FLAG_IPOIB_CSUM		= 1 << 7,
+	DEV_LIM_FLAG_BAD_PKEY_CNTR      = 1 << 8,
+	DEV_LIM_FLAG_BAD_QKEY_CNTR      = 1 << 9,
+	DEV_LIM_FLAG_MW                 = 1 << 16,
+	DEV_LIM_FLAG_AUTO_PATH_MIG      = 1 << 17,
+	DEV_LIM_FLAG_ATOMIC             = 1 << 18,
+	DEV_LIM_FLAG_RAW_MULTI          = 1 << 19,
+	DEV_LIM_FLAG_UD_AV_PORT_ENFORCE = 1 << 20,
+	DEV_LIM_FLAG_UD_MULTI           = 1 << 21,
+};
+
+struct mthca_mailbox {
+	dma_addr_t dma;
+	void      *buf;
+};
+
+struct mthca_dev_lim {
+	int max_srq_sz;
+	int max_qp_sz;
+	int reserved_qps;
+	int max_qps;
+	int reserved_srqs;
+	int max_srqs;
+	int reserved_eecs;
+	int max_eecs;
+	int max_cq_sz;
+	int reserved_cqs;
+	int max_cqs;
+	int max_mpts;
+	int reserved_eqs;
+	int max_eqs;
+	int reserved_mtts;
+	int max_mrw_sz;
+	int reserved_mrws;
+	int max_mtt_seg;
+	int max_requester_per_qp;
+	int max_responder_per_qp;
+	int max_rdma_global;
+	int local_ca_ack_delay;
+	int max_mtu;
+	int max_port_width;
+	int max_vl;
+	int num_ports;
+	int max_gids;
+	u16 stat_rate_support;
+	int max_pkeys;
+	u32 flags;
+	int reserved_uars;
+	int uar_size;
+	int min_page_sz;
+	int max_sg;
+	int max_desc_sz;
+	int max_qp_per_mcg;
+	int reserved_mgms;
+	int max_mcgs;
+	int reserved_pds;
+	int max_pds;
+	int reserved_rdds;
+	int max_rdds;
+	int eec_entry_sz;
+	int qpc_entry_sz;
+	int eeec_entry_sz;
+	int eqpc_entry_sz;
+	int eqc_entry_sz;
+	int cqc_entry_sz;
+	int srq_entry_sz;
+	int uar_scratch_entry_sz;
+	int mpt_entry_sz;
+	union {
+		struct {
+			int max_avs;
+		} tavor;
+		struct {
+			int resize_srq;
+			int max_pbl_sz;
+			u8  bmme_flags;
+			u32 reserved_lkey;
+			int lam_required;
+			u64 max_icm_sz;
+		} arbel;
+	} hca;
+};
+
+struct mthca_adapter {
+	u32  vendor_id;
+	u32  device_id;
+	u32  revision_id;
+	char board_id[MTHCA_BOARD_ID_LEN];
+	u8   inta_pin;
+};
+
+struct mthca_init_hca_param {
+	u64 qpc_base;
+	u64 eec_base;
+	u64 srqc_base;
+	u64 cqc_base;
+	u64 eqpc_base;
+	u64 eeec_base;
+	u64 eqc_base;
+	u64 rdb_base;
+	u64 mc_base;
+	u64 mpt_base;
+	u64 mtt_base;
+	u64 uar_scratch_base;
+	u64 uarc_base;
+	u16 log_mc_entry_sz;
+	u16 mc_hash_sz;
+	u8  log_num_qps;
+	u8  log_num_eecs;
+	u8  log_num_srqs;
+	u8  log_num_cqs;
+	u8  log_num_eqs;
+	u8  log_mc_table_sz;
+	u8  mtt_seg_sz;
+	u8  log_mpt_sz;
+	u8  log_uar_sz;
+	u8  log_uarc_sz;
+};
+
+struct mthca_init_ib_param {
+	int port_width;
+	int vl_cap;
+	int mtu_cap;
+	u16 gid_cap;
+	u16 pkey_cap;
+	int set_guid0;
+	u64 guid0;
+	int set_node_guid;
+	u64 node_guid;
+	int set_si_guid;
+	u64 si_guid;
+};
+
+struct mthca_set_ib_param {
+	int set_si_guid;
+	int reset_qkey_viol;
+	u64 si_guid;
+	u32 cap_mask;
+};
+
+int mthca_cmd_init(struct mthca_dev *dev);
+void mthca_cmd_cleanup(struct mthca_dev *dev);
+int mthca_cmd_use_events(struct mthca_dev *dev);
+void mthca_cmd_use_polling(struct mthca_dev *dev);
+void mthca_cmd_event(struct mthca_dev *dev, u16 token,
+		     u8  status, u64 out_param);
+
+struct mthca_mailbox *mthca_alloc_mailbox(struct mthca_dev *dev,
+					  gfp_t gfp_mask);
+void mthca_free_mailbox(struct mthca_dev *dev, struct mthca_mailbox *mailbox);
+
+int mthca_SYS_EN(struct mthca_dev *dev, u8 *status);
+int mthca_SYS_DIS(struct mthca_dev *dev, u8 *status);
+int mthca_MAP_FA(struct mthca_dev *dev, struct mthca_icm *icm, u8 *status);
+int mthca_UNMAP_FA(struct mthca_dev *dev, u8 *status);
+int mthca_RUN_FW(struct mthca_dev *dev, u8 *status);
+int mthca_QUERY_FW(struct mthca_dev *dev, u8 *status);
+int mthca_ENABLE_LAM(struct mthca_dev *dev, u8 *status);
+int mthca_DISABLE_LAM(struct mthca_dev *dev, u8 *status);
+int mthca_QUERY_DDR(struct mthca_dev *dev, u8 *status);
+int mthca_QUERY_DEV_LIM(struct mthca_dev *dev,
+			struct mthca_dev_lim *dev_lim, u8 *status);
+int mthca_QUERY_ADAPTER(struct mthca_dev *dev,
+			struct mthca_adapter *adapter, u8 *status);
+int mthca_INIT_HCA(struct mthca_dev *dev,
+		   struct mthca_init_hca_param *param,
+		   u8 *status);
+int mthca_INIT_IB(struct mthca_dev *dev,
+		  struct mthca_init_ib_param *param,
+		  int port, u8 *status);
+int mthca_CLOSE_IB(struct mthca_dev *dev, int port, u8 *status);
+int mthca_CLOSE_HCA(struct mthca_dev *dev, int panic, u8 *status);
+int mthca_SET_IB(struct mthca_dev *dev, struct mthca_set_ib_param *param,
+		 int port, u8 *status);
+int mthca_MAP_ICM(struct mthca_dev *dev, struct mthca_icm *icm, u64 virt, u8 *status);
+int mthca_MAP_ICM_page(struct mthca_dev *dev, u64 dma_addr, u64 virt, u8 *status);
+int mthca_UNMAP_ICM(struct mthca_dev *dev, u64 virt, u32 page_count, u8 *status);
+int mthca_MAP_ICM_AUX(struct mthca_dev *dev, struct mthca_icm *icm, u8 *status);
+int mthca_UNMAP_ICM_AUX(struct mthca_dev *dev, u8 *status);
+int mthca_SET_ICM_SIZE(struct mthca_dev *dev, u64 icm_size, u64 *aux_pages,
+		       u8 *status);
+int mthca_SW2HW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int mpt_index, u8 *status);
+int mthca_HW2SW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int mpt_index, u8 *status);
+int mthca_WRITE_MTT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int num_mtt, u8 *status);
+int mthca_SYNC_TPT(struct mthca_dev *dev, u8 *status);
+int mthca_MAP_EQ(struct mthca_dev *dev, u64 event_mask, int unmap,
+		 int eq_num, u8 *status);
+int mthca_SW2HW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int eq_num, u8 *status);
+int mthca_HW2SW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int eq_num, u8 *status);
+int mthca_SW2HW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int cq_num, u8 *status);
+int mthca_HW2SW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int cq_num, u8 *status);
+int mthca_RESIZE_CQ(struct mthca_dev *dev, int cq_num, u32 lkey, u8 log_size,
+		    u8 *status);
+int mthca_SW2HW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int srq_num, u8 *status);
+int mthca_HW2SW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int srq_num, u8 *status);
+int mthca_QUERY_SRQ(struct mthca_dev *dev, u32 num,
+		    struct mthca_mailbox *mailbox, u8 *status);
+int mthca_ARM_SRQ(struct mthca_dev *dev, int srq_num, int limit, u8 *status);
+int mthca_MODIFY_QP(struct mthca_dev *dev, enum ib_qp_state cur,
+		    enum ib_qp_state next, u32 num, int is_ee,
+		    struct mthca_mailbox *mailbox, u32 optmask,
+		    u8 *status);
+int mthca_QUERY_QP(struct mthca_dev *dev, u32 num, int is_ee,
+		   struct mthca_mailbox *mailbox, u8 *status);
+int mthca_CONF_SPECIAL_QP(struct mthca_dev *dev, int type, u32 qpn,
+			  u8 *status);
+int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey,
+		  int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+		  void *in_mad, void *response_mad, u8 *status);
+int mthca_READ_MGM(struct mthca_dev *dev, int index,
+		   struct mthca_mailbox *mailbox, u8 *status);
+int mthca_WRITE_MGM(struct mthca_dev *dev, int index,
+		    struct mthca_mailbox *mailbox, u8 *status);
+int mthca_MGID_HASH(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    u16 *hash, u8 *status);
+int mthca_NOP(struct mthca_dev *dev, u8 *status);
+
+#endif /* MTHCA_CMD_H */
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_config_reg.h b/sys/ofed/drivers/infiniband/hw/mthca/mthca_config_reg.h
new file mode 100644
index 0000000..75671f7
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_config_reg.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_CONFIG_REG_H
+#define MTHCA_CONFIG_REG_H
+
+#include <asm/page.h>
+
+#define MTHCA_HCR_BASE         0x80680
+#define MTHCA_HCR_SIZE         0x0001c
+#define MTHCA_ECR_BASE         0x80700
+#define MTHCA_ECR_SIZE         0x00008
+#define MTHCA_ECR_CLR_BASE     0x80708
+#define MTHCA_ECR_CLR_SIZE     0x00008
+#define MTHCA_MAP_ECR_SIZE     (MTHCA_ECR_SIZE + MTHCA_ECR_CLR_SIZE)
+#define MTHCA_CLR_INT_BASE     0xf00d8
+#define MTHCA_CLR_INT_SIZE     0x00008
+#define MTHCA_EQ_SET_CI_SIZE   (8 * 32)
+
+#endif /* MTHCA_CONFIG_REG_H */
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_cq.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_cq.c
new file mode 100644
index 0000000..aa75d26
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_cq.c
@@ -0,0 +1,992 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/hardirq.h>
+#include <linux/sched.h>
+
+#include <asm/io.h>
+
+#include <rdma/ib_pack.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+
+enum {
+	MTHCA_MAX_DIRECT_CQ_SIZE = 4 * PAGE_SIZE
+};
+
+enum {
+	MTHCA_CQ_ENTRY_SIZE = 0x20
+};
+
+enum {
+	MTHCA_ATOMIC_BYTE_LEN = 8
+};
+
+/*
+ * Must be packed because start is 64 bits but only aligned to 32 bits.
+ */
+struct mthca_cq_context {
+	__be32 flags;
+	__be64 start;
+	__be32 logsize_usrpage;
+	__be32 error_eqn;	/* Tavor only */
+	__be32 comp_eqn;
+	__be32 pd;
+	__be32 lkey;
+	__be32 last_notified_index;
+	__be32 solicit_producer_index;
+	__be32 consumer_index;
+	__be32 producer_index;
+	__be32 cqn;
+	__be32 ci_db;		/* Arbel only */
+	__be32 state_db;	/* Arbel only */
+	u32    reserved;
+} __attribute__((packed));
+
+#define MTHCA_CQ_STATUS_OK          ( 0 << 28)
+#define MTHCA_CQ_STATUS_OVERFLOW    ( 9 << 28)
+#define MTHCA_CQ_STATUS_WRITE_FAIL  (10 << 28)
+#define MTHCA_CQ_FLAG_TR            ( 1 << 18)
+#define MTHCA_CQ_FLAG_OI            ( 1 << 17)
+#define MTHCA_CQ_STATE_DISARMED     ( 0 <<  8)
+#define MTHCA_CQ_STATE_ARMED        ( 1 <<  8)
+#define MTHCA_CQ_STATE_ARMED_SOL    ( 4 <<  8)
+#define MTHCA_EQ_STATE_FIRED        (10 <<  8)
+
+enum {
+	MTHCA_ERROR_CQE_OPCODE_MASK = 0xfe
+};
+
+enum {
+	SYNDROME_LOCAL_LENGTH_ERR 	 = 0x01,
+	SYNDROME_LOCAL_QP_OP_ERR  	 = 0x02,
+	SYNDROME_LOCAL_EEC_OP_ERR 	 = 0x03,
+	SYNDROME_LOCAL_PROT_ERR   	 = 0x04,
+	SYNDROME_WR_FLUSH_ERR     	 = 0x05,
+	SYNDROME_MW_BIND_ERR      	 = 0x06,
+	SYNDROME_BAD_RESP_ERR     	 = 0x10,
+	SYNDROME_LOCAL_ACCESS_ERR 	 = 0x11,
+	SYNDROME_REMOTE_INVAL_REQ_ERR 	 = 0x12,
+	SYNDROME_REMOTE_ACCESS_ERR 	 = 0x13,
+	SYNDROME_REMOTE_OP_ERR     	 = 0x14,
+	SYNDROME_RETRY_EXC_ERR 		 = 0x15,
+	SYNDROME_RNR_RETRY_EXC_ERR 	 = 0x16,
+	SYNDROME_LOCAL_RDD_VIOL_ERR 	 = 0x20,
+	SYNDROME_REMOTE_INVAL_RD_REQ_ERR = 0x21,
+	SYNDROME_REMOTE_ABORTED_ERR 	 = 0x22,
+	SYNDROME_INVAL_EECN_ERR 	 = 0x23,
+	SYNDROME_INVAL_EEC_STATE_ERR 	 = 0x24
+};
+
+struct mthca_cqe {
+	__be32 my_qpn;
+	__be32 my_ee;
+	__be32 rqpn;
+	u8     sl_ipok;
+	u8     g_mlpath;
+	__be16 rlid;
+	__be32 imm_etype_pkey_eec;
+	__be32 byte_cnt;
+	__be32 wqe;
+	u8     opcode;
+	u8     is_send;
+	u8     reserved;
+	u8     owner;
+};
+
+struct mthca_err_cqe {
+	__be32 my_qpn;
+	u32    reserved1[3];
+	u8     syndrome;
+	u8     vendor_err;
+	__be16 db_cnt;
+	u32    reserved2;
+	__be32 wqe;
+	u8     opcode;
+	u8     reserved3[2];
+	u8     owner;
+};
+
+#define MTHCA_CQ_ENTRY_OWNER_SW      (0 << 7)
+#define MTHCA_CQ_ENTRY_OWNER_HW      (1 << 7)
+
+#define MTHCA_TAVOR_CQ_DB_INC_CI       (1 << 24)
+#define MTHCA_TAVOR_CQ_DB_REQ_NOT      (2 << 24)
+#define MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL  (3 << 24)
+#define MTHCA_TAVOR_CQ_DB_SET_CI       (4 << 24)
+#define MTHCA_TAVOR_CQ_DB_REQ_NOT_MULT (5 << 24)
+
+#define MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL  (1 << 24)
+#define MTHCA_ARBEL_CQ_DB_REQ_NOT      (2 << 24)
+#define MTHCA_ARBEL_CQ_DB_REQ_NOT_MULT (3 << 24)
+
+static inline struct mthca_cqe *get_cqe_from_buf(struct mthca_cq_buf *buf,
+						 int entry)
+{
+	if (buf->is_direct)
+		return buf->queue.direct.buf + (entry * MTHCA_CQ_ENTRY_SIZE);
+	else
+		return buf->queue.page_list[entry * MTHCA_CQ_ENTRY_SIZE / PAGE_SIZE].buf
+			+ (entry * MTHCA_CQ_ENTRY_SIZE) % PAGE_SIZE;
+}
+
+static inline struct mthca_cqe *get_cqe(struct mthca_cq *cq, int entry)
+{
+	return get_cqe_from_buf(&cq->buf, entry);
+}
+
+static inline struct mthca_cqe *cqe_sw(struct mthca_cqe *cqe)
+{
+	return MTHCA_CQ_ENTRY_OWNER_HW & cqe->owner ? NULL : cqe;
+}
+
+static inline struct mthca_cqe *next_cqe_sw(struct mthca_cq *cq)
+{
+	return cqe_sw(get_cqe(cq, cq->cons_index & cq->ibcq.cqe));
+}
+
+static inline void set_cqe_hw(struct mthca_cqe *cqe)
+{
+	cqe->owner = MTHCA_CQ_ENTRY_OWNER_HW;
+}
+
+static void dump_cqe(struct mthca_dev *dev, void *cqe_ptr)
+{
+	__be32 *cqe = cqe_ptr;
+
+	(void) cqe;	/* avoid warning if mthca_dbg compiled away... */
+	mthca_dbg(dev, "CQE contents %08x %08x %08x %08x %08x %08x %08x %08x\n",
+		  be32_to_cpu(cqe[0]), be32_to_cpu(cqe[1]), be32_to_cpu(cqe[2]),
+		  be32_to_cpu(cqe[3]), be32_to_cpu(cqe[4]), be32_to_cpu(cqe[5]),
+		  be32_to_cpu(cqe[6]), be32_to_cpu(cqe[7]));
+}
+
+/*
+ * incr is ignored in native Arbel (mem-free) mode, so cq->cons_index
+ * should be correct before calling update_cons_index().
+ */
+static inline void update_cons_index(struct mthca_dev *dev, struct mthca_cq *cq,
+				     int incr)
+{
+	if (mthca_is_memfree(dev)) {
+		*cq->set_ci_db = cpu_to_be32(cq->cons_index);
+		wmb();
+	} else {
+		mthca_write64(MTHCA_TAVOR_CQ_DB_INC_CI | cq->cqn, incr - 1,
+			      dev->kar + MTHCA_CQ_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+		/*
+		 * Make sure doorbells don't leak out of CQ spinlock
+		 * and reach the HCA out of order:
+		 */
+		mmiowb();
+	}
+}
+
+void mthca_cq_completion(struct mthca_dev *dev, u32 cqn)
+{
+	struct mthca_cq *cq;
+
+	cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1));
+
+	if (!cq) {
+		mthca_warn(dev, "Completion event for bogus CQ %08x\n", cqn);
+		return;
+	}
+
+	++cq->arm_sn;
+
+	cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+}
+
+void mthca_cq_event(struct mthca_dev *dev, u32 cqn,
+		    enum ib_event_type event_type)
+{
+	struct mthca_cq *cq;
+	struct ib_event event;
+
+	spin_lock(&dev->cq_table.lock);
+
+	cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1));
+	if (cq)
+		++cq->refcount;
+
+	spin_unlock(&dev->cq_table.lock);
+
+	if (!cq) {
+		mthca_warn(dev, "Async event for bogus CQ %08x\n", cqn);
+		return;
+	}
+
+	event.device      = &dev->ib_dev;
+	event.event       = event_type;
+	event.element.cq  = &cq->ibcq;
+	if (cq->ibcq.event_handler)
+		cq->ibcq.event_handler(&event, cq->ibcq.cq_context);
+
+	spin_lock(&dev->cq_table.lock);
+	if (!--cq->refcount)
+		wake_up(&cq->wait);
+	spin_unlock(&dev->cq_table.lock);
+}
+
+static inline int is_recv_cqe(struct mthca_cqe *cqe)
+{
+	if ((cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) ==
+	    MTHCA_ERROR_CQE_OPCODE_MASK)
+		return !(cqe->opcode & 0x01);
+	else
+		return !(cqe->is_send & 0x80);
+}
+
+void mthca_cq_clean(struct mthca_dev *dev, struct mthca_cq *cq, u32 qpn,
+		    struct mthca_srq *srq)
+{
+	struct mthca_cqe *cqe;
+	u32 prod_index;
+	int i, nfreed = 0;
+
+	spin_lock_irq(&cq->lock);
+
+	/*
+	 * First we need to find the current producer index, so we
+	 * know where to start cleaning from.  It doesn't matter if HW
+	 * adds new entries after this loop -- the QP we're worried
+	 * about is already in RESET, so the new entries won't come
+	 * from our QP and therefore don't need to be checked.
+	 */
+	for (prod_index = cq->cons_index;
+	     cqe_sw(get_cqe(cq, prod_index & cq->ibcq.cqe));
+	     ++prod_index)
+		if (prod_index == cq->cons_index + cq->ibcq.cqe)
+			break;
+
+	if (0)
+		mthca_dbg(dev, "Cleaning QPN %06x from CQN %06x; ci %d, pi %d\n",
+			  qpn, cq->cqn, cq->cons_index, prod_index);
+
+	/*
+	 * Now sweep backwards through the CQ, removing CQ entries
+	 * that match our QP by copying older entries on top of them.
+	 */
+	while ((int) --prod_index - (int) cq->cons_index >= 0) {
+		cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
+		if (cqe->my_qpn == cpu_to_be32(qpn)) {
+			if (srq && is_recv_cqe(cqe))
+				mthca_free_srq_wqe(srq, be32_to_cpu(cqe->wqe));
+			++nfreed;
+		} else if (nfreed)
+			memcpy(get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe),
+			       cqe, MTHCA_CQ_ENTRY_SIZE);
+	}
+
+	if (nfreed) {
+		for (i = 0; i < nfreed; ++i)
+			set_cqe_hw(get_cqe(cq, (cq->cons_index + i) & cq->ibcq.cqe));
+		wmb();
+		cq->cons_index += nfreed;
+		update_cons_index(dev, cq, nfreed);
+	}
+
+	spin_unlock_irq(&cq->lock);
+}
+
+void mthca_cq_resize_copy_cqes(struct mthca_cq *cq)
+{
+	int i;
+
+	/*
+	 * In Tavor mode, the hardware keeps the consumer and producer
+	 * indices mod the CQ size.  Since we might be making the CQ
+	 * bigger, we need to deal with the case where the producer
+	 * index wrapped around before the CQ was resized.
+	 */
+	if (!mthca_is_memfree(to_mdev(cq->ibcq.device)) &&
+	    cq->ibcq.cqe < cq->resize_buf->cqe) {
+		cq->cons_index &= cq->ibcq.cqe;
+		if (cqe_sw(get_cqe(cq, cq->ibcq.cqe)))
+			cq->cons_index -= cq->ibcq.cqe + 1;
+	}
+
+	for (i = cq->cons_index; cqe_sw(get_cqe(cq, i & cq->ibcq.cqe)); ++i)
+		memcpy(get_cqe_from_buf(&cq->resize_buf->buf,
+					i & cq->resize_buf->cqe),
+		       get_cqe(cq, i & cq->ibcq.cqe), MTHCA_CQ_ENTRY_SIZE);
+}
+
+int mthca_alloc_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int nent)
+{
+	int ret;
+	int i;
+
+	ret = mthca_buf_alloc(dev, nent * MTHCA_CQ_ENTRY_SIZE,
+			      MTHCA_MAX_DIRECT_CQ_SIZE,
+			      &buf->queue, &buf->is_direct,
+			      &dev->driver_pd, 1, &buf->mr);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < nent; ++i)
+		set_cqe_hw(get_cqe_from_buf(buf, i));
+
+	return 0;
+}
+
+void mthca_free_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int cqe)
+{
+	mthca_buf_free(dev, (cqe + 1) * MTHCA_CQ_ENTRY_SIZE, &buf->queue,
+		       buf->is_direct, &buf->mr);
+}
+
+static void handle_error_cqe(struct mthca_dev *dev, struct mthca_cq *cq,
+			     struct mthca_qp *qp, int wqe_index, int is_send,
+			     struct mthca_err_cqe *cqe,
+			     struct ib_wc *entry, int *free_cqe)
+{
+	int dbd;
+	__be32 new_wqe;
+
+	if (cqe->syndrome == SYNDROME_LOCAL_QP_OP_ERR) {
+		mthca_dbg(dev, "local QP operation err "
+			  "(QPN %06x, WQE @ %08x, CQN %06x, index %d)\n",
+			  be32_to_cpu(cqe->my_qpn), be32_to_cpu(cqe->wqe),
+			  cq->cqn, cq->cons_index);
+		dump_cqe(dev, cqe);
+	}
+
+	/*
+	 * For completions in error, only work request ID, status, vendor error
+	 * (and freed resource count for RD) have to be set.
+	 */
+	switch (cqe->syndrome) {
+	case SYNDROME_LOCAL_LENGTH_ERR:
+		entry->status = IB_WC_LOC_LEN_ERR;
+		break;
+	case SYNDROME_LOCAL_QP_OP_ERR:
+		entry->status = IB_WC_LOC_QP_OP_ERR;
+		break;
+	case SYNDROME_LOCAL_EEC_OP_ERR:
+		entry->status = IB_WC_LOC_EEC_OP_ERR;
+		break;
+	case SYNDROME_LOCAL_PROT_ERR:
+		entry->status = IB_WC_LOC_PROT_ERR;
+		break;
+	case SYNDROME_WR_FLUSH_ERR:
+		entry->status = IB_WC_WR_FLUSH_ERR;
+		break;
+	case SYNDROME_MW_BIND_ERR:
+		entry->status = IB_WC_MW_BIND_ERR;
+		break;
+	case SYNDROME_BAD_RESP_ERR:
+		entry->status = IB_WC_BAD_RESP_ERR;
+		break;
+	case SYNDROME_LOCAL_ACCESS_ERR:
+		entry->status = IB_WC_LOC_ACCESS_ERR;
+		break;
+	case SYNDROME_REMOTE_INVAL_REQ_ERR:
+		entry->status = IB_WC_REM_INV_REQ_ERR;
+		break;
+	case SYNDROME_REMOTE_ACCESS_ERR:
+		entry->status = IB_WC_REM_ACCESS_ERR;
+		break;
+	case SYNDROME_REMOTE_OP_ERR:
+		entry->status = IB_WC_REM_OP_ERR;
+		break;
+	case SYNDROME_RETRY_EXC_ERR:
+		entry->status = IB_WC_RETRY_EXC_ERR;
+		break;
+	case SYNDROME_RNR_RETRY_EXC_ERR:
+		entry->status = IB_WC_RNR_RETRY_EXC_ERR;
+		break;
+	case SYNDROME_LOCAL_RDD_VIOL_ERR:
+		entry->status = IB_WC_LOC_RDD_VIOL_ERR;
+		break;
+	case SYNDROME_REMOTE_INVAL_RD_REQ_ERR:
+		entry->status = IB_WC_REM_INV_RD_REQ_ERR;
+		break;
+	case SYNDROME_REMOTE_ABORTED_ERR:
+		entry->status = IB_WC_REM_ABORT_ERR;
+		break;
+	case SYNDROME_INVAL_EECN_ERR:
+		entry->status = IB_WC_INV_EECN_ERR;
+		break;
+	case SYNDROME_INVAL_EEC_STATE_ERR:
+		entry->status = IB_WC_INV_EEC_STATE_ERR;
+		break;
+	default:
+		entry->status = IB_WC_GENERAL_ERR;
+		break;
+	}
+
+	entry->vendor_err = cqe->vendor_err;
+
+	/*
+	 * Mem-free HCAs always generate one CQE per WQE, even in the
+	 * error case, so we don't have to check the doorbell count, etc.
+	 */
+	if (mthca_is_memfree(dev))
+		return;
+
+	mthca_free_err_wqe(dev, qp, is_send, wqe_index, &dbd, &new_wqe);
+
+	/*
+	 * If we're at the end of the WQE chain, or we've used up our
+	 * doorbell count, free the CQE.  Otherwise just update it for
+	 * the next poll operation.
+	 */
+	if (!(new_wqe & cpu_to_be32(0x3f)) || (!cqe->db_cnt && dbd))
+		return;
+
+	be16_add_cpu(&cqe->db_cnt, -dbd);
+	cqe->wqe      = new_wqe;
+	cqe->syndrome = SYNDROME_WR_FLUSH_ERR;
+
+	*free_cqe = 0;
+}
+
+static inline int mthca_poll_one(struct mthca_dev *dev,
+				 struct mthca_cq *cq,
+				 struct mthca_qp **cur_qp,
+				 int *freed,
+				 struct ib_wc *entry)
+{
+	struct mthca_wq *wq;
+	struct mthca_cqe *cqe;
+	int wqe_index;
+	int is_error;
+	int is_send;
+	int free_cqe = 1;
+	int err = 0;
+	u16 checksum;
+
+	cqe = next_cqe_sw(cq);
+	if (!cqe)
+		return -EAGAIN;
+
+	/*
+	 * Make sure we read CQ entry contents after we've checked the
+	 * ownership bit.
+	 */
+	rmb();
+
+	if (0) {
+		mthca_dbg(dev, "%x/%d: CQE -> QPN %06x, WQE @ %08x\n",
+			  cq->cqn, cq->cons_index, be32_to_cpu(cqe->my_qpn),
+			  be32_to_cpu(cqe->wqe));
+		dump_cqe(dev, cqe);
+	}
+
+	is_error = (cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) ==
+		MTHCA_ERROR_CQE_OPCODE_MASK;
+	is_send  = is_error ? cqe->opcode & 0x01 : cqe->is_send & 0x80;
+
+	if (!*cur_qp || be32_to_cpu(cqe->my_qpn) != (*cur_qp)->qpn) {
+		/*
+		 * We do not have to take the QP table lock here,
+		 * because CQs will be locked while QPs are removed
+		 * from the table.
+		 */
+		*cur_qp = mthca_array_get(&dev->qp_table.qp,
+					  be32_to_cpu(cqe->my_qpn) &
+					  (dev->limits.num_qps - 1));
+		if (!*cur_qp) {
+			mthca_warn(dev, "CQ entry for unknown QP %06x\n",
+				   be32_to_cpu(cqe->my_qpn) & 0xffffff);
+			err = -EINVAL;
+			goto out;
+		}
+	}
+
+	entry->qp = &(*cur_qp)->ibqp;
+
+	if (is_send) {
+		wq = &(*cur_qp)->sq;
+		wqe_index = ((be32_to_cpu(cqe->wqe) - (*cur_qp)->send_wqe_offset)
+			     >> wq->wqe_shift);
+		entry->wr_id = (*cur_qp)->wrid[wqe_index];
+	} else if ((*cur_qp)->ibqp.srq) {
+		struct mthca_srq *srq = to_msrq((*cur_qp)->ibqp.srq);
+		u32 wqe = be32_to_cpu(cqe->wqe);
+		wq = NULL;
+		wqe_index = wqe >> srq->wqe_shift;
+		entry->wr_id = srq->wrid[wqe_index];
+		mthca_free_srq_wqe(srq, wqe);
+	} else {
+		s32 wqe;
+		wq = &(*cur_qp)->rq;
+		wqe = be32_to_cpu(cqe->wqe);
+		wqe_index = wqe >> wq->wqe_shift;
+		/*
+		 * WQE addr == base - 1 might be reported in receive completion
+		 * with error instead of (rq size - 1) by Sinai FW 1.0.800 and
+		 * Arbel FW 5.1.400.  This bug should be fixed in later FW revs.
+		 */
+		if (unlikely(wqe_index < 0))
+			wqe_index = wq->max - 1;
+		entry->wr_id = (*cur_qp)->wrid[wqe_index + (*cur_qp)->sq.max];
+	}
+
+	if (wq) {
+		if (wq->last_comp < wqe_index)
+			wq->tail += wqe_index - wq->last_comp;
+		else
+			wq->tail += wqe_index + wq->max - wq->last_comp;
+
+		wq->last_comp = wqe_index;
+	}
+
+	if (is_error) {
+		handle_error_cqe(dev, cq, *cur_qp, wqe_index, is_send,
+				 (struct mthca_err_cqe *) cqe,
+				 entry, &free_cqe);
+		goto out;
+	}
+
+	if (is_send) {
+		entry->wc_flags = 0;
+		switch (cqe->opcode) {
+		case MTHCA_OPCODE_RDMA_WRITE:
+			entry->opcode    = IB_WC_RDMA_WRITE;
+			break;
+		case MTHCA_OPCODE_RDMA_WRITE_IMM:
+			entry->opcode    = IB_WC_RDMA_WRITE;
+			entry->wc_flags |= IB_WC_WITH_IMM;
+			break;
+		case MTHCA_OPCODE_SEND:
+			entry->opcode    = IB_WC_SEND;
+			break;
+		case MTHCA_OPCODE_SEND_IMM:
+			entry->opcode    = IB_WC_SEND;
+			entry->wc_flags |= IB_WC_WITH_IMM;
+			break;
+		case MTHCA_OPCODE_RDMA_READ:
+			entry->opcode    = IB_WC_RDMA_READ;
+			entry->byte_len  = be32_to_cpu(cqe->byte_cnt);
+			break;
+		case MTHCA_OPCODE_ATOMIC_CS:
+			entry->opcode    = IB_WC_COMP_SWAP;
+			entry->byte_len  = MTHCA_ATOMIC_BYTE_LEN;
+			break;
+		case MTHCA_OPCODE_ATOMIC_FA:
+			entry->opcode    = IB_WC_FETCH_ADD;
+			entry->byte_len  = MTHCA_ATOMIC_BYTE_LEN;
+			break;
+		case MTHCA_OPCODE_BIND_MW:
+			entry->opcode    = IB_WC_BIND_MW;
+			break;
+		default:
+			entry->opcode    = MTHCA_OPCODE_INVALID;
+			break;
+		}
+	} else {
+		entry->byte_len = be32_to_cpu(cqe->byte_cnt);
+		switch (cqe->opcode & 0x1f) {
+		case IB_OPCODE_SEND_LAST_WITH_IMMEDIATE:
+		case IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE:
+			entry->wc_flags = IB_WC_WITH_IMM;
+			entry->ex.imm_data = cqe->imm_etype_pkey_eec;
+			entry->opcode = IB_WC_RECV;
+			break;
+		case IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+		case IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE:
+			entry->wc_flags = IB_WC_WITH_IMM;
+			entry->ex.imm_data = cqe->imm_etype_pkey_eec;
+			entry->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+			break;
+		default:
+			entry->wc_flags = 0;
+			entry->opcode = IB_WC_RECV;
+			break;
+		}
+		entry->slid 	   = be16_to_cpu(cqe->rlid);
+		entry->sl   	   = cqe->sl_ipok >> 4;
+		entry->src_qp 	   = be32_to_cpu(cqe->rqpn) & 0xffffff;
+		entry->dlid_path_bits = cqe->g_mlpath & 0x7f;
+		entry->pkey_index  = be32_to_cpu(cqe->imm_etype_pkey_eec) >> 16;
+		entry->wc_flags   |= cqe->g_mlpath & 0x80 ? IB_WC_GRH : 0;
+		checksum = (be32_to_cpu(cqe->rqpn) >> 24) |
+				((be32_to_cpu(cqe->my_ee) >> 16) & 0xff00);
+		entry->csum_ok = (cqe->sl_ipok & 1 && checksum == 0xffff);
+	}
+
+	entry->status = IB_WC_SUCCESS;
+
+ out:
+	if (likely(free_cqe)) {
+		set_cqe_hw(cqe);
+		++(*freed);
+		++cq->cons_index;
+	}
+
+	return err;
+}
+
+int mthca_poll_cq(struct ib_cq *ibcq, int num_entries,
+		  struct ib_wc *entry)
+{
+	struct mthca_dev *dev = to_mdev(ibcq->device);
+	struct mthca_cq *cq = to_mcq(ibcq);
+	struct mthca_qp *qp = NULL;
+	unsigned long flags;
+	int err = 0;
+	int freed = 0;
+	int npolled;
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	npolled = 0;
+repoll:
+	while (npolled < num_entries) {
+		err = mthca_poll_one(dev, cq, &qp,
+				     &freed, entry + npolled);
+		if (err)
+			break;
+		++npolled;
+	}
+
+	if (freed) {
+		wmb();
+		update_cons_index(dev, cq, freed);
+	}
+
+	/*
+	 * If a CQ resize is in progress and we discovered that the
+	 * old buffer is empty, then peek in the new buffer, and if
+	 * it's not empty, switch to the new buffer and continue
+	 * polling there.
+	 */
+	if (unlikely(err == -EAGAIN && cq->resize_buf &&
+		     cq->resize_buf->state == CQ_RESIZE_READY)) {
+		/*
+		 * In Tavor mode, the hardware keeps the producer
+		 * index modulo the CQ size.  Since we might be making
+		 * the CQ bigger, we need to mask our consumer index
+		 * using the size of the old CQ buffer before looking
+		 * in the new CQ buffer.
+		 */
+		if (!mthca_is_memfree(dev))
+			cq->cons_index &= cq->ibcq.cqe;
+
+		if (cqe_sw(get_cqe_from_buf(&cq->resize_buf->buf,
+					    cq->cons_index & cq->resize_buf->cqe))) {
+			struct mthca_cq_buf tbuf;
+			int tcqe;
+
+			tbuf         = cq->buf;
+			tcqe         = cq->ibcq.cqe;
+			cq->buf      = cq->resize_buf->buf;
+			cq->ibcq.cqe = cq->resize_buf->cqe;
+
+			cq->resize_buf->buf   = tbuf;
+			cq->resize_buf->cqe   = tcqe;
+			cq->resize_buf->state = CQ_RESIZE_SWAPPED;
+
+			goto repoll;
+		}
+	}
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	return err == 0 || err == -EAGAIN ? npolled : err;
+}
+
+int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags)
+{
+	u32 dbhi = ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+		    MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL :
+		    MTHCA_TAVOR_CQ_DB_REQ_NOT) |
+		to_mcq(cq)->cqn;
+
+	mthca_write64(dbhi, 0xffffffff, to_mdev(cq->device)->kar + MTHCA_CQ_DOORBELL,
+		      MTHCA_GET_DOORBELL_LOCK(&to_mdev(cq->device)->doorbell_lock));
+
+	return 0;
+}
+
+int mthca_arbel_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+	struct mthca_cq *cq = to_mcq(ibcq);
+	__be32 db_rec[2];
+	u32 dbhi;
+	u32 sn = cq->arm_sn & 3;
+
+	db_rec[0] = cpu_to_be32(cq->cons_index);
+	db_rec[1] = cpu_to_be32((cq->cqn << 8) | (2 << 5) | (sn << 3) |
+				((flags & IB_CQ_SOLICITED_MASK) ==
+				 IB_CQ_SOLICITED ? 1 : 2));
+
+	mthca_write_db_rec(db_rec, cq->arm_db);
+
+	/*
+	 * Make sure that the doorbell record in host memory is
+	 * written before ringing the doorbell via PCI MMIO.
+	 */
+	wmb();
+
+	dbhi = (sn << 28) |
+		((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+		 MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL :
+		 MTHCA_ARBEL_CQ_DB_REQ_NOT) | cq->cqn;
+
+	mthca_write64(dbhi, cq->cons_index,
+		      to_mdev(ibcq->device)->kar + MTHCA_CQ_DOORBELL,
+		      MTHCA_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->doorbell_lock));
+
+	return 0;
+}
+
+int mthca_init_cq(struct mthca_dev *dev, int nent,
+		  struct mthca_ucontext *ctx, u32 pdn,
+		  struct mthca_cq *cq)
+{
+	struct mthca_mailbox *mailbox;
+	struct mthca_cq_context *cq_context;
+	int err = -ENOMEM;
+	u8 status;
+
+	cq->ibcq.cqe  = nent - 1;
+	cq->is_kernel = !ctx;
+
+	cq->cqn = mthca_alloc(&dev->cq_table.alloc);
+	if (cq->cqn == -1)
+		return -ENOMEM;
+
+	if (mthca_is_memfree(dev)) {
+		err = mthca_table_get(dev, dev->cq_table.table, cq->cqn);
+		if (err)
+			goto err_out;
+
+		if (cq->is_kernel) {
+			cq->arm_sn = 1;
+
+			err = -ENOMEM;
+
+			cq->set_ci_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_SET_CI,
+							     cq->cqn, &cq->set_ci_db);
+			if (cq->set_ci_db_index < 0)
+				goto err_out_icm;
+
+			cq->arm_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_ARM,
+							  cq->cqn, &cq->arm_db);
+			if (cq->arm_db_index < 0)
+				goto err_out_ci;
+		}
+	}
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		goto err_out_arm;
+
+	cq_context = mailbox->buf;
+
+	if (cq->is_kernel) {
+		err = mthca_alloc_cq_buf(dev, &cq->buf, nent);
+		if (err)
+			goto err_out_mailbox;
+	}
+
+	spin_lock_init(&cq->lock);
+	cq->refcount = 1;
+	init_waitqueue_head(&cq->wait);
+	mutex_init(&cq->mutex);
+
+	memset(cq_context, 0, sizeof *cq_context);
+	cq_context->flags           = cpu_to_be32(MTHCA_CQ_STATUS_OK      |
+						  MTHCA_CQ_STATE_DISARMED |
+						  MTHCA_CQ_FLAG_TR);
+	cq_context->logsize_usrpage = cpu_to_be32((ffs(nent) - 1) << 24);
+	if (ctx)
+		cq_context->logsize_usrpage |= cpu_to_be32(ctx->uar.index);
+	else
+		cq_context->logsize_usrpage |= cpu_to_be32(dev->driver_uar.index);
+	cq_context->error_eqn       = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn);
+	cq_context->comp_eqn        = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_COMP].eqn);
+	cq_context->pd              = cpu_to_be32(pdn);
+	cq_context->lkey            = cpu_to_be32(cq->buf.mr.ibmr.lkey);
+	cq_context->cqn             = cpu_to_be32(cq->cqn);
+
+	if (mthca_is_memfree(dev)) {
+		cq_context->ci_db    = cpu_to_be32(cq->set_ci_db_index);
+		cq_context->state_db = cpu_to_be32(cq->arm_db_index);
+	}
+
+	err = mthca_SW2HW_CQ(dev, mailbox, cq->cqn, &status);
+	if (err) {
+		mthca_warn(dev, "SW2HW_CQ failed (%d)\n", err);
+		goto err_out_free_mr;
+	}
+
+	if (status) {
+		mthca_warn(dev, "SW2HW_CQ returned status 0x%02x\n",
+			   status);
+		err = -EINVAL;
+		goto err_out_free_mr;
+	}
+
+	spin_lock_irq(&dev->cq_table.lock);
+	if (mthca_array_set(&dev->cq_table.cq,
+			    cq->cqn & (dev->limits.num_cqs - 1),
+			    cq)) {
+		spin_unlock_irq(&dev->cq_table.lock);
+		goto err_out_free_mr;
+	}
+	spin_unlock_irq(&dev->cq_table.lock);
+
+	cq->cons_index = 0;
+
+	mthca_free_mailbox(dev, mailbox);
+
+	return 0;
+
+err_out_free_mr:
+	if (cq->is_kernel)
+		mthca_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
+
+err_out_mailbox:
+	mthca_free_mailbox(dev, mailbox);
+
+err_out_arm:
+	if (cq->is_kernel && mthca_is_memfree(dev))
+		mthca_free_db(dev, MTHCA_DB_TYPE_CQ_ARM, cq->arm_db_index);
+
+err_out_ci:
+	if (cq->is_kernel && mthca_is_memfree(dev))
+		mthca_free_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, cq->set_ci_db_index);
+
+err_out_icm:
+	mthca_table_put(dev, dev->cq_table.table, cq->cqn);
+
+err_out:
+	mthca_free(&dev->cq_table.alloc, cq->cqn);
+
+	return err;
+}
+
+static inline int get_cq_refcount(struct mthca_dev *dev, struct mthca_cq *cq)
+{
+	int c;
+
+	spin_lock_irq(&dev->cq_table.lock);
+	c = cq->refcount;
+	spin_unlock_irq(&dev->cq_table.lock);
+
+	return c;
+}
+
+void mthca_free_cq(struct mthca_dev *dev,
+		   struct mthca_cq *cq)
+{
+	struct mthca_mailbox *mailbox;
+	int err;
+	u8 status;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		mthca_warn(dev, "No memory for mailbox to free CQ.\n");
+		return;
+	}
+
+	err = mthca_HW2SW_CQ(dev, mailbox, cq->cqn, &status);
+	if (err)
+		mthca_warn(dev, "HW2SW_CQ failed (%d)\n", err);
+	else if (status)
+		mthca_warn(dev, "HW2SW_CQ returned status 0x%02x\n", status);
+
+	if (0) {
+		__be32 *ctx = mailbox->buf;
+		int j;
+
+		printk(KERN_ERR "context for CQN %x (cons index %x, next sw %d)\n",
+		       cq->cqn, cq->cons_index,
+		       cq->is_kernel ? !!next_cqe_sw(cq) : 0);
+		for (j = 0; j < 16; ++j)
+			printk(KERN_ERR "[%2x] %08x\n", j * 4, be32_to_cpu(ctx[j]));
+	}
+
+	spin_lock_irq(&dev->cq_table.lock);
+	mthca_array_clear(&dev->cq_table.cq,
+			  cq->cqn & (dev->limits.num_cqs - 1));
+	--cq->refcount;
+	spin_unlock_irq(&dev->cq_table.lock);
+
+	if (dev->mthca_flags & MTHCA_FLAG_MSI_X)
+		synchronize_irq(dev->eq_table.eq[MTHCA_EQ_COMP].msi_x_vector);
+	else
+		synchronize_irq(dev->pdev->irq);
+
+	wait_event(cq->wait, !get_cq_refcount(dev, cq));
+
+	if (cq->is_kernel) {
+		mthca_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
+		if (mthca_is_memfree(dev)) {
+			mthca_free_db(dev, MTHCA_DB_TYPE_CQ_ARM,    cq->arm_db_index);
+			mthca_free_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, cq->set_ci_db_index);
+		}
+	}
+
+	mthca_table_put(dev, dev->cq_table.table, cq->cqn);
+	mthca_free(&dev->cq_table.alloc, cq->cqn);
+	mthca_free_mailbox(dev, mailbox);
+}
+
+int mthca_init_cq_table(struct mthca_dev *dev)
+{
+	int err;
+
+	spin_lock_init(&dev->cq_table.lock);
+
+	err = mthca_alloc_init(&dev->cq_table.alloc,
+			       dev->limits.num_cqs,
+			       (1 << 24) - 1,
+			       dev->limits.reserved_cqs);
+	if (err)
+		return err;
+
+	err = mthca_array_init(&dev->cq_table.cq,
+			       dev->limits.num_cqs);
+	if (err)
+		mthca_alloc_cleanup(&dev->cq_table.alloc);
+
+	return err;
+}
+
+void mthca_cleanup_cq_table(struct mthca_dev *dev)
+{
+	mthca_array_cleanup(&dev->cq_table.cq, dev->limits.num_cqs);
+	mthca_alloc_cleanup(&dev->cq_table.alloc);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_dev.h b/sys/ofed/drivers/infiniband/hw/mthca/mthca_dev.h
new file mode 100644
index 0000000..14e3f62
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_dev.h
@@ -0,0 +1,596 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_DEV_H
+#define MTHCA_DEV_H
+
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/timer.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/semaphore.h>
+
+#include "mthca_provider.h"
+#include "mthca_doorbell.h"
+
+#define DRV_NAME	"ib_mthca"
+#define PFX		DRV_NAME ": "
+#define DRV_VERSION	"1.0-ofed1.5.2"
+#define DRV_RELDATE	"August 4, 2010"
+
+enum {
+	MTHCA_FLAG_DDR_HIDDEN = 1 << 1,
+	MTHCA_FLAG_SRQ        = 1 << 2,
+	MTHCA_FLAG_MSI_X      = 1 << 3,
+	MTHCA_FLAG_NO_LAM     = 1 << 4,
+	MTHCA_FLAG_FMR        = 1 << 5,
+	MTHCA_FLAG_MEMFREE    = 1 << 6,
+	MTHCA_FLAG_PCIE       = 1 << 7,
+	MTHCA_FLAG_SINAI_OPT  = 1 << 8
+};
+
+enum {
+	MTHCA_MAX_PORTS = 2
+};
+
+enum {
+	MTHCA_BOARD_ID_LEN = 64
+};
+
+enum {
+	MTHCA_EQ_CONTEXT_SIZE =  0x40,
+	MTHCA_CQ_CONTEXT_SIZE =  0x40,
+	MTHCA_QP_CONTEXT_SIZE = 0x200,
+	MTHCA_RDB_ENTRY_SIZE  =  0x20,
+	MTHCA_AV_SIZE         =  0x20,
+	MTHCA_MGM_ENTRY_SIZE  = 0x100,
+
+	/* Arbel FW gives us these, but we need them for Tavor */
+	MTHCA_MPT_ENTRY_SIZE  =  0x40,
+	MTHCA_MTT_SEG_SIZE    =  0x40,
+
+	MTHCA_QP_PER_MGM      = 4 * (MTHCA_MGM_ENTRY_SIZE / 16 - 2)
+};
+
+enum {
+	MTHCA_EQ_CMD,
+	MTHCA_EQ_ASYNC,
+	MTHCA_EQ_COMP,
+	MTHCA_NUM_EQ
+};
+
+enum {
+	MTHCA_OPCODE_NOP            = 0x00,
+	MTHCA_OPCODE_RDMA_WRITE     = 0x08,
+	MTHCA_OPCODE_RDMA_WRITE_IMM = 0x09,
+	MTHCA_OPCODE_SEND           = 0x0a,
+	MTHCA_OPCODE_SEND_IMM       = 0x0b,
+	MTHCA_OPCODE_RDMA_READ      = 0x10,
+	MTHCA_OPCODE_ATOMIC_CS      = 0x11,
+	MTHCA_OPCODE_ATOMIC_FA      = 0x12,
+	MTHCA_OPCODE_BIND_MW        = 0x18,
+	MTHCA_OPCODE_INVALID        = 0xff
+};
+
+enum {
+	MTHCA_CMD_USE_EVENTS         = 1 << 0,
+	MTHCA_CMD_POST_DOORBELLS     = 1 << 1
+};
+
+enum {
+	MTHCA_CMD_NUM_DBELL_DWORDS = 8
+};
+
+struct mthca_cmd {
+	struct pci_pool          *pool;
+	struct mutex              hcr_mutex;
+	struct semaphore 	  poll_sem;
+	struct semaphore 	  event_sem;
+	int              	  max_cmds;
+	spinlock_t                context_lock;
+	int                       free_head;
+	struct mthca_cmd_context *context;
+	u16                       token_mask;
+	u32                       flags;
+	void __iomem             *dbell_map;
+	u16                       dbell_offsets[MTHCA_CMD_NUM_DBELL_DWORDS];
+};
+
+struct mthca_limits {
+	int      num_ports;
+	int      vl_cap;
+	int      mtu_cap;
+	int      gid_table_len;
+	int      pkey_table_len;
+	int      local_ca_ack_delay;
+	int      num_uars;
+	int      max_sg;
+	int      num_qps;
+	int      max_wqes;
+	int	 max_desc_sz;
+	int	 max_qp_init_rdma;
+	int      reserved_qps;
+	int      num_srqs;
+	int      max_srq_wqes;
+	int      max_srq_sge;
+	int      reserved_srqs;
+	int      num_eecs;
+	int      reserved_eecs;
+	int      num_cqs;
+	int      max_cqes;
+	int      reserved_cqs;
+	int      num_eqs;
+	int      reserved_eqs;
+	int      num_mpts;
+	int      num_mtt_segs;
+	int	 mtt_seg_size;
+	int      fmr_reserved_mtts;
+	int      reserved_mtts;
+	int      reserved_mrws;
+	int      reserved_uars;
+	int      num_mgms;
+	int      num_amgms;
+	int      reserved_mcgs;
+	int      num_pds;
+	int      reserved_pds;
+	u32      page_size_cap;
+	u32      flags;
+	u16      stat_rate_support;
+	u8       port_width_cap;
+};
+
+struct mthca_alloc {
+	u32            last;
+	u32            top;
+	u32            max;
+	u32            mask;
+	spinlock_t     lock;
+	unsigned long *table;
+};
+
+struct mthca_array {
+	struct {
+		void    **page;
+		int       used;
+	} *page_list;
+};
+
+struct mthca_uar_table {
+	struct mthca_alloc alloc;
+	u64                uarc_base;
+	int                uarc_size;
+};
+
+struct mthca_pd_table {
+	struct mthca_alloc alloc;
+};
+
+struct mthca_buddy {
+	unsigned long **bits;
+	int	       *num_free;
+	int             max_order;
+	spinlock_t      lock;
+};
+
+struct mthca_mr_table {
+	struct mthca_alloc      mpt_alloc;
+	struct mthca_buddy      mtt_buddy;
+	struct mthca_buddy     *fmr_mtt_buddy;
+	u64                     mtt_base;
+	u64                     mpt_base;
+	struct mthca_icm_table *mtt_table;
+	struct mthca_icm_table *mpt_table;
+	struct {
+		void __iomem   *mpt_base;
+		void __iomem   *mtt_base;
+		struct mthca_buddy mtt_buddy;
+	} tavor_fmr;
+};
+
+struct mthca_eq_table {
+	struct mthca_alloc alloc;
+	void __iomem      *clr_int;
+	u32                clr_mask;
+	u32                arm_mask;
+	struct mthca_eq    eq[MTHCA_NUM_EQ];
+	u64                icm_virt;
+	struct page       *icm_page;
+	dma_addr_t         icm_dma;
+	int                have_irq;
+	u8                 inta_pin;
+};
+
+struct mthca_cq_table {
+	struct mthca_alloc 	alloc;
+	spinlock_t         	lock;
+	struct mthca_array      cq;
+	struct mthca_icm_table *table;
+};
+
+struct mthca_srq_table {
+	struct mthca_alloc 	alloc;
+	spinlock_t         	lock;
+	struct mthca_array      srq;
+	struct mthca_icm_table *table;
+};
+
+struct mthca_qp_table {
+	struct mthca_alloc     	alloc;
+	u32                    	rdb_base;
+	int                    	rdb_shift;
+	int                    	sqp_start;
+	spinlock_t             	lock;
+	struct mthca_array     	qp;
+	struct mthca_icm_table *qp_table;
+	struct mthca_icm_table *eqp_table;
+	struct mthca_icm_table *rdb_table;
+};
+
+struct mthca_av_table {
+	struct pci_pool   *pool;
+	int                num_ddr_avs;
+	u64                ddr_av_base;
+	void __iomem      *av_map;
+	struct mthca_alloc alloc;
+};
+
+struct mthca_mcg_table {
+	struct mutex		mutex;
+	struct mthca_alloc 	alloc;
+	struct mthca_icm_table *table;
+};
+
+struct mthca_catas_err {
+	u64			addr;
+	u32 __iomem	       *map;
+	u32			size;
+	struct timer_list	timer;
+	struct list_head	list;
+};
+
+extern struct mutex mthca_device_mutex;
+
+struct mthca_dev {
+	struct ib_device  ib_dev;
+	struct pci_dev   *pdev;
+
+	int          	 hca_type;
+	unsigned long	 mthca_flags;
+	unsigned long    device_cap_flags;
+
+	u32              rev_id;
+	char             board_id[MTHCA_BOARD_ID_LEN];
+
+	/* firmware info */
+	u64              fw_ver;
+	union {
+		struct {
+			u64 fw_start;
+			u64 fw_end;
+		}        tavor;
+		struct {
+			u64 clr_int_base;
+			u64 eq_arm_base;
+			u64 eq_set_ci_base;
+			struct mthca_icm *fw_icm;
+			struct mthca_icm *aux_icm;
+			u16 fw_pages;
+		}        arbel;
+	}                fw;
+
+	u64              ddr_start;
+	u64              ddr_end;
+
+	MTHCA_DECLARE_DOORBELL_LOCK(doorbell_lock)
+	struct mutex cap_mask_mutex;
+
+	void __iomem    *hcr;
+	void __iomem    *kar;
+	void __iomem    *clr_base;
+	union {
+		struct {
+			void __iomem *ecr_base;
+		} tavor;
+		struct {
+			void __iomem *eq_arm;
+			void __iomem *eq_set_ci_base;
+		} arbel;
+	} eq_regs;
+
+	struct mthca_cmd    cmd;
+	struct mthca_limits limits;
+
+	struct mthca_uar_table uar_table;
+	struct mthca_pd_table  pd_table;
+	struct mthca_mr_table  mr_table;
+	struct mthca_eq_table  eq_table;
+	struct mthca_cq_table  cq_table;
+	struct mthca_srq_table srq_table;
+	struct mthca_qp_table  qp_table;
+	struct mthca_av_table  av_table;
+	struct mthca_mcg_table mcg_table;
+
+	struct mthca_catas_err catas_err;
+
+	struct mthca_uar       driver_uar;
+	struct mthca_db_table *db_tab;
+	struct mthca_pd        driver_pd;
+	struct mthca_mr        driver_mr;
+
+	struct ib_mad_agent  *send_agent[MTHCA_MAX_PORTS][2];
+	struct ib_ah         *sm_ah[MTHCA_MAX_PORTS];
+	spinlock_t            sm_lock;
+	u8                    rate[MTHCA_MAX_PORTS];
+	int		      active;
+};
+
+#ifdef CONFIG_INFINIBAND_MTHCA_DEBUG
+extern int mthca_debug_level;
+
+#define mthca_dbg(mdev, format, arg...)					\
+	do {								\
+		if (mthca_debug_level)					\
+			dev_printk(KERN_DEBUG, &mdev->pdev->dev, format, ## arg); \
+	} while (0)
+
+#else /* CONFIG_INFINIBAND_MTHCA_DEBUG */
+
+#define mthca_dbg(mdev, format, arg...) do { (void) mdev; } while (0)
+
+#endif /* CONFIG_INFINIBAND_MTHCA_DEBUG */
+
+#define mthca_err(mdev, format, arg...) \
+	dev_err(&mdev->pdev->dev, format, ## arg)
+#define mthca_info(mdev, format, arg...) \
+	dev_info(&mdev->pdev->dev, format, ## arg)
+#define mthca_warn(mdev, format, arg...) \
+	dev_warn(&mdev->pdev->dev, format, ## arg)
+
+extern void __buggy_use_of_MTHCA_GET(void);
+extern void __buggy_use_of_MTHCA_PUT(void);
+
+#define MTHCA_GET(dest, source, offset)                               \
+	do {                                                          \
+		void *__p = (char *) (source) + (offset);             \
+		switch (sizeof (dest)) {                              \
+		case 1: (dest) = *(u8 *) __p;       break;	      \
+		case 2: (dest) = be16_to_cpup(__p); break;	      \
+		case 4: (dest) = be32_to_cpup(__p); break;	      \
+		case 8: (dest) = be64_to_cpup(__p); break;	      \
+		default: __buggy_use_of_MTHCA_GET();		      \
+		}                                                     \
+	} while (0)
+
+#define MTHCA_PUT(dest, source, offset)                               \
+	do {                                                          \
+		void *__d = ((char *) (dest) + (offset));	      \
+		switch (sizeof(source)) {                             \
+		case 1: *(u8 *) __d = (source);                break; \
+		case 2:	*(__be16 *) __d = cpu_to_be16(source); break; \
+		case 4:	*(__be32 *) __d = cpu_to_be32(source); break; \
+		case 8:	*(__be64 *) __d = cpu_to_be64(source); break; \
+		default: __buggy_use_of_MTHCA_PUT();		      \
+		}                                                     \
+	} while (0)
+
+int mthca_reset(struct mthca_dev *mdev);
+
+u32 mthca_alloc(struct mthca_alloc *alloc);
+void mthca_free(struct mthca_alloc *alloc, u32 obj);
+int mthca_alloc_init(struct mthca_alloc *alloc, u32 num, u32 mask,
+		     u32 reserved);
+void mthca_alloc_cleanup(struct mthca_alloc *alloc);
+void *mthca_array_get(struct mthca_array *array, int index);
+int mthca_array_set(struct mthca_array *array, int index, void *value);
+void mthca_array_clear(struct mthca_array *array, int index);
+int mthca_array_init(struct mthca_array *array, int nent);
+void mthca_array_cleanup(struct mthca_array *array, int nent);
+int mthca_buf_alloc(struct mthca_dev *dev, int size, int max_direct,
+		    union mthca_buf *buf, int *is_direct, struct mthca_pd *pd,
+		    int hca_write, struct mthca_mr *mr);
+void mthca_buf_free(struct mthca_dev *dev, int size, union mthca_buf *buf,
+		    int is_direct, struct mthca_mr *mr);
+
+int mthca_init_uar_table(struct mthca_dev *dev);
+int mthca_init_pd_table(struct mthca_dev *dev);
+int mthca_init_mr_table(struct mthca_dev *dev);
+int mthca_init_eq_table(struct mthca_dev *dev);
+int mthca_init_cq_table(struct mthca_dev *dev);
+int mthca_init_srq_table(struct mthca_dev *dev);
+int mthca_init_qp_table(struct mthca_dev *dev);
+int mthca_init_av_table(struct mthca_dev *dev);
+int mthca_init_mcg_table(struct mthca_dev *dev);
+
+void mthca_cleanup_uar_table(struct mthca_dev *dev);
+void mthca_cleanup_pd_table(struct mthca_dev *dev);
+void mthca_cleanup_mr_table(struct mthca_dev *dev);
+void mthca_cleanup_eq_table(struct mthca_dev *dev);
+void mthca_cleanup_cq_table(struct mthca_dev *dev);
+void mthca_cleanup_srq_table(struct mthca_dev *dev);
+void mthca_cleanup_qp_table(struct mthca_dev *dev);
+void mthca_cleanup_av_table(struct mthca_dev *dev);
+void mthca_cleanup_mcg_table(struct mthca_dev *dev);
+
+int mthca_register_device(struct mthca_dev *dev);
+void mthca_unregister_device(struct mthca_dev *dev);
+
+void mthca_start_catas_poll(struct mthca_dev *dev);
+void mthca_stop_catas_poll(struct mthca_dev *dev);
+int __mthca_restart_one(struct pci_dev *pdev);
+int mthca_catas_init(void);
+void mthca_catas_cleanup(void);
+
+int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar);
+void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar);
+
+int mthca_pd_alloc(struct mthca_dev *dev, int privileged, struct mthca_pd *pd);
+void mthca_pd_free(struct mthca_dev *dev, struct mthca_pd *pd);
+
+int mthca_write_mtt_size(struct mthca_dev *dev);
+
+struct mthca_mtt *mthca_alloc_mtt(struct mthca_dev *dev, int size);
+void mthca_free_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt);
+int mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt,
+		    int start_index, u64 *buffer_list, int list_len);
+int mthca_mr_alloc(struct mthca_dev *dev, u32 pd, int buffer_size_shift,
+		   u64 iova, u64 total_size, u32 access, struct mthca_mr *mr);
+int mthca_mr_alloc_notrans(struct mthca_dev *dev, u32 pd,
+			   u32 access, struct mthca_mr *mr);
+int mthca_mr_alloc_phys(struct mthca_dev *dev, u32 pd,
+			u64 *buffer_list, int buffer_size_shift,
+			int list_len, u64 iova, u64 total_size,
+			u32 access, struct mthca_mr *mr);
+void mthca_free_mr(struct mthca_dev *dev,  struct mthca_mr *mr);
+
+int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd,
+		    u32 access, struct mthca_fmr *fmr);
+int mthca_tavor_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+			     int list_len, u64 iova);
+void mthca_tavor_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr);
+int mthca_arbel_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+			     int list_len, u64 iova);
+void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr);
+int mthca_free_fmr(struct mthca_dev *dev,  struct mthca_fmr *fmr);
+
+int mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt);
+void mthca_unmap_eq_icm(struct mthca_dev *dev);
+
+int mthca_poll_cq(struct ib_cq *ibcq, int num_entries,
+		  struct ib_wc *entry);
+int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
+int mthca_arbel_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
+int mthca_init_cq(struct mthca_dev *dev, int nent,
+		  struct mthca_ucontext *ctx, u32 pdn,
+		  struct mthca_cq *cq);
+void mthca_free_cq(struct mthca_dev *dev,
+		   struct mthca_cq *cq);
+void mthca_cq_completion(struct mthca_dev *dev, u32 cqn);
+void mthca_cq_event(struct mthca_dev *dev, u32 cqn,
+		    enum ib_event_type event_type);
+void mthca_cq_clean(struct mthca_dev *dev, struct mthca_cq *cq, u32 qpn,
+		    struct mthca_srq *srq);
+void mthca_cq_resize_copy_cqes(struct mthca_cq *cq);
+int mthca_alloc_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int nent);
+void mthca_free_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int cqe);
+
+int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd,
+		    struct ib_srq_attr *attr, struct mthca_srq *srq);
+void mthca_free_srq(struct mthca_dev *dev, struct mthca_srq *srq);
+int mthca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		     enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
+int mthca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
+int mthca_max_srq_sge(struct mthca_dev *dev);
+void mthca_srq_event(struct mthca_dev *dev, u32 srqn,
+		     enum ib_event_type event_type);
+void mthca_free_srq_wqe(struct mthca_srq *srq, u32 wqe_addr);
+int mthca_tavor_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *wr,
+			      struct ib_recv_wr **bad_wr);
+int mthca_arbel_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *wr,
+			      struct ib_recv_wr **bad_wr);
+
+void mthca_qp_event(struct mthca_dev *dev, u32 qpn,
+		    enum ib_event_type event_type);
+int mthca_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+		   struct ib_qp_init_attr *qp_init_attr);
+int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
+		    struct ib_udata *udata);
+int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			  struct ib_send_wr **bad_wr);
+int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			     struct ib_recv_wr **bad_wr);
+int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			  struct ib_send_wr **bad_wr);
+int mthca_arbel_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			     struct ib_recv_wr **bad_wr);
+void mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send,
+			int index, int *dbd, __be32 *new_wqe);
+int mthca_alloc_qp(struct mthca_dev *dev,
+		   struct mthca_pd *pd,
+		   struct mthca_cq *send_cq,
+		   struct mthca_cq *recv_cq,
+		   enum ib_qp_type type,
+		   enum ib_sig_type send_policy,
+		   struct ib_qp_cap *cap,
+		   struct mthca_qp *qp);
+int mthca_alloc_sqp(struct mthca_dev *dev,
+		    struct mthca_pd *pd,
+		    struct mthca_cq *send_cq,
+		    struct mthca_cq *recv_cq,
+		    enum ib_sig_type send_policy,
+		    struct ib_qp_cap *cap,
+		    int qpn,
+		    int port,
+		    struct mthca_sqp *sqp);
+void mthca_free_qp(struct mthca_dev *dev, struct mthca_qp *qp);
+int mthca_create_ah(struct mthca_dev *dev,
+		    struct mthca_pd *pd,
+		    struct ib_ah_attr *ah_attr,
+		    struct mthca_ah *ah);
+int mthca_destroy_ah(struct mthca_dev *dev, struct mthca_ah *ah);
+int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah,
+		  struct ib_ud_header *header);
+int mthca_ah_query(struct ib_ah *ibah, struct ib_ah_attr *attr);
+int mthca_ah_grh_present(struct mthca_ah *ah);
+u8 mthca_get_rate(struct mthca_dev *dev, int static_rate, u8 port);
+enum ib_rate mthca_rate_to_ib(struct mthca_dev *dev, u8 mthca_rate, u8 port);
+
+int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int mthca_process_mad(struct ib_device *ibdev,
+		      int mad_flags,
+		      u8 port_num,
+		      struct ib_wc *in_wc,
+		      struct ib_grh *in_grh,
+		      struct ib_mad *in_mad,
+		      struct ib_mad *out_mad);
+int mthca_create_agents(struct mthca_dev *dev);
+void mthca_free_agents(struct mthca_dev *dev);
+
+static inline struct mthca_dev *to_mdev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct mthca_dev, ib_dev);
+}
+
+static inline int mthca_is_memfree(struct mthca_dev *dev)
+{
+	return dev->mthca_flags & MTHCA_FLAG_MEMFREE;
+}
+
+#endif /* MTHCA_DEV_H */
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_doorbell.h b/sys/ofed/drivers/infiniband/hw/mthca/mthca_doorbell.h
new file mode 100644
index 0000000..14f51ef
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_doorbell.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+
+#define MTHCA_RD_DOORBELL      0x00
+#define MTHCA_SEND_DOORBELL    0x10
+#define MTHCA_RECEIVE_DOORBELL 0x18
+#define MTHCA_CQ_DOORBELL      0x20
+#define MTHCA_EQ_DOORBELL      0x28
+
+#if BITS_PER_LONG == 64
+/*
+ * Assume that we can just write a 64-bit doorbell atomically.  s390
+ * actually doesn't have writeq() but S/390 systems don't even have
+ * PCI so we won't worry about it.
+ */
+
+#define MTHCA_DECLARE_DOORBELL_LOCK(name)
+#define MTHCA_INIT_DOORBELL_LOCK(ptr)    do { } while (0)
+#define MTHCA_GET_DOORBELL_LOCK(ptr)      (NULL)
+
+static inline void mthca_write64_raw(__be64 val, void __iomem *dest)
+{
+	__raw_writeq((__force u64) val, dest);
+}
+
+static inline void mthca_write64(u32 hi, u32 lo, void __iomem *dest,
+				 spinlock_t *doorbell_lock)
+{
+	__raw_writeq((__force u64) cpu_to_be64((u64) hi << 32 | lo), dest);
+}
+
+static inline void mthca_write_db_rec(__be32 val[2], __be32 *db)
+{
+	*(u64 *) db = *(u64 *) val;
+}
+
+#else
+
+/*
+ * Just fall back to a spinlock to protect the doorbell if
+ * BITS_PER_LONG is 32 -- there's no portable way to do atomic 64-bit
+ * MMIO writes.
+ */
+
+#define MTHCA_DECLARE_DOORBELL_LOCK(name) spinlock_t name;
+#define MTHCA_INIT_DOORBELL_LOCK(ptr)     spin_lock_init(ptr)
+#define MTHCA_GET_DOORBELL_LOCK(ptr)      (ptr)
+
+static inline void mthca_write64_raw(__be64 val, void __iomem *dest)
+{
+	__raw_writel(((__force u32 *) &val)[0], dest);
+	__raw_writel(((__force u32 *) &val)[1], dest + 4);
+}
+
+static inline void mthca_write64(u32 hi, u32 lo, void __iomem *dest,
+				 spinlock_t *doorbell_lock)
+{
+	unsigned long flags;
+
+	hi = (__force u32) cpu_to_be32(hi);
+	lo = (__force u32) cpu_to_be32(lo);
+
+	spin_lock_irqsave(doorbell_lock, flags);
+	__raw_writel(hi, dest);
+	__raw_writel(lo, dest + 4);
+	spin_unlock_irqrestore(doorbell_lock, flags);
+}
+
+static inline void mthca_write_db_rec(__be32 val[2], __be32 *db)
+{
+	db[0] = val[0];
+	wmb();
+	db[1] = val[1];
+}
+
+#endif
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_eq.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_eq.c
new file mode 100644
index 0000000..90e4e45
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_eq.c
@@ -0,0 +1,920 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_config_reg.h"
+
+enum {
+	MTHCA_NUM_ASYNC_EQE = 0x80,
+	MTHCA_NUM_CMD_EQE   = 0x80,
+	MTHCA_NUM_SPARE_EQE = 0x80,
+	MTHCA_EQ_ENTRY_SIZE = 0x20
+};
+
+/*
+ * Must be packed because start is 64 bits but only aligned to 32 bits.
+ */
+struct mthca_eq_context {
+	__be32 flags;
+	__be64 start;
+	__be32 logsize_usrpage;
+	__be32 tavor_pd;	/* reserved for Arbel */
+	u8     reserved1[3];
+	u8     intr;
+	__be32 arbel_pd;	/* lost_count for Tavor */
+	__be32 lkey;
+	u32    reserved2[2];
+	__be32 consumer_index;
+	__be32 producer_index;
+	u32    reserved3[4];
+} __attribute__((packed));
+
+#define MTHCA_EQ_STATUS_OK          ( 0 << 28)
+#define MTHCA_EQ_STATUS_OVERFLOW    ( 9 << 28)
+#define MTHCA_EQ_STATUS_WRITE_FAIL  (10 << 28)
+#define MTHCA_EQ_OWNER_SW           ( 0 << 24)
+#define MTHCA_EQ_OWNER_HW           ( 1 << 24)
+#define MTHCA_EQ_FLAG_TR            ( 1 << 18)
+#define MTHCA_EQ_FLAG_OI            ( 1 << 17)
+#define MTHCA_EQ_STATE_ARMED        ( 1 <<  8)
+#define MTHCA_EQ_STATE_FIRED        ( 2 <<  8)
+#define MTHCA_EQ_STATE_ALWAYS_ARMED ( 3 <<  8)
+#define MTHCA_EQ_STATE_ARBEL        ( 8 <<  8)
+
+enum {
+	MTHCA_EVENT_TYPE_COMP       	    = 0x00,
+	MTHCA_EVENT_TYPE_PATH_MIG   	    = 0x01,
+	MTHCA_EVENT_TYPE_COMM_EST   	    = 0x02,
+	MTHCA_EVENT_TYPE_SQ_DRAINED 	    = 0x03,
+	MTHCA_EVENT_TYPE_SRQ_QP_LAST_WQE    = 0x13,
+	MTHCA_EVENT_TYPE_SRQ_LIMIT	    = 0x14,
+	MTHCA_EVENT_TYPE_CQ_ERROR   	    = 0x04,
+	MTHCA_EVENT_TYPE_WQ_CATAS_ERROR     = 0x05,
+	MTHCA_EVENT_TYPE_EEC_CATAS_ERROR    = 0x06,
+	MTHCA_EVENT_TYPE_PATH_MIG_FAILED    = 0x07,
+	MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR = 0x10,
+	MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR    = 0x11,
+	MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR    = 0x12,
+	MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR  = 0x08,
+	MTHCA_EVENT_TYPE_PORT_CHANGE        = 0x09,
+	MTHCA_EVENT_TYPE_EQ_OVERFLOW        = 0x0f,
+	MTHCA_EVENT_TYPE_ECC_DETECT         = 0x0e,
+	MTHCA_EVENT_TYPE_CMD                = 0x0a
+};
+
+#define MTHCA_ASYNC_EVENT_MASK ((1ULL << MTHCA_EVENT_TYPE_PATH_MIG)           | \
+				(1ULL << MTHCA_EVENT_TYPE_COMM_EST)           | \
+				(1ULL << MTHCA_EVENT_TYPE_SQ_DRAINED)         | \
+				(1ULL << MTHCA_EVENT_TYPE_CQ_ERROR)           | \
+				(1ULL << MTHCA_EVENT_TYPE_WQ_CATAS_ERROR)     | \
+				(1ULL << MTHCA_EVENT_TYPE_EEC_CATAS_ERROR)    | \
+				(1ULL << MTHCA_EVENT_TYPE_PATH_MIG_FAILED)    | \
+				(1ULL << MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | \
+				(1ULL << MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR)    | \
+				(1ULL << MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR)  | \
+				(1ULL << MTHCA_EVENT_TYPE_PORT_CHANGE)        | \
+				(1ULL << MTHCA_EVENT_TYPE_ECC_DETECT))
+#define MTHCA_SRQ_EVENT_MASK   ((1ULL << MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR)    | \
+				(1ULL << MTHCA_EVENT_TYPE_SRQ_QP_LAST_WQE)    | \
+				(1ULL << MTHCA_EVENT_TYPE_SRQ_LIMIT))
+#define MTHCA_CMD_EVENT_MASK    (1ULL << MTHCA_EVENT_TYPE_CMD)
+
+#define MTHCA_EQ_DB_INC_CI     (1 << 24)
+#define MTHCA_EQ_DB_REQ_NOT    (2 << 24)
+#define MTHCA_EQ_DB_DISARM_CQ  (3 << 24)
+#define MTHCA_EQ_DB_SET_CI     (4 << 24)
+#define MTHCA_EQ_DB_ALWAYS_ARM (5 << 24)
+
+struct mthca_eqe {
+	u8 reserved1;
+	u8 type;
+	u8 reserved2;
+	u8 subtype;
+	union {
+		u32 raw[6];
+		struct {
+			__be32 cqn;
+		} __attribute__((packed)) comp;
+		struct {
+			u16    reserved1;
+			__be16 token;
+			u32    reserved2;
+			u8     reserved3[3];
+			u8     status;
+			__be64 out_param;
+		} __attribute__((packed)) cmd;
+		struct {
+			__be32 qpn;
+		} __attribute__((packed)) qp;
+		struct {
+			__be32 srqn;
+		} __attribute__((packed)) srq;
+		struct {
+			__be32 cqn;
+			u32    reserved1;
+			u8     reserved2[3];
+			u8     syndrome;
+		} __attribute__((packed)) cq_err;
+		struct {
+			u32    reserved1[2];
+			__be32 port;
+		} __attribute__((packed)) port_change;
+	} event;
+	u8 reserved3[3];
+	u8 owner;
+} __attribute__((packed));
+
+#define  MTHCA_EQ_ENTRY_OWNER_SW      (0 << 7)
+#define  MTHCA_EQ_ENTRY_OWNER_HW      (1 << 7)
+
+static inline u64 async_mask(struct mthca_dev *dev)
+{
+	return dev->mthca_flags & MTHCA_FLAG_SRQ ?
+		MTHCA_ASYNC_EVENT_MASK | MTHCA_SRQ_EVENT_MASK :
+		MTHCA_ASYNC_EVENT_MASK;
+}
+
+static inline void tavor_set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci)
+{
+	/*
+	 * This barrier makes sure that all updates to ownership bits
+	 * done by set_eqe_hw() hit memory before the consumer index
+	 * is updated.  set_eq_ci() allows the HCA to possibly write
+	 * more EQ entries, and we want to avoid the exceedingly
+	 * unlikely possibility of the HCA writing an entry and then
+	 * having set_eqe_hw() overwrite the owner field.
+	 */
+	wmb();
+	mthca_write64(MTHCA_EQ_DB_SET_CI | eq->eqn, ci & (eq->nent - 1),
+		      dev->kar + MTHCA_EQ_DOORBELL,
+		      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+}
+
+static inline void arbel_set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci)
+{
+	/* See comment in tavor_set_eq_ci() above. */
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32(ci),
+		     dev->eq_regs.arbel.eq_set_ci_base + eq->eqn * 8);
+	/* We still want ordering, just not swabbing, so add a barrier */
+	mb();
+}
+
+static inline void set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci)
+{
+	if (mthca_is_memfree(dev))
+		arbel_set_eq_ci(dev, eq, ci);
+	else
+		tavor_set_eq_ci(dev, eq, ci);
+}
+
+static inline void tavor_eq_req_not(struct mthca_dev *dev, int eqn)
+{
+	mthca_write64(MTHCA_EQ_DB_REQ_NOT | eqn, 0,
+		      dev->kar + MTHCA_EQ_DOORBELL,
+		      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+}
+
+static inline void arbel_eq_req_not(struct mthca_dev *dev, u32 eqn_mask)
+{
+	writel(eqn_mask, dev->eq_regs.arbel.eq_arm);
+}
+
+static inline void disarm_cq(struct mthca_dev *dev, int eqn, int cqn)
+{
+	if (!mthca_is_memfree(dev)) {
+		mthca_write64(MTHCA_EQ_DB_DISARM_CQ | eqn, cqn,
+			      dev->kar + MTHCA_EQ_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+	}
+}
+
+static inline struct mthca_eqe *get_eqe(struct mthca_eq *eq, u32 entry)
+{
+	unsigned long off = (entry & (eq->nent - 1)) * MTHCA_EQ_ENTRY_SIZE;
+	return eq->page_list[off / PAGE_SIZE].buf + off % PAGE_SIZE;
+}
+
+static inline struct mthca_eqe *next_eqe_sw(struct mthca_eq *eq)
+{
+	struct mthca_eqe *eqe;
+	eqe = get_eqe(eq, eq->cons_index);
+	return (MTHCA_EQ_ENTRY_OWNER_HW & eqe->owner) ? NULL : eqe;
+}
+
+static inline void set_eqe_hw(struct mthca_eqe *eqe)
+{
+	eqe->owner =  MTHCA_EQ_ENTRY_OWNER_HW;
+}
+
+static void port_change(struct mthca_dev *dev, int port, int active)
+{
+	struct ib_event record;
+
+	mthca_dbg(dev, "Port change to %s for port %d\n",
+		  active ? "active" : "down", port);
+
+	record.device = &dev->ib_dev;
+	record.event  = active ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
+	record.element.port_num = port;
+
+	ib_dispatch_event(&record);
+}
+
+static int mthca_eq_int(struct mthca_dev *dev, struct mthca_eq *eq)
+{
+	struct mthca_eqe *eqe;
+	int disarm_cqn;
+	int eqes_found = 0;
+	int set_ci = 0;
+
+	while ((eqe = next_eqe_sw(eq))) {
+		/*
+		 * Make sure we read EQ entry contents after we've
+		 * checked the ownership bit.
+		 */
+		rmb();
+
+		switch (eqe->type) {
+		case MTHCA_EVENT_TYPE_COMP:
+			disarm_cqn = be32_to_cpu(eqe->event.comp.cqn) & 0xffffff;
+			disarm_cq(dev, eq->eqn, disarm_cqn);
+			mthca_cq_completion(dev, disarm_cqn);
+			break;
+
+		case MTHCA_EVENT_TYPE_PATH_MIG:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_PATH_MIG);
+			break;
+
+		case MTHCA_EVENT_TYPE_COMM_EST:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_COMM_EST);
+			break;
+
+		case MTHCA_EVENT_TYPE_SQ_DRAINED:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_SQ_DRAINED);
+			break;
+
+		case MTHCA_EVENT_TYPE_SRQ_QP_LAST_WQE:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_QP_LAST_WQE_REACHED);
+			break;
+
+		case MTHCA_EVENT_TYPE_SRQ_LIMIT:
+			mthca_srq_event(dev, be32_to_cpu(eqe->event.srq.srqn) & 0xffffff,
+					IB_EVENT_SRQ_LIMIT_REACHED);
+			break;
+
+		case MTHCA_EVENT_TYPE_WQ_CATAS_ERROR:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_QP_FATAL);
+			break;
+
+		case MTHCA_EVENT_TYPE_PATH_MIG_FAILED:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_PATH_MIG_ERR);
+			break;
+
+		case MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_QP_REQ_ERR);
+			break;
+
+		case MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_QP_ACCESS_ERR);
+			break;
+
+		case MTHCA_EVENT_TYPE_CMD:
+			mthca_cmd_event(dev,
+					be16_to_cpu(eqe->event.cmd.token),
+					eqe->event.cmd.status,
+					be64_to_cpu(eqe->event.cmd.out_param));
+			break;
+
+		case MTHCA_EVENT_TYPE_PORT_CHANGE:
+			port_change(dev,
+				    (be32_to_cpu(eqe->event.port_change.port) >> 28) & 3,
+				    eqe->subtype == 0x4);
+			break;
+
+		case MTHCA_EVENT_TYPE_CQ_ERROR:
+			mthca_warn(dev, "CQ %s on CQN %06x\n",
+				   eqe->event.cq_err.syndrome == 1 ?
+				   "overrun" : "access violation",
+				   be32_to_cpu(eqe->event.cq_err.cqn) & 0xffffff);
+			mthca_cq_event(dev, be32_to_cpu(eqe->event.cq_err.cqn),
+				       IB_EVENT_CQ_ERR);
+			break;
+
+		case MTHCA_EVENT_TYPE_EQ_OVERFLOW:
+			mthca_warn(dev, "EQ overrun on EQN %d\n", eq->eqn);
+			break;
+
+		case MTHCA_EVENT_TYPE_EEC_CATAS_ERROR:
+		case MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR:
+		case MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR:
+		case MTHCA_EVENT_TYPE_ECC_DETECT:
+		default:
+			mthca_warn(dev, "Unhandled event %02x(%02x) on EQ %d\n",
+				   eqe->type, eqe->subtype, eq->eqn);
+			break;
+		};
+
+		set_eqe_hw(eqe);
+		++eq->cons_index;
+		eqes_found = 1;
+		++set_ci;
+
+		/*
+		 * The HCA will think the queue has overflowed if we
+		 * don't tell it we've been processing events.  We
+		 * create our EQs with MTHCA_NUM_SPARE_EQE extra
+		 * entries, so we must update our consumer index at
+		 * least that often.
+		 */
+		if (unlikely(set_ci >= MTHCA_NUM_SPARE_EQE)) {
+			/*
+			 * Conditional on hca_type is OK here because
+			 * this is a rare case, not the fast path.
+			 */
+			set_eq_ci(dev, eq, eq->cons_index);
+			set_ci = 0;
+		}
+	}
+
+	/*
+	 * Rely on caller to set consumer index so that we don't have
+	 * to test hca_type in our interrupt handling fast path.
+	 */
+	return eqes_found;
+}
+
+static irqreturn_t mthca_tavor_interrupt(int irq, void *dev_ptr)
+{
+	struct mthca_dev *dev = dev_ptr;
+	u32 ecr;
+	int i;
+
+	if (dev->eq_table.clr_mask)
+		writel(dev->eq_table.clr_mask, dev->eq_table.clr_int);
+
+	ecr = readl(dev->eq_regs.tavor.ecr_base + 4);
+	if (!ecr)
+		return IRQ_NONE;
+
+	writel(ecr, dev->eq_regs.tavor.ecr_base +
+	       MTHCA_ECR_CLR_BASE - MTHCA_ECR_BASE + 4);
+
+	for (i = 0; i < MTHCA_NUM_EQ; ++i)
+		if (ecr & dev->eq_table.eq[i].eqn_mask) {
+			if (mthca_eq_int(dev, &dev->eq_table.eq[i]))
+				tavor_set_eq_ci(dev, &dev->eq_table.eq[i],
+						dev->eq_table.eq[i].cons_index);
+			tavor_eq_req_not(dev, dev->eq_table.eq[i].eqn);
+		}
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t mthca_tavor_msi_x_interrupt(int irq, void *eq_ptr)
+{
+	struct mthca_eq  *eq  = eq_ptr;
+	struct mthca_dev *dev = eq->dev;
+
+	mthca_eq_int(dev, eq);
+	tavor_set_eq_ci(dev, eq, eq->cons_index);
+	tavor_eq_req_not(dev, eq->eqn);
+
+	/* MSI-X vectors always belong to us */
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t mthca_arbel_interrupt(int irq, void *dev_ptr)
+{
+	struct mthca_dev *dev = dev_ptr;
+	int work = 0;
+	int i;
+
+	if (dev->eq_table.clr_mask)
+		writel(dev->eq_table.clr_mask, dev->eq_table.clr_int);
+
+	for (i = 0; i < MTHCA_NUM_EQ; ++i)
+		if (mthca_eq_int(dev, &dev->eq_table.eq[i])) {
+			work = 1;
+			arbel_set_eq_ci(dev, &dev->eq_table.eq[i],
+					dev->eq_table.eq[i].cons_index);
+		}
+
+	arbel_eq_req_not(dev, dev->eq_table.arm_mask);
+
+	return IRQ_RETVAL(work);
+}
+
+static irqreturn_t mthca_arbel_msi_x_interrupt(int irq, void *eq_ptr)
+{
+	struct mthca_eq  *eq  = eq_ptr;
+	struct mthca_dev *dev = eq->dev;
+
+	mthca_eq_int(dev, eq);
+	arbel_set_eq_ci(dev, eq, eq->cons_index);
+	arbel_eq_req_not(dev, eq->eqn_mask);
+
+	/* MSI-X vectors always belong to us */
+	return IRQ_HANDLED;
+}
+
+static int mthca_create_eq(struct mthca_dev *dev,
+			   int nent,
+			   u8 intr,
+			   struct mthca_eq *eq)
+{
+	int npages;
+	u64 *dma_list = NULL;
+	dma_addr_t t;
+	struct mthca_mailbox *mailbox;
+	struct mthca_eq_context *eq_context;
+	int err = -ENOMEM;
+	int i;
+	u8 status;
+
+	eq->dev  = dev;
+	eq->nent = roundup_pow_of_two(max(nent, 2));
+	npages = ALIGN(eq->nent * MTHCA_EQ_ENTRY_SIZE, PAGE_SIZE) / PAGE_SIZE;
+
+	eq->page_list = kmalloc(npages * sizeof *eq->page_list,
+				GFP_KERNEL);
+	if (!eq->page_list)
+		goto err_out;
+
+	for (i = 0; i < npages; ++i)
+		eq->page_list[i].buf = NULL;
+
+	dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+	if (!dma_list)
+		goto err_out_free;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		goto err_out_free;
+	eq_context = mailbox->buf;
+
+	for (i = 0; i < npages; ++i) {
+		eq->page_list[i].buf = dma_alloc_coherent(&dev->pdev->dev,
+							  PAGE_SIZE, &t, GFP_KERNEL);
+		if (!eq->page_list[i].buf)
+			goto err_out_free_pages;
+
+		dma_list[i] = t;
+		pci_unmap_addr_set(&eq->page_list[i], mapping, t);
+
+		clear_page(eq->page_list[i].buf);
+	}
+
+	for (i = 0; i < eq->nent; ++i)
+		set_eqe_hw(get_eqe(eq, i));
+
+	eq->eqn = mthca_alloc(&dev->eq_table.alloc);
+	if (eq->eqn == -1)
+		goto err_out_free_pages;
+
+	err = mthca_mr_alloc_phys(dev, dev->driver_pd.pd_num,
+				  dma_list, PAGE_SHIFT, npages,
+				  0, npages * PAGE_SIZE,
+				  MTHCA_MPT_FLAG_LOCAL_WRITE |
+				  MTHCA_MPT_FLAG_LOCAL_READ,
+				  &eq->mr);
+	if (err)
+		goto err_out_free_eq;
+
+	memset(eq_context, 0, sizeof *eq_context);
+	eq_context->flags           = cpu_to_be32(MTHCA_EQ_STATUS_OK   |
+						  MTHCA_EQ_OWNER_HW    |
+						  MTHCA_EQ_STATE_ARMED |
+						  MTHCA_EQ_FLAG_TR);
+	if (mthca_is_memfree(dev))
+		eq_context->flags  |= cpu_to_be32(MTHCA_EQ_STATE_ARBEL);
+
+	eq_context->logsize_usrpage = cpu_to_be32((ffs(eq->nent) - 1) << 24);
+	if (mthca_is_memfree(dev)) {
+		eq_context->arbel_pd = cpu_to_be32(dev->driver_pd.pd_num);
+	} else {
+		eq_context->logsize_usrpage |= cpu_to_be32(dev->driver_uar.index);
+		eq_context->tavor_pd         = cpu_to_be32(dev->driver_pd.pd_num);
+	}
+	eq_context->intr            = intr;
+	eq_context->lkey            = cpu_to_be32(eq->mr.ibmr.lkey);
+
+	err = mthca_SW2HW_EQ(dev, mailbox, eq->eqn, &status);
+	if (err) {
+		mthca_warn(dev, "SW2HW_EQ failed (%d)\n", err);
+		goto err_out_free_mr;
+	}
+	if (status) {
+		mthca_warn(dev, "SW2HW_EQ returned status 0x%02x\n",
+			   status);
+		err = -EINVAL;
+		goto err_out_free_mr;
+	}
+
+	kfree(dma_list);
+	mthca_free_mailbox(dev, mailbox);
+
+	eq->eqn_mask   = swab32(1 << eq->eqn);
+	eq->cons_index = 0;
+
+	dev->eq_table.arm_mask |= eq->eqn_mask;
+
+	mthca_dbg(dev, "Allocated EQ %d with %d entries\n",
+		  eq->eqn, eq->nent);
+
+	return err;
+
+ err_out_free_mr:
+	mthca_free_mr(dev, &eq->mr);
+
+ err_out_free_eq:
+	mthca_free(&dev->eq_table.alloc, eq->eqn);
+
+ err_out_free_pages:
+	for (i = 0; i < npages; ++i)
+		if (eq->page_list[i].buf)
+			dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+					  eq->page_list[i].buf,
+					  pci_unmap_addr(&eq->page_list[i],
+							 mapping));
+
+	mthca_free_mailbox(dev, mailbox);
+
+ err_out_free:
+	kfree(eq->page_list);
+	kfree(dma_list);
+
+ err_out:
+	return err;
+}
+
+static void mthca_free_eq(struct mthca_dev *dev,
+			  struct mthca_eq *eq)
+{
+	struct mthca_mailbox *mailbox;
+	int err;
+	u8 status;
+	int npages = (eq->nent * MTHCA_EQ_ENTRY_SIZE + PAGE_SIZE - 1) /
+		PAGE_SIZE;
+	int i;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return;
+
+	err = mthca_HW2SW_EQ(dev, mailbox, eq->eqn, &status);
+	if (err)
+		mthca_warn(dev, "HW2SW_EQ failed (%d)\n", err);
+	if (status)
+		mthca_warn(dev, "HW2SW_EQ returned status 0x%02x\n", status);
+
+	dev->eq_table.arm_mask &= ~eq->eqn_mask;
+
+	if (0) {
+		mthca_dbg(dev, "Dumping EQ context %02x:\n", eq->eqn);
+		for (i = 0; i < sizeof (struct mthca_eq_context) / 4; ++i) {
+			if (i % 4 == 0)
+				printk("[%02x] ", i * 4);
+			printk(" %08x", be32_to_cpup(mailbox->buf + i * 4));
+			if ((i + 1) % 4 == 0)
+				printk("\n");
+		}
+	}
+
+	mthca_free_mr(dev, &eq->mr);
+	for (i = 0; i < npages; ++i)
+		pci_free_consistent(dev->pdev, PAGE_SIZE,
+				    eq->page_list[i].buf,
+				    pci_unmap_addr(&eq->page_list[i], mapping));
+
+	kfree(eq->page_list);
+	mthca_free_mailbox(dev, mailbox);
+}
+
+static void mthca_free_irqs(struct mthca_dev *dev)
+{
+	int i;
+
+	if (dev->eq_table.have_irq)
+		free_irq(dev->pdev->irq, dev);
+	for (i = 0; i < MTHCA_NUM_EQ; ++i)
+		if (dev->eq_table.eq[i].have_irq) {
+			free_irq(dev->eq_table.eq[i].msi_x_vector,
+				 dev->eq_table.eq + i);
+			dev->eq_table.eq[i].have_irq = 0;
+		}
+}
+
+static int mthca_map_reg(struct mthca_dev *dev,
+			 unsigned long offset, unsigned long size,
+			 void __iomem **map)
+{
+	unsigned long base = pci_resource_start(dev->pdev, 0);
+
+	*map = ioremap(base + offset, size);
+	if (!*map)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int mthca_map_eq_regs(struct mthca_dev *dev)
+{
+	if (mthca_is_memfree(dev)) {
+		/*
+		 * We assume that the EQ arm and EQ set CI registers
+		 * fall within the first BAR.  We can't trust the
+		 * values firmware gives us, since those addresses are
+		 * valid on the HCA's side of the PCI bus but not
+		 * necessarily the host side.
+		 */
+		if (mthca_map_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+				  dev->fw.arbel.clr_int_base, MTHCA_CLR_INT_SIZE,
+				  &dev->clr_base)) {
+			mthca_err(dev, "Couldn't map interrupt clear register, "
+				  "aborting.\n");
+			return -ENOMEM;
+		}
+
+		/*
+		 * Add 4 because we limit ourselves to EQs 0 ... 31,
+		 * so we only need the low word of the register.
+		 */
+		if (mthca_map_reg(dev, ((pci_resource_len(dev->pdev, 0) - 1) &
+					dev->fw.arbel.eq_arm_base) + 4, 4,
+				  &dev->eq_regs.arbel.eq_arm)) {
+			mthca_err(dev, "Couldn't map EQ arm register, aborting.\n");
+			iounmap(dev->clr_base);
+			return -ENOMEM;
+		}
+
+		if (mthca_map_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+				  dev->fw.arbel.eq_set_ci_base,
+				  MTHCA_EQ_SET_CI_SIZE,
+				  &dev->eq_regs.arbel.eq_set_ci_base)) {
+			mthca_err(dev, "Couldn't map EQ CI register, aborting.\n");
+			iounmap(dev->eq_regs.arbel.eq_arm);
+			iounmap(dev->clr_base);
+			return -ENOMEM;
+		}
+	} else {
+		if (mthca_map_reg(dev, MTHCA_CLR_INT_BASE, MTHCA_CLR_INT_SIZE,
+				  &dev->clr_base)) {
+			mthca_err(dev, "Couldn't map interrupt clear register, "
+				  "aborting.\n");
+			return -ENOMEM;
+		}
+
+		if (mthca_map_reg(dev, MTHCA_ECR_BASE,
+				  MTHCA_ECR_SIZE + MTHCA_ECR_CLR_SIZE,
+				  &dev->eq_regs.tavor.ecr_base)) {
+			mthca_err(dev, "Couldn't map ecr register, "
+				  "aborting.\n");
+			iounmap(dev->clr_base);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+
+}
+
+static void mthca_unmap_eq_regs(struct mthca_dev *dev)
+{
+	if (mthca_is_memfree(dev)) {
+		iounmap(dev->eq_regs.arbel.eq_set_ci_base);
+		iounmap(dev->eq_regs.arbel.eq_arm);
+		iounmap(dev->clr_base);
+	} else {
+		iounmap(dev->eq_regs.tavor.ecr_base);
+		iounmap(dev->clr_base);
+	}
+}
+
+int mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt)
+{
+	int ret;
+	u8 status;
+
+	/*
+	 * We assume that mapping one page is enough for the whole EQ
+	 * context table.  This is fine with all current HCAs, because
+	 * we only use 32 EQs and each EQ uses 32 bytes of context
+	 * memory, or 1 KB total.
+	 */
+	dev->eq_table.icm_virt = icm_virt;
+	dev->eq_table.icm_page = alloc_page(GFP_HIGHUSER);
+	if (!dev->eq_table.icm_page)
+		return -ENOMEM;
+	dev->eq_table.icm_dma  = pci_map_page(dev->pdev, dev->eq_table.icm_page, 0,
+					      PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
+	if (pci_dma_mapping_error(dev->pdev, dev->eq_table.icm_dma)) {
+		__free_page(dev->eq_table.icm_page);
+		return -ENOMEM;
+	}
+
+	ret = mthca_MAP_ICM_page(dev, dev->eq_table.icm_dma, icm_virt, &status);
+	if (!ret && status)
+		ret = -EINVAL;
+	if (ret) {
+		pci_unmap_page(dev->pdev, dev->eq_table.icm_dma, PAGE_SIZE,
+			       PCI_DMA_BIDIRECTIONAL);
+		__free_page(dev->eq_table.icm_page);
+	}
+
+	return ret;
+}
+
+void mthca_unmap_eq_icm(struct mthca_dev *dev)
+{
+	u8 status;
+
+	mthca_UNMAP_ICM(dev, dev->eq_table.icm_virt, 1, &status);
+	pci_unmap_page(dev->pdev, dev->eq_table.icm_dma, PAGE_SIZE,
+		       PCI_DMA_BIDIRECTIONAL);
+	__free_page(dev->eq_table.icm_page);
+}
+
+int mthca_init_eq_table(struct mthca_dev *dev)
+{
+	int err;
+	u8 status;
+	u8 intr;
+	int i;
+
+	err = mthca_alloc_init(&dev->eq_table.alloc,
+			       dev->limits.num_eqs,
+			       dev->limits.num_eqs - 1,
+			       dev->limits.reserved_eqs);
+	if (err)
+		return err;
+
+	err = mthca_map_eq_regs(dev);
+	if (err)
+		goto err_out_free;
+
+	if (dev->mthca_flags & MTHCA_FLAG_MSI_X) {
+		dev->eq_table.clr_mask = 0;
+	} else {
+		dev->eq_table.clr_mask =
+			swab32(1 << (dev->eq_table.inta_pin & 31));
+		dev->eq_table.clr_int  = dev->clr_base +
+			(dev->eq_table.inta_pin < 32 ? 4 : 0);
+	}
+
+	dev->eq_table.arm_mask = 0;
+
+	intr = dev->eq_table.inta_pin;
+
+	err = mthca_create_eq(dev, dev->limits.num_cqs + MTHCA_NUM_SPARE_EQE,
+			      (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 128 : intr,
+			      &dev->eq_table.eq[MTHCA_EQ_COMP]);
+	if (err)
+		goto err_out_unmap;
+
+	err = mthca_create_eq(dev, MTHCA_NUM_ASYNC_EQE + MTHCA_NUM_SPARE_EQE,
+			      (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 129 : intr,
+			      &dev->eq_table.eq[MTHCA_EQ_ASYNC]);
+	if (err)
+		goto err_out_comp;
+
+	err = mthca_create_eq(dev, MTHCA_NUM_CMD_EQE + MTHCA_NUM_SPARE_EQE,
+			      (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 130 : intr,
+			      &dev->eq_table.eq[MTHCA_EQ_CMD]);
+	if (err)
+		goto err_out_async;
+
+	if (dev->mthca_flags & MTHCA_FLAG_MSI_X) {
+		static const char *eq_name[] = {
+			[MTHCA_EQ_COMP]  = DRV_NAME " (comp)",
+			[MTHCA_EQ_ASYNC] = DRV_NAME " (async)",
+			[MTHCA_EQ_CMD]   = DRV_NAME " (cmd)"
+		};
+
+		for (i = 0; i < MTHCA_NUM_EQ; ++i) {
+			err = request_irq(dev->eq_table.eq[i].msi_x_vector,
+					  mthca_is_memfree(dev) ?
+					  mthca_arbel_msi_x_interrupt :
+					  mthca_tavor_msi_x_interrupt,
+					  0, eq_name[i], dev->eq_table.eq + i);
+			if (err)
+				goto err_out_cmd;
+			dev->eq_table.eq[i].have_irq = 1;
+		}
+	} else {
+		err = request_irq(dev->pdev->irq,
+				  mthca_is_memfree(dev) ?
+				  mthca_arbel_interrupt :
+				  mthca_tavor_interrupt,
+				  IRQF_SHARED, DRV_NAME, dev);
+		if (err)
+			goto err_out_cmd;
+		dev->eq_table.have_irq = 1;
+	}
+
+	err = mthca_MAP_EQ(dev, async_mask(dev),
+			   0, dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn, &status);
+	if (err)
+		mthca_warn(dev, "MAP_EQ for async EQ %d failed (%d)\n",
+			   dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn, err);
+	if (status)
+		mthca_warn(dev, "MAP_EQ for async EQ %d returned status 0x%02x\n",
+			   dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn, status);
+
+	err = mthca_MAP_EQ(dev, MTHCA_CMD_EVENT_MASK,
+			   0, dev->eq_table.eq[MTHCA_EQ_CMD].eqn, &status);
+	if (err)
+		mthca_warn(dev, "MAP_EQ for cmd EQ %d failed (%d)\n",
+			   dev->eq_table.eq[MTHCA_EQ_CMD].eqn, err);
+	if (status)
+		mthca_warn(dev, "MAP_EQ for cmd EQ %d returned status 0x%02x\n",
+			   dev->eq_table.eq[MTHCA_EQ_CMD].eqn, status);
+
+	for (i = 0; i < MTHCA_NUM_EQ; ++i)
+		if (mthca_is_memfree(dev))
+			arbel_eq_req_not(dev, dev->eq_table.eq[i].eqn_mask);
+		else
+			tavor_eq_req_not(dev, dev->eq_table.eq[i].eqn);
+
+	return 0;
+
+err_out_cmd:
+	mthca_free_irqs(dev);
+	mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_CMD]);
+
+err_out_async:
+	mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_ASYNC]);
+
+err_out_comp:
+	mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_COMP]);
+
+err_out_unmap:
+	mthca_unmap_eq_regs(dev);
+
+err_out_free:
+	mthca_alloc_cleanup(&dev->eq_table.alloc);
+	return err;
+}
+
+void mthca_cleanup_eq_table(struct mthca_dev *dev)
+{
+	u8 status;
+	int i;
+
+	mthca_free_irqs(dev);
+
+	mthca_MAP_EQ(dev, async_mask(dev),
+		     1, dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn, &status);
+	mthca_MAP_EQ(dev, MTHCA_CMD_EVENT_MASK,
+		     1, dev->eq_table.eq[MTHCA_EQ_CMD].eqn, &status);
+
+	for (i = 0; i < MTHCA_NUM_EQ; ++i)
+		mthca_free_eq(dev, &dev->eq_table.eq[i]);
+
+	mthca_unmap_eq_regs(dev);
+
+	mthca_alloc_cleanup(&dev->eq_table.alloc);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_mad.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_mad.c
new file mode 100644
index 0000000..5648659
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_mad.c
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+enum {
+	MTHCA_VENDOR_CLASS1 = 0x9,
+	MTHCA_VENDOR_CLASS2 = 0xa
+};
+
+static int mthca_update_rate(struct mthca_dev *dev, u8 port_num)
+{
+	struct ib_port_attr *tprops = NULL;
+	int                  ret;
+
+	tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+	if (!tprops)
+		return -ENOMEM;
+
+	ret = ib_query_port(&dev->ib_dev, port_num, tprops);
+	if (ret) {
+		printk(KERN_WARNING "ib_query_port failed (%d) for %s port %d\n",
+		       ret, dev->ib_dev.name, port_num);
+		goto out;
+	}
+
+	dev->rate[port_num - 1] = tprops->active_speed *
+				  ib_width_enum_to_int(tprops->active_width);
+
+out:
+	kfree(tprops);
+	return ret;
+}
+
+static void update_sm_ah(struct mthca_dev *dev,
+			 u8 port_num, u16 lid, u8 sl)
+{
+	struct ib_ah *new_ah;
+	struct ib_ah_attr ah_attr;
+	unsigned long flags;
+
+	if (!dev->send_agent[port_num - 1][0])
+		return;
+
+	memset(&ah_attr, 0, sizeof ah_attr);
+	ah_attr.dlid     = lid;
+	ah_attr.sl       = sl;
+	ah_attr.port_num = port_num;
+
+	new_ah = ib_create_ah(dev->send_agent[port_num - 1][0]->qp->pd,
+			      &ah_attr);
+	if (IS_ERR(new_ah))
+		return;
+
+	spin_lock_irqsave(&dev->sm_lock, flags);
+	if (dev->sm_ah[port_num - 1])
+		ib_destroy_ah(dev->sm_ah[port_num - 1]);
+	dev->sm_ah[port_num - 1] = new_ah;
+	spin_unlock_irqrestore(&dev->sm_lock, flags);
+}
+
+/*
+ * Snoop SM MADs for port info and P_Key table sets, so we can
+ * synthesize LID change and P_Key change events.
+ */
+static void smp_snoop(struct ib_device *ibdev,
+		      u8 port_num,
+		      struct ib_mad *mad,
+		      u16 prev_lid)
+{
+	struct ib_event event;
+
+	if ((mad->mad_hdr.mgmt_class  == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     mad->mad_hdr.mgmt_class  == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    mad->mad_hdr.method     == IB_MGMT_METHOD_SET) {
+		if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO) {
+			struct ib_port_info *pinfo =
+				(struct ib_port_info *) ((struct ib_smp *) mad)->data;
+			u16 lid = be16_to_cpu(pinfo->lid);
+
+			mthca_update_rate(to_mdev(ibdev), port_num);
+			update_sm_ah(to_mdev(ibdev), port_num,
+				     be16_to_cpu(pinfo->sm_lid),
+				     pinfo->neighbormtu_mastersmsl & 0xf);
+
+			event.device           = ibdev;
+			event.element.port_num = port_num;
+
+			if (pinfo->clientrereg_resv_subnetto & 0x80) {
+				event.event    = IB_EVENT_CLIENT_REREGISTER;
+				ib_dispatch_event(&event);
+			}
+
+			if (prev_lid != lid) {
+				event.event    = IB_EVENT_LID_CHANGE;
+				ib_dispatch_event(&event);
+			}
+		}
+
+		if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PKEY_TABLE) {
+			event.device           = ibdev;
+			event.event            = IB_EVENT_PKEY_CHANGE;
+			event.element.port_num = port_num;
+			ib_dispatch_event(&event);
+		}
+	}
+}
+
+static void node_desc_override(struct ib_device *dev,
+			       struct ib_mad *mad)
+{
+	if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP &&
+	    mad->mad_hdr.attr_id == IB_SMP_ATTR_NODE_DESC) {
+		mutex_lock(&to_mdev(dev)->cap_mask_mutex);
+		memcpy(((struct ib_smp *) mad)->data, dev->node_desc, 64);
+		mutex_unlock(&to_mdev(dev)->cap_mask_mutex);
+	}
+}
+
+static void forward_trap(struct mthca_dev *dev,
+			 u8 port_num,
+			 struct ib_mad *mad)
+{
+	int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	struct ib_mad_send_buf *send_buf;
+	struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn];
+	int ret;
+	unsigned long flags;
+
+	if (agent) {
+		send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR,
+					      IB_MGMT_MAD_DATA, GFP_ATOMIC);
+		/*
+		 * We rely here on the fact that MLX QPs don't use the
+		 * address handle after the send is posted (this is
+		 * wrong following the IB spec strictly, but we know
+		 * it's OK for our devices).
+		 */
+		spin_lock_irqsave(&dev->sm_lock, flags);
+		memcpy(send_buf->mad, mad, sizeof *mad);
+		if ((send_buf->ah = dev->sm_ah[port_num - 1]))
+			ret = ib_post_send_mad(send_buf, NULL);
+		else
+			ret = -EINVAL;
+		spin_unlock_irqrestore(&dev->sm_lock, flags);
+
+		if (ret)
+			ib_free_send_mad(send_buf);
+	}
+}
+
+int mthca_process_mad(struct ib_device *ibdev,
+		      int mad_flags,
+		      u8 port_num,
+		      struct ib_wc *in_wc,
+		      struct ib_grh *in_grh,
+		      struct ib_mad *in_mad,
+		      struct ib_mad *out_mad)
+{
+	int err;
+	u8 status;
+	u16 slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE);
+	u16 prev_lid = 0;
+	struct ib_port_attr pattr;
+
+	/* Forward locally generated traps to the SM */
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP &&
+	    slid == 0) {
+		forward_trap(to_mdev(ibdev), port_num, in_mad);
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+	}
+
+	/*
+	 * Only handle SM gets, sets and trap represses for SM class
+	 *
+	 * Only handle PMA and Mellanox vendor-specific class gets and
+	 * sets for other classes.
+	 */
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	    in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		if (in_mad->mad_hdr.method   != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_SET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_TRAP_REPRESS)
+			return IB_MAD_RESULT_SUCCESS;
+
+		/*
+		 * Don't process SMInfo queries or vendor-specific
+		 * MADs -- the SMA can't handle them.
+		 */
+		if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO ||
+		    ((in_mad->mad_hdr.attr_id & IB_SMP_ATTR_VENDOR_MASK) ==
+		     IB_SMP_ATTR_VENDOR_MASK))
+			return IB_MAD_RESULT_SUCCESS;
+	} else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
+		   in_mad->mad_hdr.mgmt_class == MTHCA_VENDOR_CLASS1     ||
+		   in_mad->mad_hdr.mgmt_class == MTHCA_VENDOR_CLASS2) {
+		if (in_mad->mad_hdr.method  != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method  != IB_MGMT_METHOD_SET)
+			return IB_MAD_RESULT_SUCCESS;
+	} else
+		return IB_MAD_RESULT_SUCCESS;
+	if ((in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    in_mad->mad_hdr.method == IB_MGMT_METHOD_SET &&
+	    in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
+	    !ib_query_port(ibdev, port_num, &pattr))
+		prev_lid = pattr.lid;
+
+	err = mthca_MAD_IFC(to_mdev(ibdev),
+			    mad_flags & IB_MAD_IGNORE_MKEY,
+			    mad_flags & IB_MAD_IGNORE_BKEY,
+			    port_num, in_wc, in_grh, in_mad, out_mad,
+			    &status);
+	if (err) {
+		mthca_err(to_mdev(ibdev), "MAD_IFC failed\n");
+		return IB_MAD_RESULT_FAILURE;
+	}
+	if (status == MTHCA_CMD_STAT_BAD_PKT)
+		return IB_MAD_RESULT_SUCCESS;
+	if (status) {
+		mthca_err(to_mdev(ibdev), "MAD_IFC returned status %02x\n",
+			  status);
+		return IB_MAD_RESULT_FAILURE;
+	}
+
+	if (!out_mad->mad_hdr.status) {
+		smp_snoop(ibdev, port_num, in_mad, prev_lid);
+		node_desc_override(ibdev, out_mad);
+	}
+
+	/* set return bit in status of directed route responses */
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		out_mad->mad_hdr.status |= cpu_to_be16(1 << 15);
+
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS)
+		/* no response for trap repress */
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc)
+{
+	ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+int mthca_create_agents(struct mthca_dev *dev)
+{
+	struct ib_mad_agent *agent;
+	int p, q;
+	int ret;
+
+	spin_lock_init(&dev->sm_lock);
+
+	for (p = 0; p < dev->limits.num_ports; ++p)
+		for (q = 0; q <= 1; ++q) {
+			agent = ib_register_mad_agent(&dev->ib_dev, p + 1,
+						      q ? IB_QPT_GSI : IB_QPT_SMI,
+						      NULL, 0, send_handler,
+						      NULL, NULL);
+			if (IS_ERR(agent)) {
+				ret = PTR_ERR(agent);
+				goto err;
+			}
+			dev->send_agent[p][q] = agent;
+		}
+
+
+	for (p = 1; p <= dev->limits.num_ports; ++p) {
+		ret = mthca_update_rate(dev, p);
+		if (ret) {
+			mthca_err(dev, "Failed to obtain port %d rate."
+				  " aborting.\n", p);
+			goto err;
+		}
+	}
+
+	return 0;
+
+err:
+	for (p = 0; p < dev->limits.num_ports; ++p)
+		for (q = 0; q <= 1; ++q)
+			if (dev->send_agent[p][q])
+				ib_unregister_mad_agent(dev->send_agent[p][q]);
+
+	return ret;
+}
+
+void mthca_free_agents(struct mthca_dev *dev)
+{
+	struct ib_mad_agent *agent;
+	int p, q;
+
+	for (p = 0; p < dev->limits.num_ports; ++p) {
+		for (q = 0; q <= 1; ++q) {
+			agent = dev->send_agent[p][q];
+			dev->send_agent[p][q] = NULL;
+			ib_unregister_mad_agent(agent);
+		}
+
+		if (dev->sm_ah[p])
+			ib_destroy_ah(dev->sm_ah[p]);
+	}
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_main.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_main.c
new file mode 100644
index 0000000..772cf8c
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_main.c
@@ -0,0 +1,1360 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+
+#include "mthca_dev.h"
+#include "mthca_config_reg.h"
+#include "mthca_cmd.h"
+#include "mthca_profile.h"
+#include "mthca_memfree.h"
+#include "mthca_wqe.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("Mellanox InfiniBand HCA low-level driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+#ifdef CONFIG_INFINIBAND_MTHCA_DEBUG
+
+int mthca_debug_level = 0;
+module_param_named(debug_level, mthca_debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
+
+#endif /* CONFIG_INFINIBAND_MTHCA_DEBUG */
+
+#ifdef CONFIG_PCI_MSI
+
+static int msi_x = 1;
+module_param(msi_x, int, 0444);
+MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero");
+
+#else /* CONFIG_PCI_MSI */
+
+#define msi_x (0)
+
+#endif /* CONFIG_PCI_MSI */
+
+static int tune_pci = 0;
+module_param(tune_pci, int, 0444);
+MODULE_PARM_DESC(tune_pci, "increase PCI burst from the default set by BIOS if nonzero");
+
+DEFINE_MUTEX(mthca_device_mutex);
+
+#define MTHCA_DEFAULT_NUM_QP            (1 << 16)
+#define MTHCA_DEFAULT_RDB_PER_QP        (1 << 2)
+#define MTHCA_DEFAULT_NUM_CQ            (1 << 16)
+#define MTHCA_DEFAULT_NUM_MCG           (1 << 13)
+#define MTHCA_DEFAULT_NUM_MPT           (1 << 17)
+#define MTHCA_DEFAULT_NUM_MTT           (1 << 20)
+#define MTHCA_DEFAULT_NUM_UDAV          (1 << 15)
+#define MTHCA_DEFAULT_NUM_RESERVED_MTTS (1 << 18)
+#define MTHCA_DEFAULT_NUM_UARC_SIZE     (1 << 18)
+
+static struct mthca_profile hca_profile = {
+	.num_qp             = MTHCA_DEFAULT_NUM_QP,
+	.rdb_per_qp         = MTHCA_DEFAULT_RDB_PER_QP,
+	.num_cq             = MTHCA_DEFAULT_NUM_CQ,
+	.num_mcg            = MTHCA_DEFAULT_NUM_MCG,
+	.num_mpt            = MTHCA_DEFAULT_NUM_MPT,
+	.num_mtt            = MTHCA_DEFAULT_NUM_MTT,
+	.num_udav           = MTHCA_DEFAULT_NUM_UDAV,          /* Tavor only */
+	.fmr_reserved_mtts  = MTHCA_DEFAULT_NUM_RESERVED_MTTS, /* Tavor only */
+	.uarc_size          = MTHCA_DEFAULT_NUM_UARC_SIZE,     /* Arbel only */
+};
+
+module_param_named(num_qp, hca_profile.num_qp, int, 0444);
+MODULE_PARM_DESC(num_qp, "maximum number of QPs per HCA");
+
+module_param_named(rdb_per_qp, hca_profile.rdb_per_qp, int, 0444);
+MODULE_PARM_DESC(rdb_per_qp, "number of RDB buffers per QP");
+
+module_param_named(num_cq, hca_profile.num_cq, int, 0444);
+MODULE_PARM_DESC(num_cq, "maximum number of CQs per HCA");
+
+module_param_named(num_mcg, hca_profile.num_mcg, int, 0444);
+MODULE_PARM_DESC(num_mcg, "maximum number of multicast groups per HCA");
+
+module_param_named(num_mpt, hca_profile.num_mpt, int, 0444);
+MODULE_PARM_DESC(num_mpt,
+		"maximum number of memory protection table entries per HCA");
+
+module_param_named(num_mtt, hca_profile.num_mtt, int, 0444);
+MODULE_PARM_DESC(num_mtt,
+		 "maximum number of memory translation table segments per HCA");
+
+module_param_named(num_udav, hca_profile.num_udav, int, 0444);
+MODULE_PARM_DESC(num_udav, "maximum number of UD address vectors per HCA");
+
+module_param_named(fmr_reserved_mtts, hca_profile.fmr_reserved_mtts, int, 0444);
+MODULE_PARM_DESC(fmr_reserved_mtts,
+		 "number of memory translation table segments reserved for FMR");
+
+static int log_mtts_per_seg;
+module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444);
+MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment (1-5)");
+
+static char mthca_version[] __devinitdata =
+	DRV_NAME ": Mellanox InfiniBand HCA driver v"
+	DRV_VERSION " (" DRV_RELDATE ")\n";
+
+static int mthca_tune_pci(struct mthca_dev *mdev)
+{
+	if (!tune_pci)
+		return 0;
+
+	/* First try to max out Read Byte Count */
+	if (pci_find_capability(mdev->pdev, PCI_CAP_ID_PCIX)) {
+		if (pcix_set_mmrbc(mdev->pdev, pcix_get_max_mmrbc(mdev->pdev))) {
+			mthca_err(mdev, "Couldn't set PCI-X max read count, "
+				"aborting.\n");
+			return -ENODEV;
+		}
+	} else if (!(mdev->mthca_flags & MTHCA_FLAG_PCIE))
+		mthca_info(mdev, "No PCI-X capability, not setting RBC.\n");
+
+	if (pci_find_capability(mdev->pdev, PCI_CAP_ID_EXP)) {
+		if (pcie_set_readrq(mdev->pdev, 4096)) {
+			mthca_err(mdev, "Couldn't write PCI Express read request, "
+				"aborting.\n");
+			return -ENODEV;
+		}
+	} else if (mdev->mthca_flags & MTHCA_FLAG_PCIE)
+		mthca_info(mdev, "No PCI Express capability, "
+			   "not setting Max Read Request Size.\n");
+
+	return 0;
+}
+
+static int mthca_dev_lim(struct mthca_dev *mdev, struct mthca_dev_lim *dev_lim)
+{
+	int err;
+	u8 status;
+
+	mdev->limits.mtt_seg_size = (1 << log_mtts_per_seg) * 8;
+	err = mthca_QUERY_DEV_LIM(mdev, dev_lim, &status);
+	if (err) {
+		mthca_err(mdev, "QUERY_DEV_LIM command failed, aborting.\n");
+		return err;
+	}
+	if (status) {
+		mthca_err(mdev, "QUERY_DEV_LIM returned status 0x%02x, "
+			  "aborting.\n", status);
+		return -EINVAL;
+	}
+	if (dev_lim->min_page_sz > PAGE_SIZE) {
+		mthca_err(mdev, "HCA minimum page size of %d bigger than "
+			  "kernel PAGE_SIZE of %d, aborting.\n",
+			  dev_lim->min_page_sz, PAGE_SIZE);
+		return -ENODEV;
+	}
+	if (dev_lim->num_ports > MTHCA_MAX_PORTS) {
+		mthca_err(mdev, "HCA has %d ports, but we only support %d, "
+			  "aborting.\n",
+			  dev_lim->num_ports, MTHCA_MAX_PORTS);
+		return -ENODEV;
+	}
+
+	if (dev_lim->uar_size > pci_resource_len(mdev->pdev, 2)) {
+		mthca_err(mdev, "HCA reported UAR size of 0x%x bigger than "
+			  "PCI resource 2 size of 0x%llx, aborting.\n",
+			  dev_lim->uar_size,
+			  (unsigned long long)pci_resource_len(mdev->pdev, 2));
+		return -ENODEV;
+	}
+
+	mdev->limits.num_ports      	= dev_lim->num_ports;
+	mdev->limits.vl_cap             = dev_lim->max_vl;
+	mdev->limits.mtu_cap            = dev_lim->max_mtu;
+	mdev->limits.gid_table_len  	= dev_lim->max_gids;
+	mdev->limits.pkey_table_len 	= dev_lim->max_pkeys;
+	mdev->limits.local_ca_ack_delay = dev_lim->local_ca_ack_delay;
+	/*
+	 * Need to allow for worst case send WQE overhead and check
+	 * whether max_desc_sz imposes a lower limit than max_sg; UD
+	 * send has the biggest overhead.
+	 */
+	mdev->limits.max_sg		= min_t(int, dev_lim->max_sg,
+					      (dev_lim->max_desc_sz -
+					       sizeof (struct mthca_next_seg) -
+					       (mthca_is_memfree(mdev) ?
+						sizeof (struct mthca_arbel_ud_seg) :
+						sizeof (struct mthca_tavor_ud_seg))) /
+						sizeof (struct mthca_data_seg));
+	mdev->limits.max_wqes           = dev_lim->max_qp_sz;
+	mdev->limits.max_qp_init_rdma   = dev_lim->max_requester_per_qp;
+	mdev->limits.reserved_qps       = dev_lim->reserved_qps;
+	mdev->limits.max_srq_wqes       = dev_lim->max_srq_sz;
+	mdev->limits.reserved_srqs      = dev_lim->reserved_srqs;
+	mdev->limits.reserved_eecs      = dev_lim->reserved_eecs;
+	mdev->limits.max_desc_sz        = dev_lim->max_desc_sz;
+	mdev->limits.max_srq_sge	= mthca_max_srq_sge(mdev);
+	/*
+	 * Subtract 1 from the limit because we need to allocate a
+	 * spare CQE so the HCA HW can tell the difference between an
+	 * empty CQ and a full CQ.
+	 */
+	mdev->limits.max_cqes           = dev_lim->max_cq_sz - 1;
+	mdev->limits.reserved_cqs       = dev_lim->reserved_cqs;
+	mdev->limits.reserved_eqs       = dev_lim->reserved_eqs;
+	mdev->limits.reserved_mtts      = dev_lim->reserved_mtts;
+	mdev->limits.reserved_mrws      = dev_lim->reserved_mrws;
+	mdev->limits.reserved_uars      = dev_lim->reserved_uars;
+	mdev->limits.reserved_pds       = dev_lim->reserved_pds;
+	mdev->limits.port_width_cap     = dev_lim->max_port_width;
+	mdev->limits.page_size_cap      = ~(u32) (dev_lim->min_page_sz - 1);
+	mdev->limits.flags              = dev_lim->flags;
+	/*
+	 * For old FW that doesn't return static rate support, use a
+	 * value of 0x3 (only static rate values of 0 or 1 are handled),
+	 * except on Sinai, where even old FW can handle static rate
+	 * values of 2 and 3.
+	 */
+	if (dev_lim->stat_rate_support)
+		mdev->limits.stat_rate_support = dev_lim->stat_rate_support;
+	else if (mdev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		mdev->limits.stat_rate_support = 0xf;
+	else
+		mdev->limits.stat_rate_support = 0x3;
+
+	/* IB_DEVICE_RESIZE_MAX_WR not supported by driver.
+	   May be doable since hardware supports it for SRQ.
+
+	   IB_DEVICE_N_NOTIFY_CQ is supported by hardware but not by driver.
+
+	   IB_DEVICE_SRQ_RESIZE is supported by hardware but SRQ is not
+	   supported by driver. */
+	mdev->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT |
+		IB_DEVICE_PORT_ACTIVE_EVENT |
+		IB_DEVICE_SYS_IMAGE_GUID |
+		IB_DEVICE_RC_RNR_NAK_GEN;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_BAD_PKEY_CNTR)
+		mdev->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_BAD_QKEY_CNTR)
+		mdev->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_RAW_MULTI)
+		mdev->device_cap_flags |= IB_DEVICE_RAW_MULTI;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_AUTO_PATH_MIG)
+		mdev->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_UD_AV_PORT_ENFORCE)
+		mdev->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_SRQ)
+		mdev->mthca_flags |= MTHCA_FLAG_SRQ;
+
+	if (mthca_is_memfree(mdev))
+		if (dev_lim->flags & DEV_LIM_FLAG_IPOIB_CSUM)
+			mdev->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
+
+	return 0;
+}
+
+static int mthca_init_tavor(struct mthca_dev *mdev)
+{
+	s64 size;
+	u8 status;
+	int err;
+	struct mthca_dev_lim        dev_lim;
+	struct mthca_profile        profile;
+	struct mthca_init_hca_param init_hca;
+
+	err = mthca_SYS_EN(mdev, &status);
+	if (err) {
+		mthca_err(mdev, "SYS_EN command failed, aborting.\n");
+		return err;
+	}
+	if (status) {
+		mthca_err(mdev, "SYS_EN returned status 0x%02x, "
+			  "aborting.\n", status);
+		return -EINVAL;
+	}
+
+	err = mthca_QUERY_FW(mdev, &status);
+	if (err) {
+		mthca_err(mdev, "QUERY_FW command failed, aborting.\n");
+		goto err_disable;
+	}
+	if (status) {
+		mthca_err(mdev, "QUERY_FW returned status 0x%02x, "
+			  "aborting.\n", status);
+		err = -EINVAL;
+		goto err_disable;
+	}
+	err = mthca_QUERY_DDR(mdev, &status);
+	if (err) {
+		mthca_err(mdev, "QUERY_DDR command failed, aborting.\n");
+		goto err_disable;
+	}
+	if (status) {
+		mthca_err(mdev, "QUERY_DDR returned status 0x%02x, "
+			  "aborting.\n", status);
+		err = -EINVAL;
+		goto err_disable;
+	}
+
+	err = mthca_dev_lim(mdev, &dev_lim);
+	if (err) {
+		mthca_err(mdev, "QUERY_DEV_LIM command failed, aborting.\n");
+		goto err_disable;
+	}
+
+	profile = hca_profile;
+	profile.num_uar   = dev_lim.uar_size / PAGE_SIZE;
+	profile.uarc_size = 0;
+	if (mdev->mthca_flags & MTHCA_FLAG_SRQ)
+		profile.num_srq = dev_lim.max_srqs;
+
+	size = mthca_make_profile(mdev, &profile, &dev_lim, &init_hca);
+	if (size < 0) {
+		err = size;
+		goto err_disable;
+	}
+
+	err = mthca_INIT_HCA(mdev, &init_hca, &status);
+	if (err) {
+		mthca_err(mdev, "INIT_HCA command failed, aborting.\n");
+		goto err_disable;
+	}
+	if (status) {
+		mthca_err(mdev, "INIT_HCA returned status 0x%02x, "
+			  "aborting.\n", status);
+		err = -EINVAL;
+		goto err_disable;
+	}
+
+	return 0;
+
+err_disable:
+	mthca_SYS_DIS(mdev, &status);
+
+	return err;
+}
+
+static int mthca_load_fw(struct mthca_dev *mdev)
+{
+	u8 status;
+	int err;
+
+	/* FIXME: use HCA-attached memory for FW if present */
+
+	mdev->fw.arbel.fw_icm =
+		mthca_alloc_icm(mdev, mdev->fw.arbel.fw_pages,
+				GFP_HIGHUSER | __GFP_NOWARN, 0);
+	if (!mdev->fw.arbel.fw_icm) {
+		mthca_err(mdev, "Couldn't allocate FW area, aborting.\n");
+		return -ENOMEM;
+	}
+
+	err = mthca_MAP_FA(mdev, mdev->fw.arbel.fw_icm, &status);
+	if (err) {
+		mthca_err(mdev, "MAP_FA command failed, aborting.\n");
+		goto err_free;
+	}
+	if (status) {
+		mthca_err(mdev, "MAP_FA returned status 0x%02x, aborting.\n", status);
+		err = -EINVAL;
+		goto err_free;
+	}
+	err = mthca_RUN_FW(mdev, &status);
+	if (err) {
+		mthca_err(mdev, "RUN_FW command failed, aborting.\n");
+		goto err_unmap_fa;
+	}
+	if (status) {
+		mthca_err(mdev, "RUN_FW returned status 0x%02x, aborting.\n", status);
+		err = -EINVAL;
+		goto err_unmap_fa;
+	}
+
+	return 0;
+
+err_unmap_fa:
+	mthca_UNMAP_FA(mdev, &status);
+
+err_free:
+	mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0);
+	return err;
+}
+
+static int mthca_init_icm(struct mthca_dev *mdev,
+			  struct mthca_dev_lim *dev_lim,
+			  struct mthca_init_hca_param *init_hca,
+			  u64 icm_size)
+{
+	u64 aux_pages;
+	u8 status;
+	int err;
+
+	err = mthca_SET_ICM_SIZE(mdev, icm_size, &aux_pages, &status);
+	if (err) {
+		mthca_err(mdev, "SET_ICM_SIZE command failed, aborting.\n");
+		return err;
+	}
+	if (status) {
+		mthca_err(mdev, "SET_ICM_SIZE returned status 0x%02x, "
+			  "aborting.\n", status);
+		return -EINVAL;
+	}
+
+	mthca_dbg(mdev, "%lld KB of HCA context requires %lld KB aux memory.\n",
+		  (unsigned long long) icm_size >> 10,
+		  (unsigned long long) aux_pages << 2);
+
+	mdev->fw.arbel.aux_icm = mthca_alloc_icm(mdev, aux_pages,
+						 GFP_HIGHUSER | __GFP_NOWARN, 0);
+	if (!mdev->fw.arbel.aux_icm) {
+		mthca_err(mdev, "Couldn't allocate aux memory, aborting.\n");
+		return -ENOMEM;
+	}
+
+	err = mthca_MAP_ICM_AUX(mdev, mdev->fw.arbel.aux_icm, &status);
+	if (err) {
+		mthca_err(mdev, "MAP_ICM_AUX command failed, aborting.\n");
+		goto err_free_aux;
+	}
+	if (status) {
+		mthca_err(mdev, "MAP_ICM_AUX returned status 0x%02x, aborting.\n", status);
+		err = -EINVAL;
+		goto err_free_aux;
+	}
+
+	err = mthca_map_eq_icm(mdev, init_hca->eqc_base);
+	if (err) {
+		mthca_err(mdev, "Failed to map EQ context memory, aborting.\n");
+		goto err_unmap_aux;
+	}
+
+	/* CPU writes to non-reserved MTTs, while HCA might DMA to reserved mtts */
+	mdev->limits.reserved_mtts = ALIGN(mdev->limits.reserved_mtts * mdev->limits.mtt_seg_size,
+					   dma_get_cache_alignment()) / mdev->limits.mtt_seg_size;
+
+	mdev->mr_table.mtt_table = mthca_alloc_icm_table(mdev, init_hca->mtt_base,
+							 mdev->limits.mtt_seg_size,
+							 mdev->limits.num_mtt_segs,
+							 mdev->limits.reserved_mtts,
+							 1, 0);
+	if (!mdev->mr_table.mtt_table) {
+		mthca_err(mdev, "Failed to map MTT context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_eq;
+	}
+
+	mdev->mr_table.mpt_table = mthca_alloc_icm_table(mdev, init_hca->mpt_base,
+							 dev_lim->mpt_entry_sz,
+							 mdev->limits.num_mpts,
+							 mdev->limits.reserved_mrws,
+							 1, 1);
+	if (!mdev->mr_table.mpt_table) {
+		mthca_err(mdev, "Failed to map MPT context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_mtt;
+	}
+
+	mdev->qp_table.qp_table = mthca_alloc_icm_table(mdev, init_hca->qpc_base,
+							dev_lim->qpc_entry_sz,
+							mdev->limits.num_qps,
+							mdev->limits.reserved_qps,
+							0, 0);
+	if (!mdev->qp_table.qp_table) {
+		mthca_err(mdev, "Failed to map QP context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_mpt;
+	}
+
+	mdev->qp_table.eqp_table = mthca_alloc_icm_table(mdev, init_hca->eqpc_base,
+							 dev_lim->eqpc_entry_sz,
+							 mdev->limits.num_qps,
+							 mdev->limits.reserved_qps,
+							 0, 0);
+	if (!mdev->qp_table.eqp_table) {
+		mthca_err(mdev, "Failed to map EQP context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_qp;
+	}
+
+	mdev->qp_table.rdb_table = mthca_alloc_icm_table(mdev, init_hca->rdb_base,
+							 MTHCA_RDB_ENTRY_SIZE,
+							 mdev->limits.num_qps <<
+							 mdev->qp_table.rdb_shift, 0,
+							 0, 0);
+	if (!mdev->qp_table.rdb_table) {
+		mthca_err(mdev, "Failed to map RDB context memory, aborting\n");
+		err = -ENOMEM;
+		goto err_unmap_eqp;
+	}
+
+       mdev->cq_table.table = mthca_alloc_icm_table(mdev, init_hca->cqc_base,
+						    dev_lim->cqc_entry_sz,
+						    mdev->limits.num_cqs,
+						    mdev->limits.reserved_cqs,
+						    0, 0);
+	if (!mdev->cq_table.table) {
+		mthca_err(mdev, "Failed to map CQ context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_rdb;
+	}
+
+	if (mdev->mthca_flags & MTHCA_FLAG_SRQ) {
+		mdev->srq_table.table =
+			mthca_alloc_icm_table(mdev, init_hca->srqc_base,
+					      dev_lim->srq_entry_sz,
+					      mdev->limits.num_srqs,
+					      mdev->limits.reserved_srqs,
+					      0, 0);
+		if (!mdev->srq_table.table) {
+			mthca_err(mdev, "Failed to map SRQ context memory, "
+				  "aborting.\n");
+			err = -ENOMEM;
+			goto err_unmap_cq;
+		}
+	}
+
+	/*
+	 * It's not strictly required, but for simplicity just map the
+	 * whole multicast group table now.  The table isn't very big
+	 * and it's a lot easier than trying to track ref counts.
+	 */
+	mdev->mcg_table.table = mthca_alloc_icm_table(mdev, init_hca->mc_base,
+						      MTHCA_MGM_ENTRY_SIZE,
+						      mdev->limits.num_mgms +
+						      mdev->limits.num_amgms,
+						      mdev->limits.num_mgms +
+						      mdev->limits.num_amgms,
+						      0, 0);
+	if (!mdev->mcg_table.table) {
+		mthca_err(mdev, "Failed to map MCG context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_srq;
+	}
+
+	return 0;
+
+err_unmap_srq:
+	if (mdev->mthca_flags & MTHCA_FLAG_SRQ)
+		mthca_free_icm_table(mdev, mdev->srq_table.table);
+
+err_unmap_cq:
+	mthca_free_icm_table(mdev, mdev->cq_table.table);
+
+err_unmap_rdb:
+	mthca_free_icm_table(mdev, mdev->qp_table.rdb_table);
+
+err_unmap_eqp:
+	mthca_free_icm_table(mdev, mdev->qp_table.eqp_table);
+
+err_unmap_qp:
+	mthca_free_icm_table(mdev, mdev->qp_table.qp_table);
+
+err_unmap_mpt:
+	mthca_free_icm_table(mdev, mdev->mr_table.mpt_table);
+
+err_unmap_mtt:
+	mthca_free_icm_table(mdev, mdev->mr_table.mtt_table);
+
+err_unmap_eq:
+	mthca_unmap_eq_icm(mdev);
+
+err_unmap_aux:
+	mthca_UNMAP_ICM_AUX(mdev, &status);
+
+err_free_aux:
+	mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0);
+
+	return err;
+}
+
+static void mthca_free_icms(struct mthca_dev *mdev)
+{
+	u8 status;
+
+	mthca_free_icm_table(mdev, mdev->mcg_table.table);
+	if (mdev->mthca_flags & MTHCA_FLAG_SRQ)
+		mthca_free_icm_table(mdev, mdev->srq_table.table);
+	mthca_free_icm_table(mdev, mdev->cq_table.table);
+	mthca_free_icm_table(mdev, mdev->qp_table.rdb_table);
+	mthca_free_icm_table(mdev, mdev->qp_table.eqp_table);
+	mthca_free_icm_table(mdev, mdev->qp_table.qp_table);
+	mthca_free_icm_table(mdev, mdev->mr_table.mpt_table);
+	mthca_free_icm_table(mdev, mdev->mr_table.mtt_table);
+	mthca_unmap_eq_icm(mdev);
+
+	mthca_UNMAP_ICM_AUX(mdev, &status);
+	mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0);
+}
+
+static int mthca_init_arbel(struct mthca_dev *mdev)
+{
+	struct mthca_dev_lim        dev_lim;
+	struct mthca_profile        profile;
+	struct mthca_init_hca_param init_hca;
+	s64 icm_size;
+	u8 status;
+	int err;
+
+	err = mthca_QUERY_FW(mdev, &status);
+	if (err) {
+		mthca_err(mdev, "QUERY_FW command failed, aborting.\n");
+		return err;
+	}
+	if (status) {
+		mthca_err(mdev, "QUERY_FW returned status 0x%02x, "
+			  "aborting.\n", status);
+		return -EINVAL;
+	}
+
+	err = mthca_ENABLE_LAM(mdev, &status);
+	if (err) {
+		mthca_err(mdev, "ENABLE_LAM command failed, aborting.\n");
+		return err;
+	}
+	if (status == MTHCA_CMD_STAT_LAM_NOT_PRE) {
+		mthca_dbg(mdev, "No HCA-attached memory (running in MemFree mode)\n");
+		mdev->mthca_flags |= MTHCA_FLAG_NO_LAM;
+	} else if (status) {
+		mthca_err(mdev, "ENABLE_LAM returned status 0x%02x, "
+			  "aborting.\n", status);
+		return -EINVAL;
+	}
+
+	err = mthca_load_fw(mdev);
+	if (err) {
+		mthca_err(mdev, "Failed to start FW, aborting.\n");
+		goto err_disable;
+	}
+
+	err = mthca_dev_lim(mdev, &dev_lim);
+	if (err) {
+		mthca_err(mdev, "QUERY_DEV_LIM command failed, aborting.\n");
+		goto err_stop_fw;
+	}
+
+	profile = hca_profile;
+	profile.num_uar  = dev_lim.uar_size / PAGE_SIZE;
+	profile.num_udav = 0;
+	if (mdev->mthca_flags & MTHCA_FLAG_SRQ)
+		profile.num_srq = dev_lim.max_srqs;
+
+	icm_size = mthca_make_profile(mdev, &profile, &dev_lim, &init_hca);
+	if (icm_size < 0) {
+		err = icm_size;
+		goto err_stop_fw;
+	}
+
+	err = mthca_init_icm(mdev, &dev_lim, &init_hca, icm_size);
+	if (err)
+		goto err_stop_fw;
+
+	err = mthca_INIT_HCA(mdev, &init_hca, &status);
+	if (err) {
+		mthca_err(mdev, "INIT_HCA command failed, aborting.\n");
+		goto err_free_icm;
+	}
+	if (status) {
+		mthca_err(mdev, "INIT_HCA returned status 0x%02x, "
+			  "aborting.\n", status);
+		err = -EINVAL;
+		goto err_free_icm;
+	}
+
+	return 0;
+
+err_free_icm:
+	mthca_free_icms(mdev);
+
+err_stop_fw:
+	mthca_UNMAP_FA(mdev, &status);
+	mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0);
+
+err_disable:
+	if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM))
+		mthca_DISABLE_LAM(mdev, &status);
+
+	return err;
+}
+
+static void mthca_close_hca(struct mthca_dev *mdev)
+{
+	u8 status;
+
+	mthca_CLOSE_HCA(mdev, 0, &status);
+
+	if (mthca_is_memfree(mdev)) {
+		mthca_free_icms(mdev);
+
+		mthca_UNMAP_FA(mdev, &status);
+		mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0);
+
+		if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM))
+			mthca_DISABLE_LAM(mdev, &status);
+	} else
+		mthca_SYS_DIS(mdev, &status);
+}
+
+static int mthca_init_hca(struct mthca_dev *mdev)
+{
+	u8 status;
+	int err;
+	struct mthca_adapter adapter;
+
+	if (mthca_is_memfree(mdev))
+		err = mthca_init_arbel(mdev);
+	else
+		err = mthca_init_tavor(mdev);
+
+	if (err)
+		return err;
+
+	err = mthca_QUERY_ADAPTER(mdev, &adapter, &status);
+	if (err) {
+		mthca_err(mdev, "QUERY_ADAPTER command failed, aborting.\n");
+		goto err_close;
+	}
+	if (status) {
+		mthca_err(mdev, "QUERY_ADAPTER returned status 0x%02x, "
+			  "aborting.\n", status);
+		err = -EINVAL;
+		goto err_close;
+	}
+
+	mdev->eq_table.inta_pin = adapter.inta_pin;
+	if (!mthca_is_memfree(mdev))
+		mdev->rev_id = adapter.revision_id;
+	memcpy(mdev->board_id, adapter.board_id, sizeof mdev->board_id);
+
+	return 0;
+
+err_close:
+	mthca_close_hca(mdev);
+	return err;
+}
+
+static int mthca_setup_hca(struct mthca_dev *dev)
+{
+	int err;
+	u8 status;
+
+	MTHCA_INIT_DOORBELL_LOCK(&dev->doorbell_lock);
+
+	err = mthca_init_uar_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "user access region table, aborting.\n");
+		return err;
+	}
+
+	err = mthca_uar_alloc(dev, &dev->driver_uar);
+	if (err) {
+		mthca_err(dev, "Failed to allocate driver access region, "
+			  "aborting.\n");
+		goto err_uar_table_free;
+	}
+
+	dev->kar = ioremap(dev->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
+	if (!dev->kar) {
+		mthca_err(dev, "Couldn't map kernel access region, "
+			  "aborting.\n");
+		err = -ENOMEM;
+		goto err_uar_free;
+	}
+
+	err = mthca_init_pd_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "protection domain table, aborting.\n");
+		goto err_kar_unmap;
+	}
+
+	err = mthca_init_mr_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "memory region table, aborting.\n");
+		goto err_pd_table_free;
+	}
+
+	err = mthca_pd_alloc(dev, 1, &dev->driver_pd);
+	if (err) {
+		mthca_err(dev, "Failed to create driver PD, "
+			  "aborting.\n");
+		goto err_mr_table_free;
+	}
+
+	err = mthca_init_eq_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "event queue table, aborting.\n");
+		goto err_pd_free;
+	}
+
+	err = mthca_cmd_use_events(dev);
+	if (err) {
+		mthca_err(dev, "Failed to switch to event-driven "
+			  "firmware commands, aborting.\n");
+		goto err_eq_table_free;
+	}
+
+	err = mthca_NOP(dev, &status);
+	if (err || status) {
+		if (dev->mthca_flags & MTHCA_FLAG_MSI_X) {
+			mthca_warn(dev, "NOP command failed to generate interrupt "
+				   "(IRQ %d).\n",
+				   dev->eq_table.eq[MTHCA_EQ_CMD].msi_x_vector);
+			mthca_warn(dev, "Trying again with MSI-X disabled.\n");
+		} else {
+			mthca_err(dev, "NOP command failed to generate interrupt "
+				  "(IRQ %d), aborting.\n",
+				  dev->pdev->irq);
+			mthca_err(dev, "BIOS or ACPI interrupt routing problem?\n");
+		}
+
+		goto err_cmd_poll;
+	}
+
+	mthca_dbg(dev, "NOP command IRQ test passed\n");
+
+	err = mthca_init_cq_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "completion queue table, aborting.\n");
+		goto err_cmd_poll;
+	}
+
+	err = mthca_init_srq_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "shared receive queue table, aborting.\n");
+		goto err_cq_table_free;
+	}
+
+	err = mthca_init_qp_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "queue pair table, aborting.\n");
+		goto err_srq_table_free;
+	}
+
+	err = mthca_init_av_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "address vector table, aborting.\n");
+		goto err_qp_table_free;
+	}
+
+	err = mthca_init_mcg_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "multicast group table, aborting.\n");
+		goto err_av_table_free;
+	}
+
+	return 0;
+
+err_av_table_free:
+	mthca_cleanup_av_table(dev);
+
+err_qp_table_free:
+	mthca_cleanup_qp_table(dev);
+
+err_srq_table_free:
+	mthca_cleanup_srq_table(dev);
+
+err_cq_table_free:
+	mthca_cleanup_cq_table(dev);
+
+err_cmd_poll:
+	mthca_cmd_use_polling(dev);
+
+err_eq_table_free:
+	mthca_cleanup_eq_table(dev);
+
+err_pd_free:
+	mthca_pd_free(dev, &dev->driver_pd);
+
+err_mr_table_free:
+	mthca_cleanup_mr_table(dev);
+
+err_pd_table_free:
+	mthca_cleanup_pd_table(dev);
+
+err_kar_unmap:
+	iounmap(dev->kar);
+
+err_uar_free:
+	mthca_uar_free(dev, &dev->driver_uar);
+
+err_uar_table_free:
+	mthca_cleanup_uar_table(dev);
+	return err;
+}
+
+static int mthca_enable_msi_x(struct mthca_dev *mdev)
+{
+	struct msix_entry entries[3];
+	int err;
+
+	entries[0].entry = 0;
+	entries[1].entry = 1;
+	entries[2].entry = 2;
+
+	err = pci_enable_msix(mdev->pdev, entries, ARRAY_SIZE(entries));
+	if (err) {
+		if (err > 0)
+			mthca_info(mdev, "Only %d MSI-X vectors available, "
+				   "not using MSI-X\n", err);
+		return err;
+	}
+
+	mdev->eq_table.eq[MTHCA_EQ_COMP ].msi_x_vector = entries[0].vector;
+	mdev->eq_table.eq[MTHCA_EQ_ASYNC].msi_x_vector = entries[1].vector;
+	mdev->eq_table.eq[MTHCA_EQ_CMD  ].msi_x_vector = entries[2].vector;
+
+	return 0;
+}
+
+/* Types of supported HCA */
+enum {
+	TAVOR,			/* MT23108                        */
+	ARBEL_COMPAT,		/* MT25208 in Tavor compat mode   */
+	ARBEL_NATIVE,		/* MT25208 with extended features */
+	SINAI			/* MT25204 */
+};
+
+#define MTHCA_FW_VER(major, minor, subminor) \
+	(((u64) (major) << 32) | ((u64) (minor) << 16) | (u64) (subminor))
+
+static struct {
+	u64 latest_fw;
+	u32 flags;
+} mthca_hca_table[] = {
+	[TAVOR]        = { .latest_fw = MTHCA_FW_VER(3, 5, 0),
+			   .flags     = 0 },
+	[ARBEL_COMPAT] = { .latest_fw = MTHCA_FW_VER(4, 8, 200),
+			   .flags     = MTHCA_FLAG_PCIE },
+	[ARBEL_NATIVE] = { .latest_fw = MTHCA_FW_VER(5, 3, 0),
+			   .flags     = MTHCA_FLAG_MEMFREE |
+					MTHCA_FLAG_PCIE },
+	[SINAI]        = { .latest_fw = MTHCA_FW_VER(1, 2, 0),
+			   .flags     = MTHCA_FLAG_MEMFREE |
+					MTHCA_FLAG_PCIE    |
+					MTHCA_FLAG_SINAI_OPT }
+};
+
+static int __mthca_init_one(struct pci_dev *pdev, int hca_type)
+{
+	int ddr_hidden = 0;
+	int err;
+	struct mthca_dev *mdev;
+
+	printk(KERN_INFO PFX "Initializing %s\n",
+	       pci_name(pdev));
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot enable PCI device, "
+			"aborting.\n");
+		return err;
+	}
+
+	/*
+	 * Check for BARs.  We expect 0: 1MB, 2: 8MB, 4: DDR (may not
+	 * be present)
+	 */
+	if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM) ||
+	    pci_resource_len(pdev, 0) != 1 << 20) {
+		dev_err(&pdev->dev, "Missing DCS, aborting.\n");
+		err = -ENODEV;
+		goto err_disable_pdev;
+	}
+	if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) {
+		dev_err(&pdev->dev, "Missing UAR, aborting.\n");
+		err = -ENODEV;
+		goto err_disable_pdev;
+	}
+	if (!(pci_resource_flags(pdev, 4) & IORESOURCE_MEM))
+		ddr_hidden = 1;
+
+	err = pci_request_regions(pdev, DRV_NAME);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot obtain PCI resources, "
+			"aborting.\n");
+		goto err_disable_pdev;
+	}
+
+	pci_set_master(pdev);
+
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask.\n");
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n");
+			goto err_free_res;
+		}
+	}
+	err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit "
+			 "consistent PCI DMA mask.\n");
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, "
+				"aborting.\n");
+			goto err_free_res;
+		}
+	}
+
+	mdev = (struct mthca_dev *) ib_alloc_device(sizeof *mdev);
+	if (!mdev) {
+		dev_err(&pdev->dev, "Device struct alloc failed, "
+			"aborting.\n");
+		err = -ENOMEM;
+		goto err_free_res;
+	}
+
+	mdev->pdev = pdev;
+
+	mdev->mthca_flags = mthca_hca_table[hca_type].flags;
+	if (ddr_hidden)
+		mdev->mthca_flags |= MTHCA_FLAG_DDR_HIDDEN;
+
+	/*
+	 * Now reset the HCA before we touch the PCI capabilities or
+	 * attempt a firmware command, since a boot ROM may have left
+	 * the HCA in an undefined state.
+	 */
+	err = mthca_reset(mdev);
+	if (err) {
+		mthca_err(mdev, "Failed to reset HCA, aborting.\n");
+		goto err_free_dev;
+	}
+
+	if (mthca_cmd_init(mdev)) {
+		mthca_err(mdev, "Failed to init command interface, aborting.\n");
+		goto err_free_dev;
+	}
+
+	err = mthca_tune_pci(mdev);
+	if (err)
+		goto err_cmd;
+
+	err = mthca_init_hca(mdev);
+	if (err)
+		goto err_cmd;
+
+	if (mdev->fw_ver < mthca_hca_table[hca_type].latest_fw) {
+		mthca_warn(mdev, "HCA FW version %d.%d.%03d is old (%d.%d.%03d is current).\n",
+			   (int) (mdev->fw_ver >> 32), (int) (mdev->fw_ver >> 16) & 0xffff,
+			   (int) (mdev->fw_ver & 0xffff),
+			   (int) (mthca_hca_table[hca_type].latest_fw >> 32),
+			   (int) (mthca_hca_table[hca_type].latest_fw >> 16) & 0xffff,
+			   (int) (mthca_hca_table[hca_type].latest_fw & 0xffff));
+		mthca_warn(mdev, "If you have problems, try updating your HCA FW.\n");
+	}
+
+	if (msi_x && !mthca_enable_msi_x(mdev))
+		mdev->mthca_flags |= MTHCA_FLAG_MSI_X;
+
+	err = mthca_setup_hca(mdev);
+	if (err == -EBUSY && (mdev->mthca_flags & MTHCA_FLAG_MSI_X)) {
+		if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
+			pci_disable_msix(pdev);
+		mdev->mthca_flags &= ~MTHCA_FLAG_MSI_X;
+
+		err = mthca_setup_hca(mdev);
+	}
+
+	if (err)
+		goto err_close;
+
+	err = mthca_register_device(mdev);
+	if (err)
+		goto err_cleanup;
+
+	err = mthca_create_agents(mdev);
+	if (err)
+		goto err_unregister;
+
+	pci_set_drvdata(pdev, mdev);
+	mdev->hca_type = hca_type;
+
+	mdev->active = 1;
+
+	return 0;
+
+err_unregister:
+	mthca_unregister_device(mdev);
+
+err_cleanup:
+	mthca_cleanup_mcg_table(mdev);
+	mthca_cleanup_av_table(mdev);
+	mthca_cleanup_qp_table(mdev);
+	mthca_cleanup_srq_table(mdev);
+	mthca_cleanup_cq_table(mdev);
+	mthca_cmd_use_polling(mdev);
+	mthca_cleanup_eq_table(mdev);
+
+	mthca_pd_free(mdev, &mdev->driver_pd);
+
+	mthca_cleanup_mr_table(mdev);
+	mthca_cleanup_pd_table(mdev);
+	mthca_cleanup_uar_table(mdev);
+
+err_close:
+	if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
+		pci_disable_msix(pdev);
+
+	mthca_close_hca(mdev);
+
+err_cmd:
+	mthca_cmd_cleanup(mdev);
+
+err_free_dev:
+	ib_dealloc_device(&mdev->ib_dev);
+
+err_free_res:
+	pci_release_regions(pdev);
+
+err_disable_pdev:
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+	return err;
+}
+
+static void __mthca_remove_one(struct pci_dev *pdev)
+{
+	struct mthca_dev *mdev = pci_get_drvdata(pdev);
+	u8 status;
+	int p;
+
+	if (mdev) {
+		mthca_free_agents(mdev);
+		mthca_unregister_device(mdev);
+
+		for (p = 1; p <= mdev->limits.num_ports; ++p)
+			mthca_CLOSE_IB(mdev, p, &status);
+
+		mthca_cleanup_mcg_table(mdev);
+		mthca_cleanup_av_table(mdev);
+		mthca_cleanup_qp_table(mdev);
+		mthca_cleanup_srq_table(mdev);
+		mthca_cleanup_cq_table(mdev);
+		mthca_cmd_use_polling(mdev);
+		mthca_cleanup_eq_table(mdev);
+
+		mthca_pd_free(mdev, &mdev->driver_pd);
+
+		mthca_cleanup_mr_table(mdev);
+		mthca_cleanup_pd_table(mdev);
+
+		iounmap(mdev->kar);
+		mthca_uar_free(mdev, &mdev->driver_uar);
+		mthca_cleanup_uar_table(mdev);
+		mthca_close_hca(mdev);
+		mthca_cmd_cleanup(mdev);
+
+		if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
+			pci_disable_msix(pdev);
+
+		ib_dealloc_device(&mdev->ib_dev);
+		pci_release_regions(pdev);
+		pci_disable_device(pdev);
+		pci_set_drvdata(pdev, NULL);
+	}
+}
+
+int __mthca_restart_one(struct pci_dev *pdev)
+{
+	struct mthca_dev *mdev;
+	int hca_type;
+
+	mdev = pci_get_drvdata(pdev);
+	if (!mdev)
+		return -ENODEV;
+	hca_type = mdev->hca_type;
+	__mthca_remove_one(pdev);
+	return __mthca_init_one(pdev, hca_type);
+}
+
+static int __devinit mthca_init_one(struct pci_dev *pdev,
+				    const struct pci_device_id *id)
+{
+	static int mthca_version_printed = 0;
+	int ret;
+
+	mutex_lock(&mthca_device_mutex);
+
+	if (!mthca_version_printed) {
+		printk(KERN_INFO "%s", mthca_version);
+		++mthca_version_printed;
+	}
+
+	if (id->driver_data >= ARRAY_SIZE(mthca_hca_table)) {
+		printk(KERN_ERR PFX "%s has invalid driver data %lx\n",
+		       pci_name(pdev), id->driver_data);
+		mutex_unlock(&mthca_device_mutex);
+		return -ENODEV;
+	}
+
+	ret = __mthca_init_one(pdev, id->driver_data);
+
+	mutex_unlock(&mthca_device_mutex);
+
+	return ret;
+}
+
+static void __devexit mthca_remove_one(struct pci_dev *pdev)
+{
+	mutex_lock(&mthca_device_mutex);
+	__mthca_remove_one(pdev);
+	mutex_unlock(&mthca_device_mutex);
+}
+
+static struct pci_device_id mthca_pci_table[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR),
+	  .driver_data = TAVOR },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_TAVOR),
+	  .driver_data = TAVOR },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT),
+	  .driver_data = ARBEL_COMPAT },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT),
+	  .driver_data = ARBEL_COMPAT },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_ARBEL),
+	  .driver_data = ARBEL_NATIVE },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_ARBEL),
+	  .driver_data = ARBEL_NATIVE },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_SINAI),
+	  .driver_data = SINAI },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_SINAI),
+	  .driver_data = SINAI },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_SINAI_OLD),
+	  .driver_data = SINAI },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_SINAI_OLD),
+	  .driver_data = SINAI },
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, mthca_pci_table);
+
+static struct pci_driver mthca_driver = {
+	.name		= DRV_NAME,
+	.id_table	= mthca_pci_table,
+	.probe		= mthca_init_one,
+	.remove		= __devexit_p(mthca_remove_one)
+};
+
+static void __init __mthca_check_profile_val(const char *name, int *pval,
+					     int pval_default)
+{
+	/* value must be positive and power of 2 */
+	int old_pval = *pval;
+
+	if (old_pval <= 0)
+		*pval = pval_default;
+	else
+		*pval = roundup_pow_of_two(old_pval);
+
+	if (old_pval != *pval) {
+		printk(KERN_WARNING PFX "Invalid value %d for %s in module parameter.\n",
+		       old_pval, name);
+		printk(KERN_WARNING PFX "Corrected %s to %d.\n", name, *pval);
+	}
+}
+
+#define mthca_check_profile_val(name, default)				\
+	__mthca_check_profile_val(#name, &hca_profile.name, default)
+
+static void __init mthca_validate_profile(void)
+{
+	mthca_check_profile_val(num_qp,            MTHCA_DEFAULT_NUM_QP);
+	mthca_check_profile_val(rdb_per_qp,        MTHCA_DEFAULT_RDB_PER_QP);
+	mthca_check_profile_val(num_cq,            MTHCA_DEFAULT_NUM_CQ);
+	mthca_check_profile_val(num_mcg, 	   MTHCA_DEFAULT_NUM_MCG);
+	mthca_check_profile_val(num_mpt, 	   MTHCA_DEFAULT_NUM_MPT);
+	mthca_check_profile_val(num_mtt, 	   MTHCA_DEFAULT_NUM_MTT);
+	mthca_check_profile_val(num_udav,          MTHCA_DEFAULT_NUM_UDAV);
+	mthca_check_profile_val(fmr_reserved_mtts, MTHCA_DEFAULT_NUM_RESERVED_MTTS);
+
+	if (hca_profile.fmr_reserved_mtts >= hca_profile.num_mtt) {
+		printk(KERN_WARNING PFX "Invalid fmr_reserved_mtts module parameter %d.\n",
+		       hca_profile.fmr_reserved_mtts);
+		printk(KERN_WARNING PFX "(Must be smaller than num_mtt %d)\n",
+		       hca_profile.num_mtt);
+		hca_profile.fmr_reserved_mtts = hca_profile.num_mtt / 2;
+		printk(KERN_WARNING PFX "Corrected fmr_reserved_mtts to %d.\n",
+		       hca_profile.fmr_reserved_mtts);
+	}
+	if (log_mtts_per_seg == 0)
+		log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8);
+	if ((log_mtts_per_seg < 1) || (log_mtts_per_seg > 5)) {
+		printk(KERN_WARNING PFX "bad log_mtts_per_seg (%d). Using default - %ld\n",
+		       log_mtts_per_seg, ilog2(MTHCA_MTT_SEG_SIZE / 8));
+		log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8);
+	}
+}
+
+static int __init mthca_init(void)
+{
+	int ret;
+
+	mthca_validate_profile();
+
+	ret = mthca_catas_init();
+	if (ret)
+		return ret;
+
+	ret = pci_register_driver(&mthca_driver);
+	if (ret < 0) {
+		mthca_catas_cleanup();
+		return ret;
+	}
+
+	return 0;
+}
+
+static void __exit mthca_cleanup(void)
+{
+	pci_unregister_driver(&mthca_driver);
+	mthca_catas_cleanup();
+}
+
+module_init_order(mthca_init, SI_ORDER_MIDDLE);
+module_exit(mthca_cleanup);
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_mcg.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_mcg.c
new file mode 100644
index 0000000..d4c8105
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_mcg.c
@@ -0,0 +1,372 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+struct mthca_mgm {
+	__be32 next_gid_index;
+	u32    reserved[3];
+	u8     gid[16];
+	__be32 qp[MTHCA_QP_PER_MGM];
+};
+
+static const u8 zero_gid[16];	/* automatically initialized to 0 */
+
+/*
+ * Caller must hold MCG table semaphore.  gid and mgm parameters must
+ * be properly aligned for command interface.
+ *
+ *  Returns 0 unless a firmware command error occurs.
+ *
+ * If GID is found in MGM or MGM is empty, *index = *hash, *prev = -1
+ * and *mgm holds MGM entry.
+ *
+ * if GID is found in AMGM, *index = index in AMGM, *prev = index of
+ * previous entry in hash chain and *mgm holds AMGM entry.
+ *
+ * If no AMGM exists for given gid, *index = -1, *prev = index of last
+ * entry in hash chain and *mgm holds end of hash chain.
+ */
+static int find_mgm(struct mthca_dev *dev,
+		    u8 *gid, struct mthca_mailbox *mgm_mailbox,
+		    u16 *hash, int *prev, int *index)
+{
+	struct mthca_mailbox *mailbox;
+	struct mthca_mgm *mgm = mgm_mailbox->buf;
+	u8 *mgid;
+	int err;
+	u8 status;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return -ENOMEM;
+	mgid = mailbox->buf;
+
+	memcpy(mgid, gid, 16);
+
+	err = mthca_MGID_HASH(dev, mailbox, hash, &status);
+	if (err)
+		goto out;
+	if (status) {
+		mthca_err(dev, "MGID_HASH returned status %02x\n", status);
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (0)
+		mthca_dbg(dev, "Hash for %pI6 is %04x\n", gid, *hash);
+
+	*index = *hash;
+	*prev  = -1;
+
+	do {
+		err = mthca_READ_MGM(dev, *index, mgm_mailbox, &status);
+		if (err)
+			goto out;
+		if (status) {
+			mthca_err(dev, "READ_MGM returned status %02x\n", status);
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (!memcmp(mgm->gid, zero_gid, 16)) {
+			if (*index != *hash) {
+				mthca_err(dev, "Found zero MGID in AMGM.\n");
+				err = -EINVAL;
+			}
+			goto out;
+		}
+
+		if (!memcmp(mgm->gid, gid, 16))
+			goto out;
+
+		*prev = *index;
+		*index = be32_to_cpu(mgm->next_gid_index) >> 6;
+	} while (*index);
+
+	*index = -1;
+
+ out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_mailbox *mailbox;
+	struct mthca_mgm *mgm;
+	u16 hash;
+	int index, prev;
+	int link = 0;
+	int i;
+	int err;
+	u8 status;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	mgm = mailbox->buf;
+
+	mutex_lock(&dev->mcg_table.mutex);
+
+	err = find_mgm(dev, gid->raw, mailbox, &hash, &prev, &index);
+	if (err)
+		goto out;
+
+	if (index != -1) {
+		if (!memcmp(mgm->gid, zero_gid, 16))
+			memcpy(mgm->gid, gid->raw, 16);
+	} else {
+		link = 1;
+
+		index = mthca_alloc(&dev->mcg_table.alloc);
+		if (index == -1) {
+			mthca_err(dev, "No AMGM entries left\n");
+			err = -ENOMEM;
+			goto out;
+		}
+
+		err = mthca_READ_MGM(dev, index, mailbox, &status);
+		if (err)
+			goto out;
+		if (status) {
+			mthca_err(dev, "READ_MGM returned status %02x\n", status);
+			err = -EINVAL;
+			goto out;
+		}
+		memset(mgm, 0, sizeof *mgm);
+		memcpy(mgm->gid, gid->raw, 16);
+	}
+
+	for (i = 0; i < MTHCA_QP_PER_MGM; ++i)
+		if (mgm->qp[i] == cpu_to_be32(ibqp->qp_num | (1 << 31))) {
+			mthca_dbg(dev, "QP %06x already a member of MGM\n",
+				  ibqp->qp_num);
+			err = 0;
+			goto out;
+		} else if (!(mgm->qp[i] & cpu_to_be32(1 << 31))) {
+			mgm->qp[i] = cpu_to_be32(ibqp->qp_num | (1 << 31));
+			break;
+		}
+
+	if (i == MTHCA_QP_PER_MGM) {
+		mthca_err(dev, "MGM at index %x is full.\n", index);
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = mthca_WRITE_MGM(dev, index, mailbox, &status);
+	if (err)
+		goto out;
+	if (status) {
+		mthca_err(dev, "WRITE_MGM returned status %02x\n", status);
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (!link)
+		goto out;
+
+	err = mthca_READ_MGM(dev, prev, mailbox, &status);
+	if (err)
+		goto out;
+	if (status) {
+		mthca_err(dev, "READ_MGM returned status %02x\n", status);
+		err = -EINVAL;
+		goto out;
+	}
+
+	mgm->next_gid_index = cpu_to_be32(index << 6);
+
+	err = mthca_WRITE_MGM(dev, prev, mailbox, &status);
+	if (err)
+		goto out;
+	if (status) {
+		mthca_err(dev, "WRITE_MGM returned status %02x\n", status);
+		err = -EINVAL;
+	}
+
+ out:
+	if (err && link && index != -1) {
+		BUG_ON(index < dev->limits.num_mgms);
+		mthca_free(&dev->mcg_table.alloc, index);
+	}
+	mutex_unlock(&dev->mcg_table.mutex);
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_mailbox *mailbox;
+	struct mthca_mgm *mgm;
+	u16 hash;
+	int prev, index;
+	int i, loc;
+	int err;
+	u8 status;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	mgm = mailbox->buf;
+
+	mutex_lock(&dev->mcg_table.mutex);
+
+	err = find_mgm(dev, gid->raw, mailbox, &hash, &prev, &index);
+	if (err)
+		goto out;
+
+	if (index == -1) {
+		mthca_err(dev, "MGID %pI6 not found\n", gid->raw);
+		err = -EINVAL;
+		goto out;
+	}
+
+	for (loc = -1, i = 0; i < MTHCA_QP_PER_MGM; ++i) {
+		if (mgm->qp[i] == cpu_to_be32(ibqp->qp_num | (1 << 31)))
+			loc = i;
+		if (!(mgm->qp[i] & cpu_to_be32(1 << 31)))
+			break;
+	}
+
+	if (loc == -1) {
+		mthca_err(dev, "QP %06x not found in MGM\n", ibqp->qp_num);
+		err = -EINVAL;
+		goto out;
+	}
+
+	mgm->qp[loc]   = mgm->qp[i - 1];
+	mgm->qp[i - 1] = 0;
+
+	err = mthca_WRITE_MGM(dev, index, mailbox, &status);
+	if (err)
+		goto out;
+	if (status) {
+		mthca_err(dev, "WRITE_MGM returned status %02x\n", status);
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (i != 1)
+		goto out;
+
+	if (prev == -1) {
+		/* Remove entry from MGM */
+		int amgm_index_to_free = be32_to_cpu(mgm->next_gid_index) >> 6;
+		if (amgm_index_to_free) {
+			err = mthca_READ_MGM(dev, amgm_index_to_free,
+					     mailbox, &status);
+			if (err)
+				goto out;
+			if (status) {
+				mthca_err(dev, "READ_MGM returned status %02x\n",
+					  status);
+				err = -EINVAL;
+				goto out;
+			}
+		} else
+			memset(mgm->gid, 0, 16);
+
+		err = mthca_WRITE_MGM(dev, index, mailbox, &status);
+		if (err)
+			goto out;
+		if (status) {
+			mthca_err(dev, "WRITE_MGM returned status %02x\n", status);
+			err = -EINVAL;
+			goto out;
+		}
+		if (amgm_index_to_free) {
+			BUG_ON(amgm_index_to_free < dev->limits.num_mgms);
+			mthca_free(&dev->mcg_table.alloc, amgm_index_to_free);
+		}
+	} else {
+		/* Remove entry from AMGM */
+		int curr_next_index = be32_to_cpu(mgm->next_gid_index) >> 6;
+		err = mthca_READ_MGM(dev, prev, mailbox, &status);
+		if (err)
+			goto out;
+		if (status) {
+			mthca_err(dev, "READ_MGM returned status %02x\n", status);
+			err = -EINVAL;
+			goto out;
+		}
+
+		mgm->next_gid_index = cpu_to_be32(curr_next_index << 6);
+
+		err = mthca_WRITE_MGM(dev, prev, mailbox, &status);
+		if (err)
+			goto out;
+		if (status) {
+			mthca_err(dev, "WRITE_MGM returned status %02x\n", status);
+			err = -EINVAL;
+			goto out;
+		}
+		BUG_ON(index < dev->limits.num_mgms);
+		mthca_free(&dev->mcg_table.alloc, index);
+	}
+
+ out:
+	mutex_unlock(&dev->mcg_table.mutex);
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_init_mcg_table(struct mthca_dev *dev)
+{
+	int err;
+	int table_size = dev->limits.num_mgms + dev->limits.num_amgms;
+
+	err = mthca_alloc_init(&dev->mcg_table.alloc,
+			       table_size,
+			       table_size - 1,
+			       dev->limits.num_mgms);
+	if (err)
+		return err;
+
+	mutex_init(&dev->mcg_table.mutex);
+
+	return 0;
+}
+
+void mthca_cleanup_mcg_table(struct mthca_dev *dev)
+{
+	mthca_alloc_cleanup(&dev->mcg_table.alloc);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.c
new file mode 100644
index 0000000..783da4b
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -0,0 +1,879 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <linux/scatterlist.h>
+#include <linux/sched.h>
+
+#include <asm/page.h>
+
+#include "mthca_memfree.h"
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+/*
+ * We allocate in as big chunks as we can, up to a maximum of 256 KB
+ * per chunk.
+ */
+enum {
+	MTHCA_ICM_ALLOC_SIZE   = 1 << 18,
+	MTHCA_TABLE_CHUNK_SIZE = 1 << 18
+};
+
+struct mthca_user_db_table {
+	struct mutex mutex;
+	struct {
+		u64                uvirt;
+		struct scatterlist mem;
+		int                refcount;
+	}                page[0];
+};
+
+static void mthca_free_icm_pages(struct mthca_dev *dev, struct mthca_icm_chunk *chunk)
+{
+	int i;
+
+	if (chunk->nsg > 0)
+		pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages,
+			     PCI_DMA_BIDIRECTIONAL);
+
+	for (i = 0; i < chunk->npages; ++i)
+		__free_pages(sg_page(&chunk->mem[i]),
+			     get_order(chunk->mem[i].length));
+}
+
+static void mthca_free_icm_coherent(struct mthca_dev *dev, struct mthca_icm_chunk *chunk)
+{
+	int i;
+
+	for (i = 0; i < chunk->npages; ++i) {
+		dma_free_coherent(&dev->pdev->dev, chunk->mem[i].length,
+				  lowmem_page_address(sg_page(&chunk->mem[i])),
+				  sg_dma_address(&chunk->mem[i]));
+	}
+}
+
+void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm, int coherent)
+{
+	struct mthca_icm_chunk *chunk, *tmp;
+
+	if (!icm)
+		return;
+
+	list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) {
+		if (coherent)
+			mthca_free_icm_coherent(dev, chunk);
+		else
+			mthca_free_icm_pages(dev, chunk);
+
+		kfree(chunk);
+	}
+
+	kfree(icm);
+}
+
+static int mthca_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_mask)
+{
+	struct page *page;
+
+	/*
+	 * Use __GFP_ZERO because buggy firmware assumes ICM pages are
+	 * cleared, and subtle failures are seen if they aren't.
+	 */
+	page = alloc_pages(gfp_mask | __GFP_ZERO, order);
+	if (!page)
+		return -ENOMEM;
+
+	sg_set_page(mem, page, PAGE_SIZE << order, 0);
+	return 0;
+}
+
+static int mthca_alloc_icm_coherent(struct device *dev, struct scatterlist *mem,
+				    int order, gfp_t gfp_mask)
+{
+	void *buf = dma_alloc_coherent(dev, PAGE_SIZE << order, &sg_dma_address(mem),
+				       gfp_mask);
+	if (!buf)
+		return -ENOMEM;
+
+	sg_set_buf(mem, buf, PAGE_SIZE << order);
+	BUG_ON(mem->offset);
+	sg_dma_len(mem) = PAGE_SIZE << order;
+	return 0;
+}
+
+struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages,
+				  gfp_t gfp_mask, int coherent)
+{
+	struct mthca_icm *icm;
+	struct mthca_icm_chunk *chunk = NULL;
+	int cur_order;
+	int ret;
+
+	/* We use sg_set_buf for coherent allocs, which assumes low memory */
+	BUG_ON(coherent && (gfp_mask & __GFP_HIGHMEM));
+
+	icm = kmalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+	if (!icm)
+		return icm;
+
+	icm->refcount = 0;
+	INIT_LIST_HEAD(&icm->chunk_list);
+
+	cur_order = get_order(MTHCA_ICM_ALLOC_SIZE);
+
+	while (npages > 0) {
+		if (!chunk) {
+			chunk = kmalloc(sizeof *chunk,
+					gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+			if (!chunk)
+				goto fail;
+
+			sg_init_table(chunk->mem, MTHCA_ICM_CHUNK_LEN);
+			chunk->npages = 0;
+			chunk->nsg    = 0;
+			list_add_tail(&chunk->list, &icm->chunk_list);
+		}
+
+		while (1 << cur_order > npages)
+			--cur_order;
+
+		if (coherent)
+			ret = mthca_alloc_icm_coherent(&dev->pdev->dev,
+						       &chunk->mem[chunk->npages],
+						       cur_order, gfp_mask);
+		else
+			ret = mthca_alloc_icm_pages(&chunk->mem[chunk->npages],
+						    cur_order, gfp_mask);
+
+		if (!ret) {
+			++chunk->npages;
+
+			if (coherent)
+				++chunk->nsg;
+			else if (chunk->npages == MTHCA_ICM_CHUNK_LEN) {
+				chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+							chunk->npages,
+							PCI_DMA_BIDIRECTIONAL);
+
+				if (chunk->nsg <= 0)
+					goto fail;
+			}
+
+			if (chunk->npages == MTHCA_ICM_CHUNK_LEN)
+				chunk = NULL;
+
+			npages -= 1 << cur_order;
+		} else {
+			--cur_order;
+			if (cur_order < 0)
+				goto fail;
+		}
+	}
+
+	if (!coherent && chunk) {
+		chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+					chunk->npages,
+					PCI_DMA_BIDIRECTIONAL);
+
+		if (chunk->nsg <= 0)
+			goto fail;
+	}
+
+	return icm;
+
+fail:
+	mthca_free_icm(dev, icm, coherent);
+	return NULL;
+}
+
+int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int obj)
+{
+	int i = (obj & (table->num_obj - 1)) * table->obj_size / MTHCA_TABLE_CHUNK_SIZE;
+	int ret = 0;
+	u8 status;
+
+	mutex_lock(&table->mutex);
+
+	if (table->icm[i]) {
+		++table->icm[i]->refcount;
+		goto out;
+	}
+
+	table->icm[i] = mthca_alloc_icm(dev, MTHCA_TABLE_CHUNK_SIZE >> PAGE_SHIFT,
+					(table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+					__GFP_NOWARN, table->coherent);
+	if (!table->icm[i]) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (mthca_MAP_ICM(dev, table->icm[i], table->virt + i * MTHCA_TABLE_CHUNK_SIZE,
+			  &status) || status) {
+		mthca_free_icm(dev, table->icm[i], table->coherent);
+		table->icm[i] = NULL;
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	++table->icm[i]->refcount;
+
+out:
+	mutex_unlock(&table->mutex);
+	return ret;
+}
+
+void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int obj)
+{
+	int i;
+	u8 status;
+
+	if (!mthca_is_memfree(dev))
+		return;
+
+	i = (obj & (table->num_obj - 1)) * table->obj_size / MTHCA_TABLE_CHUNK_SIZE;
+
+	mutex_lock(&table->mutex);
+
+	if (--table->icm[i]->refcount == 0) {
+		mthca_UNMAP_ICM(dev, table->virt + i * MTHCA_TABLE_CHUNK_SIZE,
+				MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE,
+				&status);
+		mthca_free_icm(dev, table->icm[i], table->coherent);
+		table->icm[i] = NULL;
+	}
+
+	mutex_unlock(&table->mutex);
+}
+
+void *mthca_table_find(struct mthca_icm_table *table, int obj, dma_addr_t *dma_handle)
+{
+	int idx, offset, dma_offset, i;
+	struct mthca_icm_chunk *chunk;
+	struct mthca_icm *icm;
+	struct page *page = NULL;
+
+	if (!table->lowmem)
+		return NULL;
+
+	mutex_lock(&table->mutex);
+
+	idx = (obj & (table->num_obj - 1)) * table->obj_size;
+	icm = table->icm[idx / MTHCA_TABLE_CHUNK_SIZE];
+	dma_offset = offset = idx % MTHCA_TABLE_CHUNK_SIZE;
+
+	if (!icm)
+		goto out;
+
+	list_for_each_entry(chunk, &icm->chunk_list, list) {
+		for (i = 0; i < chunk->npages; ++i) {
+			if (dma_handle && dma_offset >= 0) {
+				if (sg_dma_len(&chunk->mem[i]) > dma_offset)
+					*dma_handle = sg_dma_address(&chunk->mem[i]) +
+						dma_offset;
+				dma_offset -= sg_dma_len(&chunk->mem[i]);
+			}
+			/* DMA mapping can merge pages but not split them,
+			 * so if we found the page, dma_handle has already
+			 * been assigned to. */
+			if (chunk->mem[i].length > offset) {
+				page = sg_page(&chunk->mem[i]);
+				goto out;
+			}
+			offset -= chunk->mem[i].length;
+		}
+	}
+
+out:
+	mutex_unlock(&table->mutex);
+	return page ? lowmem_page_address(page) + offset : NULL;
+}
+
+int mthca_table_get_range(struct mthca_dev *dev, struct mthca_icm_table *table,
+			  int start, int end)
+{
+	int inc = MTHCA_TABLE_CHUNK_SIZE / table->obj_size;
+	int i, err;
+
+	for (i = start; i <= end; i += inc) {
+		err = mthca_table_get(dev, table, i);
+		if (err)
+			goto fail;
+	}
+
+	return 0;
+
+fail:
+	while (i > start) {
+		i -= inc;
+		mthca_table_put(dev, table, i);
+	}
+
+	return err;
+}
+
+void mthca_table_put_range(struct mthca_dev *dev, struct mthca_icm_table *table,
+			   int start, int end)
+{
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return;
+
+	for (i = start; i <= end; i += MTHCA_TABLE_CHUNK_SIZE / table->obj_size)
+		mthca_table_put(dev, table, i);
+}
+
+struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev,
+					      u64 virt, int obj_size,
+					      int nobj, int reserved,
+					      int use_lowmem, int use_coherent)
+{
+	struct mthca_icm_table *table;
+	int obj_per_chunk;
+	int num_icm;
+	unsigned chunk_size;
+	int i;
+	u8 status;
+
+	obj_per_chunk = MTHCA_TABLE_CHUNK_SIZE / obj_size;
+	num_icm = DIV_ROUND_UP(nobj, obj_per_chunk);
+
+	table = kmalloc(sizeof *table + num_icm * sizeof *table->icm, GFP_KERNEL);
+	if (!table)
+		return NULL;
+
+	table->virt     = virt;
+	table->num_icm  = num_icm;
+	table->num_obj  = nobj;
+	table->obj_size = obj_size;
+	table->lowmem   = use_lowmem;
+	table->coherent = use_coherent;
+	mutex_init(&table->mutex);
+
+	for (i = 0; i < num_icm; ++i)
+		table->icm[i] = NULL;
+
+	for (i = 0; i * MTHCA_TABLE_CHUNK_SIZE < reserved * obj_size; ++i) {
+		chunk_size = MTHCA_TABLE_CHUNK_SIZE;
+		if ((i + 1) * MTHCA_TABLE_CHUNK_SIZE > nobj * obj_size)
+			chunk_size = nobj * obj_size - i * MTHCA_TABLE_CHUNK_SIZE;
+
+		table->icm[i] = mthca_alloc_icm(dev, chunk_size >> PAGE_SHIFT,
+						(use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+						__GFP_NOWARN, use_coherent);
+		if (!table->icm[i])
+			goto err;
+		if (mthca_MAP_ICM(dev, table->icm[i], virt + i * MTHCA_TABLE_CHUNK_SIZE,
+				  &status) || status) {
+			mthca_free_icm(dev, table->icm[i], table->coherent);
+			table->icm[i] = NULL;
+			goto err;
+		}
+
+		/*
+		 * Add a reference to this ICM chunk so that it never
+		 * gets freed (since it contains reserved firmware objects).
+		 */
+		++table->icm[i]->refcount;
+	}
+
+	return table;
+
+err:
+	for (i = 0; i < num_icm; ++i)
+		if (table->icm[i]) {
+			mthca_UNMAP_ICM(dev, virt + i * MTHCA_TABLE_CHUNK_SIZE,
+					MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE,
+					&status);
+			mthca_free_icm(dev, table->icm[i], table->coherent);
+		}
+
+	kfree(table);
+
+	return NULL;
+}
+
+void mthca_free_icm_table(struct mthca_dev *dev, struct mthca_icm_table *table)
+{
+	int i;
+	u8 status;
+
+	for (i = 0; i < table->num_icm; ++i)
+		if (table->icm[i]) {
+			mthca_UNMAP_ICM(dev, table->virt + i * MTHCA_TABLE_CHUNK_SIZE,
+					MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE,
+					&status);
+			mthca_free_icm(dev, table->icm[i], table->coherent);
+		}
+
+	kfree(table);
+}
+
+static u64 mthca_uarc_virt(struct mthca_dev *dev, struct mthca_uar *uar, int page)
+{
+	return dev->uar_table.uarc_base +
+		uar->index * dev->uar_table.uarc_size +
+		page * MTHCA_ICM_PAGE_SIZE;
+}
+
+#include <vm/vm_map.h>
+#include <vm/vm_pageout.h>
+#include <vm/pmap.h>
+
+#include <sys/resource.h>
+#include <sys/resourcevar.h>
+
+int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
+		      struct mthca_user_db_table *db_tab, int index, u64 uaddr)
+{
+#ifdef __linux__
+	struct page *pages[1];
+	int ret = 0;
+	u8 status;
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return 0;
+
+	if (index < 0 || index > dev->uar_table.uarc_size / 8)
+		return -EINVAL;
+
+	mutex_lock(&db_tab->mutex);
+
+	i = index / MTHCA_DB_REC_PER_PAGE;
+
+	if ((db_tab->page[i].refcount >= MTHCA_DB_REC_PER_PAGE)       ||
+	    (db_tab->page[i].uvirt && db_tab->page[i].uvirt != uaddr) ||
+	    (uaddr & 4095)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (db_tab->page[i].refcount) {
+		++db_tab->page[i].refcount;
+		goto out;
+	}
+
+	ret = get_user_pages(current, current->mm, uaddr & PAGE_MASK, 1, 1, 0,
+			     pages, NULL);
+	if (ret < 0)
+		goto out;
+
+	sg_set_page(&db_tab->page[i].mem, pages[0], MTHCA_ICM_PAGE_SIZE,
+			uaddr & ~PAGE_MASK);
+
+	ret = pci_map_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
+	if (ret < 0) {
+		put_page(pages[0]);
+		goto out;
+	}
+
+	ret = mthca_MAP_ICM_page(dev, sg_dma_address(&db_tab->page[i].mem),
+				 mthca_uarc_virt(dev, uar, i), &status);
+	if (!ret && status)
+		ret = -EINVAL;
+	if (ret) {
+		pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
+		put_page(sg_page(&db_tab->page[i].mem));
+		goto out;
+	}
+
+	db_tab->page[i].uvirt    = uaddr;
+	db_tab->page[i].refcount = 1;
+
+out:
+	mutex_unlock(&db_tab->mutex);
+	return ret;
+#else
+	struct proc *proc;
+	vm_offset_t start;
+	vm_paddr_t paddr;
+	pmap_t pmap;
+	vm_page_t m;
+	int ret = 0;
+	u8 status;
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return 0;
+
+	if (index < 0 || index > dev->uar_table.uarc_size / 8)
+		return -EINVAL;
+
+	mutex_lock(&db_tab->mutex);
+
+	i = index / MTHCA_DB_REC_PER_PAGE;
+	start = 0;
+
+	if ((db_tab->page[i].refcount >= MTHCA_DB_REC_PER_PAGE)       ||
+	    (db_tab->page[i].uvirt && db_tab->page[i].uvirt != uaddr) ||
+	    (uaddr & 4095)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (db_tab->page[i].refcount) {
+		++db_tab->page[i].refcount;
+		goto out;
+	}
+
+	proc = curproc;
+	pmap = vm_map_pmap(&proc->p_vmspace->vm_map);
+	PROC_LOCK(proc);
+	if (ptoa(pmap_wired_count(pmap) + 1) > lim_cur(proc, RLIMIT_MEMLOCK)) {
+		PROC_UNLOCK(proc);
+		ret = -ENOMEM;
+		goto out;
+	}
+	PROC_UNLOCK(proc);
+	if (cnt.v_wire_count + 1 > vm_page_max_wired) {
+		ret = -EAGAIN;
+		goto out;
+	}
+	start = uaddr & PAGE_MASK;
+	ret = vm_map_wire(&proc->p_vmspace->vm_map, start, start + PAGE_SIZE,
+	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES | VM_MAP_WIRE_WRITE);
+        if (ret != KERN_SUCCESS) {
+		start = 0;
+		ret = -ENOMEM;
+		goto out;
+	}
+	paddr = pmap_extract(pmap, uaddr);
+	if (paddr == 0) {
+		ret = -EFAULT;
+		goto out;
+	}
+	m = PHYS_TO_VM_PAGE(paddr);
+
+	sg_set_page(&db_tab->page[i].mem, m, MTHCA_ICM_PAGE_SIZE,
+			uaddr & ~PAGE_MASK);
+
+	ret = pci_map_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
+	if (ret < 0)
+		goto out;
+
+	ret = mthca_MAP_ICM_page(dev, sg_dma_address(&db_tab->page[i].mem),
+				 mthca_uarc_virt(dev, uar, i), &status);
+	if (!ret && status)
+		ret = -EINVAL;
+	if (ret) {
+		pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
+		goto out;
+	}
+
+	db_tab->page[i].uvirt    = uaddr;
+	db_tab->page[i].refcount = 1;
+
+out:
+	if (ret < 0 && start)
+		vm_map_unwire(&curthread->td_proc->p_vmspace->vm_map,
+		    start, start + PAGE_SIZE,
+		    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
+	mutex_unlock(&db_tab->mutex);
+	return ret;
+#endif
+}
+
+void mthca_unmap_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
+			 struct mthca_user_db_table *db_tab, int index)
+{
+	if (!mthca_is_memfree(dev))
+		return;
+
+	/*
+	 * To make our bookkeeping simpler, we don't unmap DB
+	 * pages until we clean up the whole db table.
+	 */
+
+	mutex_lock(&db_tab->mutex);
+
+	--db_tab->page[index / MTHCA_DB_REC_PER_PAGE].refcount;
+
+	mutex_unlock(&db_tab->mutex);
+}
+
+struct mthca_user_db_table *mthca_init_user_db_tab(struct mthca_dev *dev)
+{
+	struct mthca_user_db_table *db_tab;
+	int npages;
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return NULL;
+
+	npages = dev->uar_table.uarc_size / MTHCA_ICM_PAGE_SIZE;
+	db_tab = kmalloc(sizeof *db_tab + npages * sizeof *db_tab->page, GFP_KERNEL);
+	if (!db_tab)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&db_tab->mutex);
+	for (i = 0; i < npages; ++i) {
+		db_tab->page[i].refcount = 0;
+		db_tab->page[i].uvirt    = 0;
+		sg_init_table(&db_tab->page[i].mem, 1);
+	}
+
+	return db_tab;
+}
+
+void mthca_cleanup_user_db_tab(struct mthca_dev *dev, struct mthca_uar *uar,
+			       struct mthca_user_db_table *db_tab)
+{
+	int i;
+	u8 status;
+
+	if (!mthca_is_memfree(dev))
+		return;
+
+	for (i = 0; i < dev->uar_table.uarc_size / MTHCA_ICM_PAGE_SIZE; ++i) {
+		if (db_tab->page[i].uvirt) {
+			mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, uar, i), 1, &status);
+			pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
+#ifdef __linux__
+			put_page(sg_page(&db_tab->page[i].mem));
+#else
+			vm_offset_t start;
+
+			start = db_tab->page[i].uvirt & PAGE_MASK;
+			vm_map_unwire(&curthread->td_proc->p_vmspace->vm_map,
+			    start, start + PAGE_SIZE,
+			    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
+#endif
+		}
+	}
+
+	kfree(db_tab);
+}
+
+int mthca_alloc_db(struct mthca_dev *dev, enum mthca_db_type type,
+		   u32 qn, __be32 **db)
+{
+	int group;
+	int start, end, dir;
+	int i, j;
+	struct mthca_db_page *page;
+	int ret = 0;
+	u8 status;
+
+	mutex_lock(&dev->db_tab->mutex);
+
+	switch (type) {
+	case MTHCA_DB_TYPE_CQ_ARM:
+	case MTHCA_DB_TYPE_SQ:
+		group = 0;
+		start = 0;
+		end   = dev->db_tab->max_group1;
+		dir   = 1;
+		break;
+
+	case MTHCA_DB_TYPE_CQ_SET_CI:
+	case MTHCA_DB_TYPE_RQ:
+	case MTHCA_DB_TYPE_SRQ:
+		group = 1;
+		start = dev->db_tab->npages - 1;
+		end   = dev->db_tab->min_group2;
+		dir   = -1;
+		break;
+
+	default:
+		ret = -EINVAL;
+		goto out;
+	}
+
+	for (i = start; i != end; i += dir)
+		if (dev->db_tab->page[i].db_rec &&
+		    !bitmap_full(dev->db_tab->page[i].used,
+				 MTHCA_DB_REC_PER_PAGE)) {
+			page = dev->db_tab->page + i;
+			goto found;
+		}
+
+	for (i = start; i != end; i += dir)
+		if (!dev->db_tab->page[i].db_rec) {
+			page = dev->db_tab->page + i;
+			goto alloc;
+		}
+
+	if (dev->db_tab->max_group1 >= dev->db_tab->min_group2 - 1) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (group == 0)
+		++dev->db_tab->max_group1;
+	else
+		--dev->db_tab->min_group2;
+
+	page = dev->db_tab->page + end;
+
+alloc:
+	page->db_rec = dma_alloc_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE,
+					  &page->mapping, GFP_KERNEL);
+	if (!page->db_rec) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	memset(page->db_rec, 0, MTHCA_ICM_PAGE_SIZE);
+
+	ret = mthca_MAP_ICM_page(dev, page->mapping,
+				 mthca_uarc_virt(dev, &dev->driver_uar, i), &status);
+	if (!ret && status)
+		ret = -EINVAL;
+	if (ret) {
+		dma_free_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE,
+				  page->db_rec, page->mapping);
+		goto out;
+	}
+
+	bitmap_zero(page->used, MTHCA_DB_REC_PER_PAGE);
+
+found:
+	j = find_first_zero_bit(page->used, MTHCA_DB_REC_PER_PAGE);
+	set_bit(j, page->used);
+
+	if (group == 1)
+		j = MTHCA_DB_REC_PER_PAGE - 1 - j;
+
+	ret = i * MTHCA_DB_REC_PER_PAGE + j;
+
+	page->db_rec[j] = cpu_to_be64((qn << 8) | (type << 5));
+
+	*db = (__be32 *) &page->db_rec[j];
+
+out:
+	mutex_unlock(&dev->db_tab->mutex);
+
+	return ret;
+}
+
+void mthca_free_db(struct mthca_dev *dev, int type, int db_index)
+{
+	int i, j;
+	struct mthca_db_page *page;
+	u8 status;
+
+	i = db_index / MTHCA_DB_REC_PER_PAGE;
+	j = db_index % MTHCA_DB_REC_PER_PAGE;
+
+	page = dev->db_tab->page + i;
+
+	mutex_lock(&dev->db_tab->mutex);
+
+	page->db_rec[j] = 0;
+	if (i >= dev->db_tab->min_group2)
+		j = MTHCA_DB_REC_PER_PAGE - 1 - j;
+	clear_bit(j, page->used);
+
+	if (bitmap_empty(page->used, MTHCA_DB_REC_PER_PAGE) &&
+	    i >= dev->db_tab->max_group1 - 1) {
+		mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, &dev->driver_uar, i), 1, &status);
+
+		dma_free_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE,
+				  page->db_rec, page->mapping);
+		page->db_rec = NULL;
+
+		if (i == dev->db_tab->max_group1) {
+			--dev->db_tab->max_group1;
+			/* XXX may be able to unmap more pages now */
+		}
+		if (i == dev->db_tab->min_group2)
+			++dev->db_tab->min_group2;
+	}
+
+	mutex_unlock(&dev->db_tab->mutex);
+}
+
+int mthca_init_db_tab(struct mthca_dev *dev)
+{
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return 0;
+
+	dev->db_tab = kmalloc(sizeof *dev->db_tab, GFP_KERNEL);
+	if (!dev->db_tab)
+		return -ENOMEM;
+
+	mutex_init(&dev->db_tab->mutex);
+
+	dev->db_tab->npages     = dev->uar_table.uarc_size / MTHCA_ICM_PAGE_SIZE;
+	dev->db_tab->max_group1 = 0;
+	dev->db_tab->min_group2 = dev->db_tab->npages - 1;
+
+	dev->db_tab->page = kmalloc(dev->db_tab->npages *
+				    sizeof *dev->db_tab->page,
+				    GFP_KERNEL);
+	if (!dev->db_tab->page) {
+		kfree(dev->db_tab);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < dev->db_tab->npages; ++i)
+		dev->db_tab->page[i].db_rec = NULL;
+
+	return 0;
+}
+
+void mthca_cleanup_db_tab(struct mthca_dev *dev)
+{
+	int i;
+	u8 status;
+
+	if (!mthca_is_memfree(dev))
+		return;
+
+	/*
+	 * Because we don't always free our UARC pages when they
+	 * become empty to make mthca_free_db() simpler we need to
+	 * make a sweep through the doorbell pages and free any
+	 * leftover pages now.
+	 */
+	for (i = 0; i < dev->db_tab->npages; ++i) {
+		if (!dev->db_tab->page[i].db_rec)
+			continue;
+
+		if (!bitmap_empty(dev->db_tab->page[i].used, MTHCA_DB_REC_PER_PAGE))
+			mthca_warn(dev, "Kernel UARC page %d not empty\n", i);
+
+		mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, &dev->driver_uar, i), 1, &status);
+
+		dma_free_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE,
+				  dev->db_tab->page[i].db_rec,
+				  dev->db_tab->page[i].mapping);
+	}
+
+	kfree(dev->db_tab->page);
+	kfree(dev->db_tab);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.h b/sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.h
new file mode 100644
index 0000000..da9b8f9
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_MEMFREE_H
+#define MTHCA_MEMFREE_H
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+
+#define MTHCA_ICM_CHUNK_LEN \
+	((256 - sizeof (struct list_head) - 2 * sizeof (int)) /		\
+	 (sizeof (struct scatterlist)))
+
+enum {
+	MTHCA_ICM_PAGE_SHIFT	= 12,
+	MTHCA_ICM_PAGE_SIZE	= 1 << MTHCA_ICM_PAGE_SHIFT,
+	MTHCA_DB_REC_PER_PAGE	= MTHCA_ICM_PAGE_SIZE / 8
+};
+
+struct mthca_icm_chunk {
+	struct list_head   list;
+	int                npages;
+	int                nsg;
+	struct scatterlist mem[MTHCA_ICM_CHUNK_LEN];
+};
+
+struct mthca_icm {
+	struct list_head chunk_list;
+	int              refcount;
+};
+
+struct mthca_icm_table {
+	u64               virt;
+	int               num_icm;
+	int               num_obj;
+	int               obj_size;
+	int               lowmem;
+	int               coherent;
+	struct mutex      mutex;
+	struct mthca_icm *icm[0];
+};
+
+struct mthca_icm_iter {
+	struct mthca_icm       *icm;
+	struct mthca_icm_chunk *chunk;
+	int                     page_idx;
+};
+
+struct mthca_dev;
+
+struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages,
+				  gfp_t gfp_mask, int coherent);
+void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm, int coherent);
+
+struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev,
+					      u64 virt, int obj_size,
+					      int nobj, int reserved,
+					      int use_lowmem, int use_coherent);
+void mthca_free_icm_table(struct mthca_dev *dev, struct mthca_icm_table *table);
+int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int obj);
+void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int obj);
+void *mthca_table_find(struct mthca_icm_table *table, int obj, dma_addr_t *dma_handle);
+int mthca_table_get_range(struct mthca_dev *dev, struct mthca_icm_table *table,
+			  int start, int end);
+void mthca_table_put_range(struct mthca_dev *dev, struct mthca_icm_table *table,
+			   int start, int end);
+
+static inline void mthca_icm_first(struct mthca_icm *icm,
+				   struct mthca_icm_iter *iter)
+{
+	iter->icm      = icm;
+	iter->chunk    = list_empty(&icm->chunk_list) ?
+		NULL : list_entry(icm->chunk_list.next,
+				  struct mthca_icm_chunk, list);
+	iter->page_idx = 0;
+}
+
+static inline int mthca_icm_last(struct mthca_icm_iter *iter)
+{
+	return !iter->chunk;
+}
+
+static inline void mthca_icm_next(struct mthca_icm_iter *iter)
+{
+	if (++iter->page_idx >= iter->chunk->nsg) {
+		if (iter->chunk->list.next == &iter->icm->chunk_list) {
+			iter->chunk = NULL;
+			return;
+		}
+
+		iter->chunk = list_entry(iter->chunk->list.next,
+					 struct mthca_icm_chunk, list);
+		iter->page_idx = 0;
+	}
+}
+
+static inline dma_addr_t mthca_icm_addr(struct mthca_icm_iter *iter)
+{
+	return sg_dma_address(&iter->chunk->mem[iter->page_idx]);
+}
+
+static inline unsigned long mthca_icm_size(struct mthca_icm_iter *iter)
+{
+	return sg_dma_len(&iter->chunk->mem[iter->page_idx]);
+}
+
+struct mthca_db_page {
+	DECLARE_BITMAP(used, MTHCA_DB_REC_PER_PAGE);
+	__be64    *db_rec;
+	dma_addr_t mapping;
+};
+
+struct mthca_db_table {
+	int 	       	      npages;
+	int 	       	      max_group1;
+	int 	       	      min_group2;
+	struct mthca_db_page *page;
+	struct mutex          mutex;
+};
+
+enum mthca_db_type {
+	MTHCA_DB_TYPE_INVALID   = 0x0,
+	MTHCA_DB_TYPE_CQ_SET_CI = 0x1,
+	MTHCA_DB_TYPE_CQ_ARM    = 0x2,
+	MTHCA_DB_TYPE_SQ        = 0x3,
+	MTHCA_DB_TYPE_RQ        = 0x4,
+	MTHCA_DB_TYPE_SRQ       = 0x5,
+	MTHCA_DB_TYPE_GROUP_SEP = 0x7
+};
+
+struct mthca_user_db_table;
+struct mthca_uar;
+
+int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
+		      struct mthca_user_db_table *db_tab, int index, u64 uaddr);
+void mthca_unmap_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
+			 struct mthca_user_db_table *db_tab, int index);
+struct mthca_user_db_table *mthca_init_user_db_tab(struct mthca_dev *dev);
+void mthca_cleanup_user_db_tab(struct mthca_dev *dev, struct mthca_uar *uar,
+			       struct mthca_user_db_table *db_tab);
+
+int mthca_init_db_tab(struct mthca_dev *dev);
+void mthca_cleanup_db_tab(struct mthca_dev *dev);
+int mthca_alloc_db(struct mthca_dev *dev, enum mthca_db_type type,
+		   u32 qn, __be32 **db);
+void mthca_free_db(struct mthca_dev *dev, int type, int db_index);
+
+#endif /* MTHCA_MEMFREE_H */
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_mr.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_mr.c
new file mode 100644
index 0000000..d606edf
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_mr.c
@@ -0,0 +1,985 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+#include <linux/errno.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+
+struct mthca_mtt {
+	struct mthca_buddy *buddy;
+	int                 order;
+	u32                 first_seg;
+};
+
+/*
+ * Must be packed because mtt_seg is 64 bits but only aligned to 32 bits.
+ */
+struct mthca_mpt_entry {
+	__be32 flags;
+	__be32 page_size;
+	__be32 key;
+	__be32 pd;
+	__be64 start;
+	__be64 length;
+	__be32 lkey;
+	__be32 window_count;
+	__be32 window_count_limit;
+	__be64 mtt_seg;
+	__be32 mtt_sz;		/* Arbel only */
+	u32    reserved[2];
+} __attribute__((packed));
+
+#define MTHCA_MPT_FLAG_SW_OWNS       (0xfUL << 28)
+#define MTHCA_MPT_FLAG_MIO           (1 << 17)
+#define MTHCA_MPT_FLAG_BIND_ENABLE   (1 << 15)
+#define MTHCA_MPT_FLAG_PHYSICAL      (1 <<  9)
+#define MTHCA_MPT_FLAG_REGION        (1 <<  8)
+
+#define MTHCA_MTT_FLAG_PRESENT       1
+
+#define MTHCA_MPT_STATUS_SW 0xF0
+#define MTHCA_MPT_STATUS_HW 0x00
+
+#define SINAI_FMR_KEY_INC 0x1000000
+
+/*
+ * Buddy allocator for MTT segments (currently not very efficient
+ * since it doesn't keep a free list and just searches linearly
+ * through the bitmaps)
+ */
+
+static u32 mthca_buddy_alloc(struct mthca_buddy *buddy, int order)
+{
+	int o;
+	int m;
+	u32 seg;
+
+	spin_lock(&buddy->lock);
+
+	for (o = order; o <= buddy->max_order; ++o)
+		if (buddy->num_free[o]) {
+			m = 1 << (buddy->max_order - o);
+			seg = find_first_bit(buddy->bits[o], m);
+			if (seg < m)
+				goto found;
+		}
+
+	spin_unlock(&buddy->lock);
+	return -1;
+
+ found:
+	clear_bit(seg, buddy->bits[o]);
+	--buddy->num_free[o];
+
+	while (o > order) {
+		--o;
+		seg <<= 1;
+		set_bit(seg ^ 1, buddy->bits[o]);
+		++buddy->num_free[o];
+	}
+
+	spin_unlock(&buddy->lock);
+
+	seg <<= order;
+
+	return seg;
+}
+
+static void mthca_buddy_free(struct mthca_buddy *buddy, u32 seg, int order)
+{
+	seg >>= order;
+
+	spin_lock(&buddy->lock);
+
+	while (test_bit(seg ^ 1, buddy->bits[order])) {
+		clear_bit(seg ^ 1, buddy->bits[order]);
+		--buddy->num_free[order];
+		seg >>= 1;
+		++order;
+	}
+
+	set_bit(seg, buddy->bits[order]);
+	++buddy->num_free[order];
+
+	spin_unlock(&buddy->lock);
+}
+
+static int mthca_buddy_init(struct mthca_buddy *buddy, int max_order)
+{
+	int i, s;
+
+	buddy->max_order = max_order;
+	spin_lock_init(&buddy->lock);
+
+	buddy->bits = kzalloc((buddy->max_order + 1) * sizeof (long *),
+			      GFP_KERNEL);
+	buddy->num_free = kzalloc((buddy->max_order + 1) * sizeof (int *),
+				  GFP_KERNEL);
+	if (!buddy->bits || !buddy->num_free)
+		goto err_out;
+
+	for (i = 0; i <= buddy->max_order; ++i) {
+		s = BITS_TO_LONGS(1 << (buddy->max_order - i));
+		buddy->bits[i] = kmalloc(s * sizeof (long), GFP_KERNEL);
+		if (!buddy->bits[i])
+			goto err_out_free;
+		bitmap_zero(buddy->bits[i],
+			    1 << (buddy->max_order - i));
+	}
+
+	set_bit(0, buddy->bits[buddy->max_order]);
+	buddy->num_free[buddy->max_order] = 1;
+
+	return 0;
+
+err_out_free:
+	for (i = 0; i <= buddy->max_order; ++i)
+		kfree(buddy->bits[i]);
+
+err_out:
+	kfree(buddy->bits);
+	kfree(buddy->num_free);
+
+	return -ENOMEM;
+}
+
+static void mthca_buddy_cleanup(struct mthca_buddy *buddy)
+{
+	int i;
+
+	for (i = 0; i <= buddy->max_order; ++i)
+		kfree(buddy->bits[i]);
+
+	kfree(buddy->bits);
+	kfree(buddy->num_free);
+}
+
+static u32 mthca_alloc_mtt_range(struct mthca_dev *dev, int order,
+				 struct mthca_buddy *buddy)
+{
+	u32 seg = mthca_buddy_alloc(buddy, order);
+
+	if (seg == -1)
+		return -1;
+
+	if (mthca_is_memfree(dev))
+		if (mthca_table_get_range(dev, dev->mr_table.mtt_table, seg,
+					  seg + (1 << order) - 1)) {
+			mthca_buddy_free(buddy, seg, order);
+			seg = -1;
+		}
+
+	return seg;
+}
+
+static struct mthca_mtt *__mthca_alloc_mtt(struct mthca_dev *dev, int size,
+					   struct mthca_buddy *buddy)
+{
+	struct mthca_mtt *mtt;
+	int i;
+
+	if (size <= 0)
+		return ERR_PTR(-EINVAL);
+
+	mtt = kmalloc(sizeof *mtt, GFP_KERNEL);
+	if (!mtt)
+		return ERR_PTR(-ENOMEM);
+
+	mtt->buddy = buddy;
+	mtt->order = 0;
+	for (i = dev->limits.mtt_seg_size / 8; i < size; i <<= 1)
+		++mtt->order;
+
+	mtt->first_seg = mthca_alloc_mtt_range(dev, mtt->order, buddy);
+	if (mtt->first_seg == -1) {
+		kfree(mtt);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return mtt;
+}
+
+struct mthca_mtt *mthca_alloc_mtt(struct mthca_dev *dev, int size)
+{
+	return __mthca_alloc_mtt(dev, size, &dev->mr_table.mtt_buddy);
+}
+
+void mthca_free_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt)
+{
+	if (!mtt)
+		return;
+
+	mthca_buddy_free(mtt->buddy, mtt->first_seg, mtt->order);
+
+	mthca_table_put_range(dev, dev->mr_table.mtt_table,
+			      mtt->first_seg,
+			      mtt->first_seg + (1 << mtt->order) - 1);
+
+	kfree(mtt);
+}
+
+static int __mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt,
+			     int start_index, u64 *buffer_list, int list_len)
+{
+	struct mthca_mailbox *mailbox;
+	__be64 *mtt_entry;
+	int err = 0;
+	u8 status;
+	int i;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	mtt_entry = mailbox->buf;
+
+	while (list_len > 0) {
+		mtt_entry[0] = cpu_to_be64(dev->mr_table.mtt_base +
+					   mtt->first_seg * dev->limits.mtt_seg_size +
+					   start_index * 8);
+		mtt_entry[1] = 0;
+		for (i = 0; i < list_len && i < MTHCA_MAILBOX_SIZE / 8 - 2; ++i)
+			mtt_entry[i + 2] = cpu_to_be64(buffer_list[i] |
+						       MTHCA_MTT_FLAG_PRESENT);
+
+		/*
+		 * If we have an odd number of entries to write, add
+		 * one more dummy entry for firmware efficiency.
+		 */
+		if (i & 1)
+			mtt_entry[i + 2] = 0;
+
+		err = mthca_WRITE_MTT(dev, mailbox, (i + 1) & ~1, &status);
+		if (err) {
+			mthca_warn(dev, "WRITE_MTT failed (%d)\n", err);
+			goto out;
+		}
+		if (status) {
+			mthca_warn(dev, "WRITE_MTT returned status 0x%02x\n",
+				   status);
+			err = -EINVAL;
+			goto out;
+		}
+
+		list_len    -= i;
+		start_index += i;
+		buffer_list += i;
+	}
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_write_mtt_size(struct mthca_dev *dev)
+{
+	if (dev->mr_table.fmr_mtt_buddy != &dev->mr_table.mtt_buddy ||
+	    !(dev->mthca_flags & MTHCA_FLAG_FMR))
+		/*
+		 * Be friendly to WRITE_MTT command
+		 * and leave two empty slots for the
+		 * index and reserved fields of the
+		 * mailbox.
+		 */
+		return PAGE_SIZE / sizeof (u64) - 2;
+
+	/* For Arbel, all MTTs must fit in the same page. */
+	return mthca_is_memfree(dev) ? (PAGE_SIZE / sizeof (u64)) : 0x7ffffff;
+}
+
+static void mthca_tavor_write_mtt_seg(struct mthca_dev *dev,
+				      struct mthca_mtt *mtt, int start_index,
+				      u64 *buffer_list, int list_len)
+{
+	u64 __iomem *mtts;
+	int i;
+
+	mtts = dev->mr_table.tavor_fmr.mtt_base + mtt->first_seg * dev->limits.mtt_seg_size +
+		start_index * sizeof (u64);
+	for (i = 0; i < list_len; ++i)
+		mthca_write64_raw(cpu_to_be64(buffer_list[i] | MTHCA_MTT_FLAG_PRESENT),
+				  mtts + i);
+}
+
+static void mthca_arbel_write_mtt_seg(struct mthca_dev *dev,
+				      struct mthca_mtt *mtt, int start_index,
+				      u64 *buffer_list, int list_len)
+{
+	__be64 *mtts;
+	dma_addr_t dma_handle;
+	int i;
+	int s = start_index * sizeof (u64);
+
+	/* For Arbel, all MTTs must fit in the same page. */
+	BUG_ON(s / PAGE_SIZE != (s + list_len * sizeof(u64) - 1) / PAGE_SIZE);
+	/* Require full segments */
+	BUG_ON(s % dev->limits.mtt_seg_size);
+
+	mtts = mthca_table_find(dev->mr_table.mtt_table, mtt->first_seg +
+				s / dev->limits.mtt_seg_size, &dma_handle);
+
+	BUG_ON(!mtts);
+
+	for (i = 0; i < list_len; ++i)
+		mtts[i] = cpu_to_be64(buffer_list[i] | MTHCA_MTT_FLAG_PRESENT);
+
+	dma_sync_single(&dev->pdev->dev, dma_handle, list_len * sizeof (u64), DMA_TO_DEVICE);
+}
+
+int mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt,
+		    int start_index, u64 *buffer_list, int list_len)
+{
+	int size = mthca_write_mtt_size(dev);
+	int chunk;
+
+	if (dev->mr_table.fmr_mtt_buddy != &dev->mr_table.mtt_buddy ||
+	    !(dev->mthca_flags & MTHCA_FLAG_FMR))
+		return __mthca_write_mtt(dev, mtt, start_index, buffer_list, list_len);
+
+	while (list_len > 0) {
+		chunk = min(size, list_len);
+		if (mthca_is_memfree(dev))
+			mthca_arbel_write_mtt_seg(dev, mtt, start_index,
+						  buffer_list, chunk);
+		else
+			mthca_tavor_write_mtt_seg(dev, mtt, start_index,
+						  buffer_list, chunk);
+
+		list_len    -= chunk;
+		start_index += chunk;
+		buffer_list += chunk;
+	}
+
+	return 0;
+}
+
+static inline u32 tavor_hw_index_to_key(u32 ind)
+{
+	return ind;
+}
+
+static inline u32 tavor_key_to_hw_index(u32 key)
+{
+	return key;
+}
+
+static inline u32 arbel_hw_index_to_key(u32 ind)
+{
+	return (ind >> 24) | (ind << 8);
+}
+
+static inline u32 arbel_key_to_hw_index(u32 key)
+{
+	return (key << 24) | (key >> 8);
+}
+
+static inline u32 hw_index_to_key(struct mthca_dev *dev, u32 ind)
+{
+	if (mthca_is_memfree(dev))
+		return arbel_hw_index_to_key(ind);
+	else
+		return tavor_hw_index_to_key(ind);
+}
+
+static inline u32 key_to_hw_index(struct mthca_dev *dev, u32 key)
+{
+	if (mthca_is_memfree(dev))
+		return arbel_key_to_hw_index(key);
+	else
+		return tavor_key_to_hw_index(key);
+}
+
+static inline u32 adjust_key(struct mthca_dev *dev, u32 key)
+{
+	if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		return ((key << 20) & 0x800000) | (key & 0x7fffff);
+	else
+		return key;
+}
+
+int mthca_mr_alloc(struct mthca_dev *dev, u32 pd, int buffer_size_shift,
+		   u64 iova, u64 total_size, u32 access, struct mthca_mr *mr)
+{
+	struct mthca_mailbox *mailbox;
+	struct mthca_mpt_entry *mpt_entry;
+	u32 key;
+	int i;
+	int err;
+	u8 status;
+
+	WARN_ON(buffer_size_shift >= 32);
+
+	key = mthca_alloc(&dev->mr_table.mpt_alloc);
+	if (key == -1)
+		return -ENOMEM;
+	key = adjust_key(dev, key);
+	mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key);
+
+	if (mthca_is_memfree(dev)) {
+		err = mthca_table_get(dev, dev->mr_table.mpt_table, key);
+		if (err)
+			goto err_out_mpt_free;
+	}
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_out_table;
+	}
+	mpt_entry = mailbox->buf;
+
+	mpt_entry->flags = cpu_to_be32(MTHCA_MPT_FLAG_SW_OWNS     |
+				       MTHCA_MPT_FLAG_MIO         |
+				       MTHCA_MPT_FLAG_REGION      |
+				       access);
+	if (!mr->mtt)
+		mpt_entry->flags |= cpu_to_be32(MTHCA_MPT_FLAG_PHYSICAL);
+
+	mpt_entry->page_size = cpu_to_be32(buffer_size_shift - 12);
+	mpt_entry->key       = cpu_to_be32(key);
+	mpt_entry->pd        = cpu_to_be32(pd);
+	mpt_entry->start     = cpu_to_be64(iova);
+	mpt_entry->length    = cpu_to_be64(total_size);
+
+	memset(&mpt_entry->lkey, 0,
+	       sizeof *mpt_entry - offsetof(struct mthca_mpt_entry, lkey));
+
+	if (mr->mtt)
+		mpt_entry->mtt_seg =
+			cpu_to_be64(dev->mr_table.mtt_base +
+				    mr->mtt->first_seg * dev->limits.mtt_seg_size);
+
+	if (0) {
+		mthca_dbg(dev, "Dumping MPT entry %08x:\n", mr->ibmr.lkey);
+		for (i = 0; i < sizeof (struct mthca_mpt_entry) / 4; ++i) {
+			if (i % 4 == 0)
+				printk("[%02x] ", i * 4);
+			printk(" %08x", be32_to_cpu(((__be32 *) mpt_entry)[i]));
+			if ((i + 1) % 4 == 0)
+				printk("\n");
+		}
+	}
+
+	err = mthca_SW2HW_MPT(dev, mailbox,
+			      key & (dev->limits.num_mpts - 1),
+			      &status);
+	if (err) {
+		mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+		goto err_out_mailbox;
+	} else if (status) {
+		mthca_warn(dev, "SW2HW_MPT returned status 0x%02x\n",
+			   status);
+		err = -EINVAL;
+		goto err_out_mailbox;
+	}
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+
+err_out_mailbox:
+	mthca_free_mailbox(dev, mailbox);
+
+err_out_table:
+	mthca_table_put(dev, dev->mr_table.mpt_table, key);
+
+err_out_mpt_free:
+	mthca_free(&dev->mr_table.mpt_alloc, key);
+	return err;
+}
+
+int mthca_mr_alloc_notrans(struct mthca_dev *dev, u32 pd,
+			   u32 access, struct mthca_mr *mr)
+{
+	mr->mtt = NULL;
+	return mthca_mr_alloc(dev, pd, 12, 0, ~0ULL, access, mr);
+}
+
+int mthca_mr_alloc_phys(struct mthca_dev *dev, u32 pd,
+			u64 *buffer_list, int buffer_size_shift,
+			int list_len, u64 iova, u64 total_size,
+			u32 access, struct mthca_mr *mr)
+{
+	int err;
+
+	mr->mtt = mthca_alloc_mtt(dev, list_len);
+	if (IS_ERR(mr->mtt))
+		return PTR_ERR(mr->mtt);
+
+	err = mthca_write_mtt(dev, mr->mtt, 0, buffer_list, list_len);
+	if (err) {
+		mthca_free_mtt(dev, mr->mtt);
+		return err;
+	}
+
+	err = mthca_mr_alloc(dev, pd, buffer_size_shift, iova,
+			     total_size, access, mr);
+	if (err)
+		mthca_free_mtt(dev, mr->mtt);
+
+	return err;
+}
+
+/* Free mr or fmr */
+static void mthca_free_region(struct mthca_dev *dev, u32 lkey)
+{
+	mthca_table_put(dev, dev->mr_table.mpt_table,
+			key_to_hw_index(dev, lkey));
+
+	mthca_free(&dev->mr_table.mpt_alloc, key_to_hw_index(dev, lkey));
+}
+
+void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr)
+{
+	int err;
+	u8 status;
+
+	err = mthca_HW2SW_MPT(dev, NULL,
+			      key_to_hw_index(dev, mr->ibmr.lkey) &
+			      (dev->limits.num_mpts - 1),
+			      &status);
+	if (err)
+		mthca_warn(dev, "HW2SW_MPT failed (%d)\n", err);
+	else if (status)
+		mthca_warn(dev, "HW2SW_MPT returned status 0x%02x\n",
+			   status);
+
+	mthca_free_region(dev, mr->ibmr.lkey);
+	mthca_free_mtt(dev, mr->mtt);
+}
+
+int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd,
+		    u32 access, struct mthca_fmr *mr)
+{
+	struct mthca_mpt_entry *mpt_entry;
+	struct mthca_mailbox *mailbox;
+	u64 mtt_seg;
+	u32 key, idx;
+	u8 status;
+	int list_len = mr->attr.max_pages;
+	int err = -ENOMEM;
+	int i;
+
+	if (mr->attr.page_shift < 12 || mr->attr.page_shift >= 32)
+		return -EINVAL;
+
+	/* For Arbel, all MTTs must fit in the same page. */
+	if (mthca_is_memfree(dev) &&
+	    mr->attr.max_pages * sizeof *mr->mem.arbel.mtts > PAGE_SIZE)
+		return -EINVAL;
+
+	mr->maps = 0;
+
+	key = mthca_alloc(&dev->mr_table.mpt_alloc);
+	if (key == -1)
+		return -ENOMEM;
+	key = adjust_key(dev, key);
+
+	idx = key & (dev->limits.num_mpts - 1);
+	mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key);
+
+	if (mthca_is_memfree(dev)) {
+		err = mthca_table_get(dev, dev->mr_table.mpt_table, key);
+		if (err)
+			goto err_out_mpt_free;
+
+		mr->mem.arbel.mpt = mthca_table_find(dev->mr_table.mpt_table, key, NULL);
+		BUG_ON(!mr->mem.arbel.mpt);
+	} else
+		mr->mem.tavor.mpt = dev->mr_table.tavor_fmr.mpt_base +
+			sizeof *(mr->mem.tavor.mpt) * idx;
+
+	mr->mtt = __mthca_alloc_mtt(dev, list_len, dev->mr_table.fmr_mtt_buddy);
+	if (IS_ERR(mr->mtt)) {
+		err = PTR_ERR(mr->mtt);
+		goto err_out_table;
+	}
+
+	mtt_seg = mr->mtt->first_seg * dev->limits.mtt_seg_size;
+
+	if (mthca_is_memfree(dev)) {
+		mr->mem.arbel.mtts = mthca_table_find(dev->mr_table.mtt_table,
+						      mr->mtt->first_seg,
+						      &mr->mem.arbel.dma_handle);
+		BUG_ON(!mr->mem.arbel.mtts);
+	} else
+		mr->mem.tavor.mtts = dev->mr_table.tavor_fmr.mtt_base + mtt_seg;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_out_free_mtt;
+	}
+
+	mpt_entry = mailbox->buf;
+
+	mpt_entry->flags = cpu_to_be32(MTHCA_MPT_FLAG_SW_OWNS     |
+				       MTHCA_MPT_FLAG_MIO         |
+				       MTHCA_MPT_FLAG_REGION      |
+				       access);
+
+	mpt_entry->page_size = cpu_to_be32(mr->attr.page_shift - 12);
+	mpt_entry->key       = cpu_to_be32(key);
+	mpt_entry->pd        = cpu_to_be32(pd);
+	memset(&mpt_entry->start, 0,
+	       sizeof *mpt_entry - offsetof(struct mthca_mpt_entry, start));
+	mpt_entry->mtt_seg   = cpu_to_be64(dev->mr_table.mtt_base + mtt_seg);
+
+	if (0) {
+		mthca_dbg(dev, "Dumping MPT entry %08x:\n", mr->ibmr.lkey);
+		for (i = 0; i < sizeof (struct mthca_mpt_entry) / 4; ++i) {
+			if (i % 4 == 0)
+				printk("[%02x] ", i * 4);
+			printk(" %08x", be32_to_cpu(((__be32 *) mpt_entry)[i]));
+			if ((i + 1) % 4 == 0)
+				printk("\n");
+		}
+	}
+
+	err = mthca_SW2HW_MPT(dev, mailbox,
+			      key & (dev->limits.num_mpts - 1),
+			      &status);
+	if (err) {
+		mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+		goto err_out_mailbox_free;
+	}
+	if (status) {
+		mthca_warn(dev, "SW2HW_MPT returned status 0x%02x\n",
+			   status);
+		err = -EINVAL;
+		goto err_out_mailbox_free;
+	}
+
+	mthca_free_mailbox(dev, mailbox);
+	return 0;
+
+err_out_mailbox_free:
+	mthca_free_mailbox(dev, mailbox);
+
+err_out_free_mtt:
+	mthca_free_mtt(dev, mr->mtt);
+
+err_out_table:
+	mthca_table_put(dev, dev->mr_table.mpt_table, key);
+
+err_out_mpt_free:
+	mthca_free(&dev->mr_table.mpt_alloc, key);
+	return err;
+}
+
+int mthca_free_fmr(struct mthca_dev *dev, struct mthca_fmr *fmr)
+{
+	if (fmr->maps)
+		return -EBUSY;
+
+	mthca_free_region(dev, fmr->ibmr.lkey);
+	mthca_free_mtt(dev, fmr->mtt);
+
+	return 0;
+}
+
+static inline int mthca_check_fmr(struct mthca_fmr *fmr, u64 *page_list,
+				  int list_len, u64 iova)
+{
+	int i, page_mask;
+
+	if (list_len > fmr->attr.max_pages)
+		return -EINVAL;
+
+	page_mask = (1 << fmr->attr.page_shift) - 1;
+
+	/* We are getting page lists, so va must be page aligned. */
+	if (iova & page_mask)
+		return -EINVAL;
+
+	/* Trust the user not to pass misaligned data in page_list */
+	if (0)
+		for (i = 0; i < list_len; ++i) {
+			if (page_list[i] & ~page_mask)
+				return -EINVAL;
+		}
+
+	if (fmr->maps >= fmr->attr.max_maps)
+		return -EINVAL;
+
+	return 0;
+}
+
+
+int mthca_tavor_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+			     int list_len, u64 iova)
+{
+	struct mthca_fmr *fmr = to_mfmr(ibfmr);
+	struct mthca_dev *dev = to_mdev(ibfmr->device);
+	struct mthca_mpt_entry mpt_entry;
+	u32 key;
+	int i, err;
+
+	err = mthca_check_fmr(fmr, page_list, list_len, iova);
+	if (err)
+		return err;
+
+	++fmr->maps;
+
+	key = tavor_key_to_hw_index(fmr->ibmr.lkey);
+	key += dev->limits.num_mpts;
+	fmr->ibmr.lkey = fmr->ibmr.rkey = tavor_hw_index_to_key(key);
+
+	writeb(MTHCA_MPT_STATUS_SW, fmr->mem.tavor.mpt);
+
+	for (i = 0; i < list_len; ++i) {
+		__be64 mtt_entry = cpu_to_be64(page_list[i] |
+					       MTHCA_MTT_FLAG_PRESENT);
+		mthca_write64_raw(mtt_entry, fmr->mem.tavor.mtts + i);
+	}
+
+	mpt_entry.lkey   = cpu_to_be32(key);
+	mpt_entry.length = cpu_to_be64(list_len * (1ull << fmr->attr.page_shift));
+	mpt_entry.start  = cpu_to_be64(iova);
+
+	__raw_writel((__force u32) mpt_entry.lkey, &fmr->mem.tavor.mpt->key);
+	memcpy_toio(&fmr->mem.tavor.mpt->start, &mpt_entry.start,
+		    offsetof(struct mthca_mpt_entry, window_count) -
+		    offsetof(struct mthca_mpt_entry, start));
+
+	writeb(MTHCA_MPT_STATUS_HW, fmr->mem.tavor.mpt);
+
+	return 0;
+}
+
+int mthca_arbel_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+			     int list_len, u64 iova)
+{
+	struct mthca_fmr *fmr = to_mfmr(ibfmr);
+	struct mthca_dev *dev = to_mdev(ibfmr->device);
+	u32 key;
+	int i, err;
+
+	err = mthca_check_fmr(fmr, page_list, list_len, iova);
+	if (err)
+		return err;
+
+	++fmr->maps;
+
+	key = arbel_key_to_hw_index(fmr->ibmr.lkey);
+	if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		key += SINAI_FMR_KEY_INC;
+	else
+		key += dev->limits.num_mpts;
+	fmr->ibmr.lkey = fmr->ibmr.rkey = arbel_hw_index_to_key(key);
+
+	*(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_SW;
+
+	wmb();
+
+	for (i = 0; i < list_len; ++i)
+		fmr->mem.arbel.mtts[i] = cpu_to_be64(page_list[i] |
+						     MTHCA_MTT_FLAG_PRESENT);
+
+	dma_sync_single(&dev->pdev->dev, fmr->mem.arbel.dma_handle,
+			list_len * sizeof(u64), DMA_TO_DEVICE);
+
+	fmr->mem.arbel.mpt->key    = cpu_to_be32(key);
+	fmr->mem.arbel.mpt->lkey   = cpu_to_be32(key);
+	fmr->mem.arbel.mpt->length = cpu_to_be64(list_len * (1ull << fmr->attr.page_shift));
+	fmr->mem.arbel.mpt->start  = cpu_to_be64(iova);
+
+	wmb();
+
+	*(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_HW;
+
+	wmb();
+
+	return 0;
+}
+
+void mthca_tavor_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr)
+{
+	if (!fmr->maps)
+		return;
+
+	fmr->maps = 0;
+
+	writeb(MTHCA_MPT_STATUS_SW, fmr->mem.tavor.mpt);
+}
+
+void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr)
+{
+	if (!fmr->maps)
+		return;
+
+	fmr->maps = 0;
+
+	*(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_SW;
+}
+
+int mthca_init_mr_table(struct mthca_dev *dev)
+{
+	unsigned long addr;
+	int mpts, mtts, err, i;
+
+	err = mthca_alloc_init(&dev->mr_table.mpt_alloc,
+			       dev->limits.num_mpts,
+			       ~0, dev->limits.reserved_mrws);
+	if (err)
+		return err;
+
+	if (!mthca_is_memfree(dev) &&
+	    (dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN))
+		dev->limits.fmr_reserved_mtts = 0;
+	else
+		dev->mthca_flags |= MTHCA_FLAG_FMR;
+
+	if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		mthca_dbg(dev, "Memory key throughput optimization activated.\n");
+
+	err = mthca_buddy_init(&dev->mr_table.mtt_buddy,
+			       fls(dev->limits.num_mtt_segs - 1));
+
+	if (err)
+		goto err_mtt_buddy;
+
+	dev->mr_table.tavor_fmr.mpt_base = NULL;
+	dev->mr_table.tavor_fmr.mtt_base = NULL;
+
+	if (dev->limits.fmr_reserved_mtts) {
+		i = fls(dev->limits.fmr_reserved_mtts - 1);
+
+		if (i >= 31) {
+			mthca_warn(dev, "Unable to reserve 2^31 FMR MTTs.\n");
+			err = -EINVAL;
+			goto err_fmr_mpt;
+		}
+		mpts = mtts = 1 << i;
+	} else {
+		mtts = dev->limits.num_mtt_segs;
+		mpts = dev->limits.num_mpts;
+	}
+
+	if (!mthca_is_memfree(dev) &&
+	    (dev->mthca_flags & MTHCA_FLAG_FMR)) {
+
+		addr = pci_resource_start(dev->pdev, 4) +
+			((pci_resource_len(dev->pdev, 4) - 1) &
+			 dev->mr_table.mpt_base);
+
+		dev->mr_table.tavor_fmr.mpt_base =
+			ioremap(addr, mpts * sizeof(struct mthca_mpt_entry));
+
+		if (!dev->mr_table.tavor_fmr.mpt_base) {
+			mthca_warn(dev, "MPT ioremap for FMR failed.\n");
+			err = -ENOMEM;
+			goto err_fmr_mpt;
+		}
+
+		addr = pci_resource_start(dev->pdev, 4) +
+			((pci_resource_len(dev->pdev, 4) - 1) &
+			 dev->mr_table.mtt_base);
+
+		dev->mr_table.tavor_fmr.mtt_base =
+			ioremap(addr, mtts * dev->limits.mtt_seg_size);
+		if (!dev->mr_table.tavor_fmr.mtt_base) {
+			mthca_warn(dev, "MTT ioremap for FMR failed.\n");
+			err = -ENOMEM;
+			goto err_fmr_mtt;
+		}
+	}
+
+	if (dev->limits.fmr_reserved_mtts) {
+		err = mthca_buddy_init(&dev->mr_table.tavor_fmr.mtt_buddy, fls(mtts - 1));
+		if (err)
+			goto err_fmr_mtt_buddy;
+
+		/* Prevent regular MRs from using FMR keys */
+		err = mthca_buddy_alloc(&dev->mr_table.mtt_buddy, fls(mtts - 1));
+		if (err)
+			goto err_reserve_fmr;
+
+		dev->mr_table.fmr_mtt_buddy =
+			&dev->mr_table.tavor_fmr.mtt_buddy;
+	} else
+		dev->mr_table.fmr_mtt_buddy = &dev->mr_table.mtt_buddy;
+
+	/* FMR table is always the first, take reserved MTTs out of there */
+	if (dev->limits.reserved_mtts) {
+		i = fls(dev->limits.reserved_mtts - 1);
+
+		if (mthca_alloc_mtt_range(dev, i,
+					  dev->mr_table.fmr_mtt_buddy) == -1) {
+			mthca_warn(dev, "MTT table of order %d is too small.\n",
+				  dev->mr_table.fmr_mtt_buddy->max_order);
+			err = -ENOMEM;
+			goto err_reserve_mtts;
+		}
+	}
+
+	return 0;
+
+err_reserve_mtts:
+err_reserve_fmr:
+	if (dev->limits.fmr_reserved_mtts)
+		mthca_buddy_cleanup(&dev->mr_table.tavor_fmr.mtt_buddy);
+
+err_fmr_mtt_buddy:
+	if (dev->mr_table.tavor_fmr.mtt_base)
+		iounmap(dev->mr_table.tavor_fmr.mtt_base);
+
+err_fmr_mtt:
+	if (dev->mr_table.tavor_fmr.mpt_base)
+		iounmap(dev->mr_table.tavor_fmr.mpt_base);
+
+err_fmr_mpt:
+	mthca_buddy_cleanup(&dev->mr_table.mtt_buddy);
+
+err_mtt_buddy:
+	mthca_alloc_cleanup(&dev->mr_table.mpt_alloc);
+
+	return err;
+}
+
+void mthca_cleanup_mr_table(struct mthca_dev *dev)
+{
+	/* XXX check if any MRs are still allocated? */
+	if (dev->limits.fmr_reserved_mtts)
+		mthca_buddy_cleanup(&dev->mr_table.tavor_fmr.mtt_buddy);
+
+	mthca_buddy_cleanup(&dev->mr_table.mtt_buddy);
+
+	if (dev->mr_table.tavor_fmr.mtt_base)
+		iounmap(dev->mr_table.tavor_fmr.mtt_base);
+	if (dev->mr_table.tavor_fmr.mpt_base)
+		iounmap(dev->mr_table.tavor_fmr.mpt_base);
+
+	mthca_alloc_cleanup(&dev->mr_table.mpt_alloc);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_pd.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_pd.c
new file mode 100644
index 0000000..266f14e
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_pd.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+
+#include "mthca_dev.h"
+
+int mthca_pd_alloc(struct mthca_dev *dev, int privileged, struct mthca_pd *pd)
+{
+	int err = 0;
+
+	pd->privileged = privileged;
+
+	atomic_set(&pd->sqp_count, 0);
+	pd->pd_num = mthca_alloc(&dev->pd_table.alloc);
+	if (pd->pd_num == -1)
+		return -ENOMEM;
+
+	if (privileged) {
+		err = mthca_mr_alloc_notrans(dev, pd->pd_num,
+					     MTHCA_MPT_FLAG_LOCAL_READ |
+					     MTHCA_MPT_FLAG_LOCAL_WRITE,
+					     &pd->ntmr);
+		if (err)
+			mthca_free(&dev->pd_table.alloc, pd->pd_num);
+	}
+
+	return err;
+}
+
+void mthca_pd_free(struct mthca_dev *dev, struct mthca_pd *pd)
+{
+	if (pd->privileged)
+		mthca_free_mr(dev, &pd->ntmr);
+	mthca_free(&dev->pd_table.alloc, pd->pd_num);
+}
+
+int mthca_init_pd_table(struct mthca_dev *dev)
+{
+	return mthca_alloc_init(&dev->pd_table.alloc,
+				dev->limits.num_pds,
+				(1 << 24) - 1,
+				dev->limits.reserved_pds);
+}
+
+void mthca_cleanup_pd_table(struct mthca_dev *dev)
+{
+	/* XXX check if any PDs are still allocated? */
+	mthca_alloc_cleanup(&dev->pd_table.alloc);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_profile.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_profile.c
new file mode 100644
index 0000000..8edb28a
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_profile.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include "mthca_profile.h"
+
+enum {
+	MTHCA_RES_QP,
+	MTHCA_RES_EEC,
+	MTHCA_RES_SRQ,
+	MTHCA_RES_CQ,
+	MTHCA_RES_EQP,
+	MTHCA_RES_EEEC,
+	MTHCA_RES_EQ,
+	MTHCA_RES_RDB,
+	MTHCA_RES_MCG,
+	MTHCA_RES_MPT,
+	MTHCA_RES_MTT,
+	MTHCA_RES_UAR,
+	MTHCA_RES_UDAV,
+	MTHCA_RES_UARC,
+	MTHCA_RES_NUM
+};
+
+enum {
+	MTHCA_NUM_EQS = 32,
+	MTHCA_NUM_PDS = 1 << 15
+};
+
+s64 mthca_make_profile(struct mthca_dev *dev,
+		       struct mthca_profile *request,
+		       struct mthca_dev_lim *dev_lim,
+		       struct mthca_init_hca_param *init_hca)
+{
+	struct mthca_resource {
+		u64 size;
+		u64 start;
+		int type;
+		int num;
+		int log_num;
+	};
+
+	u64 mem_base, mem_avail;
+	s64 total_size = 0;
+	struct mthca_resource *profile;
+	struct mthca_resource tmp;
+	int i, j;
+
+	profile = kzalloc(MTHCA_RES_NUM * sizeof *profile, GFP_KERNEL);
+	if (!profile)
+		return -ENOMEM;
+
+	profile[MTHCA_RES_QP].size   = dev_lim->qpc_entry_sz;
+	profile[MTHCA_RES_EEC].size  = dev_lim->eec_entry_sz;
+	profile[MTHCA_RES_SRQ].size  = dev_lim->srq_entry_sz;
+	profile[MTHCA_RES_CQ].size   = dev_lim->cqc_entry_sz;
+	profile[MTHCA_RES_EQP].size  = dev_lim->eqpc_entry_sz;
+	profile[MTHCA_RES_EEEC].size = dev_lim->eeec_entry_sz;
+	profile[MTHCA_RES_EQ].size   = dev_lim->eqc_entry_sz;
+	profile[MTHCA_RES_RDB].size  = MTHCA_RDB_ENTRY_SIZE;
+	profile[MTHCA_RES_MCG].size  = MTHCA_MGM_ENTRY_SIZE;
+	profile[MTHCA_RES_MPT].size  = dev_lim->mpt_entry_sz;
+	profile[MTHCA_RES_MTT].size  = dev->limits.mtt_seg_size;
+	profile[MTHCA_RES_UAR].size  = dev_lim->uar_scratch_entry_sz;
+	profile[MTHCA_RES_UDAV].size = MTHCA_AV_SIZE;
+	profile[MTHCA_RES_UARC].size = request->uarc_size;
+
+	profile[MTHCA_RES_QP].num    = request->num_qp;
+	profile[MTHCA_RES_SRQ].num   = request->num_srq;
+	profile[MTHCA_RES_EQP].num   = request->num_qp;
+	profile[MTHCA_RES_RDB].num   = request->num_qp * request->rdb_per_qp;
+	profile[MTHCA_RES_CQ].num    = request->num_cq;
+	profile[MTHCA_RES_EQ].num    = MTHCA_NUM_EQS;
+	profile[MTHCA_RES_MCG].num   = request->num_mcg;
+	profile[MTHCA_RES_MPT].num   = request->num_mpt;
+	profile[MTHCA_RES_MTT].num   = request->num_mtt;
+	profile[MTHCA_RES_UAR].num   = request->num_uar;
+	profile[MTHCA_RES_UARC].num  = request->num_uar;
+	profile[MTHCA_RES_UDAV].num  = request->num_udav;
+
+	for (i = 0; i < MTHCA_RES_NUM; ++i) {
+		profile[i].type     = i;
+		profile[i].log_num  = max(ffs(profile[i].num) - 1, 0);
+		profile[i].size    *= profile[i].num;
+		if (mthca_is_memfree(dev))
+			profile[i].size = max(profile[i].size, (u64) PAGE_SIZE);
+	}
+
+	if (mthca_is_memfree(dev)) {
+		mem_base  = 0;
+		mem_avail = dev_lim->hca.arbel.max_icm_sz;
+	} else {
+		mem_base  = dev->ddr_start;
+		mem_avail = dev->fw.tavor.fw_start - dev->ddr_start;
+	}
+
+	/*
+	 * Sort the resources in decreasing order of size.  Since they
+	 * all have sizes that are powers of 2, we'll be able to keep
+	 * resources aligned to their size and pack them without gaps
+	 * using the sorted order.
+	 */
+	for (i = MTHCA_RES_NUM; i > 0; --i)
+		for (j = 1; j < i; ++j) {
+			if (profile[j].size > profile[j - 1].size) {
+				tmp            = profile[j];
+				profile[j]     = profile[j - 1];
+				profile[j - 1] = tmp;
+			}
+		}
+
+	for (i = 0; i < MTHCA_RES_NUM; ++i) {
+		if (profile[i].size) {
+			profile[i].start = mem_base + total_size;
+			total_size      += profile[i].size;
+		}
+		if (total_size > mem_avail) {
+			mthca_err(dev, "Profile requires 0x%llx bytes; "
+				  "won't fit in 0x%llx bytes of context memory.\n",
+				  (unsigned long long) total_size,
+				  (unsigned long long) mem_avail);
+			kfree(profile);
+			return -ENOMEM;
+		}
+
+		if (profile[i].size)
+			mthca_dbg(dev, "profile[%2d]--%2d/%2d @ 0x%16llx "
+				  "(size 0x%8llx)\n",
+				  i, profile[i].type, profile[i].log_num,
+				  (unsigned long long) profile[i].start,
+				  (unsigned long long) profile[i].size);
+	}
+
+	if (mthca_is_memfree(dev))
+		mthca_dbg(dev, "HCA context memory: reserving %d KB\n",
+			  (int) (total_size >> 10));
+	else
+		mthca_dbg(dev, "HCA memory: allocated %d KB/%d KB (%d KB free)\n",
+			  (int) (total_size >> 10), (int) (mem_avail >> 10),
+			  (int) ((mem_avail - total_size) >> 10));
+
+	for (i = 0; i < MTHCA_RES_NUM; ++i) {
+		switch (profile[i].type) {
+		case MTHCA_RES_QP:
+			dev->limits.num_qps   = profile[i].num;
+			init_hca->qpc_base    = profile[i].start;
+			init_hca->log_num_qps = profile[i].log_num;
+			break;
+		case MTHCA_RES_EEC:
+			dev->limits.num_eecs   = profile[i].num;
+			init_hca->eec_base     = profile[i].start;
+			init_hca->log_num_eecs = profile[i].log_num;
+			break;
+		case MTHCA_RES_SRQ:
+			dev->limits.num_srqs   = profile[i].num;
+			init_hca->srqc_base    = profile[i].start;
+			init_hca->log_num_srqs = profile[i].log_num;
+			break;
+		case MTHCA_RES_CQ:
+			dev->limits.num_cqs   = profile[i].num;
+			init_hca->cqc_base    = profile[i].start;
+			init_hca->log_num_cqs = profile[i].log_num;
+			break;
+		case MTHCA_RES_EQP:
+			init_hca->eqpc_base = profile[i].start;
+			break;
+		case MTHCA_RES_EEEC:
+			init_hca->eeec_base = profile[i].start;
+			break;
+		case MTHCA_RES_EQ:
+			dev->limits.num_eqs   = profile[i].num;
+			init_hca->eqc_base    = profile[i].start;
+			init_hca->log_num_eqs = profile[i].log_num;
+			break;
+		case MTHCA_RES_RDB:
+			for (dev->qp_table.rdb_shift = 0;
+			     request->num_qp << dev->qp_table.rdb_shift < profile[i].num;
+			     ++dev->qp_table.rdb_shift)
+				; /* nothing */
+			dev->qp_table.rdb_base    = (u32) profile[i].start;
+			init_hca->rdb_base        = profile[i].start;
+			break;
+		case MTHCA_RES_MCG:
+			dev->limits.num_mgms      = profile[i].num >> 1;
+			dev->limits.num_amgms     = profile[i].num >> 1;
+			init_hca->mc_base         = profile[i].start;
+			init_hca->log_mc_entry_sz = ffs(MTHCA_MGM_ENTRY_SIZE) - 1;
+			init_hca->log_mc_table_sz = profile[i].log_num;
+			init_hca->mc_hash_sz      = 1 << (profile[i].log_num - 1);
+			break;
+		case MTHCA_RES_MPT:
+			dev->limits.num_mpts   = profile[i].num;
+			dev->mr_table.mpt_base = profile[i].start;
+			init_hca->mpt_base     = profile[i].start;
+			init_hca->log_mpt_sz   = profile[i].log_num;
+			break;
+		case MTHCA_RES_MTT:
+			dev->limits.num_mtt_segs = profile[i].num;
+			dev->mr_table.mtt_base   = profile[i].start;
+			init_hca->mtt_base       = profile[i].start;
+			init_hca->mtt_seg_sz     = ffs(dev->limits.mtt_seg_size) - 7;
+			break;
+		case MTHCA_RES_UAR:
+			dev->limits.num_uars       = profile[i].num;
+			init_hca->uar_scratch_base = profile[i].start;
+			break;
+		case MTHCA_RES_UDAV:
+			dev->av_table.ddr_av_base = profile[i].start;
+			dev->av_table.num_ddr_avs = profile[i].num;
+			break;
+		case MTHCA_RES_UARC:
+			dev->uar_table.uarc_size = request->uarc_size;
+			dev->uar_table.uarc_base = profile[i].start;
+			init_hca->uarc_base   	 = profile[i].start;
+			init_hca->log_uarc_sz 	 = ffs(request->uarc_size) - 13;
+			init_hca->log_uar_sz  	 = ffs(request->num_uar) - 1;
+			break;
+		default:
+			break;
+		}
+	}
+
+	/*
+	 * PDs don't take any HCA memory, but we assign them as part
+	 * of the HCA profile anyway.
+	 */
+	dev->limits.num_pds = MTHCA_NUM_PDS;
+
+	if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT &&
+	    init_hca->log_mpt_sz > 23) {
+		mthca_warn(dev, "MPT table too large (requested size 2^%d >= 2^24)\n",
+			   init_hca->log_mpt_sz);
+		mthca_warn(dev, "Disabling memory key throughput optimization.\n");
+		dev->mthca_flags &= ~MTHCA_FLAG_SINAI_OPT;
+	}
+
+	/*
+	 * For Tavor, FMRs use ioremapped PCI memory. For 32 bit
+	 * systems it may use too much vmalloc space to map all MTT
+	 * memory, so we reserve some MTTs for FMR access, taking them
+	 * out of the MR pool. They don't use additional memory, but
+	 * we assign them as part of the HCA profile anyway.
+	 */
+	if (mthca_is_memfree(dev) || BITS_PER_LONG == 64)
+		dev->limits.fmr_reserved_mtts = 0;
+	else
+		dev->limits.fmr_reserved_mtts = request->fmr_reserved_mtts;
+
+	kfree(profile);
+	return total_size;
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_profile.h b/sys/ofed/drivers/infiniband/hw/mthca/mthca_profile.h
new file mode 100644
index 0000000..62b009c
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_profile.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_PROFILE_H
+#define MTHCA_PROFILE_H
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+struct mthca_profile {
+	int num_qp;
+	int rdb_per_qp;
+	int num_srq;
+	int num_cq;
+	int num_mcg;
+	int num_mpt;
+	int num_mtt;
+	int num_udav;
+	int num_uar;
+	int uarc_size;
+	int fmr_reserved_mtts;
+};
+
+s64 mthca_make_profile(struct mthca_dev *mdev,
+		       struct mthca_profile *request,
+		       struct mthca_dev_lim *dev_lim,
+		       struct mthca_init_hca_param *init_hca);
+
+#endif /* MTHCA_PROFILE_H */
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c
new file mode 100644
index 0000000..e547739
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -0,0 +1,1427 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_user.h"
+#include "mthca_memfree.h"
+
+static void init_query_mad(struct ib_smp *mad)
+{
+	mad->base_version  = 1;
+	mad->mgmt_class    = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	mad->class_version = 1;
+	mad->method    	   = IB_MGMT_METHOD_GET;
+}
+
+static int mthca_query_device(struct ib_device *ibdev,
+			      struct ib_device_attr *props)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+	struct mthca_dev *mdev = to_mdev(ibdev);
+
+	u8 status;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	memset(props, 0, sizeof *props);
+
+	props->fw_ver              = mdev->fw_ver;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+	err = mthca_MAD_IFC(mdev, 1, 1,
+			    1, NULL, NULL, in_mad, out_mad,
+			    &status);
+	if (err)
+		goto out;
+	if (status) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	props->device_cap_flags    = mdev->device_cap_flags;
+	props->vendor_id           = be32_to_cpup((__be32 *) (out_mad->data + 36)) &
+		0xffffff;
+	props->vendor_part_id      = be16_to_cpup((__be16 *) (out_mad->data + 30));
+	props->hw_ver              = be32_to_cpup((__be32 *) (out_mad->data + 32));
+	memcpy(&props->sys_image_guid, out_mad->data +  4, 8);
+
+	props->max_mr_size         = ~0ull;
+	props->page_size_cap       = mdev->limits.page_size_cap;
+	props->max_qp              = mdev->limits.num_qps - mdev->limits.reserved_qps;
+	props->max_qp_wr           = mdev->limits.max_wqes;
+	props->max_sge             = mdev->limits.max_sg;
+	props->max_cq              = mdev->limits.num_cqs - mdev->limits.reserved_cqs;
+	props->max_cqe             = mdev->limits.max_cqes;
+	props->max_mr              = mdev->limits.num_mpts - mdev->limits.reserved_mrws;
+	props->max_pd              = mdev->limits.num_pds - mdev->limits.reserved_pds;
+	props->max_qp_rd_atom      = 1 << mdev->qp_table.rdb_shift;
+	props->max_qp_init_rd_atom = mdev->limits.max_qp_init_rdma;
+	props->max_res_rd_atom     = props->max_qp_rd_atom * props->max_qp;
+	props->max_srq             = mdev->limits.num_srqs - mdev->limits.reserved_srqs;
+	props->max_srq_wr          = mdev->limits.max_srq_wqes;
+	props->max_srq_sge         = mdev->limits.max_srq_sge;
+	props->local_ca_ack_delay  = mdev->limits.local_ca_ack_delay;
+	props->atomic_cap          = mdev->limits.flags & DEV_LIM_FLAG_ATOMIC ?
+					IB_ATOMIC_HCA : IB_ATOMIC_NONE;
+	props->max_pkeys           = mdev->limits.pkey_table_len;
+	props->max_mcast_grp       = mdev->limits.num_mgms + mdev->limits.num_amgms;
+	props->max_mcast_qp_attach = MTHCA_QP_PER_MGM;
+	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+					   props->max_mcast_grp;
+	/*
+	 * If Sinai memory key optimization is being used, then only
+	 * the 8-bit key portion will change.  For other HCAs, the
+	 * unused index bits will also be used for FMR remapping.
+	 */
+	if (mdev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		props->max_map_per_fmr = 255;
+	else
+		props->max_map_per_fmr =
+			(1 << (32 - ilog2(mdev->limits.num_mpts))) - 1;
+
+	err = 0;
+ out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mthca_query_port(struct ib_device *ibdev,
+			    u8 port, struct ib_port_attr *props)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+	u8 status;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	memset(props, 0, sizeof *props);
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+			    port, NULL, NULL, in_mad, out_mad,
+			    &status);
+	if (err)
+		goto out;
+	if (status) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	props->lid               = be16_to_cpup((__be16 *) (out_mad->data + 16));
+	props->lmc               = out_mad->data[34] & 0x7;
+	props->sm_lid            = be16_to_cpup((__be16 *) (out_mad->data + 18));
+	props->sm_sl             = out_mad->data[36] & 0xf;
+	props->state             = out_mad->data[32] & 0xf;
+	props->phys_state        = out_mad->data[33] >> 4;
+	props->port_cap_flags    = be32_to_cpup((__be32 *) (out_mad->data + 20));
+	props->gid_tbl_len       = to_mdev(ibdev)->limits.gid_table_len;
+	props->max_msg_sz        = 0x80000000;
+	props->pkey_tbl_len      = to_mdev(ibdev)->limits.pkey_table_len;
+	props->bad_pkey_cntr     = be16_to_cpup((__be16 *) (out_mad->data + 46));
+	props->qkey_viol_cntr    = be16_to_cpup((__be16 *) (out_mad->data + 48));
+	props->active_width      = out_mad->data[31] & 0xf;
+	props->active_speed      = out_mad->data[35] >> 4;
+	props->max_mtu           = out_mad->data[41] & 0xf;
+	props->active_mtu        = out_mad->data[36] >> 4;
+	props->subnet_timeout    = out_mad->data[51] & 0x1f;
+	props->max_vl_num        = out_mad->data[37] >> 4;
+	props->init_type_reply   = out_mad->data[41] >> 4;
+
+ out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mthca_modify_device(struct ib_device *ibdev,
+			       int mask,
+			       struct ib_device_modify *props)
+{
+	if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
+		return -EOPNOTSUPP;
+
+	if (mask & IB_DEVICE_MODIFY_NODE_DESC) {
+		if (mutex_lock_interruptible(&to_mdev(ibdev)->cap_mask_mutex))
+			return -ERESTARTSYS;
+		memcpy(ibdev->node_desc, props->node_desc, 64);
+		mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex);
+	}
+
+	return 0;
+}
+
+static int mthca_modify_port(struct ib_device *ibdev,
+			     u8 port, int port_modify_mask,
+			     struct ib_port_modify *props)
+{
+	struct mthca_set_ib_param set_ib;
+	struct ib_port_attr attr;
+	int err;
+	u8 status;
+
+	if (mutex_lock_interruptible(&to_mdev(ibdev)->cap_mask_mutex))
+		return -ERESTARTSYS;
+
+	err = mthca_query_port(ibdev, port, &attr);
+	if (err)
+		goto out;
+
+	set_ib.set_si_guid     = 0;
+	set_ib.reset_qkey_viol = !!(port_modify_mask & IB_PORT_RESET_QKEY_CNTR);
+
+	set_ib.cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) &
+		~props->clr_port_cap_mask;
+
+	err = mthca_SET_IB(to_mdev(ibdev), &set_ib, port, &status);
+	if (err)
+		goto out;
+	if (status) {
+		err = -EINVAL;
+		goto out;
+	}
+
+out:
+	mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex);
+	return err;
+}
+
+static int mthca_query_pkey(struct ib_device *ibdev,
+			    u8 port, u16 index, u16 *pkey)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+	u8 status;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PKEY_TABLE;
+	in_mad->attr_mod = cpu_to_be32(index / 32);
+
+	err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+			    port, NULL, NULL, in_mad, out_mad,
+			    &status);
+	if (err)
+		goto out;
+	if (status) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	*pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]);
+
+ out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mthca_query_gid(struct ib_device *ibdev, u8 port,
+			   int index, union ib_gid *gid)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+	u8 status;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+			    port, NULL, NULL, in_mad, out_mad,
+			    &status);
+	if (err)
+		goto out;
+	if (status) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	memcpy(gid->raw, out_mad->data + 8, 8);
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_GUID_INFO;
+	in_mad->attr_mod = cpu_to_be32(index / 8);
+
+	err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+			    port, NULL, NULL, in_mad, out_mad,
+			    &status);
+	if (err)
+		goto out;
+	if (status) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8);
+
+ out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static struct ib_ucontext *mthca_alloc_ucontext(struct ib_device *ibdev,
+						struct ib_udata *udata)
+{
+	struct mthca_alloc_ucontext_resp uresp;
+	struct mthca_ucontext           *context;
+	int                              err;
+
+	if (!(to_mdev(ibdev)->active))
+		return ERR_PTR(-EAGAIN);
+
+	memset(&uresp, 0, sizeof uresp);
+
+	uresp.qp_tab_size = to_mdev(ibdev)->limits.num_qps;
+	if (mthca_is_memfree(to_mdev(ibdev)))
+		uresp.uarc_size = to_mdev(ibdev)->uar_table.uarc_size;
+	else
+		uresp.uarc_size = 0;
+
+	context = kmalloc(sizeof *context, GFP_KERNEL);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+
+	err = mthca_uar_alloc(to_mdev(ibdev), &context->uar);
+	if (err) {
+		kfree(context);
+		return ERR_PTR(err);
+	}
+
+	context->db_tab = mthca_init_user_db_tab(to_mdev(ibdev));
+	if (IS_ERR(context->db_tab)) {
+		err = PTR_ERR(context->db_tab);
+		mthca_uar_free(to_mdev(ibdev), &context->uar);
+		kfree(context);
+		return ERR_PTR(err);
+	}
+
+	if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) {
+		mthca_cleanup_user_db_tab(to_mdev(ibdev), &context->uar, context->db_tab);
+		mthca_uar_free(to_mdev(ibdev), &context->uar);
+		kfree(context);
+		return ERR_PTR(-EFAULT);
+	}
+
+	context->reg_mr_warned = 0;
+
+	return &context->ibucontext;
+}
+
+static int mthca_dealloc_ucontext(struct ib_ucontext *context)
+{
+	mthca_cleanup_user_db_tab(to_mdev(context->device), &to_mucontext(context)->uar,
+				  to_mucontext(context)->db_tab);
+	mthca_uar_free(to_mdev(context->device), &to_mucontext(context)->uar);
+	kfree(to_mucontext(context));
+
+	return 0;
+}
+
+static int mthca_mmap_uar(struct ib_ucontext *context,
+			  struct vm_area_struct *vma)
+{
+	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+		return -EINVAL;
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	if (io_remap_pfn_range(vma, vma->vm_start,
+			       to_mucontext(context)->uar.pfn,
+			       PAGE_SIZE, vma->vm_page_prot))
+		return -EAGAIN;
+
+	return 0;
+}
+
+static struct ib_pd *mthca_alloc_pd(struct ib_device *ibdev,
+				    struct ib_ucontext *context,
+				    struct ib_udata *udata)
+{
+	struct mthca_pd *pd;
+	int err;
+
+	pd = kmalloc(sizeof *pd, GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	err = mthca_pd_alloc(to_mdev(ibdev), !context, pd);
+	if (err) {
+		kfree(pd);
+		return ERR_PTR(err);
+	}
+
+	if (context) {
+		if (ib_copy_to_udata(udata, &pd->pd_num, sizeof (__u32))) {
+			mthca_pd_free(to_mdev(ibdev), pd);
+			kfree(pd);
+			return ERR_PTR(-EFAULT);
+		}
+	}
+
+	return &pd->ibpd;
+}
+
+static int mthca_dealloc_pd(struct ib_pd *pd)
+{
+	mthca_pd_free(to_mdev(pd->device), to_mpd(pd));
+	kfree(pd);
+
+	return 0;
+}
+
+static struct ib_ah *mthca_ah_create(struct ib_pd *pd,
+				     struct ib_ah_attr *ah_attr)
+{
+	int err;
+	struct mthca_ah *ah;
+
+	ah = kmalloc(sizeof *ah, GFP_ATOMIC);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	err = mthca_create_ah(to_mdev(pd->device), to_mpd(pd), ah_attr, ah);
+	if (err) {
+		kfree(ah);
+		return ERR_PTR(err);
+	}
+
+	return &ah->ibah;
+}
+
+static int mthca_ah_destroy(struct ib_ah *ah)
+{
+	mthca_destroy_ah(to_mdev(ah->device), to_mah(ah));
+	kfree(ah);
+
+	return 0;
+}
+
+static struct ib_srq *mthca_create_srq(struct ib_pd *pd,
+				       struct ib_srq_init_attr *init_attr,
+				       struct ib_udata *udata)
+{
+	struct mthca_create_srq ucmd;
+	struct mthca_ucontext *context = NULL;
+	struct mthca_srq *srq;
+	int err;
+
+	srq = kmalloc(sizeof *srq, GFP_KERNEL);
+	if (!srq)
+		return ERR_PTR(-ENOMEM);
+
+	if (pd->uobject) {
+		context = to_mucontext(pd->uobject->context);
+
+		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+			err = -EFAULT;
+			goto err_free;
+		}
+
+		err = mthca_map_user_db(to_mdev(pd->device), &context->uar,
+					context->db_tab, ucmd.db_index,
+					ucmd.db_page);
+
+		if (err)
+			goto err_free;
+
+		srq->mr.ibmr.lkey = ucmd.lkey;
+		srq->db_index     = ucmd.db_index;
+	}
+
+	err = mthca_alloc_srq(to_mdev(pd->device), to_mpd(pd),
+			      &init_attr->attr, srq);
+
+	if (err && pd->uobject)
+		mthca_unmap_user_db(to_mdev(pd->device), &context->uar,
+				    context->db_tab, ucmd.db_index);
+
+	if (err)
+		goto err_free;
+
+	if (context && ib_copy_to_udata(udata, &srq->srqn, sizeof (__u32))) {
+		mthca_free_srq(to_mdev(pd->device), srq);
+		err = -EFAULT;
+		goto err_free;
+	}
+
+	return &srq->ibsrq;
+
+err_free:
+	kfree(srq);
+
+	return ERR_PTR(err);
+}
+
+static int mthca_destroy_srq(struct ib_srq *srq)
+{
+	struct mthca_ucontext *context;
+
+	if (srq->uobject) {
+		context = to_mucontext(srq->uobject->context);
+
+		mthca_unmap_user_db(to_mdev(srq->device), &context->uar,
+				    context->db_tab, to_msrq(srq)->db_index);
+	}
+
+	mthca_free_srq(to_mdev(srq->device), to_msrq(srq));
+	kfree(srq);
+
+	return 0;
+}
+
+static struct ib_qp *mthca_create_qp(struct ib_pd *pd,
+				     struct ib_qp_init_attr *init_attr,
+				     struct ib_udata *udata)
+{
+	struct mthca_create_qp ucmd;
+	struct mthca_qp *qp;
+	int err;
+
+	if (init_attr->create_flags)
+		return ERR_PTR(-EINVAL);
+
+	switch (init_attr->qp_type) {
+	case IB_QPT_RC:
+	case IB_QPT_UC:
+	case IB_QPT_UD:
+	{
+		struct mthca_ucontext *context;
+
+		qp = kmalloc(sizeof *qp, GFP_KERNEL);
+		if (!qp)
+			return ERR_PTR(-ENOMEM);
+
+		if (pd->uobject) {
+			context = to_mucontext(pd->uobject->context);
+
+			if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+				kfree(qp);
+				return ERR_PTR(-EFAULT);
+			}
+
+			err = mthca_map_user_db(to_mdev(pd->device), &context->uar,
+						context->db_tab,
+						ucmd.sq_db_index, ucmd.sq_db_page);
+			if (err) {
+				kfree(qp);
+				return ERR_PTR(err);
+			}
+
+			err = mthca_map_user_db(to_mdev(pd->device), &context->uar,
+						context->db_tab,
+						ucmd.rq_db_index, ucmd.rq_db_page);
+			if (err) {
+				mthca_unmap_user_db(to_mdev(pd->device),
+						    &context->uar,
+						    context->db_tab,
+						    ucmd.sq_db_index);
+				kfree(qp);
+				return ERR_PTR(err);
+			}
+
+			qp->mr.ibmr.lkey = ucmd.lkey;
+			qp->sq.db_index  = ucmd.sq_db_index;
+			qp->rq.db_index  = ucmd.rq_db_index;
+		}
+
+		err = mthca_alloc_qp(to_mdev(pd->device), to_mpd(pd),
+				     to_mcq(init_attr->send_cq),
+				     to_mcq(init_attr->recv_cq),
+				     init_attr->qp_type, init_attr->sq_sig_type,
+				     &init_attr->cap, qp);
+
+		if (err && pd->uobject) {
+			context = to_mucontext(pd->uobject->context);
+
+			mthca_unmap_user_db(to_mdev(pd->device),
+					    &context->uar,
+					    context->db_tab,
+					    ucmd.sq_db_index);
+			mthca_unmap_user_db(to_mdev(pd->device),
+					    &context->uar,
+					    context->db_tab,
+					    ucmd.rq_db_index);
+		}
+
+		qp->ibqp.qp_num = qp->qpn;
+		break;
+	}
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+	{
+		/* Don't allow userspace to create special QPs */
+		if (pd->uobject)
+			return ERR_PTR(-EINVAL);
+
+		qp = kmalloc(sizeof (struct mthca_sqp), GFP_KERNEL);
+		if (!qp)
+			return ERR_PTR(-ENOMEM);
+
+		qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1;
+
+		err = mthca_alloc_sqp(to_mdev(pd->device), to_mpd(pd),
+				      to_mcq(init_attr->send_cq),
+				      to_mcq(init_attr->recv_cq),
+				      init_attr->sq_sig_type, &init_attr->cap,
+				      qp->ibqp.qp_num, init_attr->port_num,
+				      to_msqp(qp));
+		break;
+	}
+	default:
+		/* Don't support raw QPs */
+		return ERR_PTR(-ENOSYS);
+	}
+
+	if (err) {
+		kfree(qp);
+		return ERR_PTR(err);
+	}
+
+	init_attr->cap.max_send_wr     = qp->sq.max;
+	init_attr->cap.max_recv_wr     = qp->rq.max;
+	init_attr->cap.max_send_sge    = qp->sq.max_gs;
+	init_attr->cap.max_recv_sge    = qp->rq.max_gs;
+	init_attr->cap.max_inline_data = qp->max_inline_data;
+
+	return &qp->ibqp;
+}
+
+static int mthca_destroy_qp(struct ib_qp *qp)
+{
+	if (qp->uobject) {
+		mthca_unmap_user_db(to_mdev(qp->device),
+				    &to_mucontext(qp->uobject->context)->uar,
+				    to_mucontext(qp->uobject->context)->db_tab,
+				    to_mqp(qp)->sq.db_index);
+		mthca_unmap_user_db(to_mdev(qp->device),
+				    &to_mucontext(qp->uobject->context)->uar,
+				    to_mucontext(qp->uobject->context)->db_tab,
+				    to_mqp(qp)->rq.db_index);
+	}
+	mthca_free_qp(to_mdev(qp->device), to_mqp(qp));
+	kfree(qp);
+	return 0;
+}
+
+static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, int entries,
+				     int comp_vector,
+				     struct ib_ucontext *context,
+				     struct ib_udata *udata)
+{
+	struct mthca_create_cq ucmd;
+	struct mthca_cq *cq;
+	int nent;
+	int err;
+
+	if (entries < 1 || entries > to_mdev(ibdev)->limits.max_cqes)
+		return ERR_PTR(-EINVAL);
+
+	if (context) {
+		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd))
+			return ERR_PTR(-EFAULT);
+
+		err = mthca_map_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
+					to_mucontext(context)->db_tab,
+					ucmd.set_db_index, ucmd.set_db_page);
+		if (err)
+			return ERR_PTR(err);
+
+		err = mthca_map_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
+					to_mucontext(context)->db_tab,
+					ucmd.arm_db_index, ucmd.arm_db_page);
+		if (err)
+			goto err_unmap_set;
+	}
+
+	cq = kmalloc(sizeof *cq, GFP_KERNEL);
+	if (!cq) {
+		err = -ENOMEM;
+		goto err_unmap_arm;
+	}
+
+	if (context) {
+		cq->buf.mr.ibmr.lkey = ucmd.lkey;
+		cq->set_ci_db_index  = ucmd.set_db_index;
+		cq->arm_db_index     = ucmd.arm_db_index;
+	}
+
+	for (nent = 1; nent <= entries; nent <<= 1)
+		; /* nothing */
+
+	err = mthca_init_cq(to_mdev(ibdev), nent,
+			    context ? to_mucontext(context) : NULL,
+			    context ? ucmd.pdn : to_mdev(ibdev)->driver_pd.pd_num,
+			    cq);
+	if (err)
+		goto err_free;
+
+	if (context && ib_copy_to_udata(udata, &cq->cqn, sizeof (__u32))) {
+		mthca_free_cq(to_mdev(ibdev), cq);
+		goto err_free;
+	}
+
+	cq->resize_buf = NULL;
+
+	return &cq->ibcq;
+
+err_free:
+	kfree(cq);
+
+err_unmap_arm:
+	if (context)
+		mthca_unmap_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
+				    to_mucontext(context)->db_tab, ucmd.arm_db_index);
+
+err_unmap_set:
+	if (context)
+		mthca_unmap_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
+				    to_mucontext(context)->db_tab, ucmd.set_db_index);
+
+	return ERR_PTR(err);
+}
+
+static int mthca_alloc_resize_buf(struct mthca_dev *dev, struct mthca_cq *cq,
+				  int entries)
+{
+	int ret;
+
+	spin_lock_irq(&cq->lock);
+	if (cq->resize_buf) {
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC);
+	if (!cq->resize_buf) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	cq->resize_buf->state = CQ_RESIZE_ALLOC;
+
+	ret = 0;
+
+unlock:
+	spin_unlock_irq(&cq->lock);
+
+	if (ret)
+		return ret;
+
+	ret = mthca_alloc_cq_buf(dev, &cq->resize_buf->buf, entries);
+	if (ret) {
+		spin_lock_irq(&cq->lock);
+		kfree(cq->resize_buf);
+		cq->resize_buf = NULL;
+		spin_unlock_irq(&cq->lock);
+		return ret;
+	}
+
+	cq->resize_buf->cqe = entries - 1;
+
+	spin_lock_irq(&cq->lock);
+	cq->resize_buf->state = CQ_RESIZE_READY;
+	spin_unlock_irq(&cq->lock);
+
+	return 0;
+}
+
+static int mthca_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
+{
+	struct mthca_dev *dev = to_mdev(ibcq->device);
+	struct mthca_cq *cq = to_mcq(ibcq);
+	struct mthca_resize_cq ucmd;
+	u32 lkey;
+	u8 status;
+	int ret;
+
+	if (entries < 1 || entries > dev->limits.max_cqes)
+		return -EINVAL;
+
+	mutex_lock(&cq->mutex);
+
+	entries = roundup_pow_of_two(entries + 1);
+	if (entries == ibcq->cqe + 1) {
+		ret = 0;
+		goto out;
+	}
+
+	if (cq->is_kernel) {
+		ret = mthca_alloc_resize_buf(dev, cq, entries);
+		if (ret)
+			goto out;
+		lkey = cq->resize_buf->buf.mr.ibmr.lkey;
+	} else {
+		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+			ret = -EFAULT;
+			goto out;
+		}
+		lkey = ucmd.lkey;
+	}
+
+	ret = mthca_RESIZE_CQ(dev, cq->cqn, lkey, ilog2(entries), &status);
+	if (status)
+		ret = -EINVAL;
+
+	if (ret) {
+		if (cq->resize_buf) {
+			mthca_free_cq_buf(dev, &cq->resize_buf->buf,
+					  cq->resize_buf->cqe);
+			kfree(cq->resize_buf);
+			spin_lock_irq(&cq->lock);
+			cq->resize_buf = NULL;
+			spin_unlock_irq(&cq->lock);
+		}
+		goto out;
+	}
+
+	if (cq->is_kernel) {
+		struct mthca_cq_buf tbuf;
+		int tcqe;
+
+		spin_lock_irq(&cq->lock);
+		if (cq->resize_buf->state == CQ_RESIZE_READY) {
+			mthca_cq_resize_copy_cqes(cq);
+			tbuf         = cq->buf;
+			tcqe         = cq->ibcq.cqe;
+			cq->buf      = cq->resize_buf->buf;
+			cq->ibcq.cqe = cq->resize_buf->cqe;
+		} else {
+			tbuf = cq->resize_buf->buf;
+			tcqe = cq->resize_buf->cqe;
+		}
+
+		kfree(cq->resize_buf);
+		cq->resize_buf = NULL;
+		spin_unlock_irq(&cq->lock);
+
+		mthca_free_cq_buf(dev, &tbuf, tcqe);
+	} else
+		ibcq->cqe = entries - 1;
+
+out:
+	mutex_unlock(&cq->mutex);
+
+	return ret;
+}
+
+static int mthca_destroy_cq(struct ib_cq *cq)
+{
+	if (cq->uobject) {
+		mthca_unmap_user_db(to_mdev(cq->device),
+				    &to_mucontext(cq->uobject->context)->uar,
+				    to_mucontext(cq->uobject->context)->db_tab,
+				    to_mcq(cq)->arm_db_index);
+		mthca_unmap_user_db(to_mdev(cq->device),
+				    &to_mucontext(cq->uobject->context)->uar,
+				    to_mucontext(cq->uobject->context)->db_tab,
+				    to_mcq(cq)->set_ci_db_index);
+	}
+	mthca_free_cq(to_mdev(cq->device), to_mcq(cq));
+	kfree(cq);
+
+	return 0;
+}
+
+static inline u32 convert_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MTHCA_MPT_FLAG_ATOMIC       : 0) |
+	       (acc & IB_ACCESS_REMOTE_WRITE  ? MTHCA_MPT_FLAG_REMOTE_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ   ? MTHCA_MPT_FLAG_REMOTE_READ  : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE   ? MTHCA_MPT_FLAG_LOCAL_WRITE  : 0) |
+	       MTHCA_MPT_FLAG_LOCAL_READ;
+}
+
+static struct ib_mr *mthca_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct mthca_mr *mr;
+	int err;
+
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	err = mthca_mr_alloc_notrans(to_mdev(pd->device),
+				     to_mpd(pd)->pd_num,
+				     convert_access(acc), mr);
+
+	if (err) {
+		kfree(mr);
+		return ERR_PTR(err);
+	}
+
+	mr->umem = NULL;
+
+	return &mr->ibmr;
+}
+
+static struct ib_mr *mthca_reg_phys_mr(struct ib_pd       *pd,
+				       struct ib_phys_buf *buffer_list,
+				       int                 num_phys_buf,
+				       int                 acc,
+				       u64                *iova_start)
+{
+	struct mthca_mr *mr;
+	u64 *page_list;
+	u64 total_size;
+	unsigned long mask;
+	int shift;
+	int npages;
+	int err;
+	int i, j, n;
+
+	mask = buffer_list[0].addr ^ *iova_start;
+	total_size = 0;
+	for (i = 0; i < num_phys_buf; ++i) {
+		if (i != 0)
+			mask |= buffer_list[i].addr;
+		if (i != num_phys_buf - 1)
+			mask |= buffer_list[i].addr + buffer_list[i].size;
+
+		total_size += buffer_list[i].size;
+	}
+
+	if (mask & ~PAGE_MASK)
+		return ERR_PTR(-EINVAL);
+
+	shift = __ffs(mask | 1 << 31);
+
+	buffer_list[0].size += buffer_list[0].addr & ((1ULL << shift) - 1);
+	buffer_list[0].addr &= ~0ull << shift;
+
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	npages = 0;
+	for (i = 0; i < num_phys_buf; ++i)
+		npages += (buffer_list[i].size + (1ULL << shift) - 1) >> shift;
+
+	if (!npages)
+		return &mr->ibmr;
+
+	page_list = kmalloc(npages * sizeof *page_list, GFP_KERNEL);
+	if (!page_list) {
+		kfree(mr);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	n = 0;
+	for (i = 0; i < num_phys_buf; ++i)
+		for (j = 0;
+		     j < (buffer_list[i].size + (1ULL << shift) - 1) >> shift;
+		     ++j)
+			page_list[n++] = buffer_list[i].addr + ((u64) j << shift);
+
+	mthca_dbg(to_mdev(pd->device), "Registering memory at %llx (iova %llx) "
+		  "in PD %x; shift %d, npages %d.\n",
+		  (unsigned long long) buffer_list[0].addr,
+		  (unsigned long long) *iova_start,
+		  to_mpd(pd)->pd_num,
+		  shift, npages);
+
+	err = mthca_mr_alloc_phys(to_mdev(pd->device),
+				  to_mpd(pd)->pd_num,
+				  page_list, shift, npages,
+				  *iova_start, total_size,
+				  convert_access(acc), mr);
+
+	if (err) {
+		kfree(page_list);
+		kfree(mr);
+		return ERR_PTR(err);
+	}
+
+	kfree(page_list);
+	mr->umem = NULL;
+
+	return &mr->ibmr;
+}
+
+static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				       u64 virt, int acc, struct ib_udata *udata)
+{
+	struct mthca_dev *dev = to_mdev(pd->device);
+	struct ib_umem_chunk *chunk;
+	struct mthca_mr *mr;
+	struct mthca_reg_mr ucmd;
+	u64 *pages;
+	int shift, n, len;
+	int i, j, k;
+	int err = 0;
+	int write_mtt_size;
+
+	if (udata->inlen - sizeof (struct ib_uverbs_cmd_hdr) < sizeof ucmd) {
+		if (!to_mucontext(pd->uobject->context)->reg_mr_warned) {
+			mthca_warn(dev, "Process '%s' did not pass in MR attrs.\n",
+				   curproc->p_comm);
+			mthca_warn(dev, "  Update libmthca to fix this.\n");
+		}
+		++to_mucontext(pd->uobject->context)->reg_mr_warned;
+		ucmd.mr_attrs = 0;
+	} else if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd))
+		return ERR_PTR(-EFAULT);
+
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	mr->umem = ib_umem_get(pd->uobject->context, start, length, acc,
+			       ucmd.mr_attrs & MTHCA_MR_DMASYNC);
+
+	if (IS_ERR(mr->umem)) {
+		err = PTR_ERR(mr->umem);
+		goto err;
+	}
+
+	shift = ffs(mr->umem->page_size) - 1;
+
+	n = 0;
+	list_for_each_entry(chunk, &mr->umem->chunk_list, list)
+		n += chunk->nents;
+
+	mr->mtt = mthca_alloc_mtt(dev, n);
+	if (IS_ERR(mr->mtt)) {
+		err = PTR_ERR(mr->mtt);
+		goto err_umem;
+	}
+
+	pages = (u64 *) __get_free_page(GFP_KERNEL);
+	if (!pages) {
+		err = -ENOMEM;
+		goto err_mtt;
+	}
+
+	i = n = 0;
+
+	write_mtt_size = min(mthca_write_mtt_size(dev), (int) (PAGE_SIZE / sizeof *pages));
+
+	list_for_each_entry(chunk, &mr->umem->chunk_list, list)
+		for (j = 0; j < chunk->nmap; ++j) {
+			len = sg_dma_len(&chunk->page_list[j]) >> shift;
+			for (k = 0; k < len; ++k) {
+				pages[i++] = sg_dma_address(&chunk->page_list[j]) +
+					mr->umem->page_size * k;
+				/*
+				 * Be friendly to write_mtt and pass it chunks
+				 * of appropriate size.
+				 */
+				if (i == write_mtt_size) {
+					err = mthca_write_mtt(dev, mr->mtt, n, pages, i);
+					if (err)
+						goto mtt_done;
+					n += i;
+					i = 0;
+				}
+			}
+		}
+
+	if (i)
+		err = mthca_write_mtt(dev, mr->mtt, n, pages, i);
+mtt_done:
+	free_page((unsigned long) pages);
+	if (err)
+		goto err_mtt;
+
+	err = mthca_mr_alloc(dev, to_mpd(pd)->pd_num, shift, virt, length,
+			     convert_access(acc), mr);
+
+	if (err)
+		goto err_mtt;
+
+	return &mr->ibmr;
+
+err_mtt:
+	mthca_free_mtt(dev, mr->mtt);
+
+err_umem:
+	ib_umem_release(mr->umem);
+
+err:
+	kfree(mr);
+	return ERR_PTR(err);
+}
+
+static int mthca_dereg_mr(struct ib_mr *mr)
+{
+	struct mthca_mr *mmr = to_mmr(mr);
+
+	mthca_free_mr(to_mdev(mr->device), mmr);
+	if (mmr->umem)
+		ib_umem_release(mmr->umem);
+	kfree(mmr);
+
+	return 0;
+}
+
+static struct ib_fmr *mthca_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+				      struct ib_fmr_attr *fmr_attr)
+{
+	struct mthca_fmr *fmr;
+	int err;
+
+	fmr = kmalloc(sizeof *fmr, GFP_KERNEL);
+	if (!fmr)
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(&fmr->attr, fmr_attr, sizeof *fmr_attr);
+	err = mthca_fmr_alloc(to_mdev(pd->device), to_mpd(pd)->pd_num,
+			     convert_access(mr_access_flags), fmr);
+
+	if (err) {
+		kfree(fmr);
+		return ERR_PTR(err);
+	}
+
+	return &fmr->ibmr;
+}
+
+static int mthca_dealloc_fmr(struct ib_fmr *fmr)
+{
+	struct mthca_fmr *mfmr = to_mfmr(fmr);
+	int err;
+
+	err = mthca_free_fmr(to_mdev(fmr->device), mfmr);
+	if (err)
+		return err;
+
+	kfree(mfmr);
+	return 0;
+}
+
+static int mthca_unmap_fmr(struct list_head *fmr_list)
+{
+	struct ib_fmr *fmr;
+	int err;
+	u8 status;
+	struct mthca_dev *mdev = NULL;
+
+	list_for_each_entry(fmr, fmr_list, list) {
+		if (mdev && to_mdev(fmr->device) != mdev)
+			return -EINVAL;
+		mdev = to_mdev(fmr->device);
+	}
+
+	if (!mdev)
+		return 0;
+
+	if (mthca_is_memfree(mdev)) {
+		list_for_each_entry(fmr, fmr_list, list)
+			mthca_arbel_fmr_unmap(mdev, to_mfmr(fmr));
+
+		wmb();
+	} else
+		list_for_each_entry(fmr, fmr_list, list)
+			mthca_tavor_fmr_unmap(mdev, to_mfmr(fmr));
+
+	err = mthca_SYNC_TPT(mdev, &status);
+	if (err)
+		return err;
+	if (status)
+		return -EINVAL;
+	return 0;
+}
+
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct mthca_dev *dev =
+		container_of(device, struct mthca_dev, ib_dev.dev);
+	return sprintf(buf, "%x\n", dev->rev_id);
+}
+
+static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
+			   char *buf)
+{
+	struct mthca_dev *dev =
+		container_of(device, struct mthca_dev, ib_dev.dev);
+	return sprintf(buf, "%d.%d.%d\n", (int) (dev->fw_ver >> 32),
+		       (int) (dev->fw_ver >> 16) & 0xffff,
+		       (int) dev->fw_ver & 0xffff);
+}
+
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct mthca_dev *dev =
+		container_of(device, struct mthca_dev, ib_dev.dev);
+	switch (dev->pdev->device) {
+	case PCI_DEVICE_ID_MELLANOX_TAVOR:
+		return sprintf(buf, "MT23108\n");
+	case PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT:
+		return sprintf(buf, "MT25208 (MT23108 compat mode)\n");
+	case PCI_DEVICE_ID_MELLANOX_ARBEL:
+		return sprintf(buf, "MT25208\n");
+	case PCI_DEVICE_ID_MELLANOX_SINAI:
+	case PCI_DEVICE_ID_MELLANOX_SINAI_OLD:
+		return sprintf(buf, "MT25204\n");
+	default:
+		return sprintf(buf, "unknown\n");
+	}
+}
+
+static ssize_t show_board(struct device *device, struct device_attribute *attr,
+			  char *buf)
+{
+	struct mthca_dev *dev =
+		container_of(device, struct mthca_dev, ib_dev.dev);
+	return sprintf(buf, "%.*s\n", MTHCA_BOARD_ID_LEN, dev->board_id);
+}
+
+static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
+static DEVICE_ATTR(fw_ver,   S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
+
+static struct device_attribute *mthca_dev_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_fw_ver,
+	&dev_attr_hca_type,
+	&dev_attr_board_id
+};
+
+static int mthca_init_node_data(struct mthca_dev *dev)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+	u8 status;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = IB_SMP_ATTR_NODE_DESC;
+
+	err = mthca_MAD_IFC(dev, 1, 1,
+			    1, NULL, NULL, in_mad, out_mad,
+			    &status);
+	if (err)
+		goto out;
+	if (status) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	memcpy(dev->ib_dev.node_desc, out_mad->data, 64);
+
+	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+	err = mthca_MAD_IFC(dev, 1, 1,
+			    1, NULL, NULL, in_mad, out_mad,
+			    &status);
+	if (err)
+		goto out;
+	if (status) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (mthca_is_memfree(dev))
+		dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32));
+	memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+int mthca_register_device(struct mthca_dev *dev)
+{
+	int ret;
+	int i;
+
+	ret = mthca_init_node_data(dev);
+	if (ret)
+		return ret;
+
+	strlcpy(dev->ib_dev.name, "mthca%d", IB_DEVICE_NAME_MAX);
+	dev->ib_dev.owner                = THIS_MODULE;
+
+	dev->ib_dev.uverbs_abi_ver	 = MTHCA_UVERBS_ABI_VERSION;
+	dev->ib_dev.uverbs_cmd_mask	 =
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST);
+	dev->ib_dev.node_type            = RDMA_NODE_IB_CA;
+	dev->ib_dev.phys_port_cnt        = dev->limits.num_ports;
+	dev->ib_dev.num_comp_vectors     = 1;
+	dev->ib_dev.dma_device           = &dev->pdev->dev;
+	dev->ib_dev.query_device         = mthca_query_device;
+	dev->ib_dev.query_port           = mthca_query_port;
+	dev->ib_dev.modify_device        = mthca_modify_device;
+	dev->ib_dev.modify_port          = mthca_modify_port;
+	dev->ib_dev.query_pkey           = mthca_query_pkey;
+	dev->ib_dev.query_gid            = mthca_query_gid;
+	dev->ib_dev.alloc_ucontext       = mthca_alloc_ucontext;
+	dev->ib_dev.dealloc_ucontext     = mthca_dealloc_ucontext;
+	dev->ib_dev.mmap                 = mthca_mmap_uar;
+	dev->ib_dev.alloc_pd             = mthca_alloc_pd;
+	dev->ib_dev.dealloc_pd           = mthca_dealloc_pd;
+	dev->ib_dev.create_ah            = mthca_ah_create;
+	dev->ib_dev.query_ah             = mthca_ah_query;
+	dev->ib_dev.destroy_ah           = mthca_ah_destroy;
+
+	if (dev->mthca_flags & MTHCA_FLAG_SRQ) {
+		dev->ib_dev.create_srq           = mthca_create_srq;
+		dev->ib_dev.modify_srq           = mthca_modify_srq;
+		dev->ib_dev.query_srq            = mthca_query_srq;
+		dev->ib_dev.destroy_srq          = mthca_destroy_srq;
+		dev->ib_dev.uverbs_cmd_mask	|=
+			(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
+			(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
+			(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
+			(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ);
+
+		if (mthca_is_memfree(dev))
+			dev->ib_dev.post_srq_recv = mthca_arbel_post_srq_recv;
+		else
+			dev->ib_dev.post_srq_recv = mthca_tavor_post_srq_recv;
+	}
+
+	dev->ib_dev.create_qp            = mthca_create_qp;
+	dev->ib_dev.modify_qp            = mthca_modify_qp;
+	dev->ib_dev.query_qp             = mthca_query_qp;
+	dev->ib_dev.destroy_qp           = mthca_destroy_qp;
+	dev->ib_dev.create_cq            = mthca_create_cq;
+	dev->ib_dev.resize_cq            = mthca_resize_cq;
+	dev->ib_dev.destroy_cq           = mthca_destroy_cq;
+	dev->ib_dev.poll_cq              = mthca_poll_cq;
+	dev->ib_dev.get_dma_mr           = mthca_get_dma_mr;
+	dev->ib_dev.reg_phys_mr          = mthca_reg_phys_mr;
+	dev->ib_dev.reg_user_mr          = mthca_reg_user_mr;
+	dev->ib_dev.dereg_mr             = mthca_dereg_mr;
+
+	if (dev->mthca_flags & MTHCA_FLAG_FMR) {
+		dev->ib_dev.alloc_fmr            = mthca_alloc_fmr;
+		dev->ib_dev.unmap_fmr            = mthca_unmap_fmr;
+		dev->ib_dev.dealloc_fmr          = mthca_dealloc_fmr;
+		if (mthca_is_memfree(dev))
+			dev->ib_dev.map_phys_fmr = mthca_arbel_map_phys_fmr;
+		else
+			dev->ib_dev.map_phys_fmr = mthca_tavor_map_phys_fmr;
+	}
+
+	dev->ib_dev.attach_mcast         = mthca_multicast_attach;
+	dev->ib_dev.detach_mcast         = mthca_multicast_detach;
+	dev->ib_dev.process_mad          = mthca_process_mad;
+
+	if (mthca_is_memfree(dev)) {
+		dev->ib_dev.req_notify_cq = mthca_arbel_arm_cq;
+		dev->ib_dev.post_send     = mthca_arbel_post_send;
+		dev->ib_dev.post_recv     = mthca_arbel_post_receive;
+	} else {
+		dev->ib_dev.req_notify_cq = mthca_tavor_arm_cq;
+		dev->ib_dev.post_send     = mthca_tavor_post_send;
+		dev->ib_dev.post_recv     = mthca_tavor_post_receive;
+	}
+
+	mutex_init(&dev->cap_mask_mutex);
+
+	ret = ib_register_device(&dev->ib_dev);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < ARRAY_SIZE(mthca_dev_attributes); ++i) {
+		ret = device_create_file(&dev->ib_dev.dev,
+					 mthca_dev_attributes[i]);
+		if (ret) {
+			ib_unregister_device(&dev->ib_dev);
+			return ret;
+		}
+	}
+
+	mthca_start_catas_poll(dev);
+
+	return 0;
+}
+
+void mthca_unregister_device(struct mthca_dev *dev)
+{
+	mthca_stop_catas_poll(dev);
+	ib_unregister_device(&dev->ib_dev);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.h b/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.h
new file mode 100644
index 0000000..c621f87
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.h
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_PROVIDER_H
+#define MTHCA_PROVIDER_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+
+#define MTHCA_MPT_FLAG_ATOMIC        (1 << 14)
+#define MTHCA_MPT_FLAG_REMOTE_WRITE  (1 << 13)
+#define MTHCA_MPT_FLAG_REMOTE_READ   (1 << 12)
+#define MTHCA_MPT_FLAG_LOCAL_WRITE   (1 << 11)
+#define MTHCA_MPT_FLAG_LOCAL_READ    (1 << 10)
+
+struct mthca_buf_list {
+	void *buf;
+	DECLARE_PCI_UNMAP_ADDR(mapping)
+};
+
+union mthca_buf {
+	struct mthca_buf_list direct;
+	struct mthca_buf_list *page_list;
+};
+
+struct mthca_uar {
+	unsigned long pfn;
+	int           index;
+};
+
+struct mthca_user_db_table;
+
+struct mthca_ucontext {
+	struct ib_ucontext          ibucontext;
+	struct mthca_uar            uar;
+	struct mthca_user_db_table *db_tab;
+	int			    reg_mr_warned;
+};
+
+struct mthca_mtt;
+
+struct mthca_mr {
+	struct ib_mr      ibmr;
+	struct ib_umem   *umem;
+	struct mthca_mtt *mtt;
+};
+
+struct mthca_fmr {
+	struct ib_fmr      ibmr;
+	struct ib_fmr_attr attr;
+	struct mthca_mtt  *mtt;
+	int                maps;
+	union {
+		struct {
+			struct mthca_mpt_entry __iomem *mpt;
+			u64 __iomem *mtts;
+		} tavor;
+		struct {
+			struct mthca_mpt_entry *mpt;
+			__be64 *mtts;
+			dma_addr_t dma_handle;
+		} arbel;
+	} mem;
+};
+
+struct mthca_pd {
+	struct ib_pd    ibpd;
+	u32             pd_num;
+	atomic_t        sqp_count;
+	struct mthca_mr ntmr;
+	int             privileged;
+};
+
+struct mthca_eq {
+	struct mthca_dev      *dev;
+	int                    eqn;
+	u32                    eqn_mask;
+	u32                    cons_index;
+	u16                    msi_x_vector;
+	u16                    msi_x_entry;
+	int                    have_irq;
+	int                    nent;
+	struct mthca_buf_list *page_list;
+	struct mthca_mr        mr;
+};
+
+struct mthca_av;
+
+enum mthca_ah_type {
+	MTHCA_AH_ON_HCA,
+	MTHCA_AH_PCI_POOL,
+	MTHCA_AH_KMALLOC
+};
+
+struct mthca_ah {
+	struct ib_ah       ibah;
+	enum mthca_ah_type type;
+	u32                key;
+	struct mthca_av   *av;
+	dma_addr_t         avdma;
+};
+
+/*
+ * Quick description of our CQ/QP locking scheme:
+ *
+ * We have one global lock that protects dev->cq/qp_table.  Each
+ * struct mthca_cq/qp also has its own lock.  An individual qp lock
+ * may be taken inside of an individual cq lock.  Both cqs attached to
+ * a qp may be locked, with the cq with the lower cqn locked first.
+ * No other nesting should be done.
+ *
+ * Each struct mthca_cq/qp also has an ref count, protected by the
+ * corresponding table lock.  The pointer from the cq/qp_table to the
+ * struct counts as one reference.  This reference also is good for
+ * access through the consumer API, so modifying the CQ/QP etc doesn't
+ * need to take another reference.  Access to a QP because of a
+ * completion being polled does not need a reference either.
+ *
+ * Finally, each struct mthca_cq/qp has a wait_queue_head_t for the
+ * destroy function to sleep on.
+ *
+ * This means that access from the consumer API requires nothing but
+ * taking the struct's lock.
+ *
+ * Access because of a completion event should go as follows:
+ * - lock cq/qp_table and look up struct
+ * - increment ref count in struct
+ * - drop cq/qp_table lock
+ * - lock struct, do your thing, and unlock struct
+ * - decrement ref count; if zero, wake up waiters
+ *
+ * To destroy a CQ/QP, we can do the following:
+ * - lock cq/qp_table
+ * - remove pointer and decrement ref count
+ * - unlock cq/qp_table lock
+ * - wait_event until ref count is zero
+ *
+ * It is the consumer's responsibilty to make sure that no QP
+ * operations (WQE posting or state modification) are pending when a
+ * QP is destroyed.  Also, the consumer must make sure that calls to
+ * qp_modify are serialized.  Similarly, the consumer is responsible
+ * for ensuring that no CQ resize operations are pending when a CQ
+ * is destroyed.
+ *
+ * Possible optimizations (wait for profile data to see if/where we
+ * have locks bouncing between CPUs):
+ * - split cq/qp table lock into n separate (cache-aligned) locks,
+ *   indexed (say) by the page in the table
+ * - split QP struct lock into three (one for common info, one for the
+ *   send queue and one for the receive queue)
+ */
+
+struct mthca_cq_buf {
+	union mthca_buf		queue;
+	struct mthca_mr		mr;
+	int			is_direct;
+};
+
+struct mthca_cq_resize {
+	struct mthca_cq_buf	buf;
+	int			cqe;
+	enum {
+		CQ_RESIZE_ALLOC,
+		CQ_RESIZE_READY,
+		CQ_RESIZE_SWAPPED
+	}			state;
+};
+
+struct mthca_cq {
+	struct ib_cq		ibcq;
+	spinlock_t		lock;
+	int			refcount;
+	int			cqn;
+	u32			cons_index;
+	struct mthca_cq_buf	buf;
+	struct mthca_cq_resize *resize_buf;
+	int			is_kernel;
+
+	/* Next fields are Arbel only */
+	int			set_ci_db_index;
+	__be32		       *set_ci_db;
+	int			arm_db_index;
+	__be32		       *arm_db;
+	int			arm_sn;
+
+	wait_queue_head_t	wait;
+	struct mutex		mutex;
+};
+
+struct mthca_srq {
+	struct ib_srq		ibsrq;
+	spinlock_t		lock;
+	int			refcount;
+	int			srqn;
+	int			max;
+	int			max_gs;
+	int			wqe_shift;
+	int			first_free;
+	int			last_free;
+	u16			counter;  /* Arbel only */
+	int			db_index; /* Arbel only */
+	__be32		       *db;       /* Arbel only */
+	void		       *last;
+
+	int			is_direct;
+	u64		       *wrid;
+	union mthca_buf		queue;
+	struct mthca_mr		mr;
+
+	wait_queue_head_t	wait;
+	struct mutex		mutex;
+};
+
+struct mthca_wq {
+	spinlock_t lock;
+	int        max;
+	unsigned   next_ind;
+	unsigned   last_comp;
+	unsigned   head;
+	unsigned   tail;
+	void      *last;
+	int        max_gs;
+	int        wqe_shift;
+
+	int        db_index;	/* Arbel only */
+	__be32    *db;
+};
+
+struct mthca_qp {
+	struct ib_qp           ibqp;
+	int                    refcount;
+	u32                    qpn;
+	int                    is_direct;
+	u8                     port; /* for SQP and memfree use only */
+	u8                     alt_port; /* for memfree use only */
+	u8                     transport;
+	u8                     state;
+	u8                     atomic_rd_en;
+	u8                     resp_depth;
+
+	struct mthca_mr        mr;
+
+	struct mthca_wq        rq;
+	struct mthca_wq        sq;
+	enum ib_sig_type       sq_policy;
+	int                    send_wqe_offset;
+	int                    max_inline_data;
+
+	u64                   *wrid;
+	union mthca_buf	       queue;
+
+	wait_queue_head_t      wait;
+	struct mutex	       mutex;
+};
+
+struct mthca_sqp {
+	struct mthca_qp qp;
+	int             pkey_index;
+	u32             qkey;
+	u32             send_psn;
+	struct ib_ud_header ud_header;
+	int             header_buf_size;
+	void           *header_buf;
+	dma_addr_t      header_dma;
+};
+
+static inline struct mthca_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct mthca_ucontext, ibucontext);
+}
+
+static inline struct mthca_fmr *to_mfmr(struct ib_fmr *ibmr)
+{
+	return container_of(ibmr, struct mthca_fmr, ibmr);
+}
+
+static inline struct mthca_mr *to_mmr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct mthca_mr, ibmr);
+}
+
+static inline struct mthca_pd *to_mpd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct mthca_pd, ibpd);
+}
+
+static inline struct mthca_ah *to_mah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct mthca_ah, ibah);
+}
+
+static inline struct mthca_cq *to_mcq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct mthca_cq, ibcq);
+}
+
+static inline struct mthca_srq *to_msrq(struct ib_srq *ibsrq)
+{
+	return container_of(ibsrq, struct mthca_srq, ibsrq);
+}
+
+static inline struct mthca_qp *to_mqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct mthca_qp, ibqp);
+}
+
+static inline struct mthca_sqp *to_msqp(struct mthca_qp *qp)
+{
+	return container_of(qp, struct mthca_sqp, qp);
+}
+
+#endif /* MTHCA_PROVIDER_H */
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_qp.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_qp.c
new file mode 100644
index 0000000..4a4d133
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -0,0 +1,2332 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+#include <asm/io.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_pack.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+#include "mthca_wqe.h"
+
+enum {
+	MTHCA_MAX_DIRECT_QP_SIZE = 4 * PAGE_SIZE,
+	MTHCA_ACK_REQ_FREQ       = 10,
+	MTHCA_FLIGHT_LIMIT       = 9,
+	MTHCA_UD_HEADER_SIZE     = 72, /* largest UD header possible */
+	MTHCA_INLINE_HEADER_SIZE = 4,  /* data segment overhead for inline */
+	MTHCA_INLINE_CHUNK_SIZE  = 16  /* inline data segment chunk */
+};
+
+enum {
+	MTHCA_QP_STATE_RST  = 0,
+	MTHCA_QP_STATE_INIT = 1,
+	MTHCA_QP_STATE_RTR  = 2,
+	MTHCA_QP_STATE_RTS  = 3,
+	MTHCA_QP_STATE_SQE  = 4,
+	MTHCA_QP_STATE_SQD  = 5,
+	MTHCA_QP_STATE_ERR  = 6,
+	MTHCA_QP_STATE_DRAINING = 7
+};
+
+enum {
+	MTHCA_QP_ST_RC 	= 0x0,
+	MTHCA_QP_ST_UC 	= 0x1,
+	MTHCA_QP_ST_RD 	= 0x2,
+	MTHCA_QP_ST_UD 	= 0x3,
+	MTHCA_QP_ST_MLX = 0x7
+};
+
+enum {
+	MTHCA_QP_PM_MIGRATED = 0x3,
+	MTHCA_QP_PM_ARMED    = 0x0,
+	MTHCA_QP_PM_REARM    = 0x1
+};
+
+enum {
+	/* qp_context flags */
+	MTHCA_QP_BIT_DE  = 1 <<  8,
+	/* params1 */
+	MTHCA_QP_BIT_SRE = 1 << 15,
+	MTHCA_QP_BIT_SWE = 1 << 14,
+	MTHCA_QP_BIT_SAE = 1 << 13,
+	MTHCA_QP_BIT_SIC = 1 <<  4,
+	MTHCA_QP_BIT_SSC = 1 <<  3,
+	/* params2 */
+	MTHCA_QP_BIT_RRE = 1 << 15,
+	MTHCA_QP_BIT_RWE = 1 << 14,
+	MTHCA_QP_BIT_RAE = 1 << 13,
+	MTHCA_QP_BIT_RIC = 1 <<  4,
+	MTHCA_QP_BIT_RSC = 1 <<  3
+};
+
+enum {
+	MTHCA_SEND_DOORBELL_FENCE = 1 << 5
+};
+
+struct mthca_qp_path {
+	__be32 port_pkey;
+	u8     rnr_retry;
+	u8     g_mylmc;
+	__be16 rlid;
+	u8     ackto;
+	u8     mgid_index;
+	u8     static_rate;
+	u8     hop_limit;
+	__be32 sl_tclass_flowlabel;
+	u8     rgid[16];
+} __attribute__((packed));
+
+struct mthca_qp_context {
+	__be32 flags;
+	__be32 tavor_sched_queue; /* Reserved on Arbel */
+	u8     mtu_msgmax;
+	u8     rq_size_stride;	/* Reserved on Tavor */
+	u8     sq_size_stride;	/* Reserved on Tavor */
+	u8     rlkey_arbel_sched_queue;	/* Reserved on Tavor */
+	__be32 usr_page;
+	__be32 local_qpn;
+	__be32 remote_qpn;
+	u32    reserved1[2];
+	struct mthca_qp_path pri_path;
+	struct mthca_qp_path alt_path;
+	__be32 rdd;
+	__be32 pd;
+	__be32 wqe_base;
+	__be32 wqe_lkey;
+	__be32 params1;
+	__be32 reserved2;
+	__be32 next_send_psn;
+	__be32 cqn_snd;
+	__be32 snd_wqe_base_l;	/* Next send WQE on Tavor */
+	__be32 snd_db_index;	/* (debugging only entries) */
+	__be32 last_acked_psn;
+	__be32 ssn;
+	__be32 params2;
+	__be32 rnr_nextrecvpsn;
+	__be32 ra_buff_indx;
+	__be32 cqn_rcv;
+	__be32 rcv_wqe_base_l;	/* Next recv WQE on Tavor */
+	__be32 rcv_db_index;	/* (debugging only entries) */
+	__be32 qkey;
+	__be32 srqn;
+	__be32 rmsn;
+	__be16 rq_wqe_counter;	/* reserved on Tavor */
+	__be16 sq_wqe_counter;	/* reserved on Tavor */
+	u32    reserved3[18];
+} __attribute__((packed));
+
+struct mthca_qp_param {
+	__be32 opt_param_mask;
+	u32    reserved1;
+	struct mthca_qp_context context;
+	u32    reserved2[62];
+} __attribute__((packed));
+
+enum {
+	MTHCA_QP_OPTPAR_ALT_ADDR_PATH     = 1 << 0,
+	MTHCA_QP_OPTPAR_RRE               = 1 << 1,
+	MTHCA_QP_OPTPAR_RAE               = 1 << 2,
+	MTHCA_QP_OPTPAR_RWE               = 1 << 3,
+	MTHCA_QP_OPTPAR_PKEY_INDEX        = 1 << 4,
+	MTHCA_QP_OPTPAR_Q_KEY             = 1 << 5,
+	MTHCA_QP_OPTPAR_RNR_TIMEOUT       = 1 << 6,
+	MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH = 1 << 7,
+	MTHCA_QP_OPTPAR_SRA_MAX           = 1 << 8,
+	MTHCA_QP_OPTPAR_RRA_MAX           = 1 << 9,
+	MTHCA_QP_OPTPAR_PM_STATE          = 1 << 10,
+	MTHCA_QP_OPTPAR_PORT_NUM          = 1 << 11,
+	MTHCA_QP_OPTPAR_RETRY_COUNT       = 1 << 12,
+	MTHCA_QP_OPTPAR_ALT_RNR_RETRY     = 1 << 13,
+	MTHCA_QP_OPTPAR_ACK_TIMEOUT       = 1 << 14,
+	MTHCA_QP_OPTPAR_RNR_RETRY         = 1 << 15,
+	MTHCA_QP_OPTPAR_SCHED_QUEUE       = 1 << 16
+};
+
+static const u8 mthca_opcode[] = {
+	[IB_WR_SEND]                 = MTHCA_OPCODE_SEND,
+	[IB_WR_SEND_WITH_IMM]        = MTHCA_OPCODE_SEND_IMM,
+	[IB_WR_RDMA_WRITE]           = MTHCA_OPCODE_RDMA_WRITE,
+	[IB_WR_RDMA_WRITE_WITH_IMM]  = MTHCA_OPCODE_RDMA_WRITE_IMM,
+	[IB_WR_RDMA_READ]            = MTHCA_OPCODE_RDMA_READ,
+	[IB_WR_ATOMIC_CMP_AND_SWP]   = MTHCA_OPCODE_ATOMIC_CS,
+	[IB_WR_ATOMIC_FETCH_AND_ADD] = MTHCA_OPCODE_ATOMIC_FA,
+};
+
+static int is_sqp(struct mthca_dev *dev, struct mthca_qp *qp)
+{
+	return qp->qpn >= dev->qp_table.sqp_start &&
+		qp->qpn <= dev->qp_table.sqp_start + 3;
+}
+
+static int is_qp0(struct mthca_dev *dev, struct mthca_qp *qp)
+{
+	return qp->qpn >= dev->qp_table.sqp_start &&
+		qp->qpn <= dev->qp_table.sqp_start + 1;
+}
+
+static void *get_recv_wqe(struct mthca_qp *qp, int n)
+{
+	if (qp->is_direct)
+		return qp->queue.direct.buf + (n << qp->rq.wqe_shift);
+	else
+		return qp->queue.page_list[(n << qp->rq.wqe_shift) >> PAGE_SHIFT].buf +
+			((n << qp->rq.wqe_shift) & (PAGE_SIZE - 1));
+}
+
+static void *get_send_wqe(struct mthca_qp *qp, int n)
+{
+	if (qp->is_direct)
+		return qp->queue.direct.buf + qp->send_wqe_offset +
+			(n << qp->sq.wqe_shift);
+	else
+		return qp->queue.page_list[(qp->send_wqe_offset +
+					    (n << qp->sq.wqe_shift)) >>
+					   PAGE_SHIFT].buf +
+			((qp->send_wqe_offset + (n << qp->sq.wqe_shift)) &
+			 (PAGE_SIZE - 1));
+}
+
+static void mthca_wq_reset(struct mthca_wq *wq)
+{
+	wq->next_ind  = 0;
+	wq->last_comp = wq->max - 1;
+	wq->head      = 0;
+	wq->tail      = 0;
+}
+
+void mthca_qp_event(struct mthca_dev *dev, u32 qpn,
+		    enum ib_event_type event_type)
+{
+	struct mthca_qp *qp;
+	struct ib_event event;
+
+	spin_lock(&dev->qp_table.lock);
+	qp = mthca_array_get(&dev->qp_table.qp, qpn & (dev->limits.num_qps - 1));
+	if (qp)
+		++qp->refcount;
+	spin_unlock(&dev->qp_table.lock);
+
+	if (!qp) {
+		mthca_warn(dev, "Async event %d for bogus QP %08x\n",
+			  (int) event_type, qpn);
+		return;
+	}
+
+	if (event_type == IB_EVENT_PATH_MIG)
+		qp->port = qp->alt_port;
+
+	event.device      = &dev->ib_dev;
+	event.event       = event_type;
+	event.element.qp  = &qp->ibqp;
+	if (qp->ibqp.event_handler)
+		qp->ibqp.event_handler(&event, qp->ibqp.qp_context);
+
+	spin_lock(&dev->qp_table.lock);
+	if (!--qp->refcount)
+		wake_up(&qp->wait);
+	spin_unlock(&dev->qp_table.lock);
+}
+
+static int to_mthca_state(enum ib_qp_state ib_state)
+{
+	switch (ib_state) {
+	case IB_QPS_RESET: return MTHCA_QP_STATE_RST;
+	case IB_QPS_INIT:  return MTHCA_QP_STATE_INIT;
+	case IB_QPS_RTR:   return MTHCA_QP_STATE_RTR;
+	case IB_QPS_RTS:   return MTHCA_QP_STATE_RTS;
+	case IB_QPS_SQD:   return MTHCA_QP_STATE_SQD;
+	case IB_QPS_SQE:   return MTHCA_QP_STATE_SQE;
+	case IB_QPS_ERR:   return MTHCA_QP_STATE_ERR;
+	default:                return -1;
+	}
+}
+
+enum { RC, UC, UD, RD, RDEE, MLX, NUM_TRANS };
+
+static int to_mthca_st(int transport)
+{
+	switch (transport) {
+	case RC:  return MTHCA_QP_ST_RC;
+	case UC:  return MTHCA_QP_ST_UC;
+	case UD:  return MTHCA_QP_ST_UD;
+	case RD:  return MTHCA_QP_ST_RD;
+	case MLX: return MTHCA_QP_ST_MLX;
+	default:  return -1;
+	}
+}
+
+static void store_attrs(struct mthca_sqp *sqp, const struct ib_qp_attr *attr,
+			int attr_mask)
+{
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		sqp->pkey_index = attr->pkey_index;
+	if (attr_mask & IB_QP_QKEY)
+		sqp->qkey = attr->qkey;
+	if (attr_mask & IB_QP_SQ_PSN)
+		sqp->send_psn = attr->sq_psn;
+}
+
+static void init_port(struct mthca_dev *dev, int port)
+{
+	int err;
+	u8 status;
+	struct mthca_init_ib_param param;
+
+	memset(&param, 0, sizeof param);
+
+	param.port_width = dev->limits.port_width_cap;
+	param.vl_cap     = dev->limits.vl_cap;
+	param.mtu_cap    = dev->limits.mtu_cap;
+	param.gid_cap    = dev->limits.gid_table_len;
+	param.pkey_cap   = dev->limits.pkey_table_len;
+
+	err = mthca_INIT_IB(dev, &param, port, &status);
+	if (err)
+		mthca_warn(dev, "INIT_IB failed, return code %d.\n", err);
+	if (status)
+		mthca_warn(dev, "INIT_IB returned status %02x.\n", status);
+}
+
+static __be32 get_hw_access_flags(struct mthca_qp *qp, const struct ib_qp_attr *attr,
+				  int attr_mask)
+{
+	u8 dest_rd_atomic;
+	u32 access_flags;
+	u32 hw_access_flags = 0;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		dest_rd_atomic = attr->max_dest_rd_atomic;
+	else
+		dest_rd_atomic = qp->resp_depth;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		access_flags = attr->qp_access_flags;
+	else
+		access_flags = qp->atomic_rd_en;
+
+	if (!dest_rd_atomic)
+		access_flags &= IB_ACCESS_REMOTE_WRITE;
+
+	if (access_flags & IB_ACCESS_REMOTE_READ)
+		hw_access_flags |= MTHCA_QP_BIT_RRE;
+	if (access_flags & IB_ACCESS_REMOTE_ATOMIC)
+		hw_access_flags |= MTHCA_QP_BIT_RAE;
+	if (access_flags & IB_ACCESS_REMOTE_WRITE)
+		hw_access_flags |= MTHCA_QP_BIT_RWE;
+
+	return cpu_to_be32(hw_access_flags);
+}
+
+static inline enum ib_qp_state to_ib_qp_state(int mthca_state)
+{
+	switch (mthca_state) {
+	case MTHCA_QP_STATE_RST:      return IB_QPS_RESET;
+	case MTHCA_QP_STATE_INIT:     return IB_QPS_INIT;
+	case MTHCA_QP_STATE_RTR:      return IB_QPS_RTR;
+	case MTHCA_QP_STATE_RTS:      return IB_QPS_RTS;
+	case MTHCA_QP_STATE_DRAINING:
+	case MTHCA_QP_STATE_SQD:      return IB_QPS_SQD;
+	case MTHCA_QP_STATE_SQE:      return IB_QPS_SQE;
+	case MTHCA_QP_STATE_ERR:      return IB_QPS_ERR;
+	default:                      return -1;
+	}
+}
+
+static inline enum ib_mig_state to_ib_mig_state(int mthca_mig_state)
+{
+	switch (mthca_mig_state) {
+	case 0:  return IB_MIG_ARMED;
+	case 1:  return IB_MIG_REARM;
+	case 3:  return IB_MIG_MIGRATED;
+	default: return -1;
+	}
+}
+
+static int to_ib_qp_access_flags(int mthca_flags)
+{
+	int ib_flags = 0;
+
+	if (mthca_flags & MTHCA_QP_BIT_RRE)
+		ib_flags |= IB_ACCESS_REMOTE_READ;
+	if (mthca_flags & MTHCA_QP_BIT_RWE)
+		ib_flags |= IB_ACCESS_REMOTE_WRITE;
+	if (mthca_flags & MTHCA_QP_BIT_RAE)
+		ib_flags |= IB_ACCESS_REMOTE_ATOMIC;
+
+	return ib_flags;
+}
+
+static void to_ib_ah_attr(struct mthca_dev *dev, struct ib_ah_attr *ib_ah_attr,
+				struct mthca_qp_path *path)
+{
+	memset(ib_ah_attr, 0, sizeof *ib_ah_attr);
+	ib_ah_attr->port_num 	  = (be32_to_cpu(path->port_pkey) >> 24) & 0x3;
+
+	if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->limits.num_ports)
+		return;
+
+	ib_ah_attr->dlid     	  = be16_to_cpu(path->rlid);
+	ib_ah_attr->sl       	  = be32_to_cpu(path->sl_tclass_flowlabel) >> 28;
+	ib_ah_attr->src_path_bits = path->g_mylmc & 0x7f;
+	ib_ah_attr->static_rate   = mthca_rate_to_ib(dev,
+						     path->static_rate & 0xf,
+						     ib_ah_attr->port_num);
+	ib_ah_attr->ah_flags      = (path->g_mylmc & (1 << 7)) ? IB_AH_GRH : 0;
+	if (ib_ah_attr->ah_flags) {
+		ib_ah_attr->grh.sgid_index = path->mgid_index & (dev->limits.gid_table_len - 1);
+		ib_ah_attr->grh.hop_limit  = path->hop_limit;
+		ib_ah_attr->grh.traffic_class =
+			(be32_to_cpu(path->sl_tclass_flowlabel) >> 20) & 0xff;
+		ib_ah_attr->grh.flow_label =
+			be32_to_cpu(path->sl_tclass_flowlabel) & 0xfffff;
+		memcpy(ib_ah_attr->grh.dgid.raw,
+			path->rgid, sizeof ib_ah_attr->grh.dgid.raw);
+	}
+}
+
+int mthca_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+		   struct ib_qp_init_attr *qp_init_attr)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	int err = 0;
+	struct mthca_mailbox *mailbox = NULL;
+	struct mthca_qp_param *qp_param;
+	struct mthca_qp_context *context;
+	int mthca_state;
+	u8 status;
+
+	mutex_lock(&qp->mutex);
+
+	if (qp->state == IB_QPS_RESET) {
+		qp_attr->qp_state = IB_QPS_RESET;
+		goto done;
+	}
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto out;
+	}
+
+	err = mthca_QUERY_QP(dev, qp->qpn, 0, mailbox, &status);
+	if (err)
+		goto out_mailbox;
+	if (status) {
+		mthca_warn(dev, "QUERY_QP returned status %02x\n", status);
+		err = -EINVAL;
+		goto out_mailbox;
+	}
+
+	qp_param    = mailbox->buf;
+	context     = &qp_param->context;
+	mthca_state = be32_to_cpu(context->flags) >> 28;
+
+	qp->state		     = to_ib_qp_state(mthca_state);
+	qp_attr->qp_state	     = qp->state;
+	qp_attr->path_mtu 	     = context->mtu_msgmax >> 5;
+	qp_attr->path_mig_state      =
+		to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3);
+	qp_attr->qkey 		     = be32_to_cpu(context->qkey);
+	qp_attr->rq_psn 	     = be32_to_cpu(context->rnr_nextrecvpsn) & 0xffffff;
+	qp_attr->sq_psn 	     = be32_to_cpu(context->next_send_psn) & 0xffffff;
+	qp_attr->dest_qp_num 	     = be32_to_cpu(context->remote_qpn) & 0xffffff;
+	qp_attr->qp_access_flags     =
+		to_ib_qp_access_flags(be32_to_cpu(context->params2));
+
+	if (qp->transport == RC || qp->transport == UC) {
+		to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path);
+		to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path);
+		qp_attr->alt_pkey_index =
+			be32_to_cpu(context->alt_path.port_pkey) & 0x7f;
+		qp_attr->alt_port_num 	= qp_attr->alt_ah_attr.port_num;
+	}
+
+	qp_attr->pkey_index = be32_to_cpu(context->pri_path.port_pkey) & 0x7f;
+	qp_attr->port_num   =
+		(be32_to_cpu(context->pri_path.port_pkey) >> 24) & 0x3;
+
+	/* qp_attr->en_sqd_async_notify is only applicable in modify qp */
+	qp_attr->sq_draining = mthca_state == MTHCA_QP_STATE_DRAINING;
+
+	qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context->params1) >> 21) & 0x7);
+
+	qp_attr->max_dest_rd_atomic =
+		1 << ((be32_to_cpu(context->params2) >> 21) & 0x7);
+	qp_attr->min_rnr_timer 	    =
+		(be32_to_cpu(context->rnr_nextrecvpsn) >> 24) & 0x1f;
+	qp_attr->timeout 	    = context->pri_path.ackto >> 3;
+	qp_attr->retry_cnt 	    = (be32_to_cpu(context->params1) >> 16) & 0x7;
+	qp_attr->rnr_retry 	    = context->pri_path.rnr_retry >> 5;
+	qp_attr->alt_timeout 	    = context->alt_path.ackto >> 3;
+
+done:
+	qp_attr->cur_qp_state	     = qp_attr->qp_state;
+	qp_attr->cap.max_send_wr     = qp->sq.max;
+	qp_attr->cap.max_recv_wr     = qp->rq.max;
+	qp_attr->cap.max_send_sge    = qp->sq.max_gs;
+	qp_attr->cap.max_recv_sge    = qp->rq.max_gs;
+	qp_attr->cap.max_inline_data = qp->max_inline_data;
+
+	qp_init_attr->cap	     = qp_attr->cap;
+
+out_mailbox:
+	mthca_free_mailbox(dev, mailbox);
+
+out:
+	mutex_unlock(&qp->mutex);
+	return err;
+}
+
+static int mthca_path_set(struct mthca_dev *dev, const struct ib_ah_attr *ah,
+			  struct mthca_qp_path *path, u8 port)
+{
+	path->g_mylmc     = ah->src_path_bits & 0x7f;
+	path->rlid        = cpu_to_be16(ah->dlid);
+	path->static_rate = mthca_get_rate(dev, ah->static_rate, port);
+
+	if (ah->ah_flags & IB_AH_GRH) {
+		if (ah->grh.sgid_index >= dev->limits.gid_table_len) {
+			mthca_dbg(dev, "sgid_index (%u) too large. max is %d\n",
+				  ah->grh.sgid_index, dev->limits.gid_table_len-1);
+			return -1;
+		}
+
+		path->g_mylmc   |= 1 << 7;
+		path->mgid_index = ah->grh.sgid_index;
+		path->hop_limit  = ah->grh.hop_limit;
+		path->sl_tclass_flowlabel =
+			cpu_to_be32((ah->sl << 28)                |
+				    (ah->grh.traffic_class << 20) |
+				    (ah->grh.flow_label));
+		memcpy(path->rgid, ah->grh.dgid.raw, 16);
+	} else
+		path->sl_tclass_flowlabel = cpu_to_be32(ah->sl << 28);
+
+	return 0;
+}
+
+static int __mthca_modify_qp(struct ib_qp *ibqp,
+			     const struct ib_qp_attr *attr, int attr_mask,
+			     enum ib_qp_state cur_state, enum ib_qp_state new_state)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	struct mthca_mailbox *mailbox;
+	struct mthca_qp_param *qp_param;
+	struct mthca_qp_context *qp_context;
+	u32 sqd_event = 0;
+	u8 status;
+	int err = -EINVAL;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto out;
+	}
+	qp_param = mailbox->buf;
+	qp_context = &qp_param->context;
+	memset(qp_param, 0, sizeof *qp_param);
+
+	qp_context->flags      = cpu_to_be32((to_mthca_state(new_state) << 28) |
+					     (to_mthca_st(qp->transport) << 16));
+	qp_context->flags     |= cpu_to_be32(MTHCA_QP_BIT_DE);
+	if (!(attr_mask & IB_QP_PATH_MIG_STATE))
+		qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_MIGRATED << 11);
+	else {
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PM_STATE);
+		switch (attr->path_mig_state) {
+		case IB_MIG_MIGRATED:
+			qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_MIGRATED << 11);
+			break;
+		case IB_MIG_REARM:
+			qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_REARM << 11);
+			break;
+		case IB_MIG_ARMED:
+			qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_ARMED << 11);
+			break;
+		}
+	}
+
+	/* leave tavor_sched_queue as 0 */
+
+	if (qp->transport == MLX || qp->transport == UD)
+		qp_context->mtu_msgmax = (IB_MTU_2048 << 5) | 11;
+	else if (attr_mask & IB_QP_PATH_MTU) {
+		if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_2048) {
+			mthca_dbg(dev, "path MTU (%u) is invalid\n",
+				  attr->path_mtu);
+			goto out_mailbox;
+		}
+		qp_context->mtu_msgmax = (attr->path_mtu << 5) | 31;
+	}
+
+	if (mthca_is_memfree(dev)) {
+		if (qp->rq.max)
+			qp_context->rq_size_stride = ilog2(qp->rq.max) << 3;
+		qp_context->rq_size_stride |= qp->rq.wqe_shift - 4;
+
+		if (qp->sq.max)
+			qp_context->sq_size_stride = ilog2(qp->sq.max) << 3;
+		qp_context->sq_size_stride |= qp->sq.wqe_shift - 4;
+	}
+
+	/* leave arbel_sched_queue as 0 */
+
+	if (qp->ibqp.uobject)
+		qp_context->usr_page =
+			cpu_to_be32(to_mucontext(qp->ibqp.uobject->context)->uar.index);
+	else
+		qp_context->usr_page = cpu_to_be32(dev->driver_uar.index);
+	qp_context->local_qpn  = cpu_to_be32(qp->qpn);
+	if (attr_mask & IB_QP_DEST_QPN) {
+		qp_context->remote_qpn = cpu_to_be32(attr->dest_qp_num);
+	}
+
+	if (qp->transport == MLX)
+		qp_context->pri_path.port_pkey |=
+			cpu_to_be32(qp->port << 24);
+	else {
+		if (attr_mask & IB_QP_PORT) {
+			qp_context->pri_path.port_pkey |=
+				cpu_to_be32(attr->port_num << 24);
+			qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PORT_NUM);
+		}
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		qp_context->pri_path.port_pkey |=
+			cpu_to_be32(attr->pkey_index);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PKEY_INDEX);
+	}
+
+	if (attr_mask & IB_QP_RNR_RETRY) {
+		qp_context->alt_path.rnr_retry = qp_context->pri_path.rnr_retry =
+			attr->rnr_retry << 5;
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RNR_RETRY |
+							MTHCA_QP_OPTPAR_ALT_RNR_RETRY);
+	}
+
+	if (attr_mask & IB_QP_AV) {
+		if (mthca_path_set(dev, &attr->ah_attr, &qp_context->pri_path,
+				   attr_mask & IB_QP_PORT ? attr->port_num : qp->port))
+			goto out_mailbox;
+
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH);
+	}
+
+	if (ibqp->qp_type == IB_QPT_RC &&
+	    cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
+		u8 sched_queue = ibqp->uobject ? 0x2 : 0x1;
+
+		if (mthca_is_memfree(dev))
+			qp_context->rlkey_arbel_sched_queue |= sched_queue;
+		else
+			qp_context->tavor_sched_queue |= cpu_to_be32(sched_queue);
+
+		qp_param->opt_param_mask |=
+			cpu_to_be32(MTHCA_QP_OPTPAR_SCHED_QUEUE);
+	}
+
+	if (attr_mask & IB_QP_TIMEOUT) {
+		qp_context->pri_path.ackto = attr->timeout << 3;
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_ACK_TIMEOUT);
+	}
+
+	if (attr_mask & IB_QP_ALT_PATH) {
+		if (attr->alt_pkey_index >= dev->limits.pkey_table_len) {
+			mthca_dbg(dev, "Alternate P_Key index (%u) too large. max is %d\n",
+				  attr->alt_pkey_index, dev->limits.pkey_table_len-1);
+			goto out_mailbox;
+		}
+
+		if (attr->alt_port_num == 0 || attr->alt_port_num > dev->limits.num_ports) {
+			mthca_dbg(dev, "Alternate port number (%u) is invalid\n",
+				attr->alt_port_num);
+			goto out_mailbox;
+		}
+
+		if (mthca_path_set(dev, &attr->alt_ah_attr, &qp_context->alt_path,
+				   attr->alt_ah_attr.port_num))
+			goto out_mailbox;
+
+		qp_context->alt_path.port_pkey |= cpu_to_be32(attr->alt_pkey_index |
+							      attr->alt_port_num << 24);
+		qp_context->alt_path.ackto = attr->alt_timeout << 3;
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_ALT_ADDR_PATH);
+	}
+
+	/* leave rdd as 0 */
+	qp_context->pd         = cpu_to_be32(to_mpd(ibqp->pd)->pd_num);
+	/* leave wqe_base as 0 (we always create an MR based at 0 for WQs) */
+	qp_context->wqe_lkey   = cpu_to_be32(qp->mr.ibmr.lkey);
+	qp_context->params1    = cpu_to_be32((MTHCA_ACK_REQ_FREQ << 28) |
+					     (MTHCA_FLIGHT_LIMIT << 24) |
+					     MTHCA_QP_BIT_SWE);
+	if (qp->sq_policy == IB_SIGNAL_ALL_WR)
+		qp_context->params1 |= cpu_to_be32(MTHCA_QP_BIT_SSC);
+	if (attr_mask & IB_QP_RETRY_CNT) {
+		qp_context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RETRY_COUNT);
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+		if (attr->max_rd_atomic) {
+			qp_context->params1 |=
+				cpu_to_be32(MTHCA_QP_BIT_SRE |
+					    MTHCA_QP_BIT_SAE);
+			qp_context->params1 |=
+				cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21);
+		}
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_SRA_MAX);
+	}
+
+	if (attr_mask & IB_QP_SQ_PSN)
+		qp_context->next_send_psn = cpu_to_be32(attr->sq_psn);
+	qp_context->cqn_snd = cpu_to_be32(to_mcq(ibqp->send_cq)->cqn);
+
+	if (mthca_is_memfree(dev)) {
+		qp_context->snd_wqe_base_l = cpu_to_be32(qp->send_wqe_offset);
+		qp_context->snd_db_index   = cpu_to_be32(qp->sq.db_index);
+	}
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+		if (attr->max_dest_rd_atomic)
+			qp_context->params2 |=
+				cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
+
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RRA_MAX);
+	}
+
+	if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) {
+		qp_context->params2      |= get_hw_access_flags(qp, attr, attr_mask);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RWE |
+							MTHCA_QP_OPTPAR_RRE |
+							MTHCA_QP_OPTPAR_RAE);
+	}
+
+	qp_context->params2 |= cpu_to_be32(MTHCA_QP_BIT_RSC);
+
+	if (ibqp->srq)
+		qp_context->params2 |= cpu_to_be32(MTHCA_QP_BIT_RIC);
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+		qp_context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RNR_TIMEOUT);
+	}
+	if (attr_mask & IB_QP_RQ_PSN)
+		qp_context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
+
+	qp_context->ra_buff_indx =
+		cpu_to_be32(dev->qp_table.rdb_base +
+			    ((qp->qpn & (dev->limits.num_qps - 1)) * MTHCA_RDB_ENTRY_SIZE <<
+			     dev->qp_table.rdb_shift));
+
+	qp_context->cqn_rcv = cpu_to_be32(to_mcq(ibqp->recv_cq)->cqn);
+
+	if (mthca_is_memfree(dev))
+		qp_context->rcv_db_index   = cpu_to_be32(qp->rq.db_index);
+
+	if (attr_mask & IB_QP_QKEY) {
+		qp_context->qkey = cpu_to_be32(attr->qkey);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_Q_KEY);
+	}
+
+	if (ibqp->srq)
+		qp_context->srqn = cpu_to_be32(1 << 24 |
+					       to_msrq(ibqp->srq)->srqn);
+
+	if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD	&&
+	    attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY		&&
+	    attr->en_sqd_async_notify)
+		sqd_event = 1 << 31;
+
+	err = mthca_MODIFY_QP(dev, cur_state, new_state, qp->qpn, 0,
+			      mailbox, sqd_event, &status);
+	if (err)
+		goto out_mailbox;
+	if (status) {
+		mthca_warn(dev, "modify QP %d->%d returned status %02x.\n",
+			   cur_state, new_state, status);
+		err = -EINVAL;
+		goto out_mailbox;
+	}
+
+	qp->state = new_state;
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		qp->atomic_rd_en = attr->qp_access_flags;
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		qp->resp_depth = attr->max_dest_rd_atomic;
+	if (attr_mask & IB_QP_PORT)
+		qp->port = attr->port_num;
+	if (attr_mask & IB_QP_ALT_PATH)
+		qp->alt_port = attr->alt_port_num;
+
+	if (is_sqp(dev, qp))
+		store_attrs(to_msqp(qp), attr, attr_mask);
+
+	/*
+	 * If we moved QP0 to RTR, bring the IB link up; if we moved
+	 * QP0 to RESET or ERROR, bring the link back down.
+	 */
+	if (is_qp0(dev, qp)) {
+		if (cur_state != IB_QPS_RTR &&
+		    new_state == IB_QPS_RTR)
+			init_port(dev, qp->port);
+
+		if (cur_state != IB_QPS_RESET &&
+		    cur_state != IB_QPS_ERR &&
+		    (new_state == IB_QPS_RESET ||
+		     new_state == IB_QPS_ERR))
+			mthca_CLOSE_IB(dev, qp->port, &status);
+	}
+
+	/*
+	 * If we moved a kernel QP to RESET, clean up all old CQ
+	 * entries and reinitialize the QP.
+	 */
+	if (new_state == IB_QPS_RESET && !qp->ibqp.uobject) {
+		mthca_cq_clean(dev, to_mcq(qp->ibqp.recv_cq), qp->qpn,
+			       qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL);
+		if (qp->ibqp.send_cq != qp->ibqp.recv_cq)
+			mthca_cq_clean(dev, to_mcq(qp->ibqp.send_cq), qp->qpn, NULL);
+
+		mthca_wq_reset(&qp->sq);
+		qp->sq.last = get_send_wqe(qp, qp->sq.max - 1);
+
+		mthca_wq_reset(&qp->rq);
+		qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1);
+
+		if (mthca_is_memfree(dev)) {
+			*qp->sq.db = 0;
+			*qp->rq.db = 0;
+		}
+	}
+
+out_mailbox:
+	mthca_free_mailbox(dev, mailbox);
+out:
+	return err;
+}
+
+int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
+		    struct ib_udata *udata)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	enum ib_qp_state cur_state, new_state;
+	int err = -EINVAL;
+
+	mutex_lock(&qp->mutex);
+	if (attr_mask & IB_QP_CUR_STATE) {
+		cur_state = attr->cur_qp_state;
+	} else {
+		spin_lock_irq(&qp->sq.lock);
+		spin_lock(&qp->rq.lock);
+		cur_state = qp->state;
+		spin_unlock(&qp->rq.lock);
+		spin_unlock_irq(&qp->sq.lock);
+	}
+
+	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) {
+		mthca_dbg(dev, "Bad QP transition (transport %d) "
+			  "%d->%d with attr 0x%08x\n",
+			  qp->transport, cur_state, new_state,
+			  attr_mask);
+		goto out;
+	}
+
+	if ((attr_mask & IB_QP_PKEY_INDEX) &&
+	     attr->pkey_index >= dev->limits.pkey_table_len) {
+		mthca_dbg(dev, "P_Key index (%u) too large. max is %d\n",
+			  attr->pkey_index, dev->limits.pkey_table_len-1);
+		goto out;
+	}
+
+	if ((attr_mask & IB_QP_PORT) &&
+	    (attr->port_num == 0 || attr->port_num > dev->limits.num_ports)) {
+		mthca_dbg(dev, "Port number (%u) is invalid\n", attr->port_num);
+		goto out;
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+	    attr->max_rd_atomic > dev->limits.max_qp_init_rdma) {
+		mthca_dbg(dev, "Max rdma_atomic as initiator %u too large (max is %d)\n",
+			  attr->max_rd_atomic, dev->limits.max_qp_init_rdma);
+		goto out;
+	}
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+	    attr->max_dest_rd_atomic > 1 << dev->qp_table.rdb_shift) {
+		mthca_dbg(dev, "Max rdma_atomic as responder %u too large (max %d)\n",
+			  attr->max_dest_rd_atomic, 1 << dev->qp_table.rdb_shift);
+		goto out;
+	}
+
+	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+		err = 0;
+		goto out;
+	}
+
+	err = __mthca_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
+
+out:
+	mutex_unlock(&qp->mutex);
+	return err;
+}
+
+static int mthca_max_data_size(struct mthca_dev *dev, struct mthca_qp *qp, int desc_sz)
+{
+	/*
+	 * Calculate the maximum size of WQE s/g segments, excluding
+	 * the next segment and other non-data segments.
+	 */
+	int max_data_size = desc_sz - sizeof (struct mthca_next_seg);
+
+	switch (qp->transport) {
+	case MLX:
+		max_data_size -= 2 * sizeof (struct mthca_data_seg);
+		break;
+
+	case UD:
+		if (mthca_is_memfree(dev))
+			max_data_size -= sizeof (struct mthca_arbel_ud_seg);
+		else
+			max_data_size -= sizeof (struct mthca_tavor_ud_seg);
+		break;
+
+	default:
+		max_data_size -= sizeof (struct mthca_raddr_seg);
+		break;
+	}
+
+	return max_data_size;
+}
+
+static inline int mthca_max_inline_data(struct mthca_pd *pd, int max_data_size)
+{
+	/* We don't support inline data for kernel QPs (yet). */
+	return pd->ibpd.uobject ? max_data_size - MTHCA_INLINE_HEADER_SIZE : 0;
+}
+
+static void mthca_adjust_qp_caps(struct mthca_dev *dev,
+				 struct mthca_pd *pd,
+				 struct mthca_qp *qp)
+{
+	int max_data_size = mthca_max_data_size(dev, qp,
+						min(dev->limits.max_desc_sz,
+						    1 << qp->sq.wqe_shift));
+
+	qp->max_inline_data = mthca_max_inline_data(pd, max_data_size);
+
+	qp->sq.max_gs = min_t(int, dev->limits.max_sg,
+			      max_data_size / sizeof (struct mthca_data_seg));
+	qp->rq.max_gs = min_t(int, dev->limits.max_sg,
+			       (min(dev->limits.max_desc_sz, 1 << qp->rq.wqe_shift) -
+				sizeof (struct mthca_next_seg)) /
+			       sizeof (struct mthca_data_seg));
+}
+
+/*
+ * Allocate and register buffer for WQEs.  qp->rq.max, sq.max,
+ * rq.max_gs and sq.max_gs must all be assigned.
+ * mthca_alloc_wqe_buf will calculate rq.wqe_shift and
+ * sq.wqe_shift (as well as send_wqe_offset, is_direct, and
+ * queue)
+ */
+static int mthca_alloc_wqe_buf(struct mthca_dev *dev,
+			       struct mthca_pd *pd,
+			       struct mthca_qp *qp)
+{
+	int size;
+	int err = -ENOMEM;
+
+	size = sizeof (struct mthca_next_seg) +
+		qp->rq.max_gs * sizeof (struct mthca_data_seg);
+
+	if (size > dev->limits.max_desc_sz)
+		return -EINVAL;
+
+	for (qp->rq.wqe_shift = 6; 1 << qp->rq.wqe_shift < size;
+	     qp->rq.wqe_shift++)
+		; /* nothing */
+
+	size = qp->sq.max_gs * sizeof (struct mthca_data_seg);
+	switch (qp->transport) {
+	case MLX:
+		size += 2 * sizeof (struct mthca_data_seg);
+		break;
+
+	case UD:
+		size += mthca_is_memfree(dev) ?
+			sizeof (struct mthca_arbel_ud_seg) :
+			sizeof (struct mthca_tavor_ud_seg);
+		break;
+
+	case UC:
+		size += sizeof (struct mthca_raddr_seg);
+		break;
+
+	case RC:
+		size += sizeof (struct mthca_raddr_seg);
+		/*
+		 * An atomic op will require an atomic segment, a
+		 * remote address segment and one scatter entry.
+		 */
+		size = max_t(int, size,
+			     sizeof (struct mthca_atomic_seg) +
+			     sizeof (struct mthca_raddr_seg) +
+			     sizeof (struct mthca_data_seg));
+		break;
+
+	default:
+		break;
+	}
+
+	/* Make sure that we have enough space for a bind request */
+	size = max_t(int, size, sizeof (struct mthca_bind_seg));
+
+	size += sizeof (struct mthca_next_seg);
+
+	if (size > dev->limits.max_desc_sz)
+		return -EINVAL;
+
+	for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
+	     qp->sq.wqe_shift++)
+		; /* nothing */
+
+	qp->send_wqe_offset = ALIGN(qp->rq.max << qp->rq.wqe_shift,
+				    1 << qp->sq.wqe_shift);
+
+	/*
+	 * If this is a userspace QP, we don't actually have to
+	 * allocate anything.  All we need is to calculate the WQE
+	 * sizes and the send_wqe_offset, so we're done now.
+	 */
+	if (pd->ibpd.uobject)
+		return 0;
+
+	size = PAGE_ALIGN(qp->send_wqe_offset +
+			  (qp->sq.max << qp->sq.wqe_shift));
+
+	qp->wrid = kmalloc((qp->rq.max + qp->sq.max) * sizeof (u64),
+			   GFP_KERNEL);
+	if (!qp->wrid)
+		goto err_out;
+
+	err = mthca_buf_alloc(dev, size, MTHCA_MAX_DIRECT_QP_SIZE,
+			      &qp->queue, &qp->is_direct, pd, 0, &qp->mr);
+	if (err)
+		goto err_out;
+
+	return 0;
+
+err_out:
+	kfree(qp->wrid);
+	return err;
+}
+
+static void mthca_free_wqe_buf(struct mthca_dev *dev,
+			       struct mthca_qp *qp)
+{
+	mthca_buf_free(dev, PAGE_ALIGN(qp->send_wqe_offset +
+				       (qp->sq.max << qp->sq.wqe_shift)),
+		       &qp->queue, qp->is_direct, &qp->mr);
+	kfree(qp->wrid);
+}
+
+static int mthca_map_memfree(struct mthca_dev *dev,
+			     struct mthca_qp *qp)
+{
+	int ret;
+
+	if (mthca_is_memfree(dev)) {
+		ret = mthca_table_get(dev, dev->qp_table.qp_table, qp->qpn);
+		if (ret)
+			return ret;
+
+		ret = mthca_table_get(dev, dev->qp_table.eqp_table, qp->qpn);
+		if (ret)
+			goto err_qpc;
+
+		ret = mthca_table_get(dev, dev->qp_table.rdb_table,
+				      qp->qpn << dev->qp_table.rdb_shift);
+		if (ret)
+			goto err_eqpc;
+
+	}
+
+	return 0;
+
+err_eqpc:
+	mthca_table_put(dev, dev->qp_table.eqp_table, qp->qpn);
+
+err_qpc:
+	mthca_table_put(dev, dev->qp_table.qp_table, qp->qpn);
+
+	return ret;
+}
+
+static void mthca_unmap_memfree(struct mthca_dev *dev,
+				struct mthca_qp *qp)
+{
+	mthca_table_put(dev, dev->qp_table.rdb_table,
+			qp->qpn << dev->qp_table.rdb_shift);
+	mthca_table_put(dev, dev->qp_table.eqp_table, qp->qpn);
+	mthca_table_put(dev, dev->qp_table.qp_table, qp->qpn);
+}
+
+static int mthca_alloc_memfree(struct mthca_dev *dev,
+			       struct mthca_qp *qp)
+{
+	if (mthca_is_memfree(dev)) {
+		qp->rq.db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_RQ,
+						 qp->qpn, &qp->rq.db);
+		if (qp->rq.db_index < 0)
+			return -ENOMEM;
+
+		qp->sq.db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_SQ,
+						 qp->qpn, &qp->sq.db);
+		if (qp->sq.db_index < 0) {
+			mthca_free_db(dev, MTHCA_DB_TYPE_RQ, qp->rq.db_index);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+static void mthca_free_memfree(struct mthca_dev *dev,
+			       struct mthca_qp *qp)
+{
+	if (mthca_is_memfree(dev)) {
+		mthca_free_db(dev, MTHCA_DB_TYPE_SQ, qp->sq.db_index);
+		mthca_free_db(dev, MTHCA_DB_TYPE_RQ, qp->rq.db_index);
+	}
+}
+
+static int mthca_alloc_qp_common(struct mthca_dev *dev,
+				 struct mthca_pd *pd,
+				 struct mthca_cq *send_cq,
+				 struct mthca_cq *recv_cq,
+				 enum ib_sig_type send_policy,
+				 struct mthca_qp *qp)
+{
+	int ret;
+	int i;
+	struct mthca_next_seg *next;
+
+	qp->refcount = 1;
+	init_waitqueue_head(&qp->wait);
+	mutex_init(&qp->mutex);
+	qp->state    	 = IB_QPS_RESET;
+	qp->atomic_rd_en = 0;
+	qp->resp_depth   = 0;
+	qp->sq_policy    = send_policy;
+	mthca_wq_reset(&qp->sq);
+	mthca_wq_reset(&qp->rq);
+
+	spin_lock_init(&qp->sq.lock);
+	spin_lock_init(&qp->rq.lock);
+
+	ret = mthca_map_memfree(dev, qp);
+	if (ret)
+		return ret;
+
+	ret = mthca_alloc_wqe_buf(dev, pd, qp);
+	if (ret) {
+		mthca_unmap_memfree(dev, qp);
+		return ret;
+	}
+
+	mthca_adjust_qp_caps(dev, pd, qp);
+
+	/*
+	 * If this is a userspace QP, we're done now.  The doorbells
+	 * will be allocated and buffers will be initialized in
+	 * userspace.
+	 */
+	if (pd->ibpd.uobject)
+		return 0;
+
+	ret = mthca_alloc_memfree(dev, qp);
+	if (ret) {
+		mthca_free_wqe_buf(dev, qp);
+		mthca_unmap_memfree(dev, qp);
+		return ret;
+	}
+
+	if (mthca_is_memfree(dev)) {
+		struct mthca_data_seg *scatter;
+		int size = (sizeof (struct mthca_next_seg) +
+			    qp->rq.max_gs * sizeof (struct mthca_data_seg)) / 16;
+
+		for (i = 0; i < qp->rq.max; ++i) {
+			next = get_recv_wqe(qp, i);
+			next->nda_op = cpu_to_be32(((i + 1) & (qp->rq.max - 1)) <<
+						   qp->rq.wqe_shift);
+			next->ee_nds = cpu_to_be32(size);
+
+			for (scatter = (void *) (next + 1);
+			     (void *) scatter < (void *) next + (1 << qp->rq.wqe_shift);
+			     ++scatter)
+				scatter->lkey = cpu_to_be32(MTHCA_INVAL_LKEY);
+		}
+
+		for (i = 0; i < qp->sq.max; ++i) {
+			next = get_send_wqe(qp, i);
+			next->nda_op = cpu_to_be32((((i + 1) & (qp->sq.max - 1)) <<
+						    qp->sq.wqe_shift) +
+						   qp->send_wqe_offset);
+		}
+	} else {
+		for (i = 0; i < qp->rq.max; ++i) {
+			next = get_recv_wqe(qp, i);
+			next->nda_op = htonl((((i + 1) % qp->rq.max) <<
+					      qp->rq.wqe_shift) | 1);
+		}
+
+	}
+
+	qp->sq.last = get_send_wqe(qp, qp->sq.max - 1);
+	qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1);
+
+	return 0;
+}
+
+static int mthca_set_qp_size(struct mthca_dev *dev, struct ib_qp_cap *cap,
+			     struct mthca_pd *pd, struct mthca_qp *qp)
+{
+	int max_data_size = mthca_max_data_size(dev, qp, dev->limits.max_desc_sz);
+        u32 max_inline_data;
+
+	/* Sanity check QP size before proceeding */
+	if (cap->max_send_wr  	 > dev->limits.max_wqes ||
+	    cap->max_recv_wr  	 > dev->limits.max_wqes ||
+	    cap->max_send_sge 	 > dev->limits.max_sg   ||
+	    cap->max_recv_sge 	 > dev->limits.max_sg)
+		return -EINVAL;
+
+	if (pd->ibpd.uobject &&
+	    cap->max_inline_data > mthca_max_inline_data(pd, max_data_size))
+		return -EINVAL;
+
+	max_inline_data = pd->ibpd.uobject ? cap->max_inline_data : 0;
+
+	/*
+	 * For MLX transport we need 2 extra send gather entries:
+	 * one for the header and one for the checksum at the end
+	 */
+	if (qp->transport == MLX && cap->max_send_sge + 2 > dev->limits.max_sg)
+		return -EINVAL;
+
+	if (mthca_is_memfree(dev)) {
+		qp->rq.max = cap->max_recv_wr ?
+			roundup_pow_of_two(cap->max_recv_wr) : 0;
+		qp->sq.max = cap->max_send_wr ?
+			roundup_pow_of_two(cap->max_send_wr) : 0;
+	} else {
+		qp->rq.max = cap->max_recv_wr;
+		qp->sq.max = cap->max_send_wr;
+	}
+
+	qp->rq.max_gs = cap->max_recv_sge;
+	qp->sq.max_gs = max_t(int, cap->max_send_sge,
+			      ALIGN(max_inline_data + MTHCA_INLINE_HEADER_SIZE,
+				    MTHCA_INLINE_CHUNK_SIZE) /
+			      sizeof (struct mthca_data_seg));
+
+	return 0;
+}
+
+int mthca_alloc_qp(struct mthca_dev *dev,
+		   struct mthca_pd *pd,
+		   struct mthca_cq *send_cq,
+		   struct mthca_cq *recv_cq,
+		   enum ib_qp_type type,
+		   enum ib_sig_type send_policy,
+		   struct ib_qp_cap *cap,
+		   struct mthca_qp *qp)
+{
+	int err;
+
+	switch (type) {
+	case IB_QPT_RC: qp->transport = RC; break;
+	case IB_QPT_UC: qp->transport = UC; break;
+	case IB_QPT_UD: qp->transport = UD; break;
+	default: return -EINVAL;
+	}
+
+	err = mthca_set_qp_size(dev, cap, pd, qp);
+	if (err)
+		return err;
+
+	qp->qpn = mthca_alloc(&dev->qp_table.alloc);
+	if (qp->qpn == -1)
+		return -ENOMEM;
+
+	/* initialize port to zero for error-catching. */
+	qp->port = 0;
+
+	err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq,
+				    send_policy, qp);
+	if (err) {
+		mthca_free(&dev->qp_table.alloc, qp->qpn);
+		return err;
+	}
+
+	spin_lock_irq(&dev->qp_table.lock);
+	mthca_array_set(&dev->qp_table.qp,
+			qp->qpn & (dev->limits.num_qps - 1), qp);
+	spin_unlock_irq(&dev->qp_table.lock);
+
+	return 0;
+}
+
+static void mthca_lock_cqs(struct mthca_cq *send_cq, struct mthca_cq *recv_cq)
+{
+	if (send_cq == recv_cq)
+		spin_lock_irq(&send_cq->lock);
+	else if (send_cq->cqn < recv_cq->cqn) {
+		spin_lock_irq(&send_cq->lock);
+		spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
+	} else {
+		spin_lock_irq(&recv_cq->lock);
+		spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
+	}
+}
+
+static void mthca_unlock_cqs(struct mthca_cq *send_cq, struct mthca_cq *recv_cq)
+{
+	if (send_cq == recv_cq)
+		spin_unlock_irq(&send_cq->lock);
+	else if (send_cq->cqn < recv_cq->cqn) {
+		spin_unlock(&recv_cq->lock);
+		spin_unlock_irq(&send_cq->lock);
+	} else {
+		spin_unlock(&send_cq->lock);
+		spin_unlock_irq(&recv_cq->lock);
+	}
+}
+
+int mthca_alloc_sqp(struct mthca_dev *dev,
+		    struct mthca_pd *pd,
+		    struct mthca_cq *send_cq,
+		    struct mthca_cq *recv_cq,
+		    enum ib_sig_type send_policy,
+		    struct ib_qp_cap *cap,
+		    int qpn,
+		    int port,
+		    struct mthca_sqp *sqp)
+{
+	u32 mqpn = qpn * 2 + dev->qp_table.sqp_start + port - 1;
+	int err;
+
+	sqp->qp.transport = MLX;
+	err = mthca_set_qp_size(dev, cap, pd, &sqp->qp);
+	if (err)
+		return err;
+
+	sqp->header_buf_size = sqp->qp.sq.max * MTHCA_UD_HEADER_SIZE;
+	sqp->header_buf = dma_alloc_coherent(&dev->pdev->dev, sqp->header_buf_size,
+					     &sqp->header_dma, GFP_KERNEL);
+	if (!sqp->header_buf)
+		return -ENOMEM;
+
+	spin_lock_irq(&dev->qp_table.lock);
+	if (mthca_array_get(&dev->qp_table.qp, mqpn))
+		err = -EBUSY;
+	else
+		mthca_array_set(&dev->qp_table.qp, mqpn, sqp);
+	spin_unlock_irq(&dev->qp_table.lock);
+
+	if (err)
+		goto err_out;
+
+	sqp->qp.port      = port;
+	sqp->qp.qpn       = mqpn;
+	sqp->qp.transport = MLX;
+
+	err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq,
+				    send_policy, &sqp->qp);
+	if (err)
+		goto err_out_free;
+
+	atomic_inc(&pd->sqp_count);
+
+	return 0;
+
+ err_out_free:
+	/*
+	 * Lock CQs here, so that CQ polling code can do QP lookup
+	 * without taking a lock.
+	 */
+	mthca_lock_cqs(send_cq, recv_cq);
+
+	spin_lock(&dev->qp_table.lock);
+	mthca_array_clear(&dev->qp_table.qp, mqpn);
+	spin_unlock(&dev->qp_table.lock);
+
+	mthca_unlock_cqs(send_cq, recv_cq);
+
+ err_out:
+	dma_free_coherent(&dev->pdev->dev, sqp->header_buf_size,
+			  sqp->header_buf, sqp->header_dma);
+
+	return err;
+}
+
+static inline int get_qp_refcount(struct mthca_dev *dev, struct mthca_qp *qp)
+{
+	int c;
+
+	spin_lock_irq(&dev->qp_table.lock);
+	c = qp->refcount;
+	spin_unlock_irq(&dev->qp_table.lock);
+
+	return c;
+}
+
+void mthca_free_qp(struct mthca_dev *dev,
+		   struct mthca_qp *qp)
+{
+	u8 status;
+	struct mthca_cq *send_cq;
+	struct mthca_cq *recv_cq;
+
+	send_cq = to_mcq(qp->ibqp.send_cq);
+	recv_cq = to_mcq(qp->ibqp.recv_cq);
+
+	/*
+	 * Lock CQs here, so that CQ polling code can do QP lookup
+	 * without taking a lock.
+	 */
+	mthca_lock_cqs(send_cq, recv_cq);
+
+	spin_lock(&dev->qp_table.lock);
+	mthca_array_clear(&dev->qp_table.qp,
+			  qp->qpn & (dev->limits.num_qps - 1));
+	--qp->refcount;
+	spin_unlock(&dev->qp_table.lock);
+
+	mthca_unlock_cqs(send_cq, recv_cq);
+
+	wait_event(qp->wait, !get_qp_refcount(dev, qp));
+
+	if (qp->state != IB_QPS_RESET)
+		mthca_MODIFY_QP(dev, qp->state, IB_QPS_RESET, qp->qpn, 0,
+				NULL, 0, &status);
+
+	/*
+	 * If this is a userspace QP, the buffers, MR, CQs and so on
+	 * will be cleaned up in userspace, so all we have to do is
+	 * unref the mem-free tables and free the QPN in our table.
+	 */
+	if (!qp->ibqp.uobject) {
+		mthca_cq_clean(dev, recv_cq, qp->qpn,
+			       qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL);
+		if (send_cq != recv_cq)
+			mthca_cq_clean(dev, send_cq, qp->qpn, NULL);
+
+		mthca_free_memfree(dev, qp);
+		mthca_free_wqe_buf(dev, qp);
+	}
+
+	mthca_unmap_memfree(dev, qp);
+
+	if (is_sqp(dev, qp)) {
+		atomic_dec(&(to_mpd(qp->ibqp.pd)->sqp_count));
+		dma_free_coherent(&dev->pdev->dev,
+				  to_msqp(qp)->header_buf_size,
+				  to_msqp(qp)->header_buf,
+				  to_msqp(qp)->header_dma);
+	} else
+		mthca_free(&dev->qp_table.alloc, qp->qpn);
+}
+
+/* Create UD header for an MLX send and build a data segment for it */
+static int build_mlx_header(struct mthca_dev *dev, struct mthca_sqp *sqp,
+			    int ind, struct ib_send_wr *wr,
+			    struct mthca_mlx_seg *mlx,
+			    struct mthca_data_seg *data)
+{
+	int header_size;
+	int err;
+	u16 pkey;
+
+	ib_ud_header_init(256, /* assume a MAD */
+			  1, 0, 0,
+			  mthca_ah_grh_present(to_mah(wr->wr.ud.ah)),
+			  0,
+			  &sqp->ud_header);
+
+	err = mthca_read_ah(dev, to_mah(wr->wr.ud.ah), &sqp->ud_header);
+	if (err)
+		return err;
+	mlx->flags &= ~cpu_to_be32(MTHCA_NEXT_SOLICIT | 1);
+	mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MTHCA_MLX_VL15 : 0) |
+				  (sqp->ud_header.lrh.destination_lid ==
+				   IB_LID_PERMISSIVE ? MTHCA_MLX_SLR : 0) |
+				  (sqp->ud_header.lrh.service_level << 8));
+	mlx->rlid = sqp->ud_header.lrh.destination_lid;
+	mlx->vcrc = 0;
+
+	switch (wr->opcode) {
+	case IB_WR_SEND:
+		sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY;
+		sqp->ud_header.immediate_present = 0;
+		break;
+	case IB_WR_SEND_WITH_IMM:
+		sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+		sqp->ud_header.immediate_present = 1;
+		sqp->ud_header.immediate_data = wr->ex.imm_data;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	sqp->ud_header.lrh.virtual_lane    = !sqp->qp.ibqp.qp_num ? 15 : 0;
+	if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
+		sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
+	sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
+	if (!sqp->qp.ibqp.qp_num)
+		ib_get_cached_pkey(&dev->ib_dev, sqp->qp.port,
+				   sqp->pkey_index, &pkey);
+	else
+		ib_get_cached_pkey(&dev->ib_dev, sqp->qp.port,
+				   wr->wr.ud.pkey_index, &pkey);
+	sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
+	sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+	sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+	sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ?
+					       sqp->qkey : wr->wr.ud.remote_qkey);
+	sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
+
+	header_size = ib_ud_header_pack(&sqp->ud_header,
+					sqp->header_buf +
+					ind * MTHCA_UD_HEADER_SIZE);
+
+	data->byte_count = cpu_to_be32(header_size);
+	data->lkey       = cpu_to_be32(to_mpd(sqp->qp.ibqp.pd)->ntmr.ibmr.lkey);
+	data->addr       = cpu_to_be64(sqp->header_dma +
+				       ind * MTHCA_UD_HEADER_SIZE);
+
+	return 0;
+}
+
+static inline int mthca_wq_overflow(struct mthca_wq *wq, int nreq,
+				    struct ib_cq *ib_cq)
+{
+	unsigned cur;
+	struct mthca_cq *cq;
+
+	cur = wq->head - wq->tail;
+	if (likely(cur + nreq < wq->max))
+		return 0;
+
+	cq = to_mcq(ib_cq);
+	spin_lock(&cq->lock);
+	cur = wq->head - wq->tail;
+	spin_unlock(&cq->lock);
+
+	return cur + nreq >= wq->max;
+}
+
+static __always_inline void set_raddr_seg(struct mthca_raddr_seg *rseg,
+					  u64 remote_addr, u32 rkey)
+{
+	rseg->raddr    = cpu_to_be64(remote_addr);
+	rseg->rkey     = cpu_to_be32(rkey);
+	rseg->reserved = 0;
+}
+
+static __always_inline void set_atomic_seg(struct mthca_atomic_seg *aseg,
+					   struct ib_send_wr *wr)
+{
+	if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+		aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap);
+		aseg->compare  = cpu_to_be64(wr->wr.atomic.compare_add);
+	} else {
+		aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add);
+		aseg->compare  = 0;
+	}
+
+}
+
+static void set_tavor_ud_seg(struct mthca_tavor_ud_seg *useg,
+			     struct ib_send_wr *wr)
+{
+	useg->lkey    = cpu_to_be32(to_mah(wr->wr.ud.ah)->key);
+	useg->av_addr =	cpu_to_be64(to_mah(wr->wr.ud.ah)->avdma);
+	useg->dqpn    =	cpu_to_be32(wr->wr.ud.remote_qpn);
+	useg->qkey    =	cpu_to_be32(wr->wr.ud.remote_qkey);
+
+}
+
+static void set_arbel_ud_seg(struct mthca_arbel_ud_seg *useg,
+			     struct ib_send_wr *wr)
+{
+	memcpy(useg->av, to_mah(wr->wr.ud.ah)->av, MTHCA_AV_SIZE);
+	useg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+	useg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
+}
+
+int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			  struct ib_send_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	void *wqe;
+	void *prev_wqe;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int i;
+	int size;
+	/*
+	 * f0 and size0 are only used if nreq != 0, and they will
+	 * always be initialized the first time through the main loop
+	 * before nreq is incremented.  So nreq cannot become non-zero
+	 * without initializing f0 and size0, and they are in fact
+	 * never used uninitialized.
+	 */
+	int uninitialized_var(size0);
+	u32 uninitialized_var(f0);
+	int ind;
+	u8 op0 = 0;
+
+	spin_lock_irqsave(&qp->sq.lock, flags);
+
+	/* XXX check that state is OK to post send */
+
+	ind = qp->sq.next_ind;
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (mthca_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+			mthca_err(dev, "SQ %06x full (%u head, %u tail,"
+					" %d max, %d nreq)\n", qp->qpn,
+					qp->sq.head, qp->sq.tail,
+					qp->sq.max, nreq);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		wqe = get_send_wqe(qp, ind);
+		prev_wqe = qp->sq.last;
+		qp->sq.last = wqe;
+
+		((struct mthca_next_seg *) wqe)->nda_op = 0;
+		((struct mthca_next_seg *) wqe)->ee_nds = 0;
+		((struct mthca_next_seg *) wqe)->flags =
+			((wr->send_flags & IB_SEND_SIGNALED) ?
+			 cpu_to_be32(MTHCA_NEXT_CQ_UPDATE) : 0) |
+			((wr->send_flags & IB_SEND_SOLICITED) ?
+			 cpu_to_be32(MTHCA_NEXT_SOLICIT) : 0)   |
+			cpu_to_be32(1);
+		if (wr->opcode == IB_WR_SEND_WITH_IMM ||
+		    wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+			((struct mthca_next_seg *) wqe)->imm = wr->ex.imm_data;
+
+		wqe += sizeof (struct mthca_next_seg);
+		size = sizeof (struct mthca_next_seg) / 16;
+
+		switch (qp->transport) {
+		case RC:
+			switch (wr->opcode) {
+			case IB_WR_ATOMIC_CMP_AND_SWP:
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+				set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
+					      wr->wr.atomic.rkey);
+				wqe += sizeof (struct mthca_raddr_seg);
+
+				set_atomic_seg(wqe, wr);
+				wqe += sizeof (struct mthca_atomic_seg);
+				size += (sizeof (struct mthca_raddr_seg) +
+					 sizeof (struct mthca_atomic_seg)) / 16;
+				break;
+
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+			case IB_WR_RDMA_READ:
+				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+					      wr->wr.rdma.rkey);
+				wqe  += sizeof (struct mthca_raddr_seg);
+				size += sizeof (struct mthca_raddr_seg) / 16;
+				break;
+
+			default:
+				/* No extra segments required for sends */
+				break;
+			}
+
+			break;
+
+		case UC:
+			switch (wr->opcode) {
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+					      wr->wr.rdma.rkey);
+				wqe  += sizeof (struct mthca_raddr_seg);
+				size += sizeof (struct mthca_raddr_seg) / 16;
+				break;
+
+			default:
+				/* No extra segments required for sends */
+				break;
+			}
+
+			break;
+
+		case UD:
+			set_tavor_ud_seg(wqe, wr);
+			wqe  += sizeof (struct mthca_tavor_ud_seg);
+			size += sizeof (struct mthca_tavor_ud_seg) / 16;
+			break;
+
+		case MLX:
+			err = build_mlx_header(dev, to_msqp(qp), ind, wr,
+					       wqe - sizeof (struct mthca_next_seg),
+					       wqe);
+			if (err) {
+				*bad_wr = wr;
+				goto out;
+			}
+			wqe += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+			break;
+		}
+
+		if (wr->num_sge > qp->sq.max_gs) {
+			mthca_err(dev, "too many gathers\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			mthca_set_data_seg(wqe, wr->sg_list + i);
+			wqe  += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		/* Add one more inline data segment for ICRC */
+		if (qp->transport == MLX) {
+			((struct mthca_data_seg *) wqe)->byte_count =
+				cpu_to_be32((1 << 31) | 4);
+			((u32 *) wqe)[1] = 0;
+			wqe += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		qp->wrid[ind] = wr->wr_id;
+
+		if (wr->opcode >= ARRAY_SIZE(mthca_opcode)) {
+			mthca_err(dev, "opcode invalid\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		((struct mthca_next_seg *) prev_wqe)->nda_op =
+			cpu_to_be32(((ind << qp->sq.wqe_shift) +
+				     qp->send_wqe_offset) |
+				    mthca_opcode[wr->opcode]);
+		wmb();
+		((struct mthca_next_seg *) prev_wqe)->ee_nds =
+			cpu_to_be32((nreq ? 0 : MTHCA_NEXT_DBD) | size |
+				    ((wr->send_flags & IB_SEND_FENCE) ?
+				    MTHCA_NEXT_FENCE : 0));
+
+		if (!nreq) {
+			size0 = size;
+			op0   = mthca_opcode[wr->opcode];
+			f0    = wr->send_flags & IB_SEND_FENCE ?
+				MTHCA_SEND_DOORBELL_FENCE : 0;
+		}
+
+		++ind;
+		if (unlikely(ind >= qp->sq.max))
+			ind -= qp->sq.max;
+	}
+
+out:
+	if (likely(nreq)) {
+		wmb();
+
+		mthca_write64(((qp->sq.next_ind << qp->sq.wqe_shift) +
+			       qp->send_wqe_offset) | f0 | op0,
+			      (qp->qpn << 8) | size0,
+			      dev->kar + MTHCA_SEND_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+		/*
+		 * Make sure doorbells don't leak out of SQ spinlock
+		 * and reach the HCA out of order:
+		 */
+		mmiowb();
+	}
+
+	qp->sq.next_ind = ind;
+	qp->sq.head    += nreq;
+
+	spin_unlock_irqrestore(&qp->sq.lock, flags);
+	return err;
+}
+
+int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			     struct ib_recv_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int i;
+	int size;
+	/*
+	 * size0 is only used if nreq != 0, and it will always be
+	 * initialized the first time through the main loop before
+	 * nreq is incremented.  So nreq cannot become non-zero
+	 * without initializing size0, and it is in fact never used
+	 * uninitialized.
+	 */
+	int uninitialized_var(size0);
+	int ind;
+	void *wqe;
+	void *prev_wqe;
+
+	spin_lock_irqsave(&qp->rq.lock, flags);
+
+	/* XXX check that state is OK to post receive */
+
+	ind = qp->rq.next_ind;
+
+	for (nreq = 0; wr; wr = wr->next) {
+		if (mthca_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+			mthca_err(dev, "RQ %06x full (%u head, %u tail,"
+					" %d max, %d nreq)\n", qp->qpn,
+					qp->rq.head, qp->rq.tail,
+					qp->rq.max, nreq);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		wqe = get_recv_wqe(qp, ind);
+		prev_wqe = qp->rq.last;
+		qp->rq.last = wqe;
+
+		((struct mthca_next_seg *) wqe)->ee_nds =
+			cpu_to_be32(MTHCA_NEXT_DBD);
+		((struct mthca_next_seg *) wqe)->flags = 0;
+
+		wqe += sizeof (struct mthca_next_seg);
+		size = sizeof (struct mthca_next_seg) / 16;
+
+		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			mthca_set_data_seg(wqe, wr->sg_list + i);
+			wqe  += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		qp->wrid[ind + qp->sq.max] = wr->wr_id;
+
+		((struct mthca_next_seg *) prev_wqe)->ee_nds =
+			cpu_to_be32(MTHCA_NEXT_DBD | size);
+
+		if (!nreq)
+			size0 = size;
+
+		++ind;
+		if (unlikely(ind >= qp->rq.max))
+			ind -= qp->rq.max;
+
+		++nreq;
+		if (unlikely(nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB)) {
+			nreq = 0;
+
+			wmb();
+
+			mthca_write64((qp->rq.next_ind << qp->rq.wqe_shift) | size0,
+				      qp->qpn << 8, dev->kar + MTHCA_RECEIVE_DOORBELL,
+				      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+
+			qp->rq.next_ind = ind;
+			qp->rq.head += MTHCA_TAVOR_MAX_WQES_PER_RECV_DB;
+		}
+	}
+
+out:
+	if (likely(nreq)) {
+		wmb();
+
+		mthca_write64((qp->rq.next_ind << qp->rq.wqe_shift) | size0,
+			      qp->qpn << 8 | nreq, dev->kar + MTHCA_RECEIVE_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+	}
+
+	qp->rq.next_ind = ind;
+	qp->rq.head    += nreq;
+
+	/*
+	 * Make sure doorbells don't leak out of RQ spinlock and reach
+	 * the HCA out of order:
+	 */
+	mmiowb();
+
+	spin_unlock_irqrestore(&qp->rq.lock, flags);
+	return err;
+}
+
+int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			  struct ib_send_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	u32 dbhi;
+	void *wqe;
+	void *prev_wqe;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int i;
+	int size;
+	/*
+	 * f0 and size0 are only used if nreq != 0, and they will
+	 * always be initialized the first time through the main loop
+	 * before nreq is incremented.  So nreq cannot become non-zero
+	 * without initializing f0 and size0, and they are in fact
+	 * never used uninitialized.
+	 */
+	int uninitialized_var(size0);
+	u32 uninitialized_var(f0);
+	int ind;
+	u8 op0 = 0;
+
+	spin_lock_irqsave(&qp->sq.lock, flags);
+
+	/* XXX check that state is OK to post send */
+
+	ind = qp->sq.head & (qp->sq.max - 1);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (unlikely(nreq == MTHCA_ARBEL_MAX_WQES_PER_SEND_DB)) {
+			nreq = 0;
+
+			dbhi = (MTHCA_ARBEL_MAX_WQES_PER_SEND_DB << 24) |
+				((qp->sq.head & 0xffff) << 8) | f0 | op0;
+
+			qp->sq.head += MTHCA_ARBEL_MAX_WQES_PER_SEND_DB;
+
+			/*
+			 * Make sure that descriptors are written before
+			 * doorbell record.
+			 */
+			wmb();
+			*qp->sq.db = cpu_to_be32(qp->sq.head & 0xffff);
+
+			/*
+			 * Make sure doorbell record is written before we
+			 * write MMIO send doorbell.
+			 */
+			wmb();
+
+			mthca_write64(dbhi, (qp->qpn << 8) | size0,
+				      dev->kar + MTHCA_SEND_DOORBELL,
+				      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+		}
+
+		if (mthca_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+			mthca_err(dev, "SQ %06x full (%u head, %u tail,"
+					" %d max, %d nreq)\n", qp->qpn,
+					qp->sq.head, qp->sq.tail,
+					qp->sq.max, nreq);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		wqe = get_send_wqe(qp, ind);
+		prev_wqe = qp->sq.last;
+		qp->sq.last = wqe;
+
+		((struct mthca_next_seg *) wqe)->flags =
+			((wr->send_flags & IB_SEND_SIGNALED) ?
+			 cpu_to_be32(MTHCA_NEXT_CQ_UPDATE) : 0) |
+			((wr->send_flags & IB_SEND_SOLICITED) ?
+			 cpu_to_be32(MTHCA_NEXT_SOLICIT) : 0)   |
+			((wr->send_flags & IB_SEND_IP_CSUM) ?
+			 cpu_to_be32(MTHCA_NEXT_IP_CSUM | MTHCA_NEXT_TCP_UDP_CSUM) : 0) |
+			cpu_to_be32(1);
+		if (wr->opcode == IB_WR_SEND_WITH_IMM ||
+		    wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+			((struct mthca_next_seg *) wqe)->imm = wr->ex.imm_data;
+
+		wqe += sizeof (struct mthca_next_seg);
+		size = sizeof (struct mthca_next_seg) / 16;
+
+		switch (qp->transport) {
+		case RC:
+			switch (wr->opcode) {
+			case IB_WR_ATOMIC_CMP_AND_SWP:
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+				set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
+					      wr->wr.atomic.rkey);
+				wqe += sizeof (struct mthca_raddr_seg);
+
+				set_atomic_seg(wqe, wr);
+				wqe  += sizeof (struct mthca_atomic_seg);
+				size += (sizeof (struct mthca_raddr_seg) +
+					 sizeof (struct mthca_atomic_seg)) / 16;
+				break;
+
+			case IB_WR_RDMA_READ:
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+					      wr->wr.rdma.rkey);
+				wqe  += sizeof (struct mthca_raddr_seg);
+				size += sizeof (struct mthca_raddr_seg) / 16;
+				break;
+
+			default:
+				/* No extra segments required for sends */
+				break;
+			}
+
+			break;
+
+		case UC:
+			switch (wr->opcode) {
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+					      wr->wr.rdma.rkey);
+				wqe  += sizeof (struct mthca_raddr_seg);
+				size += sizeof (struct mthca_raddr_seg) / 16;
+				break;
+
+			default:
+				/* No extra segments required for sends */
+				break;
+			}
+
+			break;
+
+		case UD:
+			set_arbel_ud_seg(wqe, wr);
+			wqe  += sizeof (struct mthca_arbel_ud_seg);
+			size += sizeof (struct mthca_arbel_ud_seg) / 16;
+			break;
+
+		case MLX:
+			err = build_mlx_header(dev, to_msqp(qp), ind, wr,
+					       wqe - sizeof (struct mthca_next_seg),
+					       wqe);
+			if (err) {
+				*bad_wr = wr;
+				goto out;
+			}
+			wqe += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+			break;
+		}
+
+		if (wr->num_sge > qp->sq.max_gs) {
+			mthca_err(dev, "too many gathers\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			mthca_set_data_seg(wqe, wr->sg_list + i);
+			wqe  += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		/* Add one more inline data segment for ICRC */
+		if (qp->transport == MLX) {
+			((struct mthca_data_seg *) wqe)->byte_count =
+				cpu_to_be32((1 << 31) | 4);
+			((u32 *) wqe)[1] = 0;
+			wqe += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		qp->wrid[ind] = wr->wr_id;
+
+		if (wr->opcode >= ARRAY_SIZE(mthca_opcode)) {
+			mthca_err(dev, "opcode invalid\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		((struct mthca_next_seg *) prev_wqe)->nda_op =
+			cpu_to_be32(((ind << qp->sq.wqe_shift) +
+				     qp->send_wqe_offset) |
+				    mthca_opcode[wr->opcode]);
+		wmb();
+		((struct mthca_next_seg *) prev_wqe)->ee_nds =
+			cpu_to_be32(MTHCA_NEXT_DBD | size |
+				    ((wr->send_flags & IB_SEND_FENCE) ?
+				     MTHCA_NEXT_FENCE : 0));
+
+		if (!nreq) {
+			size0 = size;
+			op0   = mthca_opcode[wr->opcode];
+			f0    = wr->send_flags & IB_SEND_FENCE ?
+				MTHCA_SEND_DOORBELL_FENCE : 0;
+		}
+
+		++ind;
+		if (unlikely(ind >= qp->sq.max))
+			ind -= qp->sq.max;
+	}
+
+out:
+	if (likely(nreq)) {
+		dbhi = (nreq << 24) | ((qp->sq.head & 0xffff) << 8) | f0 | op0;
+
+		qp->sq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+		*qp->sq.db = cpu_to_be32(qp->sq.head & 0xffff);
+
+		/*
+		 * Make sure doorbell record is written before we
+		 * write MMIO send doorbell.
+		 */
+		wmb();
+
+		mthca_write64(dbhi, (qp->qpn << 8) | size0, dev->kar + MTHCA_SEND_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+	}
+
+	/*
+	 * Make sure doorbells don't leak out of SQ spinlock and reach
+	 * the HCA out of order:
+	 */
+	mmiowb();
+
+	spin_unlock_irqrestore(&qp->sq.lock, flags);
+	return err;
+}
+
+int mthca_arbel_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			     struct ib_recv_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int ind;
+	int i;
+	void *wqe;
+
+	spin_lock_irqsave(&qp->rq.lock, flags);
+
+	/* XXX check that state is OK to post receive */
+
+	ind = qp->rq.head & (qp->rq.max - 1);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (mthca_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+			mthca_err(dev, "RQ %06x full (%u head, %u tail,"
+					" %d max, %d nreq)\n", qp->qpn,
+					qp->rq.head, qp->rq.tail,
+					qp->rq.max, nreq);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		wqe = get_recv_wqe(qp, ind);
+
+		((struct mthca_next_seg *) wqe)->flags = 0;
+
+		wqe += sizeof (struct mthca_next_seg);
+
+		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			mthca_set_data_seg(wqe, wr->sg_list + i);
+			wqe += sizeof (struct mthca_data_seg);
+		}
+
+		if (i < qp->rq.max_gs)
+			mthca_set_data_seg_inval(wqe);
+
+		qp->wrid[ind + qp->sq.max] = wr->wr_id;
+
+		++ind;
+		if (unlikely(ind >= qp->rq.max))
+			ind -= qp->rq.max;
+	}
+out:
+	if (likely(nreq)) {
+		qp->rq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+		*qp->rq.db = cpu_to_be32(qp->rq.head & 0xffff);
+	}
+
+	spin_unlock_irqrestore(&qp->rq.lock, flags);
+	return err;
+}
+
+void mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send,
+			int index, int *dbd, __be32 *new_wqe)
+{
+	struct mthca_next_seg *next;
+
+	/*
+	 * For SRQs, all receive WQEs generate a CQE, so we're always
+	 * at the end of the doorbell chain.
+	 */
+	if (qp->ibqp.srq && !is_send) {
+		*new_wqe = 0;
+		return;
+	}
+
+	if (is_send)
+		next = get_send_wqe(qp, index);
+	else
+		next = get_recv_wqe(qp, index);
+
+	*dbd = !!(next->ee_nds & cpu_to_be32(MTHCA_NEXT_DBD));
+	if (next->ee_nds & cpu_to_be32(0x3f))
+		*new_wqe = (next->nda_op & cpu_to_be32(~0x3f)) |
+			(next->ee_nds & cpu_to_be32(0x3f));
+	else
+		*new_wqe = 0;
+}
+
+int mthca_init_qp_table(struct mthca_dev *dev)
+{
+	int err;
+	u8 status;
+	int i;
+
+	spin_lock_init(&dev->qp_table.lock);
+
+	/*
+	 * We reserve 2 extra QPs per port for the special QPs.  The
+	 * special QP for port 1 has to be even, so round up.
+	 */
+	dev->qp_table.sqp_start = (dev->limits.reserved_qps + 1) & ~1UL;
+	err = mthca_alloc_init(&dev->qp_table.alloc,
+			       dev->limits.num_qps,
+			       (1 << 24) - 1,
+			       dev->qp_table.sqp_start +
+			       MTHCA_MAX_PORTS * 2);
+	if (err)
+		return err;
+
+	err = mthca_array_init(&dev->qp_table.qp,
+			       dev->limits.num_qps);
+	if (err) {
+		mthca_alloc_cleanup(&dev->qp_table.alloc);
+		return err;
+	}
+
+	for (i = 0; i < 2; ++i) {
+		err = mthca_CONF_SPECIAL_QP(dev, i ? IB_QPT_GSI : IB_QPT_SMI,
+					    dev->qp_table.sqp_start + i * 2,
+					    &status);
+		if (err)
+			goto err_out;
+		if (status) {
+			mthca_warn(dev, "CONF_SPECIAL_QP returned "
+				   "status %02x, aborting.\n",
+				   status);
+			err = -EINVAL;
+			goto err_out;
+		}
+	}
+	return 0;
+
+ err_out:
+	for (i = 0; i < 2; ++i)
+		mthca_CONF_SPECIAL_QP(dev, i, 0, &status);
+
+	mthca_array_cleanup(&dev->qp_table.qp, dev->limits.num_qps);
+	mthca_alloc_cleanup(&dev->qp_table.alloc);
+
+	return err;
+}
+
+void mthca_cleanup_qp_table(struct mthca_dev *dev)
+{
+	int i;
+	u8 status;
+
+	for (i = 0; i < 2; ++i)
+		mthca_CONF_SPECIAL_QP(dev, i, 0, &status);
+
+	mthca_array_cleanup(&dev->qp_table.qp, dev->limits.num_qps);
+	mthca_alloc_cleanup(&dev->qp_table.alloc);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_reset.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_reset.c
new file mode 100644
index 0000000..3c12461
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_reset.c
@@ -0,0 +1,298 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+int mthca_reset(struct mthca_dev *mdev)
+{
+	int i;
+	int err = 0;
+	u32 *hca_header    = NULL;
+	u32 *bridge_header = NULL;
+	struct pci_dev *bridge = NULL;
+	int bridge_pcix_cap = 0;
+	int hca_pcie_cap = 0;
+	int hca_pcix_cap = 0;
+
+	u16 devctl;
+	u16 linkctl;
+
+#define MTHCA_RESET_OFFSET 0xf0010
+#define MTHCA_RESET_VALUE  swab32(1)
+
+	/*
+	 * Reset the chip.  This is somewhat ugly because we have to
+	 * save off the PCI header before reset and then restore it
+	 * after the chip reboots.  We skip config space offsets 22
+	 * and 23 since those have a special meaning.
+	 *
+	 * To make matters worse, for Tavor (PCI-X HCA) we have to
+	 * find the associated bridge device and save off its PCI
+	 * header as well.
+	 */
+
+	if (!(mdev->mthca_flags & MTHCA_FLAG_PCIE)) {
+		/* Look for the bridge -- its device ID will be 2 more
+		   than HCA's device ID. */
+#ifdef __linux__
+		while ((bridge = pci_get_device(mdev->pdev->vendor,
+						mdev->pdev->device + 2,
+						bridge)) != NULL) {
+			if (bridge->hdr_type    == PCI_HEADER_TYPE_BRIDGE &&
+			    bridge->subordinate == mdev->pdev->bus) {
+				mthca_dbg(mdev, "Found bridge: %s\n",
+					  pci_name(bridge));
+				break;
+			}
+		}
+
+		if (!bridge) {
+			/*
+			 * Didn't find a bridge for a Tavor device --
+			 * assume we're in no-bridge mode and hope for
+			 * the best.
+			 */
+			mthca_warn(mdev, "No bridge found for %s\n",
+				  pci_name(mdev->pdev));
+		}
+#else
+		mthca_warn(mdev, "Reset on PCI-X is not supported.\n");
+		goto out;
+
+#endif
+	}
+
+	/* For Arbel do we need to save off the full 4K PCI Express header?? */
+	hca_header = kmalloc(256, GFP_KERNEL);
+	if (!hca_header) {
+		err = -ENOMEM;
+		mthca_err(mdev, "Couldn't allocate memory to save HCA "
+			  "PCI header, aborting.\n");
+		goto out;
+	}
+
+	for (i = 0; i < 64; ++i) {
+		if (i == 22 || i == 23)
+			continue;
+		if (pci_read_config_dword(mdev->pdev, i * 4, hca_header + i)) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't save HCA "
+				  "PCI header, aborting.\n");
+			goto out;
+		}
+	}
+
+	hca_pcix_cap = pci_find_capability(mdev->pdev, PCI_CAP_ID_PCIX);
+	hca_pcie_cap = pci_find_capability(mdev->pdev, PCI_CAP_ID_EXP);
+
+#ifdef __linux__
+	if (bridge) {
+		bridge_header = kmalloc(256, GFP_KERNEL);
+		if (!bridge_header) {
+			err = -ENOMEM;
+			mthca_err(mdev, "Couldn't allocate memory to save HCA "
+				  "bridge PCI header, aborting.\n");
+			goto out;
+		}
+
+		for (i = 0; i < 64; ++i) {
+			if (i == 22 || i == 23)
+				continue;
+			if (pci_read_config_dword(bridge, i * 4, bridge_header + i)) {
+				err = -ENODEV;
+				mthca_err(mdev, "Couldn't save HCA bridge "
+					  "PCI header, aborting.\n");
+				goto out;
+			}
+		}
+		bridge_pcix_cap = pci_find_capability(bridge, PCI_CAP_ID_PCIX);
+		if (!bridge_pcix_cap) {
+				err = -ENODEV;
+				mthca_err(mdev, "Couldn't locate HCA bridge "
+					  "PCI-X capability, aborting.\n");
+				goto out;
+		}
+	}
+#endif
+
+	/* actually hit reset */
+	{
+		void __iomem *reset = ioremap(pci_resource_start(mdev->pdev, 0) +
+					      MTHCA_RESET_OFFSET, 4);
+
+		if (!reset) {
+			err = -ENOMEM;
+			mthca_err(mdev, "Couldn't map HCA reset register, "
+				  "aborting.\n");
+			goto out;
+		}
+
+		writel(MTHCA_RESET_VALUE, reset);
+		iounmap(reset);
+	}
+
+	/* Docs say to wait one second before accessing device */
+	msleep(1000);
+
+	/* Now wait for PCI device to start responding again */
+	{
+		u32 v;
+		int c = 0;
+
+		for (c = 0; c < 100; ++c) {
+			if (pci_read_config_dword(bridge ? bridge : mdev->pdev, 0, &v)) {
+				err = -ENODEV;
+				mthca_err(mdev, "Couldn't access HCA after reset, "
+					  "aborting.\n");
+				goto out;
+			}
+
+			if (v != 0xffffffff)
+				goto good;
+
+			msleep(100);
+		}
+
+		err = -ENODEV;
+		mthca_err(mdev, "PCI device did not come back after reset, "
+			  "aborting.\n");
+		goto out;
+	}
+
+good:
+	/* Now restore the PCI headers */
+	if (bridge) {
+		if (pci_write_config_dword(bridge, bridge_pcix_cap + 0x8,
+				 bridge_header[(bridge_pcix_cap + 0x8) / 4])) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA bridge Upstream "
+				  "split transaction control, aborting.\n");
+			goto out;
+		}
+		if (pci_write_config_dword(bridge, bridge_pcix_cap + 0xc,
+				 bridge_header[(bridge_pcix_cap + 0xc) / 4])) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA bridge Downstream "
+				  "split transaction control, aborting.\n");
+			goto out;
+		}
+		/*
+		 * Bridge control register is at 0x3e, so we'll
+		 * naturally restore it last in this loop.
+		 */
+		for (i = 0; i < 16; ++i) {
+			if (i * 4 == PCI_COMMAND)
+				continue;
+
+			if (pci_write_config_dword(bridge, i * 4, bridge_header[i])) {
+				err = -ENODEV;
+				mthca_err(mdev, "Couldn't restore HCA bridge reg %x, "
+					  "aborting.\n", i);
+				goto out;
+			}
+		}
+
+		if (pci_write_config_dword(bridge, PCI_COMMAND,
+					   bridge_header[PCI_COMMAND / 4])) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA bridge COMMAND, "
+				  "aborting.\n");
+			goto out;
+		}
+	}
+
+	if (hca_pcix_cap) {
+		if (pci_write_config_dword(mdev->pdev, hca_pcix_cap,
+				 hca_header[hca_pcix_cap / 4])) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA PCI-X "
+				  "command register, aborting.\n");
+			goto out;
+		}
+	}
+
+	if (hca_pcie_cap) {
+		devctl = hca_header[(hca_pcie_cap + PCI_EXP_DEVCTL) / 4];
+		if (pci_write_config_word(mdev->pdev, hca_pcie_cap + PCI_EXP_DEVCTL,
+					   devctl)) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA PCI Express "
+				  "Device Control register, aborting.\n");
+			goto out;
+		}
+		linkctl = hca_header[(hca_pcie_cap + PCI_EXP_LNKCTL) / 4];
+		if (pci_write_config_word(mdev->pdev, hca_pcie_cap + PCI_EXP_LNKCTL,
+					   linkctl)) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA PCI Express "
+				  "Link control register, aborting.\n");
+			goto out;
+		}
+	}
+
+	for (i = 0; i < 16; ++i) {
+		if (i * 4 == PCI_COMMAND)
+			continue;
+
+		if (pci_write_config_dword(mdev->pdev, i * 4, hca_header[i])) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA reg %x, "
+				  "aborting.\n", i);
+			goto out;
+		}
+	}
+
+	if (pci_write_config_dword(mdev->pdev, PCI_COMMAND,
+				   hca_header[PCI_COMMAND / 4])) {
+		err = -ENODEV;
+		mthca_err(mdev, "Couldn't restore HCA COMMAND, "
+			  "aborting.\n");
+		goto out;
+	}
+
+out:
+#ifdef __linux__
+	if (bridge)
+		pci_dev_put(bridge);
+#endif
+	kfree(bridge_header);
+	kfree(hca_header);
+
+	return err;
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_srq.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_srq.c
new file mode 100644
index 0000000..4fabe62
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_srq.c
@@ -0,0 +1,715 @@
+/*
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+
+#include <asm/io.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+#include "mthca_wqe.h"
+
+enum {
+	MTHCA_MAX_DIRECT_SRQ_SIZE = 4 * PAGE_SIZE
+};
+
+struct mthca_tavor_srq_context {
+	__be64 wqe_base_ds;	/* low 6 bits is descriptor size */
+	__be32 state_pd;
+	__be32 lkey;
+	__be32 uar;
+	__be16 limit_watermark;
+	__be16 wqe_cnt;
+	u32    reserved[2];
+};
+
+struct mthca_arbel_srq_context {
+	__be32 state_logsize_srqn;
+	__be32 lkey;
+	__be32 db_index;
+	__be32 logstride_usrpage;
+	__be64 wqe_base;
+	__be32 eq_pd;
+	__be16 limit_watermark;
+	__be16 wqe_cnt;
+	u16    reserved1;
+	__be16 wqe_counter;
+	u32    reserved2[3];
+};
+
+static void *get_wqe(struct mthca_srq *srq, int n)
+{
+	if (srq->is_direct)
+		return srq->queue.direct.buf + (n << srq->wqe_shift);
+	else
+		return srq->queue.page_list[(n << srq->wqe_shift) >> PAGE_SHIFT].buf +
+			((n << srq->wqe_shift) & (PAGE_SIZE - 1));
+}
+
+/*
+ * Return a pointer to the location within a WQE that we're using as a
+ * link when the WQE is in the free list.  We use the imm field
+ * because in the Tavor case, posting a WQE may overwrite the next
+ * segment of the previous WQE, but a receive WQE will never touch the
+ * imm field.  This avoids corrupting our free list if the previous
+ * WQE has already completed and been put on the free list when we
+ * post the next WQE.
+ */
+static inline int *wqe_to_link(void *wqe)
+{
+	return (int *) (wqe + offsetof(struct mthca_next_seg, imm));
+}
+
+static void mthca_tavor_init_srq_context(struct mthca_dev *dev,
+					 struct mthca_pd *pd,
+					 struct mthca_srq *srq,
+					 struct mthca_tavor_srq_context *context)
+{
+	memset(context, 0, sizeof *context);
+
+	context->wqe_base_ds = cpu_to_be64(1 << (srq->wqe_shift - 4));
+	context->state_pd    = cpu_to_be32(pd->pd_num);
+	context->lkey        = cpu_to_be32(srq->mr.ibmr.lkey);
+
+	if (pd->ibpd.uobject)
+		context->uar =
+			cpu_to_be32(to_mucontext(pd->ibpd.uobject->context)->uar.index);
+	else
+		context->uar = cpu_to_be32(dev->driver_uar.index);
+}
+
+static void mthca_arbel_init_srq_context(struct mthca_dev *dev,
+					 struct mthca_pd *pd,
+					 struct mthca_srq *srq,
+					 struct mthca_arbel_srq_context *context)
+{
+	int logsize, max;
+
+	memset(context, 0, sizeof *context);
+
+	/*
+	 * Put max in a temporary variable to work around gcc bug
+	 * triggered by ilog2() on sparc64.
+	 */
+	max = srq->max;
+	logsize = ilog2(max);
+	context->state_logsize_srqn = cpu_to_be32(logsize << 24 | srq->srqn);
+	context->lkey = cpu_to_be32(srq->mr.ibmr.lkey);
+	context->db_index = cpu_to_be32(srq->db_index);
+	context->logstride_usrpage = cpu_to_be32((srq->wqe_shift - 4) << 29);
+	if (pd->ibpd.uobject)
+		context->logstride_usrpage |=
+			cpu_to_be32(to_mucontext(pd->ibpd.uobject->context)->uar.index);
+	else
+		context->logstride_usrpage |= cpu_to_be32(dev->driver_uar.index);
+	context->eq_pd = cpu_to_be32(MTHCA_EQ_ASYNC << 24 | pd->pd_num);
+}
+
+static void mthca_free_srq_buf(struct mthca_dev *dev, struct mthca_srq *srq)
+{
+	mthca_buf_free(dev, srq->max << srq->wqe_shift, &srq->queue,
+		       srq->is_direct, &srq->mr);
+	kfree(srq->wrid);
+}
+
+static int mthca_alloc_srq_buf(struct mthca_dev *dev, struct mthca_pd *pd,
+			       struct mthca_srq *srq)
+{
+	struct mthca_data_seg *scatter;
+	void *wqe;
+	int err;
+	int i;
+
+	if (pd->ibpd.uobject)
+		return 0;
+
+	srq->wrid = kmalloc(srq->max * sizeof (u64), GFP_KERNEL);
+	if (!srq->wrid)
+		return -ENOMEM;
+
+	err = mthca_buf_alloc(dev, srq->max << srq->wqe_shift,
+			      MTHCA_MAX_DIRECT_SRQ_SIZE,
+			      &srq->queue, &srq->is_direct, pd, 1, &srq->mr);
+	if (err) {
+		kfree(srq->wrid);
+		return err;
+	}
+
+	/*
+	 * Now initialize the SRQ buffer so that all of the WQEs are
+	 * linked into the list of free WQEs.  In addition, set the
+	 * scatter list L_Keys to the sentry value of 0x100.
+	 */
+	for (i = 0; i < srq->max; ++i) {
+		struct mthca_next_seg *next;
+
+		next = wqe = get_wqe(srq, i);
+
+		if (i < srq->max - 1) {
+			*wqe_to_link(wqe) = i + 1;
+			next->nda_op = htonl(((i + 1) << srq->wqe_shift) | 1);
+		} else {
+			*wqe_to_link(wqe) = -1;
+			next->nda_op = 0;
+		}
+
+		for (scatter = wqe + sizeof (struct mthca_next_seg);
+		     (void *) scatter < wqe + (1 << srq->wqe_shift);
+		     ++scatter)
+			scatter->lkey = cpu_to_be32(MTHCA_INVAL_LKEY);
+	}
+
+	srq->last = get_wqe(srq, srq->max - 1);
+
+	return 0;
+}
+
+int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd,
+		    struct ib_srq_attr *attr, struct mthca_srq *srq)
+{
+	struct mthca_mailbox *mailbox;
+	u8 status;
+	int ds;
+	int err;
+
+	/* Sanity check SRQ size before proceeding */
+	if (attr->max_wr  > dev->limits.max_srq_wqes ||
+	    attr->max_sge > dev->limits.max_srq_sge)
+		return -EINVAL;
+
+	srq->max      = attr->max_wr;
+	srq->max_gs   = attr->max_sge;
+	srq->counter  = 0;
+
+	if (mthca_is_memfree(dev))
+		srq->max = roundup_pow_of_two(srq->max + 1);
+	else
+		srq->max = srq->max + 1;
+
+	ds = max(64UL,
+		 roundup_pow_of_two(sizeof (struct mthca_next_seg) +
+				    srq->max_gs * sizeof (struct mthca_data_seg)));
+
+	if (!mthca_is_memfree(dev) && (ds > dev->limits.max_desc_sz))
+		return -EINVAL;
+
+	srq->wqe_shift = ilog2(ds);
+
+	srq->srqn = mthca_alloc(&dev->srq_table.alloc);
+	if (srq->srqn == -1)
+		return -ENOMEM;
+
+	if (mthca_is_memfree(dev)) {
+		err = mthca_table_get(dev, dev->srq_table.table, srq->srqn);
+		if (err)
+			goto err_out;
+
+		if (!pd->ibpd.uobject) {
+			srq->db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_SRQ,
+						       srq->srqn, &srq->db);
+			if (srq->db_index < 0) {
+				err = -ENOMEM;
+				goto err_out_icm;
+			}
+		}
+	}
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_out_db;
+	}
+
+	err = mthca_alloc_srq_buf(dev, pd, srq);
+	if (err)
+		goto err_out_mailbox;
+
+	spin_lock_init(&srq->lock);
+	srq->refcount = 1;
+	init_waitqueue_head(&srq->wait);
+	mutex_init(&srq->mutex);
+
+	if (mthca_is_memfree(dev))
+		mthca_arbel_init_srq_context(dev, pd, srq, mailbox->buf);
+	else
+		mthca_tavor_init_srq_context(dev, pd, srq, mailbox->buf);
+
+	err = mthca_SW2HW_SRQ(dev, mailbox, srq->srqn, &status);
+
+	if (err) {
+		mthca_warn(dev, "SW2HW_SRQ failed (%d)\n", err);
+		goto err_out_free_buf;
+	}
+	if (status) {
+		mthca_warn(dev, "SW2HW_SRQ returned status 0x%02x\n",
+			   status);
+		err = -EINVAL;
+		goto err_out_free_buf;
+	}
+
+	spin_lock_irq(&dev->srq_table.lock);
+	if (mthca_array_set(&dev->srq_table.srq,
+			    srq->srqn & (dev->limits.num_srqs - 1),
+			    srq)) {
+		spin_unlock_irq(&dev->srq_table.lock);
+		goto err_out_free_srq;
+	}
+	spin_unlock_irq(&dev->srq_table.lock);
+
+	mthca_free_mailbox(dev, mailbox);
+
+	srq->first_free = 0;
+	srq->last_free  = srq->max - 1;
+
+	attr->max_wr    = srq->max - 1;
+	attr->max_sge   = srq->max_gs;
+
+	return 0;
+
+err_out_free_srq:
+	err = mthca_HW2SW_SRQ(dev, mailbox, srq->srqn, &status);
+	if (err)
+		mthca_warn(dev, "HW2SW_SRQ failed (%d)\n", err);
+	else if (status)
+		mthca_warn(dev, "HW2SW_SRQ returned status 0x%02x\n", status);
+
+err_out_free_buf:
+	if (!pd->ibpd.uobject)
+		mthca_free_srq_buf(dev, srq);
+
+err_out_mailbox:
+	mthca_free_mailbox(dev, mailbox);
+
+err_out_db:
+	if (!pd->ibpd.uobject && mthca_is_memfree(dev))
+		mthca_free_db(dev, MTHCA_DB_TYPE_SRQ, srq->db_index);
+
+err_out_icm:
+	mthca_table_put(dev, dev->srq_table.table, srq->srqn);
+
+err_out:
+	mthca_free(&dev->srq_table.alloc, srq->srqn);
+
+	return err;
+}
+
+static inline int get_srq_refcount(struct mthca_dev *dev, struct mthca_srq *srq)
+{
+	int c;
+
+	spin_lock_irq(&dev->srq_table.lock);
+	c = srq->refcount;
+	spin_unlock_irq(&dev->srq_table.lock);
+
+	return c;
+}
+
+void mthca_free_srq(struct mthca_dev *dev, struct mthca_srq *srq)
+{
+	struct mthca_mailbox *mailbox;
+	int err;
+	u8 status;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		mthca_warn(dev, "No memory for mailbox to free SRQ.\n");
+		return;
+	}
+
+	err = mthca_HW2SW_SRQ(dev, mailbox, srq->srqn, &status);
+	if (err)
+		mthca_warn(dev, "HW2SW_SRQ failed (%d)\n", err);
+	else if (status)
+		mthca_warn(dev, "HW2SW_SRQ returned status 0x%02x\n", status);
+
+	spin_lock_irq(&dev->srq_table.lock);
+	mthca_array_clear(&dev->srq_table.srq,
+			  srq->srqn & (dev->limits.num_srqs - 1));
+	--srq->refcount;
+	spin_unlock_irq(&dev->srq_table.lock);
+
+	wait_event(srq->wait, !get_srq_refcount(dev, srq));
+
+	if (!srq->ibsrq.uobject) {
+		mthca_free_srq_buf(dev, srq);
+		if (mthca_is_memfree(dev))
+			mthca_free_db(dev, MTHCA_DB_TYPE_SRQ, srq->db_index);
+	}
+
+	mthca_table_put(dev, dev->srq_table.table, srq->srqn);
+	mthca_free(&dev->srq_table.alloc, srq->srqn);
+	mthca_free_mailbox(dev, mailbox);
+}
+
+int mthca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		     enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+	struct mthca_dev *dev = to_mdev(ibsrq->device);
+	struct mthca_srq *srq = to_msrq(ibsrq);
+	int ret;
+	u8 status;
+
+	/* We don't support resizing SRQs (yet?) */
+	if (attr_mask & IB_SRQ_MAX_WR)
+		return -EINVAL;
+
+	if (attr_mask & IB_SRQ_LIMIT) {
+		u32 max_wr = mthca_is_memfree(dev) ? srq->max - 1 : srq->max;
+		if (attr->srq_limit > max_wr)
+			return -EINVAL;
+
+		mutex_lock(&srq->mutex);
+		ret = mthca_ARM_SRQ(dev, srq->srqn, attr->srq_limit, &status);
+		mutex_unlock(&srq->mutex);
+
+		if (ret)
+			return ret;
+		if (status)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+int mthca_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
+{
+	struct mthca_dev *dev = to_mdev(ibsrq->device);
+	struct mthca_srq *srq = to_msrq(ibsrq);
+	struct mthca_mailbox *mailbox;
+	struct mthca_arbel_srq_context *arbel_ctx;
+	struct mthca_tavor_srq_context *tavor_ctx;
+	u8 status;
+	int err;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	err = mthca_QUERY_SRQ(dev, srq->srqn, mailbox, &status);
+	if (err)
+		goto out;
+
+	if (mthca_is_memfree(dev)) {
+		arbel_ctx = mailbox->buf;
+		srq_attr->srq_limit = be16_to_cpu(arbel_ctx->limit_watermark);
+	} else {
+		tavor_ctx = mailbox->buf;
+		srq_attr->srq_limit = be16_to_cpu(tavor_ctx->limit_watermark);
+	}
+
+	srq_attr->max_wr  = srq->max - 1;
+	srq_attr->max_sge = srq->max_gs;
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+
+	return err;
+}
+
+void mthca_srq_event(struct mthca_dev *dev, u32 srqn,
+		     enum ib_event_type event_type)
+{
+	struct mthca_srq *srq;
+	struct ib_event event;
+
+	spin_lock(&dev->srq_table.lock);
+	srq = mthca_array_get(&dev->srq_table.srq, srqn & (dev->limits.num_srqs - 1));
+	if (srq)
+		++srq->refcount;
+	spin_unlock(&dev->srq_table.lock);
+
+	if (!srq) {
+		mthca_warn(dev, "Async event for bogus SRQ %08x\n", srqn);
+		return;
+	}
+
+	if (!srq->ibsrq.event_handler)
+		goto out;
+
+	event.device      = &dev->ib_dev;
+	event.event       = event_type;
+	event.element.srq = &srq->ibsrq;
+	srq->ibsrq.event_handler(&event, srq->ibsrq.srq_context);
+
+out:
+	spin_lock(&dev->srq_table.lock);
+	if (!--srq->refcount)
+		wake_up(&srq->wait);
+	spin_unlock(&dev->srq_table.lock);
+}
+
+/*
+ * This function must be called with IRQs disabled.
+ */
+void mthca_free_srq_wqe(struct mthca_srq *srq, u32 wqe_addr)
+{
+	int ind;
+	struct mthca_next_seg *last_free;
+
+	ind = wqe_addr >> srq->wqe_shift;
+
+	spin_lock(&srq->lock);
+
+	last_free = get_wqe(srq, srq->last_free);
+	*wqe_to_link(last_free) = ind;
+	last_free->nda_op = htonl((ind << srq->wqe_shift) | 1);
+	*wqe_to_link(get_wqe(srq, ind)) = -1;
+	srq->last_free = ind;
+
+	spin_unlock(&srq->lock);
+}
+
+int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			      struct ib_recv_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibsrq->device);
+	struct mthca_srq *srq = to_msrq(ibsrq);
+	unsigned long flags;
+	int err = 0;
+	int first_ind;
+	int ind;
+	int next_ind;
+	int nreq;
+	int i;
+	void *wqe;
+	void *prev_wqe;
+
+	spin_lock_irqsave(&srq->lock, flags);
+
+	first_ind = srq->first_free;
+
+	for (nreq = 0; wr; wr = wr->next) {
+		ind       = srq->first_free;
+		wqe       = get_wqe(srq, ind);
+		next_ind  = *wqe_to_link(wqe);
+
+		if (unlikely(next_ind < 0)) {
+			mthca_err(dev, "SRQ %06x full\n", srq->srqn);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+
+		prev_wqe  = srq->last;
+		srq->last = wqe;
+
+		((struct mthca_next_seg *) wqe)->ee_nds = 0;
+		/* flags field will always remain 0 */
+
+		wqe += sizeof (struct mthca_next_seg);
+
+		if (unlikely(wr->num_sge > srq->max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			srq->last = prev_wqe;
+			break;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			mthca_set_data_seg(wqe, wr->sg_list + i);
+			wqe += sizeof (struct mthca_data_seg);
+		}
+
+		if (i < srq->max_gs)
+			mthca_set_data_seg_inval(wqe);
+
+		((struct mthca_next_seg *) prev_wqe)->ee_nds =
+			cpu_to_be32(MTHCA_NEXT_DBD);
+
+		srq->wrid[ind]  = wr->wr_id;
+		srq->first_free = next_ind;
+
+		++nreq;
+		if (unlikely(nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB)) {
+			nreq = 0;
+
+			/*
+			 * Make sure that descriptors are written
+			 * before doorbell is rung.
+			 */
+			wmb();
+
+			mthca_write64(first_ind << srq->wqe_shift, srq->srqn << 8,
+				      dev->kar + MTHCA_RECEIVE_DOORBELL,
+				      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+
+			first_ind = srq->first_free;
+		}
+	}
+
+	if (likely(nreq)) {
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell is rung.
+		 */
+		wmb();
+
+		mthca_write64(first_ind << srq->wqe_shift, (srq->srqn << 8) | nreq,
+			      dev->kar + MTHCA_RECEIVE_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+	}
+
+	/*
+	 * Make sure doorbells don't leak out of SRQ spinlock and
+	 * reach the HCA out of order:
+	 */
+	mmiowb();
+
+	spin_unlock_irqrestore(&srq->lock, flags);
+	return err;
+}
+
+int mthca_arbel_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			      struct ib_recv_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibsrq->device);
+	struct mthca_srq *srq = to_msrq(ibsrq);
+	unsigned long flags;
+	int err = 0;
+	int ind;
+	int next_ind;
+	int nreq;
+	int i;
+	void *wqe;
+
+	spin_lock_irqsave(&srq->lock, flags);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		ind       = srq->first_free;
+		wqe       = get_wqe(srq, ind);
+		next_ind  = *wqe_to_link(wqe);
+
+		if (unlikely(next_ind < 0)) {
+			mthca_err(dev, "SRQ %06x full\n", srq->srqn);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+
+		((struct mthca_next_seg *) wqe)->ee_nds = 0;
+		/* flags field will always remain 0 */
+
+		wqe += sizeof (struct mthca_next_seg);
+
+		if (unlikely(wr->num_sge > srq->max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			mthca_set_data_seg(wqe, wr->sg_list + i);
+			wqe += sizeof (struct mthca_data_seg);
+		}
+
+		if (i < srq->max_gs)
+			mthca_set_data_seg_inval(wqe);
+
+		srq->wrid[ind]  = wr->wr_id;
+		srq->first_free = next_ind;
+	}
+
+	if (likely(nreq)) {
+		srq->counter += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * we write doorbell record.
+		 */
+		wmb();
+		*srq->db = cpu_to_be32(srq->counter);
+	}
+
+	spin_unlock_irqrestore(&srq->lock, flags);
+	return err;
+}
+
+int mthca_max_srq_sge(struct mthca_dev *dev)
+{
+	if (mthca_is_memfree(dev))
+		return dev->limits.max_sg;
+
+	/*
+	 * SRQ allocations are based on powers of 2 for Tavor,
+	 * (although they only need to be multiples of 16 bytes).
+	 *
+	 * Therefore, we need to base the max number of sg entries on
+	 * the largest power of 2 descriptor size that is <= to the
+	 * actual max WQE descriptor size, rather than return the
+	 * max_sg value given by the firmware (which is based on WQE
+	 * sizes as multiples of 16, not powers of 2).
+	 *
+	 * If SRQ implementation is changed for Tavor to be based on
+	 * multiples of 16, the calculation below can be deleted and
+	 * the FW max_sg value returned.
+	 */
+	return min_t(int, dev->limits.max_sg,
+		     ((1 << (fls(dev->limits.max_desc_sz) - 1)) -
+		      sizeof (struct mthca_next_seg)) /
+		     sizeof (struct mthca_data_seg));
+}
+
+int mthca_init_srq_table(struct mthca_dev *dev)
+{
+	int err;
+
+	if (!(dev->mthca_flags & MTHCA_FLAG_SRQ))
+		return 0;
+
+	spin_lock_init(&dev->srq_table.lock);
+
+	err = mthca_alloc_init(&dev->srq_table.alloc,
+			       dev->limits.num_srqs,
+			       dev->limits.num_srqs - 1,
+			       dev->limits.reserved_srqs);
+	if (err)
+		return err;
+
+	err = mthca_array_init(&dev->srq_table.srq,
+			       dev->limits.num_srqs);
+	if (err)
+		mthca_alloc_cleanup(&dev->srq_table.alloc);
+
+	return err;
+}
+
+void mthca_cleanup_srq_table(struct mthca_dev *dev)
+{
+	if (!(dev->mthca_flags & MTHCA_FLAG_SRQ))
+		return;
+
+	mthca_array_cleanup(&dev->srq_table.srq, dev->limits.num_srqs);
+	mthca_alloc_cleanup(&dev->srq_table.alloc);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_uar.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_uar.c
new file mode 100644
index 0000000..ca5900c
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_uar.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <asm/page.h>		/* PAGE_SHIFT */
+
+#include "mthca_dev.h"
+#include "mthca_memfree.h"
+
+int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar)
+{
+	uar->index = mthca_alloc(&dev->uar_table.alloc);
+	if (uar->index == -1)
+		return -ENOMEM;
+
+	uar->pfn = (pci_resource_start(dev->pdev, 2) >> PAGE_SHIFT) + uar->index;
+
+	return 0;
+}
+
+void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar)
+{
+	mthca_free(&dev->uar_table.alloc, uar->index);
+}
+
+int mthca_init_uar_table(struct mthca_dev *dev)
+{
+	int ret;
+
+	ret = mthca_alloc_init(&dev->uar_table.alloc,
+			       dev->limits.num_uars,
+			       dev->limits.num_uars - 1,
+			       dev->limits.reserved_uars + 1);
+	if (ret)
+		return ret;
+
+	ret = mthca_init_db_tab(dev);
+	if (ret)
+		mthca_alloc_cleanup(&dev->uar_table.alloc);
+
+	return ret;
+}
+
+void mthca_cleanup_uar_table(struct mthca_dev *dev)
+{
+	mthca_cleanup_db_tab(dev);
+
+	/* XXX check if any UARs are still allocated? */
+	mthca_alloc_cleanup(&dev->uar_table.alloc);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_user.h b/sys/ofed/drivers/infiniband/hw/mthca/mthca_user.h
new file mode 100644
index 0000000..5fe56e8
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_user.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_USER_H
+#define MTHCA_USER_H
+
+#include <linux/types.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define MTHCA_UVERBS_ABI_VERSION	1
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct mthca_alloc_ucontext_resp {
+	__u32 qp_tab_size;
+	__u32 uarc_size;
+};
+
+struct mthca_alloc_pd_resp {
+	__u32 pdn;
+	__u32 reserved;
+};
+
+struct mthca_reg_mr {
+/*
+ * Mark the memory region with a DMA attribute that causes
+ * in-flight DMA to be flushed when the region is written to:
+ */
+#define MTHCA_MR_DMASYNC	0x1
+	__u32 mr_attrs;
+	__u32 reserved;
+};
+
+struct mthca_create_cq {
+	__u32 lkey;
+	__u32 pdn;
+	__u64 arm_db_page;
+	__u64 set_db_page;
+	__u32 arm_db_index;
+	__u32 set_db_index;
+};
+
+struct mthca_create_cq_resp {
+	__u32 cqn;
+	__u32 reserved;
+};
+
+struct mthca_resize_cq {
+	__u32 lkey;
+	__u32 reserved;
+};
+
+struct mthca_create_srq {
+	__u32 lkey;
+	__u32 db_index;
+	__u64 db_page;
+};
+
+struct mthca_create_srq_resp {
+	__u32 srqn;
+	__u32 reserved;
+};
+
+struct mthca_create_qp {
+	__u32 lkey;
+	__u32 reserved;
+	__u64 sq_db_page;
+	__u64 rq_db_page;
+	__u32 sq_db_index;
+	__u32 rq_db_index;
+};
+
+#endif /* MTHCA_USER_H */
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_wqe.h b/sys/ofed/drivers/infiniband/hw/mthca/mthca_wqe.h
new file mode 100644
index 0000000..341a5ae
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_wqe.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_WQE_H
+#define MTHCA_WQE_H
+
+#include <linux/types.h>
+
+enum {
+	MTHCA_NEXT_DBD		= 1 << 7,
+	MTHCA_NEXT_FENCE	= 1 << 6,
+	MTHCA_NEXT_CQ_UPDATE	= 1 << 3,
+	MTHCA_NEXT_EVENT_GEN	= 1 << 2,
+	MTHCA_NEXT_SOLICIT	= 1 << 1,
+	MTHCA_NEXT_IP_CSUM	= 1 << 4,
+	MTHCA_NEXT_TCP_UDP_CSUM = 1 << 5,
+
+	MTHCA_MLX_VL15		= 1 << 17,
+	MTHCA_MLX_SLR		= 1 << 16
+};
+
+enum {
+	MTHCA_INVAL_LKEY			= 0x100,
+	MTHCA_TAVOR_MAX_WQES_PER_RECV_DB	= 256,
+	MTHCA_ARBEL_MAX_WQES_PER_SEND_DB	= 255
+};
+
+struct mthca_next_seg {
+	__be32 nda_op;		/* [31:6] next WQE [4:0] next opcode */
+	__be32 ee_nds;		/* [31:8] next EE  [7] DBD [6] F [5:0] next WQE size */
+	__be32 flags;		/* [3] CQ [2] Event [1] Solicit */
+	__be32 imm;		/* immediate data */
+};
+
+struct mthca_tavor_ud_seg {
+	u32    reserved1;
+	__be32 lkey;
+	__be64 av_addr;
+	u32    reserved2[4];
+	__be32 dqpn;
+	__be32 qkey;
+	u32    reserved3[2];
+};
+
+struct mthca_arbel_ud_seg {
+	__be32 av[8];
+	__be32 dqpn;
+	__be32 qkey;
+	u32    reserved[2];
+};
+
+struct mthca_bind_seg {
+	__be32 flags;		/* [31] Atomic [30] rem write [29] rem read */
+	u32    reserved;
+	__be32 new_rkey;
+	__be32 lkey;
+	__be64 addr;
+	__be64 length;
+};
+
+struct mthca_raddr_seg {
+	__be64 raddr;
+	__be32 rkey;
+	u32    reserved;
+};
+
+struct mthca_atomic_seg {
+	__be64 swap_add;
+	__be64 compare;
+};
+
+struct mthca_data_seg {
+	__be32 byte_count;
+	__be32 lkey;
+	__be64 addr;
+};
+
+struct mthca_mlx_seg {
+	__be32 nda_op;
+	__be32 nds;
+	__be32 flags;		/* [17] VL15 [16] SLR [14:12] static rate
+				   [11:8] SL [3] C [2] E */
+	__be16 rlid;
+	__be16 vcrc;
+};
+
+static __always_inline void mthca_set_data_seg(struct mthca_data_seg *dseg,
+					       struct ib_sge *sg)
+{
+	dseg->byte_count = cpu_to_be32(sg->length);
+	dseg->lkey       = cpu_to_be32(sg->lkey);
+	dseg->addr       = cpu_to_be64(sg->addr);
+}
+
+static __always_inline void mthca_set_data_seg_inval(struct mthca_data_seg *dseg)
+{
+	dseg->byte_count = 0;
+	dseg->lkey       = cpu_to_be32(MTHCA_INVAL_LKEY);
+	dseg->addr       = 0;
+}
+
+#endif /* MTHCA_WQE_H */
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/Kconfig b/sys/ofed/drivers/infiniband/ulp/ipoib/Kconfig
new file mode 100644
index 0000000..9d9a9dc
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/Kconfig
@@ -0,0 +1,50 @@
+config INFINIBAND_IPOIB
+	tristate "IP-over-InfiniBand"
+	depends on NETDEVICES && INET && (IPV6 || IPV6=n)
+	select INET_LRO
+	---help---
+	  Support for the IP-over-InfiniBand protocol (IPoIB). This
+	  transports IP packets over InfiniBand so you can use your IB
+	  device as a fancy NIC.
+
+	  See Documentation/infiniband/ipoib.txt for more information
+
+config INFINIBAND_IPOIB_CM
+	bool "IP-over-InfiniBand Connected Mode support"
+	depends on INFINIBAND_IPOIB
+	default n
+	---help---
+	  This option enables support for IPoIB connected mode.  After
+	  enabling this option, you need to switch to connected mode
+	  through /sys/class/net/ibXXX/mode to actually create
+	  connections, and then increase the interface MTU with
+	  e.g. ifconfig ib0 mtu 65520.
+
+	  WARNING: Enabling connected mode will trigger some packet
+	  drops for multicast and UD mode traffic from this interface,
+	  unless you limit mtu for these destinations to 2044.
+
+config INFINIBAND_IPOIB_DEBUG
+	bool "IP-over-InfiniBand debugging" if EMBEDDED
+	depends on INFINIBAND_IPOIB
+	default y
+	---help---
+	  This option causes debugging code to be compiled into the
+	  IPoIB driver.  The output can be turned on via the
+	  debug_level and mcast_debug_level module parameters (which
+	  can also be set after the driver is loaded through sysfs).
+
+	  This option also creates a directory tree under ipoib/ in
+	  debugfs, which contains files that expose debugging
+	  information about IB multicast groups used by the IPoIB
+	  driver.
+
+config INFINIBAND_IPOIB_DEBUG_DATA
+	bool "IP-over-InfiniBand data path debugging"
+	depends on INFINIBAND_IPOIB_DEBUG
+	---help---
+	  This option compiles debugging code into the data path
+	  of the IPoIB driver.  The output can be turned on via the
+	  data_debug_level module parameter; however, even with output
+	  turned off, this debugging code will have some performance
+	  impact.
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/Makefile b/sys/ofed/drivers/infiniband/ulp/ipoib/Makefile
new file mode 100644
index 0000000..3090100
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/Makefile
@@ -0,0 +1,11 @@
+obj-$(CONFIG_INFINIBAND_IPOIB)			+= ib_ipoib.o
+
+ib_ipoib-y					:= ipoib_main.o \
+						   ipoib_ib.o \
+						   ipoib_multicast.o \
+						   ipoib_verbs.o \
+						   ipoib_vlan.o \
+						   ipoib_ethtool.o
+ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_CM)		+= ipoib_cm.o
+ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG)	+= ipoib_fs.o
+
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h
new file mode 100644
index 0000000..1d6ae84
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -0,0 +1,757 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPOIB_H
+#define _IPOIB_H
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ofed.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/random.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/if_arp.h>
+#include <net/netisr.h>
+#include <net/route.h>
+#include <net/if_llc.h>
+#include <net/if_dl.h>
+#include <net/if_types.h>
+#include <net/bpf.h>
+#include <net/if_llatbl.h>
+#include <net/vnet.h>
+
+#if defined(INET) || defined(INET6)
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/if_ether.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#endif
+#ifdef INET6
+#include <netinet6/nd6.h>
+#endif
+
+#include <security/mac/mac_framework.h>
+
+#include <linux/list.h>
+
+#include <linux/workqueue.h>
+#include <linux/kref.h>
+#include <linux/mutex.h>
+
+#include <asm/atomic.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_sa.h>
+
+/* constants */
+
+#define	INFINIBAND_ALEN		20	/* Octets in IPoIB HW addr */
+
+#ifdef IPOIB_CM
+#define	CONFIG_INFINIBAND_IPOIB_CM
+#endif
+
+#ifdef IPOIB_DEBUG
+#define	CONFIG_INFINIBAND_IPOIB_DEBUG
+#define CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
+#endif
+
+enum ipoib_flush_level {
+	IPOIB_FLUSH_LIGHT,
+	IPOIB_FLUSH_NORMAL,
+	IPOIB_FLUSH_HEAVY
+};
+
+enum {
+	IPOIB_ENCAP_LEN		  = 4,
+	IPOIB_HEADER_LEN	  = IPOIB_ENCAP_LEN + INFINIBAND_ALEN,
+	IPOIB_UD_MAX_MTU	  = 4 * 1024,
+	IPOIB_UD_RX_SG		  = (IPOIB_UD_MAX_MTU / MJUMPAGESIZE),
+	IPOIB_UD_TX_SG		  = (IPOIB_UD_MAX_MTU / MCLBYTES) + 2,
+	IPOIB_CM_MAX_MTU	  = (64 * 1024),
+	IPOIB_CM_TX_SG		  = (IPOIB_CM_MAX_MTU / MCLBYTES) + 2,
+	IPOIB_CM_RX_SG		  = (IPOIB_CM_MAX_MTU / MJUMPAGESIZE),
+	IPOIB_RX_RING_SIZE	  = 256,
+	IPOIB_TX_RING_SIZE	  = 128,
+	IPOIB_MAX_RX_SG		  = MAX(IPOIB_CM_RX_SG, IPOIB_UD_RX_SG),
+	IPOIB_MAX_TX_SG		  = MAX(IPOIB_CM_TX_SG, IPOIB_UD_TX_SG),
+	IPOIB_MAX_QUEUE_SIZE	  = 8192,
+	IPOIB_MIN_QUEUE_SIZE	  = 2,
+	IPOIB_CM_MAX_CONN_QP	  = 4096,
+
+	IPOIB_NUM_WC		  = 4,
+
+	IPOIB_MAX_PATH_REC_QUEUE  = 3,
+	IPOIB_MAX_MCAST_QUEUE	  = 3,
+
+	IPOIB_FLAG_OPER_UP	  = 0,
+	IPOIB_FLAG_INITIALIZED	  = 1,
+	IPOIB_FLAG_ADMIN_UP	  = 2,
+	IPOIB_PKEY_ASSIGNED	  = 3,
+	IPOIB_PKEY_STOP		  = 4,
+	IPOIB_FLAG_SUBINTERFACE	  = 5,
+	IPOIB_MCAST_RUN		  = 6,
+	IPOIB_STOP_REAPER	  = 7,
+	IPOIB_FLAG_UMCAST	  = 10,
+	IPOIB_FLAG_CSUM		  = 11,
+
+	IPOIB_MAX_BACKOFF_SECONDS = 16,
+
+	IPOIB_MCAST_FLAG_FOUND	  = 0,	/* used in set_multicast_list */
+	IPOIB_MCAST_FLAG_SENDONLY = 1,
+	IPOIB_MCAST_FLAG_BUSY	  = 2,	/* joining or already joined */
+	IPOIB_MCAST_FLAG_ATTACHED = 3,
+
+	IPOIB_MAX_LRO_DESCRIPTORS = 8,
+	IPOIB_LRO_MAX_AGGR 	  = 64,
+
+	MAX_SEND_CQE		  = 16,
+	IPOIB_CM_COPYBREAK	  = 256,
+};
+
+#define	IPOIB_OP_RECV   (1ul << 31)
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+#define	IPOIB_OP_CM     (1ul << 30)
+#else
+#define	IPOIB_OP_CM     (0)
+#endif
+
+/* structs */
+
+struct ipoib_header {
+	u8  hwaddr[INFINIBAND_ALEN];
+	__be16	proto;
+	u16	reserved;
+};
+
+struct ipoib_pseudoheader {
+	u8  hwaddr[INFINIBAND_ALEN];
+};
+
+/* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */
+struct ipoib_mcast {
+	struct ib_sa_mcmember_rec mcmember;
+	struct ib_sa_multicast	 *mc;
+	struct ipoib_ah		 *ah;
+
+	struct rb_node    rb_node;
+	struct list_head  list;
+
+	unsigned long created;
+	unsigned long backoff;
+
+	unsigned long flags;
+	unsigned char logcount;
+
+	struct ifqueue pkt_queue;
+
+	struct ipoib_dev_priv *priv;
+};
+
+struct ipoib_cm_rx_buf {
+	struct mbuf *mb;
+	u64		mapping[IPOIB_CM_RX_SG];
+};
+
+struct ipoib_cm_tx_buf {
+	struct mbuf *mb;
+	u64		mapping[IPOIB_CM_TX_SG];
+};
+
+struct ipoib_rx_buf {
+	struct mbuf *mb;
+	u64		mapping[IPOIB_UD_RX_SG];
+};
+
+struct ipoib_tx_buf {
+	struct mbuf *mb;
+	u64		mapping[IPOIB_UD_TX_SG];
+};
+
+struct ib_cm_id;
+
+struct ipoib_cm_data {
+	__be32 qpn; /* High byte MUST be ignored on receive */
+	__be32 mtu;
+};
+
+/*
+ * Quoting 10.3.1 Queue Pair and EE Context States:
+ *
+ * Note, for QPs that are associated with an SRQ, the Consumer should take the
+ * QP through the Error State before invoking a Destroy QP or a Modify QP to the
+ * Reset State.  The Consumer may invoke the Destroy QP without first performing
+ * a Modify QP to the Error State and waiting for the Affiliated Asynchronous
+ * Last WQE Reached Event. However, if the Consumer does not wait for the
+ * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment
+ * leakage may occur. Therefore, it is good programming practice to tear down a
+ * QP that is associated with an SRQ by using the following process:
+ *
+ * - Put the QP in the Error State
+ * - Wait for the Affiliated Asynchronous Last WQE Reached Event;
+ * - either:
+ *       drain the CQ by invoking the Poll CQ verb and either wait for CQ
+ *       to be empty or the number of Poll CQ operations has exceeded
+ *       CQ capacity size;
+ * - or
+ *       post another WR that completes on the same CQ and wait for this
+ *       WR to return as a WC;
+ * - and then invoke a Destroy QP or Reset QP.
+ *
+ * We use the second option and wait for a completion on the
+ * same CQ before destroying QPs attached to our SRQ.
+ */
+
+enum ipoib_cm_state {
+	IPOIB_CM_RX_LIVE,
+	IPOIB_CM_RX_ERROR, /* Ignored by stale task */
+	IPOIB_CM_RX_FLUSH  /* Last WQE Reached event observed */
+};
+
+struct ipoib_cm_rx {
+	struct ib_cm_id	       *id;
+	struct ib_qp	       *qp;
+	struct ipoib_cm_rx_buf *rx_ring;
+	struct list_head	list;
+	struct ipoib_dev_priv	*priv;
+	unsigned long		jiffies;
+	enum ipoib_cm_state	state;
+	int			recv_count;
+};
+
+struct ipoib_cm_tx {
+	struct ib_cm_id	    *id;
+	struct ib_qp	    *qp;
+	struct list_head     list;
+	struct ipoib_dev_priv *priv;
+	struct ipoib_path   *path;
+	struct ipoib_cm_tx_buf *tx_ring;
+	unsigned	     tx_head;
+	unsigned	     tx_tail;
+	unsigned long	     flags;
+	u32		     mtu;	/* remote specified mtu, with grh. */
+};
+
+struct ipoib_cm_dev_priv {
+	struct ib_srq	       *srq;
+	struct ipoib_cm_rx_buf *srq_ring;
+	struct ib_cm_id	       *id;
+	struct list_head	passive_ids;   /* state: LIVE */
+	struct list_head	rx_error_list; /* state: ERROR */
+	struct list_head	rx_flush_list; /* state: FLUSH, drain not started */
+	struct list_head	rx_drain_list; /* state: FLUSH, drain started */
+	struct list_head	rx_reap_list;  /* state: FLUSH, drain done */
+	struct work_struct      start_task;
+	struct work_struct      reap_task;
+	struct work_struct      mb_task;
+	struct work_struct      rx_reap_task;
+	struct delayed_work     stale_task;
+	struct ifqueue     	mb_queue;
+	struct list_head	start_list;
+	struct list_head	reap_list;
+	struct ib_sge		rx_sge[IPOIB_CM_RX_SG];
+	struct ib_recv_wr	rx_wr;
+	int			nonsrq_conn_qp;
+	int			max_cm_mtu;	/* Actual buf size. */
+	int			num_frags;
+};
+
+struct ipoib_ethtool_st {
+	u16     coalesce_usecs;
+	u16     max_coalesced_frames;
+};
+
+/*
+ * Device private locking: network stack tx_lock protects members used
+ * in TX fast path, lock protects everything else.  lock nests inside
+ * of tx_lock (ie tx_lock must be acquired first if needed).
+ */
+struct ipoib_dev_priv {
+	spinlock_t lock;
+
+	struct ifnet *dev;
+
+	u8 broadcastaddr[INFINIBAND_ALEN];
+
+	unsigned long flags;
+
+	struct mutex vlan_mutex;
+
+	struct rb_root  path_tree;
+	struct list_head path_list;
+
+	struct ipoib_mcast *broadcast;
+	struct list_head multicast_list;
+	struct rb_root multicast_tree;
+
+	struct delayed_work pkey_poll_task;
+	struct delayed_work mcast_task;
+	struct work_struct carrier_on_task;
+	struct work_struct flush_light;
+	struct work_struct flush_normal;
+	struct work_struct flush_heavy;
+	struct work_struct restart_task;
+	struct delayed_work ah_reap_task;
+
+	struct ib_device *ca;
+	u8		  port;
+	u16		  pkey;
+	u16		  pkey_index;
+	struct ib_pd	 *pd;
+	struct ib_mr	 *mr;
+	struct ib_cq	 *recv_cq;
+	struct ib_cq	 *send_cq;
+	struct ib_qp	 *qp;
+	u32		  qkey;
+
+	union ib_gid local_gid;
+	u16	     local_lid;
+
+	unsigned int admin_mtu;		/* User selected MTU, no GRH. */
+	unsigned int mcast_mtu;		/* Minus GRH bytes, from mcast group. */
+	unsigned int max_ib_mtu;	/* Without header, actual buf size. */
+
+	struct ipoib_rx_buf *rx_ring;
+
+	struct ipoib_tx_buf *tx_ring;
+	unsigned	     tx_head;
+	unsigned	     tx_tail;
+	struct ib_sge	     tx_sge[IPOIB_MAX_TX_SG];
+	struct ib_send_wr    tx_wr;
+	unsigned	     tx_outstanding;
+	struct ib_wc	     send_wc[MAX_SEND_CQE];
+
+	struct ib_recv_wr    rx_wr;
+	struct ib_sge	     rx_sge[IPOIB_MAX_RX_SG];
+
+	struct ib_wc ibwc[IPOIB_NUM_WC];
+
+	struct list_head dead_ahs;
+
+	struct ib_event_handler event_handler;
+
+	struct ifnet *parent;
+	struct list_head child_intfs;
+	struct list_head list;
+
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+	struct ipoib_cm_dev_priv cm;
+#endif
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+	struct list_head fs_list;
+	struct dentry *mcg_dentry;
+	struct dentry *path_dentry;
+#endif
+	int	hca_caps;
+	struct ipoib_ethtool_st ethtool;
+	struct timer_list poll_timer;
+};
+
+struct ipoib_ah {
+	struct ipoib_dev_priv *priv;
+	struct ib_ah	  *ah;
+	struct list_head   list;
+	struct kref	   ref;
+	unsigned	   last_send;
+};
+
+struct ipoib_path {
+	struct ipoib_dev_priv *priv;
+	struct rb_node	      rb_node;
+	struct list_head      list;
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+	uint8_t		      hwaddr[INFINIBAND_ALEN];
+	struct ipoib_cm_tx   *cm;
+#endif
+	struct ipoib_ah      *ah;
+	struct ib_sa_path_rec pathrec;
+	struct ifqueue	      queue;
+
+	int		      query_id;
+	struct ib_sa_query   *query;
+	struct completion     done;
+
+	int  		      valid;
+};
+
+/* UD Only transmits encap len but we want the two sizes to be symmetrical. */
+#define IPOIB_UD_MTU(ib_mtu)		(ib_mtu - IPOIB_ENCAP_LEN)
+#define	IPOIB_CM_MTU(ib_mtu)		(ib_mtu - 0x10)
+
+#define	IPOIB_IS_MULTICAST(addr)	((addr)[4] == 0xff)
+
+extern struct workqueue_struct *ipoib_workqueue;
+
+#define IPOIB_MTAP_PROTO(_ifp, _m, _proto)			\
+do {								\
+	if (bpf_peers_present((_ifp)->if_bpf)) {		\
+		M_ASSERTVALID(_m);				\
+		ipoib_mtap_proto((_ifp), (_m), (_proto));	\
+	}							\
+} while (0)
+
+/* functions */
+void ipoib_mtap_proto(struct ifnet *ifp, struct mbuf *mb, uint16_t proto);
+void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr);
+void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr);
+
+struct ipoib_ah *ipoib_create_ah(struct ipoib_dev_priv *,
+				 struct ib_pd *pd, struct ib_ah_attr *attr);
+void ipoib_free_ah(struct kref *kref);
+static inline void ipoib_put_ah(struct ipoib_ah *ah)
+{
+	kref_put(&ah->ref, ipoib_free_ah);
+}
+
+int ipoib_open(struct ipoib_dev_priv *priv);
+int ipoib_add_pkey_attr(struct ipoib_dev_priv *priv);
+int ipoib_add_umcast_attr(struct ipoib_dev_priv *priv);
+
+void ipoib_demux(struct ifnet *ifp, struct mbuf *m, u_short proto);
+
+void ipoib_send(struct ipoib_dev_priv *priv, struct mbuf *mb,
+		struct ipoib_ah *address, u32 qpn);
+void ipoib_reap_ah(struct work_struct *work);
+
+void ipoib_mark_paths_invalid(struct ipoib_dev_priv *priv);
+void ipoib_flush_paths(struct ipoib_dev_priv *priv);
+struct ipoib_dev_priv *ipoib_intf_alloc(const char *format);
+
+int ipoib_ib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca,
+    int port);
+void ipoib_ib_dev_flush_light(struct work_struct *work);
+void ipoib_ib_dev_flush_normal(struct work_struct *work);
+void ipoib_ib_dev_flush_heavy(struct work_struct *work);
+void ipoib_pkey_event(struct work_struct *work);
+void ipoib_ib_dev_cleanup(struct ipoib_dev_priv *priv);
+
+int ipoib_ib_dev_open(struct ipoib_dev_priv *priv);
+int ipoib_ib_dev_up(struct ipoib_dev_priv *priv);
+int ipoib_ib_dev_down(struct ipoib_dev_priv *priv, int flush);
+int ipoib_ib_dev_stop(struct ipoib_dev_priv *priv, int flush);
+
+int ipoib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port);
+void ipoib_dev_cleanup(struct ipoib_dev_priv *priv);
+
+void ipoib_mcast_join_task(struct work_struct *work);
+void ipoib_mcast_carrier_on_task(struct work_struct *work);
+void ipoib_mcast_send(struct ipoib_dev_priv *priv, void *mgid, struct mbuf *mb);
+
+void ipoib_mcast_restart_task(struct work_struct *work);
+void ipoib_mcast_restart(struct ipoib_dev_priv *);
+int ipoib_mcast_start_thread(struct ipoib_dev_priv *priv);
+int ipoib_mcast_stop_thread(struct ipoib_dev_priv *priv, int flush);
+
+void ipoib_mcast_dev_down(struct ipoib_dev_priv *priv);
+void ipoib_mcast_dev_flush(struct ipoib_dev_priv *priv);
+
+void ipoib_path_free(struct ipoib_dev_priv *priv, struct ipoib_path *path);
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct ipoib_dev_priv *priv);
+int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter);
+void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter,
+				  union ib_gid *gid,
+				  unsigned long *created,
+				  unsigned int *queuelen,
+				  unsigned int *complete,
+				  unsigned int *send_only);
+
+struct ipoib_path_iter *ipoib_path_iter_init(struct ipoib_dev_priv *priv);
+int ipoib_path_iter_next(struct ipoib_path_iter *iter);
+void ipoib_path_iter_read(struct ipoib_path_iter *iter,
+			  struct ipoib_path *path);
+#endif
+
+int ipoib_change_mtu(struct ipoib_dev_priv *priv, int new_mtu);
+
+int ipoib_mcast_attach(struct ipoib_dev_priv *priv, u16 mlid,
+		       union ib_gid *mgid, int set_qkey);
+
+int ipoib_init_qp(struct ipoib_dev_priv *priv);
+int ipoib_transport_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca);
+void ipoib_transport_dev_cleanup(struct ipoib_dev_priv *priv);
+
+void ipoib_event(struct ib_event_handler *handler,
+		 struct ib_event *record);
+
+void ipoib_pkey_poll(struct work_struct *work);
+int ipoib_pkey_dev_delay_open(struct ipoib_dev_priv *priv);
+void ipoib_drain_cq(struct ipoib_dev_priv *priv);
+
+int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req, int max);
+void ipoib_dma_unmap_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req);
+int ipoib_poll_tx(struct ipoib_dev_priv *priv);
+
+void ipoib_dma_unmap_rx(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req);
+void ipoib_dma_mb(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int length);
+struct mbuf *ipoib_alloc_map_mb(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req, int size);
+
+
+void ipoib_set_ethtool_ops(struct ifnet *dev);
+int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca);
+
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+
+#define IPOIB_FLAGS_RC		0x80
+#define IPOIB_FLAGS_UC		0x40
+
+/* We don't support UC connections at the moment */
+#define IPOIB_CM_SUPPORTED(ha)   (ha[0] & (IPOIB_FLAGS_RC))
+
+extern int ipoib_max_conn_qp;
+
+static inline int ipoib_cm_admin_enabled(struct ipoib_dev_priv *priv)
+{
+	return IPOIB_CM_SUPPORTED(IF_LLADDR(priv->dev));
+}
+
+static inline int ipoib_cm_enabled(struct ipoib_dev_priv *priv, uint8_t *hwaddr)
+{
+	return IPOIB_CM_SUPPORTED(hwaddr);
+}
+
+static inline int ipoib_cm_up(struct ipoib_path *path)
+
+{
+	return test_bit(IPOIB_FLAG_OPER_UP, &path->cm->flags);
+}
+
+static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_path *path)
+{
+	return path->cm;
+}
+
+static inline void ipoib_cm_set(struct ipoib_path *path, struct ipoib_cm_tx *tx)
+{
+	path->cm = tx;
+}
+
+static inline int ipoib_cm_has_srq(struct ipoib_dev_priv *priv)
+{
+	return !!priv->cm.srq;
+}
+
+static inline unsigned int ipoib_cm_max_mtu(struct ipoib_dev_priv *priv)
+{
+	return priv->cm.max_cm_mtu;
+}
+
+void ipoib_cm_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_cm_tx *tx);
+int ipoib_cm_dev_open(struct ipoib_dev_priv *priv);
+void ipoib_cm_dev_stop(struct ipoib_dev_priv *priv);
+int ipoib_cm_dev_init(struct ipoib_dev_priv *priv);
+int ipoib_cm_add_mode_attr(struct ipoib_dev_priv *priv);
+void ipoib_cm_dev_cleanup(struct ipoib_dev_priv *priv);
+struct ipoib_cm_tx *ipoib_cm_create_tx(struct ipoib_dev_priv *priv,
+    struct ipoib_path *path);
+void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx);
+void ipoib_cm_mb_too_long(struct ipoib_dev_priv *priv, struct mbuf *mb,
+			   unsigned int mtu);
+void ipoib_cm_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc);
+void ipoib_cm_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc);
+#else
+
+struct ipoib_cm_tx;
+
+#define ipoib_max_conn_qp 0
+
+static inline int ipoib_cm_admin_enabled(struct ipoib_dev_priv *priv)
+{
+	return 0;
+}
+static inline int ipoib_cm_enabled(struct ipoib_dev_priv *priv, uint8_t *hwaddr)
+
+{
+	return 0;
+}
+
+static inline int ipoib_cm_up(struct ipoib_path *path)
+
+{
+	return 0;
+}
+
+static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_path *path)
+{
+	return NULL;
+}
+
+static inline void ipoib_cm_set(struct ipoib_path *path, struct ipoib_cm_tx *tx)
+{
+}
+
+static inline int ipoib_cm_has_srq(struct ipoib_dev_priv *priv)
+{
+	return 0;
+}
+
+static inline unsigned int ipoib_cm_max_mtu(struct ipoib_dev_priv *priv)
+{
+	return 0;
+}
+
+static inline
+void ipoib_cm_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_cm_tx *tx)
+{
+	return;
+}
+
+static inline
+int ipoib_cm_dev_open(struct ipoib_dev_priv *priv)
+{
+	return 0;
+}
+
+static inline
+void ipoib_cm_dev_stop(struct ipoib_dev_priv *priv)
+{
+	return;
+}
+
+static inline
+int ipoib_cm_dev_init(struct ipoib_dev_priv *priv)
+{
+	return -ENOSYS;
+}
+
+static inline
+void ipoib_cm_dev_cleanup(struct ipoib_dev_priv *priv)
+{
+	return;
+}
+
+static inline
+struct ipoib_cm_tx *ipoib_cm_create_tx(struct ipoib_dev_priv *priv, struct ipoib_path *path)
+{
+	return NULL;
+}
+
+static inline
+void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
+{
+	return;
+}
+
+static inline
+int ipoib_cm_add_mode_attr(struct ipoib_dev_priv *priv)
+{
+	return 0;
+}
+
+static inline void ipoib_cm_mb_too_long(struct ipoib_dev_priv *priv, struct mbuf *mb,
+					 unsigned int mtu)
+{
+	m_freem(mb);
+}
+
+static inline void ipoib_cm_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc)
+{
+}
+
+static inline void ipoib_cm_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc)
+{
+}
+#endif
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+void ipoib_create_debug_files(struct ipoib_dev_priv *priv);
+void ipoib_delete_debug_files(struct ipoib_dev_priv *priv);
+int ipoib_register_debugfs(void);
+void ipoib_unregister_debugfs(void);
+#else
+static inline void ipoib_create_debug_files(struct ipoib_dev_priv *priv) { }
+static inline void ipoib_delete_debug_files(struct ipoib_dev_priv *priv) { }
+static inline int ipoib_register_debugfs(void) { return 0; }
+static inline void ipoib_unregister_debugfs(void) { }
+#endif
+
+#define ipoib_printk(level, priv, format, arg...)	\
+	printk(level "%s: " format, if_name(((struct ipoib_dev_priv *) priv)->dev), ## arg)
+#define ipoib_warn(priv, format, arg...)		\
+	ipoib_printk(KERN_WARNING, priv, format , ## arg)
+
+extern int ipoib_sendq_size;
+extern int ipoib_recvq_size;
+
+extern struct ib_sa_client ipoib_sa_client;
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+extern int ipoib_debug_level;
+
+#define ipoib_dbg(priv, format, arg...)			\
+	do {						\
+		if (ipoib_debug_level > 0)			\
+			ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
+	} while (0)
+#define ipoib_dbg_mcast(priv, format, arg...)		\
+	do {						\
+		if (mcast_debug_level > 0)		\
+			ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
+	} while (0)
+#else /* CONFIG_INFINIBAND_IPOIB_DEBUG */
+#define ipoib_dbg(priv, format, arg...)			\
+	do { (void) (priv); } while (0)
+#define ipoib_dbg_mcast(priv, format, arg...)		\
+	do { (void) (priv); } while (0)
+#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
+#define ipoib_dbg_data(priv, format, arg...)		\
+	do {						\
+		if (data_debug_level > 0)		\
+			ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
+	} while (0)
+#else /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */
+#define ipoib_dbg_data(priv, format, arg...)		\
+	do { (void) (priv); } while (0)
+#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */
+
+#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff)
+
+#endif /* _IPOIB_H */
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c
new file mode 100644
index 0000000..2d0fd61
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -0,0 +1,1445 @@
+/*
+ * Copyright (c) 2006 Mellanox Technologies. All rights reserved
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ipoib.h"
+
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/icmp6.h>
+
+#include <rdma/ib_cm.h>
+#include <rdma/ib_cache.h>
+#include <linux/delay.h>
+
+int ipoib_max_conn_qp = 128;
+
+module_param_named(max_nonsrq_conn_qp, ipoib_max_conn_qp, int, 0444);
+MODULE_PARM_DESC(max_nonsrq_conn_qp,
+		 "Max number of connected-mode QPs per interface "
+		 "(applied only if shared receive queue is not available)");
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
+static int data_debug_level;
+
+module_param_named(cm_data_debug_level, data_debug_level, int, 0644);
+MODULE_PARM_DESC(cm_data_debug_level,
+		 "Enable data path debug tracing for connected mode if > 0");
+#endif
+
+#define IPOIB_CM_IETF_ID 0x1000000000000000ULL
+
+#define IPOIB_CM_RX_UPDATE_TIME (256 * HZ)
+#define IPOIB_CM_RX_TIMEOUT     (2 * 256 * HZ)
+#define IPOIB_CM_RX_DELAY       (3 * 256 * HZ)
+#define IPOIB_CM_RX_UPDATE_MASK (0x3)
+
+static struct ib_qp_attr ipoib_cm_err_attr = {
+	.qp_state = IB_QPS_ERR
+};
+
+#define IPOIB_CM_RX_DRAIN_WRID 0xffffffff
+
+static struct ib_send_wr ipoib_cm_rx_drain_wr = {
+	.wr_id = IPOIB_CM_RX_DRAIN_WRID,
+	.opcode = IB_WR_SEND,
+};
+
+static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
+			       struct ib_cm_event *event);
+
+static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, struct ipoib_cm_rx_buf *rx_req)
+{
+
+	ipoib_dma_unmap_rx(priv, (struct ipoib_rx_buf *)rx_req);
+
+}
+
+static int ipoib_cm_post_receive_srq(struct ipoib_dev_priv *priv, int id)
+{
+	struct ib_recv_wr *bad_wr;
+	struct ipoib_rx_buf *rx_req;
+	struct mbuf *m;
+	int ret;
+	int i;
+
+	rx_req = (struct ipoib_rx_buf *)&priv->cm.srq_ring[id];
+	for (m = rx_req->mb, i = 0; m != NULL; m = m->m_next, i++) {
+		priv->cm.rx_sge[i].addr = rx_req->mapping[i];
+		priv->cm.rx_sge[i].length = m->m_len;
+	}
+
+	priv->cm.rx_wr.num_sge = i;
+	priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
+
+	ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr);
+	if (unlikely(ret)) {
+		ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret);
+		ipoib_dma_unmap_rx(priv, rx_req);
+		m_freem(priv->cm.srq_ring[id].mb);
+		priv->cm.srq_ring[id].mb = NULL;
+	}
+
+	return ret;
+}
+
+static int ipoib_cm_post_receive_nonsrq(struct ipoib_dev_priv *priv,
+					struct ipoib_cm_rx *rx,
+					struct ib_recv_wr *wr,
+					struct ib_sge *sge, int id)
+{
+	struct ipoib_rx_buf *rx_req;
+	struct ib_recv_wr *bad_wr;
+	struct mbuf *m;
+	int ret;
+	int i;
+
+	rx_req = (struct ipoib_rx_buf *)&rx->rx_ring[id];
+	for (m = rx_req->mb, i = 0; m != NULL; m = m->m_next, i++) {
+		sge[i].addr = rx_req->mapping[i];
+		sge[i].length = m->m_len;
+	}
+
+	wr->num_sge = i;
+	wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
+
+	ret = ib_post_recv(rx->qp, wr, &bad_wr);
+	if (unlikely(ret)) {
+		ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret);
+		ipoib_dma_unmap_rx(priv, rx_req);
+		m_freem(rx->rx_ring[id].mb);
+		rx->rx_ring[id].mb = NULL;
+	}
+
+	return ret;
+}
+
+static struct mbuf *
+ipoib_cm_alloc_rx_mb(struct ipoib_dev_priv *priv, struct ipoib_cm_rx_buf *rx_req)
+{
+	return ipoib_alloc_map_mb(priv, (struct ipoib_rx_buf *)rx_req,
+	    priv->cm.max_cm_mtu);
+}
+
+static void ipoib_cm_free_rx_ring(struct ipoib_dev_priv *priv,
+				  struct ipoib_cm_rx_buf *rx_ring)
+{
+	int i;
+
+	for (i = 0; i < ipoib_recvq_size; ++i)
+		if (rx_ring[i].mb) {
+			ipoib_cm_dma_unmap_rx(priv, &rx_ring[i]);
+			m_freem(rx_ring[i].mb);
+		}
+
+	kfree(rx_ring);
+}
+
+static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv)
+{
+	struct ib_send_wr *bad_wr;
+	struct ipoib_cm_rx *p;
+
+	/* We only reserved 1 extra slot in CQ for drain WRs, so
+	 * make sure we have at most 1 outstanding WR. */
+	if (list_empty(&priv->cm.rx_flush_list) ||
+	    !list_empty(&priv->cm.rx_drain_list))
+		return;
+
+	/*
+	 * QPs on flush list are error state.  This way, a "flush
+	 * error" WC will be immediately generated for each WR we post.
+	 */
+	p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list);
+	if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr))
+		ipoib_warn(priv, "failed to post drain wr\n");
+
+	list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);
+}
+
+static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx)
+{
+	struct ipoib_cm_rx *p = ctx;
+	struct ipoib_dev_priv *priv = p->priv;
+	unsigned long flags;
+
+	if (event->event != IB_EVENT_QP_LAST_WQE_REACHED)
+		return;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	list_move(&p->list, &priv->cm.rx_flush_list);
+	p->state = IPOIB_CM_RX_FLUSH;
+	ipoib_cm_start_rx_drain(priv);
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static struct ib_qp *ipoib_cm_create_rx_qp(struct ipoib_dev_priv *priv,
+					   struct ipoib_cm_rx *p)
+{
+	struct ib_qp_init_attr attr = {
+		.event_handler = ipoib_cm_rx_event_handler,
+		.send_cq = priv->recv_cq, /* For drain WR */
+		.recv_cq = priv->recv_cq,
+		.srq = priv->cm.srq,
+		.cap.max_send_wr = 1, /* For drain WR */
+		.cap.max_send_sge = 1,
+		.sq_sig_type = IB_SIGNAL_ALL_WR,
+		.qp_type = IB_QPT_RC,
+		.qp_context = p,
+	};
+
+	if (!ipoib_cm_has_srq(priv)) {
+		attr.cap.max_recv_wr  = ipoib_recvq_size;
+		attr.cap.max_recv_sge = priv->cm.num_frags;
+	}
+
+	return ib_create_qp(priv->pd, &attr);
+}
+
+static int ipoib_cm_modify_rx_qp(struct ipoib_dev_priv *priv,
+				 struct ib_cm_id *cm_id, struct ib_qp *qp,
+				 unsigned psn)
+{
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+
+	qp_attr.qp_state = IB_QPS_INIT;
+	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret);
+		return ret;
+	}
+	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret);
+		return ret;
+	}
+	qp_attr.qp_state = IB_QPS_RTR;
+	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
+		return ret;
+	}
+	qp_attr.rq_psn = psn;
+	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
+		return ret;
+	}
+
+	/*
+	 * Current Mellanox HCA firmware won't generate completions
+	 * with error for drain WRs unless the QP has been moved to
+	 * RTS first. This work-around leaves a window where a QP has
+	 * moved to error asynchronously, but this will eventually get
+	 * fixed in firmware, so let's not error out if modify QP
+	 * fails.
+	 */
+	qp_attr.qp_state = IB_QPS_RTS;
+	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
+		return 0;
+	}
+	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
+		return 0;
+	}
+
+	return 0;
+}
+
+static void ipoib_cm_init_rx_wr(struct ipoib_dev_priv *priv,
+				struct ib_recv_wr *wr,
+				struct ib_sge *sge)
+{
+	int i;
+
+	for (i = 0; i < IPOIB_CM_RX_SG; i++)
+		sge[i].lkey = priv->mr->lkey;
+
+	wr->next    = NULL;
+	wr->sg_list = sge;
+	wr->num_sge = 1;
+}
+
+static int ipoib_cm_nonsrq_init_rx(struct ipoib_dev_priv *priv,
+    struct ib_cm_id *cm_id, struct ipoib_cm_rx *rx)
+{
+	struct {
+		struct ib_recv_wr wr;
+		struct ib_sge sge[IPOIB_CM_RX_SG];
+	} *t;
+	int ret;
+	int i;
+
+	rx->rx_ring = kzalloc(ipoib_recvq_size * sizeof *rx->rx_ring, GFP_KERNEL);
+	if (!rx->rx_ring) {
+		printk(KERN_WARNING "%s: failed to allocate CM non-SRQ ring (%d entries)\n",
+		       priv->ca->name, ipoib_recvq_size);
+		return -ENOMEM;
+	}
+
+	memset(rx->rx_ring, 0, ipoib_recvq_size * sizeof *rx->rx_ring);
+
+	t = kmalloc(sizeof *t, GFP_KERNEL);
+	if (!t) {
+		ret = -ENOMEM;
+		goto err_free;
+	}
+
+	ipoib_cm_init_rx_wr(priv, &t->wr, t->sge);
+
+	spin_lock_irq(&priv->lock);
+
+	if (priv->cm.nonsrq_conn_qp >= ipoib_max_conn_qp) {
+		spin_unlock_irq(&priv->lock);
+		ib_send_cm_rej(cm_id, IB_CM_REJ_NO_QP, NULL, 0, NULL, 0);
+		ret = -EINVAL;
+		goto err_free;
+	} else
+		++priv->cm.nonsrq_conn_qp;
+
+	spin_unlock_irq(&priv->lock);
+
+	for (i = 0; i < ipoib_recvq_size; ++i) {
+		if (!ipoib_cm_alloc_rx_mb(priv, &rx->rx_ring[i])) {
+			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
+				ret = -ENOMEM;
+				goto err_count;
+		}
+		ret = ipoib_cm_post_receive_nonsrq(priv, rx, &t->wr, t->sge, i);
+		if (ret) {
+			ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq "
+				   "failed for buf %d\n", i);
+			ret = -EIO;
+			goto err_count;
+		}
+	}
+
+	rx->recv_count = ipoib_recvq_size;
+
+	kfree(t);
+
+	return 0;
+
+err_count:
+	spin_lock_irq(&priv->lock);
+	--priv->cm.nonsrq_conn_qp;
+	spin_unlock_irq(&priv->lock);
+
+err_free:
+	kfree(t);
+	ipoib_cm_free_rx_ring(priv, rx->rx_ring);
+
+	return ret;
+}
+
+static int ipoib_cm_send_rep(struct ipoib_dev_priv *priv, struct ib_cm_id *cm_id,
+			     struct ib_qp *qp, struct ib_cm_req_event_param *req,
+			     unsigned psn)
+{
+	struct ipoib_cm_data data = {};
+	struct ib_cm_rep_param rep = {};
+
+	data.qpn = cpu_to_be32(priv->qp->qp_num);
+	data.mtu = cpu_to_be32(priv->cm.max_cm_mtu);
+
+	rep.private_data = &data;
+	rep.private_data_len = sizeof data;
+	rep.flow_control = 0;
+	rep.rnr_retry_count = req->rnr_retry_count;
+	rep.srq = ipoib_cm_has_srq(priv);
+	rep.qp_num = qp->qp_num;
+	rep.starting_psn = psn;
+	return ib_send_cm_rep(cm_id, &rep);
+}
+
+static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
+{
+	struct ipoib_dev_priv *priv = cm_id->context;
+	struct ipoib_cm_rx *p;
+	unsigned psn;
+	int ret;
+
+	ipoib_dbg(priv, "REQ arrived\n");
+	p = kzalloc(sizeof *p, GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+	p->priv = priv;
+	p->id = cm_id;
+	cm_id->context = p;
+	p->state = IPOIB_CM_RX_LIVE;
+	p->jiffies = jiffies;
+	INIT_LIST_HEAD(&p->list);
+
+	p->qp = ipoib_cm_create_rx_qp(priv, p);
+	if (IS_ERR(p->qp)) {
+		ret = PTR_ERR(p->qp);
+		goto err_qp;
+	}
+
+	psn = random() & 0xffffff;
+	ret = ipoib_cm_modify_rx_qp(priv, cm_id, p->qp, psn);
+	if (ret)
+		goto err_modify;
+
+	if (!ipoib_cm_has_srq(priv)) {
+		ret = ipoib_cm_nonsrq_init_rx(priv, cm_id, p);
+		if (ret)
+			goto err_modify;
+	}
+
+	spin_lock_irq(&priv->lock);
+	queue_delayed_work(ipoib_workqueue,
+			   &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
+	/* Add this entry to passive ids list head, but do not re-add it
+	 * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */
+	p->jiffies = jiffies;
+	if (p->state == IPOIB_CM_RX_LIVE)
+		list_move(&p->list, &priv->cm.passive_ids);
+	spin_unlock_irq(&priv->lock);
+
+	ret = ipoib_cm_send_rep(priv, cm_id, p->qp, &event->param.req_rcvd, psn);
+	if (ret) {
+		ipoib_warn(priv, "failed to send REP: %d\n", ret);
+		if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
+			ipoib_warn(priv, "unable to move qp to error state\n");
+	}
+	return 0;
+
+err_modify:
+	ib_destroy_qp(p->qp);
+err_qp:
+	kfree(p);
+	return ret;
+}
+
+static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id,
+			       struct ib_cm_event *event)
+{
+	struct ipoib_cm_rx *p;
+	struct ipoib_dev_priv *priv;
+
+	switch (event->event) {
+	case IB_CM_REQ_RECEIVED:
+		return ipoib_cm_req_handler(cm_id, event);
+	case IB_CM_DREQ_RECEIVED:
+		p = cm_id->context;
+		ib_send_cm_drep(cm_id, NULL, 0);
+		/* Fall through */
+	case IB_CM_REJ_RECEIVED:
+		p = cm_id->context;
+		priv = p->priv;
+		if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
+			ipoib_warn(priv, "unable to move qp to error state\n");
+		/* Fall through */
+	default:
+		return 0;
+	}
+}
+
+void ipoib_cm_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc)
+{
+	struct ipoib_cm_rx_buf saverx;
+	struct ipoib_cm_rx_buf *rx_ring;
+	unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV);
+	struct ifnet *dev = priv->dev;
+	struct mbuf *mb, *newmb;
+	struct ipoib_cm_rx *p;
+	int has_srq;
+	u_short proto;
+
+	ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n",
+		       wr_id, wc->status);
+
+	if (unlikely(wr_id >= ipoib_recvq_size)) {
+		if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) {
+			spin_lock(&priv->lock);
+			list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
+			ipoib_cm_start_rx_drain(priv);
+			if (priv->cm.id != NULL)
+				queue_work(ipoib_workqueue,
+				    &priv->cm.rx_reap_task);
+			spin_unlock(&priv->lock);
+		} else
+			ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
+				   wr_id, ipoib_recvq_size);
+		return;
+	}
+
+	p = wc->qp->qp_context;
+
+	has_srq = ipoib_cm_has_srq(priv);
+	rx_ring = has_srq ? priv->cm.srq_ring : p->rx_ring;
+
+	mb = rx_ring[wr_id].mb;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		ipoib_dbg(priv, "cm recv error "
+			   "(status=%d, wrid=%d vend_err %x)\n",
+			   wc->status, wr_id, wc->vendor_err);
+		++dev->if_ierrors;
+		if (has_srq)
+			goto repost;
+		else {
+			if (!--p->recv_count) {
+				spin_lock(&priv->lock);
+				list_move(&p->list, &priv->cm.rx_reap_list);
+				queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
+				spin_unlock(&priv->lock);
+			}
+			return;
+		}
+	}
+
+	if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) {
+		if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
+			p->jiffies = jiffies;
+			/* Move this entry to list head, but do not re-add it
+			 * if it has been moved out of list. */
+			if (p->state == IPOIB_CM_RX_LIVE)
+				list_move(&p->list, &priv->cm.passive_ids);
+		}
+	}
+
+	memcpy(&saverx, &rx_ring[wr_id], sizeof(saverx));
+	newmb = ipoib_cm_alloc_rx_mb(priv, &rx_ring[wr_id]);
+	if (unlikely(!newmb)) {
+		/*
+		 * If we can't allocate a new RX buffer, dump
+		 * this packet and reuse the old buffer.
+		 */
+		ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id);
+		++dev->if_ierrors;
+		memcpy(&rx_ring[wr_id], &saverx, sizeof(saverx));
+		goto repost;
+	}
+
+	ipoib_cm_dma_unmap_rx(priv, &saverx);
+
+	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
+		       wc->byte_len, wc->slid);
+
+	ipoib_dma_mb(priv, mb, wc->byte_len);
+
+	++dev->if_opackets;
+	dev->if_obytes += mb->m_pkthdr.len;
+
+	mb->m_pkthdr.rcvif = dev;
+	proto = *mtod(mb, uint16_t *);
+	m_adj(mb, IPOIB_ENCAP_LEN);
+
+	IPOIB_MTAP_PROTO(dev, mb, proto);
+	ipoib_demux(dev, mb, ntohs(proto));
+
+repost:
+	if (has_srq) {
+		if (unlikely(ipoib_cm_post_receive_srq(priv, wr_id)))
+			ipoib_warn(priv, "ipoib_cm_post_receive_srq failed "
+				   "for buf %d\n", wr_id);
+	} else {
+		if (unlikely(ipoib_cm_post_receive_nonsrq(priv, p,
+							  &priv->cm.rx_wr,
+							  priv->cm.rx_sge,
+							  wr_id))) {
+			--p->recv_count;
+			ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed "
+				   "for buf %d\n", wr_id);
+		}
+	}
+}
+
+static inline int post_send(struct ipoib_dev_priv *priv,
+			    struct ipoib_cm_tx *tx,
+			    struct ipoib_cm_tx_buf *tx_req,
+			    unsigned int wr_id)
+{
+	struct ib_send_wr *bad_wr;
+	struct mbuf *mb = tx_req->mb;
+	u64 *mapping = tx_req->mapping;
+	struct mbuf *m;
+	int i;
+
+	for (m = mb, i = 0; m != NULL; m = m->m_next, i++) {
+		priv->tx_sge[i].addr = mapping[i];
+		priv->tx_sge[i].length = m->m_len;
+	}
+	priv->tx_wr.num_sge = i;
+	priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM;
+	priv->tx_wr.opcode = IB_WR_SEND;
+
+	return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
+}
+
+void ipoib_cm_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_cm_tx *tx)
+{
+	struct ipoib_cm_tx_buf *tx_req;
+	struct ifnet *dev = priv->dev;
+
+	if (unlikely(priv->tx_outstanding > MAX_SEND_CQE))
+		while (ipoib_poll_tx(priv)); /* nothing */
+
+	m_adj(mb, sizeof(struct ipoib_pseudoheader));
+	if (unlikely(mb->m_pkthdr.len > tx->mtu)) {
+		ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
+			   mb->m_pkthdr.len, tx->mtu);
+		++dev->if_oerrors;
+		ipoib_cm_mb_too_long(priv, mb, IPOIB_CM_MTU(tx->mtu));
+		return;
+	}
+
+	ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n",
+		       tx->tx_head, mb->m_pkthdr.len, tx->qp->qp_num);
+
+
+	/*
+	 * We put the mb into the tx_ring _before_ we call post_send()
+	 * because it's entirely possible that the completion handler will
+	 * run before we execute anything after the post_send().  That
+	 * means we have to make sure everything is properly recorded and
+	 * our state is consistent before we call post_send().
+	 */
+	tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
+	tx_req->mb = mb;
+	if (unlikely(ipoib_dma_map_tx(priv->ca, (struct ipoib_tx_buf *)tx_req,
+	    priv->cm.num_frags))) {
+		++dev->if_oerrors;
+		if (tx_req->mb)
+			m_freem(tx_req->mb);
+		return;
+	}
+
+	if (unlikely(post_send(priv, tx, tx_req, tx->tx_head & (ipoib_sendq_size - 1)))) {
+		ipoib_warn(priv, "post_send failed\n");
+		++dev->if_oerrors;
+		ipoib_dma_unmap_tx(priv->ca, (struct ipoib_tx_buf *)tx_req);
+		m_freem(mb);
+	} else {
+		++tx->tx_head;
+
+		if (++priv->tx_outstanding == ipoib_sendq_size) {
+			ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
+				  tx->qp->qp_num);
+			if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP))
+				ipoib_warn(priv, "request notify on send CQ failed\n");
+			dev->if_drv_flags |= IFF_DRV_OACTIVE;
+		}
+	}
+
+}
+
+void ipoib_cm_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc)
+{
+	struct ipoib_cm_tx *tx = wc->qp->qp_context;
+	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
+	struct ifnet *dev = priv->dev;
+	struct ipoib_cm_tx_buf *tx_req;
+
+	ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n",
+		       wr_id, wc->status);
+
+	if (unlikely(wr_id >= ipoib_sendq_size)) {
+		ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n",
+			   wr_id, ipoib_sendq_size);
+		return;
+	}
+
+	tx_req = &tx->tx_ring[wr_id];
+
+	ipoib_dma_unmap_tx(priv->ca, (struct ipoib_tx_buf *)tx_req);
+
+	/* FIXME: is this right? Shouldn't we only increment on success? */
+	++dev->if_opackets;
+	dev->if_obytes += tx_req->mb->m_pkthdr.len;
+
+	m_freem(tx_req->mb);
+
+	++tx->tx_tail;
+	if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
+	    (dev->if_drv_flags & IFF_DRV_OACTIVE) != 0 &&
+	    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+		dev->if_drv_flags &= ~IFF_DRV_OACTIVE;
+
+	if (wc->status != IB_WC_SUCCESS &&
+	    wc->status != IB_WC_WR_FLUSH_ERR) {
+		struct ipoib_path *path;
+
+		ipoib_dbg(priv, "failed cm send event "
+			   "(status=%d, wrid=%d vend_err %x)\n",
+			   wc->status, wr_id, wc->vendor_err);
+
+		path = tx->path;
+
+		if (path) {
+			path->cm = NULL;
+			rb_erase(&path->rb_node, &priv->path_tree);
+			list_del(&path->list);
+		}
+
+		if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
+			list_move(&tx->list, &priv->cm.reap_list);
+			queue_work(ipoib_workqueue, &priv->cm.reap_task);
+		}
+
+		clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags);
+	}
+
+}
+
+int ipoib_cm_dev_open(struct ipoib_dev_priv *priv)
+{
+	int ret;
+
+	if (!IPOIB_CM_SUPPORTED(IF_LLADDR(priv->dev)))
+		return 0;
+
+	priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, priv);
+	if (IS_ERR(priv->cm.id)) {
+		printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
+		ret = PTR_ERR(priv->cm.id);
+		goto err_cm;
+	}
+
+	ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num),
+			   0, NULL);
+	if (ret) {
+		printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name,
+		       IPOIB_CM_IETF_ID | priv->qp->qp_num);
+		goto err_listen;
+	}
+
+	return 0;
+
+err_listen:
+	ib_destroy_cm_id(priv->cm.id);
+err_cm:
+	priv->cm.id = NULL;
+	return ret;
+}
+
+static void ipoib_cm_free_rx_reap_list(struct ipoib_dev_priv *priv)
+{
+	struct ipoib_cm_rx *rx, *n;
+	LIST_HEAD(list);
+
+	spin_lock_irq(&priv->lock);
+	list_splice_init(&priv->cm.rx_reap_list, &list);
+	spin_unlock_irq(&priv->lock);
+
+	list_for_each_entry_safe(rx, n, &list, list) {
+		ib_destroy_cm_id(rx->id);
+		ib_destroy_qp(rx->qp);
+		if (!ipoib_cm_has_srq(priv)) {
+			ipoib_cm_free_rx_ring(priv, rx->rx_ring);
+			spin_lock_irq(&priv->lock);
+			--priv->cm.nonsrq_conn_qp;
+			spin_unlock_irq(&priv->lock);
+		}
+		kfree(rx);
+	}
+}
+
+void ipoib_cm_dev_stop(struct ipoib_dev_priv *priv)
+{
+	struct ipoib_cm_rx *p;
+	unsigned long begin;
+	int ret;
+
+	if (!IPOIB_CM_SUPPORTED(IF_LLADDR(priv->dev)) || !priv->cm.id)
+		return;
+
+	ib_destroy_cm_id(priv->cm.id);
+	priv->cm.id = NULL;
+
+	cancel_work_sync(&priv->cm.rx_reap_task);
+
+	spin_lock_irq(&priv->lock);
+	while (!list_empty(&priv->cm.passive_ids)) {
+		p = list_entry(priv->cm.passive_ids.next, typeof(*p), list);
+		list_move(&p->list, &priv->cm.rx_error_list);
+		p->state = IPOIB_CM_RX_ERROR;
+		spin_unlock_irq(&priv->lock);
+		ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
+		if (ret)
+			ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
+		spin_lock_irq(&priv->lock);
+	}
+
+	/* Wait for all RX to be drained */
+	begin = jiffies;
+
+	while (!list_empty(&priv->cm.rx_error_list) ||
+	       !list_empty(&priv->cm.rx_flush_list) ||
+	       !list_empty(&priv->cm.rx_drain_list)) {
+		if (time_after(jiffies, begin + 5 * HZ)) {
+			ipoib_warn(priv, "RX drain timing out\n");
+
+			/*
+			 * assume the HW is wedged and just free up everything.
+			 */
+			list_splice_init(&priv->cm.rx_flush_list,
+					 &priv->cm.rx_reap_list);
+			list_splice_init(&priv->cm.rx_error_list,
+					 &priv->cm.rx_reap_list);
+			list_splice_init(&priv->cm.rx_drain_list,
+					 &priv->cm.rx_reap_list);
+			break;
+		}
+		spin_unlock_irq(&priv->lock);
+		msleep(1);
+		ipoib_drain_cq(priv);
+		spin_lock_irq(&priv->lock);
+	}
+
+	spin_unlock_irq(&priv->lock);
+
+	ipoib_cm_free_rx_reap_list(priv);
+
+	cancel_delayed_work(&priv->cm.stale_task);
+}
+
+static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
+{
+	struct ipoib_cm_tx *p = cm_id->context;
+	struct ipoib_dev_priv *priv = p->priv;
+	struct ipoib_cm_data *data = event->private_data;
+	struct ifqueue mbqueue;
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+	struct mbuf *mb;
+
+	ipoib_dbg(priv, "cm rep handler\n");
+	p->mtu = be32_to_cpu(data->mtu);
+
+	if (p->mtu <= IPOIB_ENCAP_LEN) {
+		ipoib_warn(priv, "Rejecting connection: mtu %d <= %d\n",
+			   p->mtu, IPOIB_ENCAP_LEN);
+		return -EINVAL;
+	}
+
+	qp_attr.qp_state = IB_QPS_RTR;
+	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
+		return ret;
+	}
+
+	qp_attr.rq_psn = 0 /* FIXME */;
+	ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
+		return ret;
+	}
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
+		return ret;
+	}
+	ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
+		return ret;
+	}
+
+	bzero(&mbqueue, sizeof(mbqueue));
+
+	spin_lock_irq(&priv->lock);
+	set_bit(IPOIB_FLAG_OPER_UP, &p->flags);
+	if (p->path)
+		for (;;) {
+			_IF_DEQUEUE(&p->path->queue, mb);
+			if (mb == NULL)
+				break;
+			_IF_ENQUEUE(&mbqueue, mb);
+		}
+	spin_unlock_irq(&priv->lock);
+
+	for (;;) {
+		struct ifnet *dev = p->priv->dev;
+		_IF_DEQUEUE(&mbqueue, mb);
+		if (mb == NULL)
+			break;
+		mb->m_pkthdr.rcvif = dev;
+		if (dev->if_transmit(dev, mb))
+			ipoib_warn(priv, "dev_queue_xmit failed "
+				   "to requeue packet\n");
+	}
+
+	ret = ib_send_cm_rtu(cm_id, NULL, 0);
+	if (ret) {
+		ipoib_warn(priv, "failed to send RTU: %d\n", ret);
+		return ret;
+	}
+	return 0;
+}
+
+static struct ib_qp *ipoib_cm_create_tx_qp(struct ipoib_dev_priv *priv,
+    struct ipoib_cm_tx *tx)
+{
+	struct ib_qp_init_attr attr = {
+		.send_cq		= priv->send_cq,
+		.recv_cq		= priv->recv_cq,
+		.srq			= priv->cm.srq,
+		.cap.max_send_wr	= ipoib_sendq_size,
+		.cap.max_send_sge	= priv->cm.num_frags,
+		.sq_sig_type		= IB_SIGNAL_ALL_WR,
+		.qp_type		= IB_QPT_RC,
+		.qp_context		= tx
+	};
+
+	return ib_create_qp(priv->pd, &attr);
+}
+
+static int ipoib_cm_send_req(struct ipoib_dev_priv *priv,
+			     struct ib_cm_id *id, struct ib_qp *qp,
+			     u32 qpn,
+			     struct ib_sa_path_rec *pathrec)
+{
+	struct ipoib_cm_data data = {};
+	struct ib_cm_req_param req = {};
+
+	ipoib_dbg(priv, "cm send req\n");
+
+	data.qpn = cpu_to_be32(priv->qp->qp_num);
+	data.mtu = cpu_to_be32(priv->cm.max_cm_mtu);
+
+	req.primary_path		= pathrec;
+	req.alternate_path		= NULL;
+	req.service_id			= cpu_to_be64(IPOIB_CM_IETF_ID | qpn);
+	req.qp_num			= qp->qp_num;
+	req.qp_type			= qp->qp_type;
+	req.private_data		= &data;
+	req.private_data_len		= sizeof data;
+	req.flow_control		= 0;
+
+	req.starting_psn		= 0; /* FIXME */
+
+	/*
+	 * Pick some arbitrary defaults here; we could make these
+	 * module parameters if anyone cared about setting them.
+	 */
+	req.responder_resources		= 4;
+	req.remote_cm_response_timeout	= 20;
+	req.local_cm_response_timeout	= 20;
+	req.retry_count			= 0; /* RFC draft warns against retries */
+	req.rnr_retry_count		= 0; /* RFC draft warns against retries */
+	req.max_cm_retries		= 15;
+	req.srq				= ipoib_cm_has_srq(priv);
+	return ib_send_cm_req(id, &req);
+}
+
+static int ipoib_cm_modify_tx_init(struct ipoib_dev_priv *priv,
+				  struct ib_cm_id *cm_id, struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+	ret = ib_find_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index);
+	if (ret) {
+		ipoib_warn(priv, "pkey 0x%x not found: %d\n", priv->pkey, ret);
+		return ret;
+	}
+
+	qp_attr.qp_state = IB_QPS_INIT;
+	qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE;
+	qp_attr.port_num = priv->port;
+	qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT;
+
+	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret);
+		return ret;
+	}
+	return 0;
+}
+
+static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn,
+			    struct ib_sa_path_rec *pathrec)
+{
+	struct ipoib_dev_priv *priv = p->priv;
+	int ret;
+
+	p->tx_ring = kzalloc(ipoib_sendq_size * sizeof *p->tx_ring, GFP_KERNEL);
+	if (!p->tx_ring) {
+		ipoib_warn(priv, "failed to allocate tx ring\n");
+		ret = -ENOMEM;
+		goto err_tx;
+	}
+	memset(p->tx_ring, 0, ipoib_sendq_size * sizeof *p->tx_ring);
+
+	p->qp = ipoib_cm_create_tx_qp(p->priv, p);
+	if (IS_ERR(p->qp)) {
+		ret = PTR_ERR(p->qp);
+		ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret);
+		goto err_qp;
+	}
+
+	p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p);
+	if (IS_ERR(p->id)) {
+		ret = PTR_ERR(p->id);
+		ipoib_warn(priv, "failed to create tx cm id: %d\n", ret);
+		goto err_id;
+	}
+
+	ret = ipoib_cm_modify_tx_init(p->priv, p->id,  p->qp);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret);
+		goto err_modify;
+	}
+
+	ret = ipoib_cm_send_req(p->priv, p->id, p->qp, qpn, pathrec);
+	if (ret) {
+		ipoib_warn(priv, "failed to send cm req: %d\n", ret);
+		goto err_send_cm;
+	}
+
+	ipoib_dbg(priv, "Request connection 0x%x for gid %pI6 qpn 0x%x\n",
+		  p->qp->qp_num, pathrec->dgid.raw, qpn);
+
+	return 0;
+
+err_send_cm:
+err_modify:
+	ib_destroy_cm_id(p->id);
+err_id:
+	p->id = NULL;
+	ib_destroy_qp(p->qp);
+err_qp:
+	p->qp = NULL;
+	kfree(p->tx_ring);
+err_tx:
+	return ret;
+}
+
+static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
+{
+	struct ipoib_dev_priv *priv = p->priv;
+	struct ifnet *dev = priv->dev;
+	struct ipoib_cm_tx_buf *tx_req;
+	unsigned long begin;
+
+	ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
+		  p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail);
+
+	if (p->path)
+		ipoib_path_free(priv, p->path);
+
+	if (p->id)
+		ib_destroy_cm_id(p->id);
+
+	if (p->tx_ring) {
+		/* Wait for all sends to complete */
+		begin = jiffies;
+		while ((int) p->tx_tail - (int) p->tx_head < 0) {
+			if (time_after(jiffies, begin + 5 * HZ)) {
+				ipoib_warn(priv, "timing out; %d sends not completed\n",
+					   p->tx_head - p->tx_tail);
+				goto timeout;
+			}
+
+			msleep(1);
+		}
+	}
+
+timeout:
+
+	while ((int) p->tx_tail - (int) p->tx_head < 0) {
+		tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
+		ipoib_dma_unmap_tx(priv->ca, (struct ipoib_tx_buf *)tx_req);
+		m_freem(tx_req->mb);
+		++p->tx_tail;
+		if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
+		    (dev->if_drv_flags & IFF_DRV_OACTIVE) != 0 &&
+		    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+			dev->if_drv_flags &= ~IFF_DRV_OACTIVE;
+	}
+
+	if (p->qp)
+		ib_destroy_qp(p->qp);
+
+	kfree(p->tx_ring);
+	kfree(p);
+}
+
+static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
+			       struct ib_cm_event *event)
+{
+	struct ipoib_cm_tx *tx = cm_id->context;
+	struct ipoib_dev_priv *priv = tx->priv;
+	struct ipoib_path *path;
+	unsigned long flags;
+	int ret;
+
+	switch (event->event) {
+	case IB_CM_DREQ_RECEIVED:
+		ipoib_dbg(priv, "DREQ received.\n");
+		ib_send_cm_drep(cm_id, NULL, 0);
+		break;
+	case IB_CM_REP_RECEIVED:
+		ipoib_dbg(priv, "REP received.\n");
+		ret = ipoib_cm_rep_handler(cm_id, event);
+		if (ret)
+			ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
+				       NULL, 0, NULL, 0);
+		break;
+	case IB_CM_REQ_ERROR:
+	case IB_CM_REJ_RECEIVED:
+	case IB_CM_TIMEWAIT_EXIT:
+		ipoib_dbg(priv, "CM error %d.\n", event->event);
+		spin_lock_irqsave(&priv->lock, flags);
+		path = tx->path;
+
+		if (path) {
+			path->cm = NULL;
+			tx->path = NULL;
+			rb_erase(&path->rb_node, &priv->path_tree);
+			list_del(&path->list);
+		}
+
+		if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
+			list_move(&tx->list, &priv->cm.reap_list);
+			queue_work(ipoib_workqueue, &priv->cm.reap_task);
+		}
+
+		spin_unlock_irqrestore(&priv->lock, flags);
+		if (path)
+			ipoib_path_free(tx->priv, path);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+struct ipoib_cm_tx *ipoib_cm_create_tx(struct ipoib_dev_priv *priv,
+    struct ipoib_path *path)
+{
+	struct ipoib_cm_tx *tx;
+
+	tx = kzalloc(sizeof *tx, GFP_ATOMIC);
+	if (!tx)
+		return NULL;
+
+	ipoib_dbg(priv, "Creating cm tx\n");
+	path->cm = tx;
+	tx->path = path;
+	tx->priv = priv;
+	list_add(&tx->list, &priv->cm.start_list);
+	set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags);
+	queue_work(ipoib_workqueue, &priv->cm.start_task);
+	return tx;
+}
+
+void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
+{
+	struct ipoib_dev_priv *priv = tx->priv;
+	if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
+		spin_lock(&priv->lock);
+		list_move(&tx->list, &priv->cm.reap_list);
+		spin_unlock(&priv->lock);
+		queue_work(ipoib_workqueue, &priv->cm.reap_task);
+		ipoib_dbg(priv, "Reap connection for gid %pI6\n",
+			  tx->path->pathrec.dgid.raw);
+		tx->path = NULL;
+	}
+}
+
+static void ipoib_cm_tx_start(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+						   cm.start_task);
+	struct ipoib_path *path;
+	struct ipoib_cm_tx *p;
+	unsigned long flags;
+	int ret;
+
+	struct ib_sa_path_rec pathrec;
+	u32 qpn;
+
+	ipoib_dbg(priv, "cm start task\n");
+	spin_lock_irqsave(&priv->lock, flags);
+
+	while (!list_empty(&priv->cm.start_list)) {
+		p = list_entry(priv->cm.start_list.next, typeof(*p), list);
+		list_del_init(&p->list);
+		path = p->path;
+		qpn = IPOIB_QPN(path->hwaddr);
+		memcpy(&pathrec, &p->path->pathrec, sizeof pathrec);
+
+		spin_unlock_irqrestore(&priv->lock, flags);
+
+		ret = ipoib_cm_tx_init(p, qpn, &pathrec);
+
+		spin_lock_irqsave(&priv->lock, flags);
+
+		if (ret) {
+			path = p->path;
+			if (path) {
+				path->cm = NULL;
+				rb_erase(&path->rb_node, &priv->path_tree);
+				list_del(&path->list);
+				ipoib_path_free(priv, path);
+			}
+			list_del(&p->list);
+			kfree(p);
+		}
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static void ipoib_cm_tx_reap(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+						   cm.reap_task);
+	struct ipoib_cm_tx *p;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	while (!list_empty(&priv->cm.reap_list)) {
+		p = list_entry(priv->cm.reap_list.next, typeof(*p), list);
+		list_del(&p->list);
+		spin_unlock_irqrestore(&priv->lock, flags);
+		ipoib_cm_tx_destroy(p);
+		spin_lock_irqsave(&priv->lock, flags);
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static void ipoib_cm_mb_reap(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+						   cm.mb_task);
+	struct mbuf *mb;
+	unsigned long flags;
+	unsigned mtu = priv->mcast_mtu;
+	uint16_t proto;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	for (;;) {
+		IF_DEQUEUE(&priv->cm.mb_queue, mb);
+		if (mb == NULL)
+			break;
+		spin_unlock_irqrestore(&priv->lock, flags);
+
+		proto = htons(*mtod(mb, uint16_t *));
+		m_adj(mb, IPOIB_ENCAP_LEN);
+		if (proto == ETHERTYPE_IP)
+			icmp_error(mb, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, mtu);
+#if defined(INET6)
+		else if (proto == ETHERTYPE_IPV6)
+			icmp6_error(mb, ICMP6_PACKET_TOO_BIG, 0, mtu);
+#endif
+		else
+			m_freem(mb);
+
+		spin_lock_irqsave(&priv->lock, flags);
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+void
+ipoib_cm_mb_too_long(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int mtu)
+{
+	int e = priv->cm.mb_queue.ifq_len; 
+
+	IF_ENQUEUE(&priv->cm.mb_queue, mb);
+	if (e == 0)
+		queue_work(ipoib_workqueue, &priv->cm.mb_task);
+}
+
+static void ipoib_cm_rx_reap(struct work_struct *work)
+{
+	ipoib_cm_free_rx_reap_list(container_of(work, struct ipoib_dev_priv,
+						cm.rx_reap_task));
+}
+
+static void ipoib_cm_stale_task(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+						   cm.stale_task.work);
+	struct ipoib_cm_rx *p;
+	int ret;
+
+	spin_lock_irq(&priv->lock);
+	while (!list_empty(&priv->cm.passive_ids)) {
+		/* List is sorted by LRU, start from tail,
+		 * stop when we see a recently used entry */
+		p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list);
+		if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT))
+			break;
+		list_move(&p->list, &priv->cm.rx_error_list);
+		p->state = IPOIB_CM_RX_ERROR;
+		spin_unlock_irq(&priv->lock);
+		ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
+		if (ret)
+			ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
+		spin_lock_irq(&priv->lock);
+	}
+
+	if (!list_empty(&priv->cm.passive_ids))
+		queue_delayed_work(ipoib_workqueue,
+				   &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
+	spin_unlock_irq(&priv->lock);
+}
+
+
+static void ipoib_cm_create_srq(struct ipoib_dev_priv *priv, int max_sge)
+{
+	struct ib_srq_init_attr srq_init_attr = {
+		.attr = {
+			.max_wr  = ipoib_recvq_size,
+			.max_sge = max_sge
+		}
+	};
+
+	priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr);
+	if (IS_ERR(priv->cm.srq)) {
+		if (PTR_ERR(priv->cm.srq) != -ENOSYS)
+			printk(KERN_WARNING "%s: failed to allocate SRQ, error %ld\n",
+			       priv->ca->name, PTR_ERR(priv->cm.srq));
+		priv->cm.srq = NULL;
+		return;
+	}
+
+	priv->cm.srq_ring = kzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring, GFP_KERNEL);
+	if (!priv->cm.srq_ring) {
+		printk(KERN_WARNING "%s: failed to allocate CM SRQ ring (%d entries)\n",
+		       priv->ca->name, ipoib_recvq_size);
+		ib_destroy_srq(priv->cm.srq);
+		priv->cm.srq = NULL;
+		return;
+	}
+
+	memset(priv->cm.srq_ring, 0, ipoib_recvq_size * sizeof *priv->cm.srq_ring);
+}
+
+int ipoib_cm_dev_init(struct ipoib_dev_priv *priv)
+{
+	struct ifnet *dev = priv->dev;
+	int i, ret;
+	struct ib_device_attr attr;
+
+	INIT_LIST_HEAD(&priv->cm.passive_ids);
+	INIT_LIST_HEAD(&priv->cm.reap_list);
+	INIT_LIST_HEAD(&priv->cm.start_list);
+	INIT_LIST_HEAD(&priv->cm.rx_error_list);
+	INIT_LIST_HEAD(&priv->cm.rx_flush_list);
+	INIT_LIST_HEAD(&priv->cm.rx_drain_list);
+	INIT_LIST_HEAD(&priv->cm.rx_reap_list);
+	INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start);
+	INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap);
+	INIT_WORK(&priv->cm.mb_task, ipoib_cm_mb_reap);
+	INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap);
+	INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task);
+
+	bzero(&priv->cm.mb_queue, sizeof(priv->cm.mb_queue));
+	mtx_init(&priv->cm.mb_queue.ifq_mtx,
+	    dev->if_xname, "if send queue", MTX_DEF);
+
+	ret = ib_query_device(priv->ca, &attr);
+	if (ret) {
+		printk(KERN_WARNING "ib_query_device() failed with %d\n", ret);
+		return ret;
+	}
+
+	ipoib_dbg(priv, "max_srq_sge=%d\n", attr.max_srq_sge);
+
+	attr.max_srq_sge = min_t(int, IPOIB_CM_RX_SG, attr.max_srq_sge);
+	ipoib_cm_create_srq(priv, attr.max_srq_sge);
+	if (ipoib_cm_has_srq(priv)) {
+		priv->cm.max_cm_mtu = attr.max_srq_sge * MJUMPAGESIZE;
+		priv->cm.num_frags  = attr.max_srq_sge;
+		ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n",
+			  priv->cm.max_cm_mtu, priv->cm.num_frags);
+	} else {
+		priv->cm.max_cm_mtu = IPOIB_CM_MAX_MTU;
+		priv->cm.num_frags  = IPOIB_CM_RX_SG;
+	}
+
+	ipoib_cm_init_rx_wr(priv, &priv->cm.rx_wr, priv->cm.rx_sge);
+
+	if (ipoib_cm_has_srq(priv)) {
+		for (i = 0; i < ipoib_recvq_size; ++i) {
+			if (!ipoib_cm_alloc_rx_mb(priv, &priv->cm.srq_ring[i])) {
+				ipoib_warn(priv, "failed to allocate "
+					   "receive buffer %d\n", i);
+				ipoib_cm_dev_cleanup(priv);
+				return -ENOMEM;
+			}
+
+			if (ipoib_cm_post_receive_srq(priv, i)) {
+				ipoib_warn(priv, "ipoib_cm_post_receive_srq "
+					   "failed for buf %d\n", i);
+				ipoib_cm_dev_cleanup(priv);
+				return -EIO;
+			}
+		}
+	}
+
+	IF_LLADDR(priv->dev)[0] = IPOIB_FLAGS_RC;
+	return 0;
+}
+
+void ipoib_cm_dev_cleanup(struct ipoib_dev_priv *priv)
+{
+	int ret;
+
+	if (!priv->cm.srq)
+		return;
+
+	ipoib_dbg(priv, "Cleanup ipoib connected mode.\n");
+
+	ret = ib_destroy_srq(priv->cm.srq);
+	if (ret)
+		ipoib_warn(priv, "ib_destroy_srq failed: %d\n", ret);
+
+	priv->cm.srq = NULL;
+	if (!priv->cm.srq_ring)
+		return;
+
+	ipoib_cm_free_rx_ring(priv, priv->cm.srq_ring);
+	priv->cm.srq_ring = NULL;
+
+	mtx_destroy(&priv->cm.mb_queue.ifq_mtx);
+}
+
+#endif /* CONFIG_INFINIBAND_IPOIB_CM */
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
new file mode 100644
index 0000000..ec52712
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+
+#include "ipoib.h"
+
+static void ipoib_get_drvinfo(struct ifnet *netdev,
+			      struct ethtool_drvinfo *drvinfo)
+{
+	strncpy(drvinfo->driver, "ipoib", sizeof(drvinfo->driver) - 1);
+}
+
+static u32 ipoib_get_rx_csum(struct ifnet *dev)
+{
+	struct ipoib_dev_priv *priv = dev->if_softc;
+	return test_bit(IPOIB_FLAG_CSUM, &priv->flags) &&
+		!test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
+}
+
+static int ipoib_get_coalesce(struct ifnet *dev,
+			      struct ethtool_coalesce *coal)
+{
+	struct ipoib_dev_priv *priv = dev->if_softc;
+
+	coal->rx_coalesce_usecs = priv->ethtool.coalesce_usecs;
+	coal->tx_coalesce_usecs = priv->ethtool.coalesce_usecs;
+	coal->rx_max_coalesced_frames = priv->ethtool.max_coalesced_frames;
+	coal->tx_max_coalesced_frames = priv->ethtool.max_coalesced_frames;
+
+	return 0;
+}
+
+static int ipoib_set_coalesce(struct ifnet *dev,
+			      struct ethtool_coalesce *coal)
+{
+	struct ipoib_dev_priv *priv = dev->if_softc;
+	int ret;
+
+	/*
+	 * Since IPoIB uses a single CQ for both rx and tx, we assume
+	 * that rx params dictate the configuration.  These values are
+	 * saved in the private data and returned when ipoib_get_coalesce()
+	 * is called.
+	 */
+	if (coal->rx_coalesce_usecs       > 0xffff ||
+	    coal->rx_max_coalesced_frames > 0xffff)
+		return -EINVAL;
+
+	if (coal->rx_max_coalesced_frames | coal->rx_coalesce_usecs) {
+		if (!coal->rx_max_coalesced_frames)
+			coal->rx_max_coalesced_frames = 0xffff;
+		else if (!coal->rx_coalesce_usecs)
+			coal->rx_coalesce_usecs = 0xffff;
+	}
+
+	ret = ib_modify_cq(priv->recv_cq, coal->rx_max_coalesced_frames,
+			   coal->rx_coalesce_usecs);
+	if (ret && ret != -ENOSYS) {
+		ipoib_warn(priv, "failed modifying CQ (%d)\n", ret);
+		return ret;
+	}
+
+	coal->tx_coalesce_usecs       = coal->rx_coalesce_usecs;
+	coal->tx_max_coalesced_frames = coal->rx_max_coalesced_frames;
+	priv->ethtool.coalesce_usecs       = coal->rx_coalesce_usecs;
+	priv->ethtool.max_coalesced_frames = coal->rx_max_coalesced_frames;
+
+	return 0;
+}
+
+static const char ipoib_stats_keys[][ETH_GSTRING_LEN] = {
+	"LRO aggregated", "LRO flushed",
+	"LRO avg aggr", "LRO no desc"
+};
+
+static void ipoib_get_strings(struct ifnet *netdev, u32 stringset, u8 *data)
+{
+	switch (stringset) {
+	case ETH_SS_STATS:
+		memcpy(data, *ipoib_stats_keys,	sizeof(ipoib_stats_keys));
+		break;
+	}
+}
+
+static int ipoib_get_sset_count(struct ifnet *dev, int sset)
+{
+	switch (sset) {
+	case ETH_SS_STATS:
+		return ARRAY_SIZE(ipoib_stats_keys);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static void ipoib_get_ethtool_stats(struct ifnet *dev,
+				struct ethtool_stats *stats, uint64_t *data)
+{
+	struct ipoib_dev_priv *priv = dev->if_softc;
+	int index = 0;
+
+	/* Get LRO statistics */
+	data[index++] = priv->lro.lro_mgr.stats.aggregated;
+	data[index++] = priv->lro.lro_mgr.stats.flushed;
+	if (priv->lro.lro_mgr.stats.flushed)
+		data[index++] = priv->lro.lro_mgr.stats.aggregated /
+				priv->lro.lro_mgr.stats.flushed;
+	else
+		data[index++] = 0;
+	data[index++] = priv->lro.lro_mgr.stats.no_desc;
+}
+
+static const struct ethtool_ops ipoib_ethtool_ops = {
+	.get_drvinfo		= ipoib_get_drvinfo,
+	.get_rx_csum		= ipoib_get_rx_csum,
+	.get_coalesce		= ipoib_get_coalesce,
+	.set_coalesce		= ipoib_set_coalesce,
+	.get_flags		= ethtool_op_get_flags,
+	.set_flags		= ethtool_op_set_flags,
+	.get_strings		= ipoib_get_strings,
+	.get_sset_count		= ipoib_get_sset_count,
+	.get_ethtool_stats	= ipoib_get_ethtool_stats,
+};
+
+void ipoib_set_ethtool_ops(struct ifnet *dev)
+{
+	SET_ETHTOOL_OPS(dev, &ipoib_ethtool_ops);
+}
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_fs.c
new file mode 100644
index 0000000..0f85f28
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_fs.c
@@ -0,0 +1,298 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/seq_file.h>
+
+struct file_operations;
+
+#include <linux/debugfs.h>
+
+#include "ipoib.h"
+
+static struct dentry *ipoib_root;
+
+static void format_gid(union ib_gid *gid, char *buf)
+{
+	int i, n;
+
+	for (n = 0, i = 0; i < 8; ++i) {
+		n += sprintf(buf + n, "%x",
+			     be16_to_cpu(((__be16 *) gid->raw)[i]));
+		if (i < 7)
+			buf[n++] = ':';
+	}
+}
+
+static void *ipoib_mcg_seq_start(struct seq_file *file, loff_t *pos)
+{
+	struct ipoib_mcast_iter *iter;
+	loff_t n = *pos;
+
+	iter = ipoib_mcast_iter_init(file->private);
+	if (!iter)
+		return NULL;
+
+	while (n--) {
+		if (ipoib_mcast_iter_next(iter)) {
+			kfree(iter);
+			return NULL;
+		}
+	}
+
+	return iter;
+}
+
+static void *ipoib_mcg_seq_next(struct seq_file *file, void *iter_ptr,
+				   loff_t *pos)
+{
+	struct ipoib_mcast_iter *iter = iter_ptr;
+
+	(*pos)++;
+
+	if (ipoib_mcast_iter_next(iter)) {
+		kfree(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+static void ipoib_mcg_seq_stop(struct seq_file *file, void *iter_ptr)
+{
+	/* nothing for now */
+}
+
+static int ipoib_mcg_seq_show(struct seq_file *file, void *iter_ptr)
+{
+	struct ipoib_mcast_iter *iter = iter_ptr;
+	char gid_buf[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"];
+	union ib_gid mgid;
+	unsigned long created;
+	unsigned int queuelen, complete, send_only;
+
+	if (!iter)
+		return 0;
+
+	ipoib_mcast_iter_read(iter, &mgid, &created, &queuelen,
+			      &complete, &send_only);
+
+	format_gid(&mgid, gid_buf);
+
+	seq_printf(file,
+		   "GID: %s\n"
+		   "  created: %10ld\n"
+		   "  queuelen: %9d\n"
+		   "  complete: %9s\n"
+		   "  send_only: %8s\n"
+		   "\n",
+		   gid_buf, created, queuelen,
+		   complete ? "yes" : "no",
+		   send_only ? "yes" : "no");
+
+	return 0;
+}
+
+static const struct seq_operations ipoib_mcg_seq_ops = {
+	.start = ipoib_mcg_seq_start,
+	.next  = ipoib_mcg_seq_next,
+	.stop  = ipoib_mcg_seq_stop,
+	.show  = ipoib_mcg_seq_show,
+};
+
+static int ipoib_mcg_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int ret;
+
+	ret = seq_open(file, &ipoib_mcg_seq_ops);
+	if (ret)
+		return ret;
+
+	seq = file->private_data;
+	seq->private = inode->i_private;
+
+	return 0;
+}
+
+static const struct file_operations ipoib_mcg_fops = {
+	.owner   = THIS_MODULE,
+	.open    = ipoib_mcg_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+static void *ipoib_path_seq_start(struct seq_file *file, loff_t *pos)
+{
+	struct ipoib_path_iter *iter;
+	loff_t n = *pos;
+
+	iter = ipoib_path_iter_init(file->private);
+	if (!iter)
+		return NULL;
+
+	while (n--) {
+		if (ipoib_path_iter_next(iter)) {
+			kfree(iter);
+			return NULL;
+		}
+	}
+
+	return iter;
+}
+
+static void *ipoib_path_seq_next(struct seq_file *file, void *iter_ptr,
+				   loff_t *pos)
+{
+	struct ipoib_path_iter *iter = iter_ptr;
+
+	(*pos)++;
+
+	if (ipoib_path_iter_next(iter)) {
+		kfree(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+static void ipoib_path_seq_stop(struct seq_file *file, void *iter_ptr)
+{
+	/* nothing for now */
+}
+
+static int ipoib_path_seq_show(struct seq_file *file, void *iter_ptr)
+{
+	struct ipoib_path_iter *iter = iter_ptr;
+	char gid_buf[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"];
+	struct ipoib_path path;
+	int rate;
+
+	if (!iter)
+		return 0;
+
+	ipoib_path_iter_read(iter, &path);
+
+	format_gid(&path.pathrec.dgid, gid_buf);
+
+	seq_printf(file,
+		   "GID: %s\n"
+		   "  complete: %6s\n",
+		   gid_buf, path.pathrec.dlid ? "yes" : "no");
+
+	if (path.pathrec.dlid) {
+		rate = ib_rate_to_mult(path.pathrec.rate) * 25;
+
+		seq_printf(file,
+			   "  DLID:     0x%04x\n"
+			   "  SL: %12d\n"
+			   "  rate: %*d%s Gb/sec\n",
+			   be16_to_cpu(path.pathrec.dlid),
+			   path.pathrec.sl,
+			   10 - ((rate % 10) ? 2 : 0),
+			   rate / 10, rate % 10 ? ".5" : "");
+	}
+
+	seq_putc(file, '\n');
+
+	return 0;
+}
+
+static const struct seq_operations ipoib_path_seq_ops = {
+	.start = ipoib_path_seq_start,
+	.next  = ipoib_path_seq_next,
+	.stop  = ipoib_path_seq_stop,
+	.show  = ipoib_path_seq_show,
+};
+
+static int ipoib_path_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int ret;
+
+	ret = seq_open(file, &ipoib_path_seq_ops);
+	if (ret)
+		return ret;
+
+	seq = file->private_data;
+	seq->private = inode->i_private;
+
+	return 0;
+}
+
+static const struct file_operations ipoib_path_fops = {
+	.owner   = THIS_MODULE,
+	.open    = ipoib_path_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+void ipoib_create_debug_files(struct ifnet *dev)
+{
+	struct ipoib_dev_priv *priv = dev->if_softc;
+	char name[IFNAMSIZ + sizeof "_path"];
+
+	snprintf(name, sizeof name, "%s_mcg", if_name(dev));
+	priv->mcg_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO,
+					       ipoib_root, dev, &ipoib_mcg_fops);
+	if (!priv->mcg_dentry)
+		ipoib_warn(priv, "failed to create mcg debug file\n");
+
+	snprintf(name, sizeof name, "%s_path", if_name(dev));
+	priv->path_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO,
+						ipoib_root, dev, &ipoib_path_fops);
+	if (!priv->path_dentry)
+		ipoib_warn(priv, "failed to create path debug file\n");
+}
+
+void ipoib_delete_debug_files(struct ifnet *dev)
+{
+	struct ipoib_dev_priv *priv = dev->if_softc;
+
+	if (priv->mcg_dentry)
+		debugfs_remove(priv->mcg_dentry);
+	if (priv->path_dentry)
+		debugfs_remove(priv->path_dentry);
+}
+
+int ipoib_register_debugfs(void)
+{
+	ipoib_root = debugfs_create_dir("ipoib", NULL);
+	return ipoib_root ? 0 : -ENOMEM;
+}
+
+void ipoib_unregister_debugfs(void)
+{
+	debugfs_remove(ipoib_root);
+}
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c
new file mode 100644
index 0000000..d3b68bc
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -0,0 +1,997 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ipoib.h"
+
+#include <rdma/ib_cache.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
+static int data_debug_level;
+
+module_param(data_debug_level, int, 0644);
+MODULE_PARM_DESC(data_debug_level,
+		 "Enable data path debug tracing if > 0");
+#endif
+
+static DEFINE_MUTEX(pkey_mutex);
+
+struct ipoib_ah *ipoib_create_ah(struct ipoib_dev_priv *priv,
+				 struct ib_pd *pd, struct ib_ah_attr *attr)
+{
+	struct ipoib_ah *ah;
+
+	ah = kmalloc(sizeof *ah, GFP_KERNEL);
+	if (!ah)
+		return NULL;
+
+	ah->priv      = priv;
+	ah->last_send = 0;
+	kref_init(&ah->ref);
+
+	ah->ah = ib_create_ah(pd, attr);
+	if (IS_ERR(ah->ah)) {
+		kfree(ah);
+		ah = NULL;
+	} else
+		ipoib_dbg(priv, "Created ah %p\n", ah->ah);
+
+	return ah;
+}
+
+void ipoib_free_ah(struct kref *kref)
+{
+	struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref);
+	struct ipoib_dev_priv *priv = ah->priv;
+
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	list_add_tail(&ah->list, &priv->dead_ahs);
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+void
+ipoib_dma_unmap_rx(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req)
+{
+	struct mbuf *m;
+	int i;
+
+	for (i = 0, m = rx_req->mb; m != NULL; m = m->m_next, i++)
+		ib_dma_unmap_single(priv->ca, rx_req->mapping[i], m->m_len,
+		    DMA_FROM_DEVICE);
+}
+
+void
+ipoib_dma_mb(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int length)
+{
+
+	m_adj(mb, -(mb->m_pkthdr.len - length));
+}
+
+struct mbuf *
+ipoib_alloc_map_mb(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req,
+    int size)
+{
+	struct mbuf *mb, *m;
+	int i, j;
+
+	rx_req->mb = NULL;
+	mb = m_getm2(NULL, size, M_NOWAIT, MT_DATA, M_PKTHDR);
+	if (mb == NULL)
+		return (NULL);
+	for (i = 0, m = mb; m != NULL; m = m->m_next, i++) {
+		m->m_len = (m->m_flags & M_EXT) ? m->m_ext.ext_size :
+		    ((m->m_flags & M_PKTHDR) ? MHLEN : MLEN);
+		mb->m_pkthdr.len += m->m_len;
+		rx_req->mapping[i] = ib_dma_map_single(priv->ca,
+		    mtod(m, void *), m->m_len, DMA_FROM_DEVICE);
+		if (unlikely(ib_dma_mapping_error(priv->ca,
+		    rx_req->mapping[i])))
+			goto error;
+
+	}
+	rx_req->mb = mb;
+	return (mb);
+error:
+	for (j = 0, m = mb; j < i; m = m->m_next, j++)
+		ib_dma_unmap_single(priv->ca, rx_req->mapping[j], m->m_len,
+		    DMA_FROM_DEVICE);
+	m_freem(mb);
+	return (NULL);
+
+}
+
+static int ipoib_ib_post_receive(struct ipoib_dev_priv *priv, int id)
+{
+	struct ipoib_rx_buf *rx_req;
+	struct ib_recv_wr *bad_wr;
+	struct mbuf *m;
+	int ret;
+	int i;
+
+	rx_req = &priv->rx_ring[id];
+	for (m = rx_req->mb, i = 0; m != NULL; m = m->m_next, i++) {
+		priv->rx_sge[i].addr = rx_req->mapping[i];
+		priv->rx_sge[i].length = m->m_len;
+	}
+	priv->rx_wr.num_sge = i;
+	priv->rx_wr.wr_id = id | IPOIB_OP_RECV;
+
+	ret = ib_post_recv(priv->qp, &priv->rx_wr, &bad_wr);
+	if (unlikely(ret)) {
+		ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
+		ipoib_dma_unmap_rx(priv, &priv->rx_ring[id]);
+		m_freem(priv->rx_ring[id].mb);
+		priv->rx_ring[id].mb = NULL;
+	}
+
+	return ret;
+}
+
+static struct mbuf *
+ipoib_alloc_rx_mb(struct ipoib_dev_priv *priv, int id)
+{
+
+	return ipoib_alloc_map_mb(priv, &priv->rx_ring[id],
+	    priv->max_ib_mtu + IB_GRH_BYTES);
+}
+
+static int ipoib_ib_post_receives(struct ipoib_dev_priv *priv)
+{
+	int i;
+
+	for (i = 0; i < ipoib_recvq_size; ++i) {
+		if (!ipoib_alloc_rx_mb(priv, i)) {
+			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
+			return -ENOMEM;
+		}
+		if (ipoib_ib_post_receive(priv, i)) {
+			ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i);
+			return -EIO;
+		}
+	}
+
+	return 0;
+}
+
+static void
+ipoib_ib_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc)
+{
+	struct ipoib_rx_buf saverx;
+	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV;
+	struct ifnet *dev = priv->dev;
+	struct ipoib_header *eh;
+	struct mbuf *mb;
+
+	ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n",
+		       wr_id, wc->status);
+
+	if (unlikely(wr_id >= ipoib_recvq_size)) {
+		ipoib_warn(priv, "recv completion event with wrid %d (> %d)\n",
+			   wr_id, ipoib_recvq_size);
+		return;
+	}
+
+	mb  = priv->rx_ring[wr_id].mb;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		if (wc->status != IB_WC_WR_FLUSH_ERR) {
+			ipoib_warn(priv, "failed recv event "
+				   "(status=%d, wrid=%d vend_err %x)\n",
+				   wc->status, wr_id, wc->vendor_err);
+			goto repost;
+		}
+		if (mb) {
+			ipoib_dma_unmap_rx(priv, &priv->rx_ring[wr_id]);
+			m_freem(mb);
+			priv->rx_ring[wr_id].mb = NULL;
+		}
+		return;
+	}
+
+	/*
+	 * Drop packets that this interface sent, ie multicast packets
+	 * that the HCA has replicated.
+	 */
+	if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num)
+		goto repost;
+
+	memcpy(&saverx, &priv->rx_ring[wr_id], sizeof(saverx));
+	/*
+	 * If we can't allocate a new RX buffer, dump
+	 * this packet and reuse the old buffer.
+	 */
+	if (unlikely(!ipoib_alloc_rx_mb(priv, wr_id))) {
+		memcpy(&priv->rx_ring[wr_id], &saverx, sizeof(saverx));
+		dev->if_iqdrops++;
+		goto repost;
+	}
+
+	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
+		       wc->byte_len, wc->slid);
+
+	ipoib_dma_unmap_rx(priv, &saverx);
+	ipoib_dma_mb(priv, mb, wc->byte_len);
+
+	++dev->if_ipackets;
+	dev->if_ibytes += mb->m_pkthdr.len;
+	mb->m_pkthdr.rcvif = dev;
+	m_adj(mb, sizeof(struct ib_grh) - INFINIBAND_ALEN);
+	eh = mtod(mb, struct ipoib_header *);
+	bzero(eh->hwaddr, 4);	/* Zero the queue pair, only dgid is in grh */
+
+	if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->csum_ok))
+		mb->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID;
+
+	dev->if_input(dev, mb);
+
+repost:
+	if (unlikely(ipoib_ib_post_receive(priv, wr_id)))
+		ipoib_warn(priv, "ipoib_ib_post_receive failed "
+			   "for buf %d\n", wr_id);
+}
+
+int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req, int max)
+{
+	struct mbuf *mb = tx_req->mb;
+	u64 *mapping = tx_req->mapping;
+	struct mbuf *m, *p;
+	int error;
+	int i;
+
+	for (m = mb, p = NULL, i = 0; m != NULL; p = m, m = m->m_next, i++) {
+		if (m->m_len != 0)
+			continue;
+		if (p == NULL)
+			panic("ipoib_dma_map_tx: First mbuf empty\n");
+		p->m_next = m_free(m);
+		m = p;
+		i--;
+	}
+	i--;
+	if (i >= max) {
+		tx_req->mb = mb = m_defrag(mb, M_DONTWAIT);
+		if (mb == NULL)
+			return -EIO;
+		for (m = mb, i = 0; m != NULL; m = m->m_next, i++);
+		if (i >= max)
+			return -EIO;
+	}
+	error = 0;
+	for (m = mb, i = 0; m != NULL; m = m->m_next, i++) {
+		mapping[i] = ib_dma_map_single(ca, mtod(m, void *),
+					       m->m_len, DMA_TO_DEVICE);
+		if (unlikely(ib_dma_mapping_error(ca, mapping[i]))) {
+			error = -EIO;
+			break;
+		}
+	}
+	if (error) {
+		int end;
+
+		end = i;
+		for (m = mb, i = 0; i < end; m = m->m_next, i++)
+			ib_dma_unmap_single(ca, mapping[i], m->m_len,
+					    DMA_TO_DEVICE);
+	}
+	return error;
+}
+
+void ipoib_dma_unmap_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req)
+{
+	struct mbuf *mb = tx_req->mb;
+	u64 *mapping = tx_req->mapping;
+	struct mbuf *m;
+	int i;
+
+	for (m = mb, i = 0; m != NULL; m = m->m_next, i++)
+		ib_dma_unmap_single(ca, mapping[i], m->m_len, DMA_TO_DEVICE);
+}
+
+static void ipoib_ib_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc)
+{
+	struct ifnet *dev = priv->dev;
+	unsigned int wr_id = wc->wr_id;
+	struct ipoib_tx_buf *tx_req;
+
+	ipoib_dbg_data(priv, "send completion: id %d, status: %d\n",
+		       wr_id, wc->status);
+
+	if (unlikely(wr_id >= ipoib_sendq_size)) {
+		ipoib_warn(priv, "send completion event with wrid %d (> %d)\n",
+			   wr_id, ipoib_sendq_size);
+		return;
+	}
+
+	tx_req = &priv->tx_ring[wr_id];
+
+	ipoib_dma_unmap_tx(priv->ca, tx_req);
+
+	++dev->if_opackets;
+	dev->if_obytes += tx_req->mb->m_pkthdr.len;
+
+	m_freem(tx_req->mb);
+
+	++priv->tx_tail;
+	if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
+	    (dev->if_drv_flags & IFF_DRV_OACTIVE) &&
+	    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+		dev->if_drv_flags &= ~IFF_DRV_OACTIVE;
+
+	if (wc->status != IB_WC_SUCCESS &&
+	    wc->status != IB_WC_WR_FLUSH_ERR)
+		ipoib_warn(priv, "failed send event "
+			   "(status=%d, wrid=%d vend_err %x)\n",
+			   wc->status, wr_id, wc->vendor_err);
+}
+
+int
+ipoib_poll_tx(struct ipoib_dev_priv *priv)
+{
+	int n, i;
+
+	n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc);
+	for (i = 0; i < n; ++i) {
+		struct ib_wc *wc = priv->send_wc + i;
+		if (wc->wr_id & IPOIB_OP_CM)
+			ipoib_cm_handle_tx_wc(priv, wc);
+		else
+			ipoib_ib_handle_tx_wc(priv, wc);
+	}
+
+	return n == MAX_SEND_CQE;
+}
+
+static void
+ipoib_poll(struct ipoib_dev_priv *priv)
+{
+	int n, i;
+
+poll_more:
+	for (;;) {
+		n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc);
+
+		for (i = 0; i < n; i++) {
+			struct ib_wc *wc = priv->ibwc + i;
+
+			if ((wc->wr_id & IPOIB_OP_RECV) == 0)
+				panic("ipoib_poll: Bad wr_id 0x%jX\n",
+				    (intmax_t)wc->wr_id);
+			if (wc->wr_id & IPOIB_OP_CM)
+				ipoib_cm_handle_rx_wc(priv, wc);
+			else
+				ipoib_ib_handle_rx_wc(priv, wc);
+		}
+
+		if (n != IPOIB_NUM_WC)
+			break;
+	}
+
+	if (ib_req_notify_cq(priv->recv_cq,
+	    IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS))
+		goto poll_more;
+}
+
+void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
+{
+	struct ipoib_dev_priv *priv = dev_ptr;
+
+	ipoib_poll(priv);
+}
+
+static void drain_tx_cq(struct ipoib_dev_priv *priv)
+{
+	struct ifnet *dev = priv->dev;
+
+	spin_lock(&priv->lock);
+	while (ipoib_poll_tx(priv))
+		; /* nothing */
+
+	if (dev->if_drv_flags & IFF_DRV_OACTIVE)
+		mod_timer(&priv->poll_timer, jiffies + 1);
+
+	spin_unlock(&priv->lock);
+}
+
+void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr)
+{
+	struct ipoib_dev_priv *priv = dev_ptr;
+
+	mod_timer(&priv->poll_timer, jiffies);
+}
+
+static inline int
+post_send(struct ipoib_dev_priv *priv, unsigned int wr_id,
+    struct ib_ah *address, u32 qpn, struct ipoib_tx_buf *tx_req, void *head,
+    int hlen)
+{
+	struct ib_send_wr *bad_wr;
+	struct mbuf *mb = tx_req->mb;
+	u64 *mapping = tx_req->mapping;
+	struct mbuf *m;
+	int i;
+
+	for (m = mb, i = 0; m != NULL; m = m->m_next, i++) {
+		priv->tx_sge[i].addr         = mapping[i];
+		priv->tx_sge[i].length       = m->m_len;
+	}
+	priv->tx_wr.num_sge	     = i;
+	priv->tx_wr.wr_id 	     = wr_id;
+	priv->tx_wr.wr.ud.remote_qpn = qpn;
+	priv->tx_wr.wr.ud.ah 	     = address;
+
+
+	if (head) {
+		priv->tx_wr.wr.ud.mss	 = 0; /* XXX mb_shinfo(mb)->gso_size; */
+		priv->tx_wr.wr.ud.header = head;
+		priv->tx_wr.wr.ud.hlen	 = hlen;
+		priv->tx_wr.opcode	 = IB_WR_LSO;
+	} else
+		priv->tx_wr.opcode	 = IB_WR_SEND;
+
+	return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr);
+}
+
+void
+ipoib_send(struct ipoib_dev_priv *priv, struct mbuf *mb,
+    struct ipoib_ah *address, u32 qpn)
+{
+	struct ifnet *dev = priv->dev;
+	struct ipoib_tx_buf *tx_req;
+	int hlen;
+	void *phead;
+
+	if (unlikely(priv->tx_outstanding > MAX_SEND_CQE))
+		while (ipoib_poll_tx(priv))
+			; /* nothing */
+
+	m_adj(mb, sizeof (struct ipoib_pseudoheader));
+	if (0 /* XXX segment offload mb_is_gso(mb) */) {
+		/* XXX hlen = mb_transport_offset(mb) + tcp_hdrlen(mb); */
+		phead = mtod(mb, void *);
+		if (mb->m_len < hlen) {
+			ipoib_warn(priv, "linear data too small\n");
+			++dev->if_oerrors;
+			m_freem(mb);
+			return;
+		}
+		m_adj(mb, hlen);
+	} else {
+		if (unlikely(mb->m_pkthdr.len - IPOIB_ENCAP_LEN > priv->mcast_mtu)) {
+			ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
+				   mb->m_pkthdr.len, priv->mcast_mtu);
+			++dev->if_oerrors;
+			ipoib_cm_mb_too_long(priv, mb, priv->mcast_mtu);
+			return;
+		}
+		phead = NULL;
+		hlen  = 0;
+	}
+
+	ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n",
+		       mb->m_pkthdr.len, address, qpn);
+
+	/*
+	 * We put the mb into the tx_ring _before_ we call post_send()
+	 * because it's entirely possible that the completion handler will
+	 * run before we execute anything after the post_send().  That
+	 * means we have to make sure everything is properly recorded and
+	 * our state is consistent before we call post_send().
+	 */
+	tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)];
+	tx_req->mb = mb;
+	if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req, IPOIB_UD_TX_SG))) {
+		++dev->if_oerrors;
+		if (tx_req->mb)
+			m_freem(tx_req->mb);
+		return;
+	}
+
+	if (mb->m_pkthdr.csum_flags & (CSUM_IP|CSUM_TCP|CSUM_UDP))
+		priv->tx_wr.send_flags |= IB_SEND_IP_CSUM;
+	else
+		priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
+
+	if (++priv->tx_outstanding == ipoib_sendq_size) {
+		ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
+		if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP))
+			ipoib_warn(priv, "request notify on send CQ failed\n");
+		dev->if_drv_flags |= IFF_DRV_OACTIVE;
+	}
+
+	if (unlikely(post_send(priv,
+	    priv->tx_head & (ipoib_sendq_size - 1), address->ah, qpn,
+	    tx_req, phead, hlen))) {
+		ipoib_warn(priv, "post_send failed\n");
+		++dev->if_oerrors;
+		--priv->tx_outstanding;
+		ipoib_dma_unmap_tx(priv->ca, tx_req);
+		m_freem(mb);
+		if (dev->if_drv_flags & IFF_DRV_OACTIVE)
+			dev->if_drv_flags &= ~IFF_DRV_OACTIVE;
+	} else {
+		address->last_send = priv->tx_head;
+		++priv->tx_head;
+	}
+}
+
+static void __ipoib_reap_ah(struct ipoib_dev_priv *priv)
+{
+	struct ipoib_ah *ah, *tah;
+	LIST_HEAD(remove_list);
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list)
+		if ((int) priv->tx_tail - (int) ah->last_send >= 0) {
+			list_del(&ah->list);
+			ib_destroy_ah(ah->ah);
+			kfree(ah);
+		}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+void ipoib_reap_ah(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, ah_reap_task.work);
+
+	__ipoib_reap_ah(priv);
+
+	if (!test_bit(IPOIB_STOP_REAPER, &priv->flags))
+		queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task,
+				   HZ);
+}
+
+static void ipoib_ah_dev_cleanup(struct ipoib_dev_priv *priv)
+{
+	unsigned long begin;
+
+	begin = jiffies;
+
+	while (!list_empty(&priv->dead_ahs)) {
+		__ipoib_reap_ah(priv);
+
+		if (time_after(jiffies, begin + HZ)) {
+			ipoib_warn(priv, "timing out; will leak address handles\n");
+			break;
+		}
+
+		msleep(1);
+	}
+}
+
+static void ipoib_ib_tx_timer_func(unsigned long ctx)
+{
+	drain_tx_cq((struct ipoib_dev_priv *)ctx);
+}
+
+int ipoib_ib_dev_open(struct ipoib_dev_priv *priv)
+{
+	int ret;
+
+	if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &priv->pkey_index)) {
+		ipoib_warn(priv, "P_Key 0x%04x not found\n", priv->pkey);
+		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+		return -1;
+	}
+	set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+
+	ret = ipoib_init_qp(priv);
+	if (ret) {
+		ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret);
+		return -1;
+	}
+
+	ret = ipoib_ib_post_receives(priv);
+	if (ret) {
+		ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
+		ipoib_ib_dev_stop(priv, 1);
+		return -1;
+	}
+
+	ret = ipoib_cm_dev_open(priv);
+	if (ret) {
+		ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret);
+		ipoib_ib_dev_stop(priv, 1);
+		return -1;
+	}
+
+	clear_bit(IPOIB_STOP_REAPER, &priv->flags);
+	queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ);
+
+	return 0;
+}
+
+static void ipoib_pkey_dev_check_presence(struct ipoib_dev_priv *priv)
+{
+	u16 pkey_index = 0;
+
+	if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index))
+		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+	else
+		set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+}
+
+int ipoib_ib_dev_up(struct ipoib_dev_priv *priv)
+{
+
+	ipoib_pkey_dev_check_presence(priv);
+
+	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
+		ipoib_dbg(priv, "PKEY is not assigned.\n");
+		return 0;
+	}
+
+	set_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
+
+	return ipoib_mcast_start_thread(priv);
+}
+
+int ipoib_ib_dev_down(struct ipoib_dev_priv *priv, int flush)
+{
+
+	ipoib_dbg(priv, "downing ib_dev\n");
+
+	clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
+	if_link_state_change(priv->dev, LINK_STATE_DOWN);
+
+	/* Shutdown the P_Key thread if still active */
+	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
+		mutex_lock(&pkey_mutex);
+		set_bit(IPOIB_PKEY_STOP, &priv->flags);
+		cancel_delayed_work(&priv->pkey_poll_task);
+		mutex_unlock(&pkey_mutex);
+		if (flush)
+			flush_workqueue(ipoib_workqueue);
+	}
+
+	ipoib_mcast_stop_thread(priv, flush);
+	ipoib_mcast_dev_flush(priv);
+
+	ipoib_flush_paths(priv);
+
+	return 0;
+}
+
+static int recvs_pending(struct ipoib_dev_priv *priv)
+{
+	int pending = 0;
+	int i;
+
+	for (i = 0; i < ipoib_recvq_size; ++i)
+		if (priv->rx_ring[i].mb)
+			++pending;
+
+	return pending;
+}
+
+void ipoib_drain_cq(struct ipoib_dev_priv *priv)
+{
+	int i, n;
+
+	do {
+		n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc);
+		for (i = 0; i < n; ++i) {
+			/*
+			 * Convert any successful completions to flush
+			 * errors to avoid passing packets up the
+			 * stack after bringing the device down.
+			 */
+			if (priv->ibwc[i].status == IB_WC_SUCCESS)
+				priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR;
+
+			if ((priv->ibwc[i].wr_id & IPOIB_OP_RECV) == 0)
+				panic("ipoib_drain_cq:  Bad wrid 0x%jX\n",
+				    (intmax_t)priv->ibwc[i].wr_id);
+			if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
+				ipoib_cm_handle_rx_wc(priv, priv->ibwc + i);
+			else
+				ipoib_ib_handle_rx_wc(priv, priv->ibwc + i);
+		}
+	} while (n == IPOIB_NUM_WC);
+
+	spin_lock(&priv->lock);
+	while (ipoib_poll_tx(priv))
+		; /* nothing */
+
+	spin_unlock(&priv->lock);
+}
+
+int ipoib_ib_dev_stop(struct ipoib_dev_priv *priv, int flush)
+{
+	struct ib_qp_attr qp_attr;
+	unsigned long begin;
+	struct ipoib_tx_buf *tx_req;
+	int i;
+
+	ipoib_cm_dev_stop(priv);
+
+	/*
+	 * Move our QP to the error state and then reinitialize in
+	 * when all work requests have completed or have been flushed.
+	 */
+	qp_attr.qp_state = IB_QPS_ERR;
+	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
+		ipoib_warn(priv, "Failed to modify QP to ERROR state\n");
+
+	/* Wait for all sends and receives to complete */
+	begin = jiffies;
+
+	while (priv->tx_head != priv->tx_tail || recvs_pending(priv)) {
+		if (time_after(jiffies, begin + 5 * HZ)) {
+			ipoib_warn(priv, "timing out; %d sends %d receives not completed\n",
+				   priv->tx_head - priv->tx_tail, recvs_pending(priv));
+
+			/*
+			 * assume the HW is wedged and just free up
+			 * all our pending work requests.
+			 */
+			while ((int) priv->tx_tail - (int) priv->tx_head < 0) {
+				tx_req = &priv->tx_ring[priv->tx_tail &
+							(ipoib_sendq_size - 1)];
+				ipoib_dma_unmap_tx(priv->ca, tx_req);
+				m_freem(tx_req->mb);
+				++priv->tx_tail;
+				--priv->tx_outstanding;
+			}
+
+			for (i = 0; i < ipoib_recvq_size; ++i) {
+				struct ipoib_rx_buf *rx_req;
+
+				rx_req = &priv->rx_ring[i];
+				if (!rx_req->mb)
+					continue;
+				ipoib_dma_unmap_rx(priv, &priv->rx_ring[i]);
+				m_freem(rx_req->mb);
+				rx_req->mb = NULL;
+			}
+
+			goto timeout;
+		}
+
+		ipoib_drain_cq(priv);
+
+		msleep(1);
+	}
+
+	ipoib_dbg(priv, "All sends and receives done.\n");
+
+timeout:
+	del_timer_sync(&priv->poll_timer);
+	qp_attr.qp_state = IB_QPS_RESET;
+	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
+		ipoib_warn(priv, "Failed to modify QP to RESET state\n");
+
+	/* Wait for all AHs to be reaped */
+	set_bit(IPOIB_STOP_REAPER, &priv->flags);
+	cancel_delayed_work(&priv->ah_reap_task);
+	if (flush)
+		flush_workqueue(ipoib_workqueue);
+
+	ipoib_ah_dev_cleanup(priv);
+
+	ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP);
+
+	return 0;
+}
+
+int ipoib_ib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port)
+{
+	struct ifnet *dev = priv->dev;
+
+	priv->ca = ca;
+	priv->port = port;
+	priv->qp = NULL;
+
+	if (ipoib_transport_dev_init(priv, ca)) {
+		printk(KERN_WARNING "%s: ipoib_transport_dev_init failed\n", ca->name);
+		return -ENODEV;
+	}
+
+	setup_timer(&priv->poll_timer, ipoib_ib_tx_timer_func,
+		    (unsigned long) priv);
+
+	if (dev->if_flags & IFF_UP) {
+		if (ipoib_ib_dev_open(priv)) {
+			ipoib_transport_dev_cleanup(priv);
+			return -ENODEV;
+		}
+	}
+
+	return 0;
+}
+
+static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
+				enum ipoib_flush_level level)
+{
+	struct ipoib_dev_priv *cpriv;
+	u16 new_index;
+
+	mutex_lock(&priv->vlan_mutex);
+
+	/*
+	 * Flush any child interfaces too -- they might be up even if
+	 * the parent is down.
+	 */
+	list_for_each_entry(cpriv, &priv->child_intfs, list)
+		__ipoib_ib_dev_flush(cpriv, level);
+
+	mutex_unlock(&priv->vlan_mutex);
+
+	if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) {
+		ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n");
+		return;
+	}
+
+	if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
+		ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n");
+		return;
+	}
+
+	if (level == IPOIB_FLUSH_HEAVY) {
+		if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) {
+			clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+			ipoib_ib_dev_down(priv, 0);
+			ipoib_ib_dev_stop(priv, 0);
+			if (ipoib_pkey_dev_delay_open(priv))
+				return;
+		}
+
+		/* restart QP only if P_Key index is changed */
+		if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) &&
+		    new_index == priv->pkey_index) {
+			ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n");
+			return;
+		}
+		priv->pkey_index = new_index;
+	}
+
+	if (level == IPOIB_FLUSH_LIGHT) {
+		ipoib_mark_paths_invalid(priv);
+		ipoib_mcast_dev_flush(priv);
+	}
+
+	if (level >= IPOIB_FLUSH_NORMAL)
+		ipoib_ib_dev_down(priv, 0);
+
+	if (level == IPOIB_FLUSH_HEAVY) {
+		ipoib_ib_dev_stop(priv, 0);
+		ipoib_ib_dev_open(priv);
+	}
+
+	/*
+	 * The device could have been brought down between the start and when
+	 * we get here, don't bring it back up if it's not configured up
+	 */
+	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
+		if (level >= IPOIB_FLUSH_NORMAL)
+			ipoib_ib_dev_up(priv);
+		ipoib_mcast_restart_task(&priv->restart_task);
+	}
+}
+
+void ipoib_ib_dev_flush_light(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, flush_light);
+
+	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT);
+}
+
+void ipoib_ib_dev_flush_normal(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, flush_normal);
+
+	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL);
+}
+
+void ipoib_ib_dev_flush_heavy(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, flush_heavy);
+
+	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY);
+}
+
+void ipoib_ib_dev_cleanup(struct ipoib_dev_priv *priv)
+{
+
+	ipoib_dbg(priv, "cleaning up ib_dev\n");
+
+	ipoib_mcast_stop_thread(priv, 1);
+	ipoib_mcast_dev_flush(priv);
+
+	ipoib_ah_dev_cleanup(priv);
+	ipoib_transport_dev_cleanup(priv);
+}
+
+/*
+ * Delayed P_Key Assigment Interim Support
+ *
+ * The following is initial implementation of delayed P_Key assigment
+ * mechanism. It is using the same approach implemented for the multicast
+ * group join. The single goal of this implementation is to quickly address
+ * Bug #2507. This implementation will probably be removed when the P_Key
+ * change async notification is available.
+ */
+
+void ipoib_pkey_poll(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, pkey_poll_task.work);
+
+	ipoib_pkey_dev_check_presence(priv);
+
+	if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
+		ipoib_open(priv);
+	else {
+		mutex_lock(&pkey_mutex);
+		if (!test_bit(IPOIB_PKEY_STOP, &priv->flags))
+			queue_delayed_work(ipoib_workqueue,
+					   &priv->pkey_poll_task,
+					   HZ);
+		mutex_unlock(&pkey_mutex);
+	}
+}
+
+int ipoib_pkey_dev_delay_open(struct ipoib_dev_priv *priv)
+{
+
+	/* Look for the interface pkey value in the IB Port P_Key table and */
+	/* set the interface pkey assigment flag                            */
+	ipoib_pkey_dev_check_presence(priv);
+
+	/* P_Key value not assigned yet - start polling */
+	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
+		mutex_lock(&pkey_mutex);
+		clear_bit(IPOIB_PKEY_STOP, &priv->flags);
+		queue_delayed_work(ipoib_workqueue,
+				   &priv->pkey_poll_task,
+				   HZ);
+		mutex_unlock(&pkey_mutex);
+		return 1;
+	}
+
+	return 0;
+}
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c
new file mode 100644
index 0000000..99c9cc2
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -0,0 +1,1537 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ipoib.h"
+
+static	int ipoib_resolvemulti(struct ifnet *, struct sockaddr **,
+		struct sockaddr *);
+
+
+#include <linux/module.h>
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+
+#include <linux/if_arp.h>	/* For ARPHRD_xxx */
+#include <linux/if_vlan.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
+MODULE_LICENSE("Dual BSD/GPL");
+
+int ipoib_sendq_size = IPOIB_TX_RING_SIZE;
+int ipoib_recvq_size = IPOIB_RX_RING_SIZE;
+
+module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
+MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
+module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
+MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+int ipoib_debug_level = 1;
+
+module_param_named(debug_level, ipoib_debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
+#endif
+
+struct ipoib_path_iter {
+	struct ipoib_dev_priv *priv;
+	struct ipoib_path  path;
+};
+
+static const u8 ipv4_bcast_addr[] = {
+	0x00, 0xff, 0xff, 0xff,
+	0xff, 0x12, 0x40, 0x1b,	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,	0xff, 0xff, 0xff, 0xff
+};
+
+struct workqueue_struct *ipoib_workqueue;
+
+struct ib_sa_client ipoib_sa_client;
+
+static void ipoib_add_one(struct ib_device *device);
+static void ipoib_remove_one(struct ib_device *device);
+static void ipoib_start(struct ifnet *dev);
+static int ipoib_output(struct ifnet *ifp, struct mbuf *m,
+	    struct sockaddr *dst, struct route *ro);
+static int ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data);
+static void ipoib_input(struct ifnet *ifp, struct mbuf *m);
+
+#define	IPOIB_MTAP(_ifp, _m)					\
+do {								\
+	if (bpf_peers_present((_ifp)->if_bpf)) {		\
+		M_ASSERTVALID(_m);				\
+		ipoib_mtap_mb((_ifp), (_m));			\
+	}							\
+} while (0)
+
+/*
+ * This is for clients that have an ipoib_header in the mbuf.
+ */
+static void
+ipoib_mtap_mb(struct ifnet *ifp, struct mbuf *mb)
+{
+	struct ipoib_header *ih;
+	struct ether_header eh;
+
+	ih = mtod(mb, struct ipoib_header *);
+	eh.ether_type = ih->proto;
+	bcopy(ih->hwaddr, &eh.ether_dhost, ETHER_ADDR_LEN);
+	bzero(&eh.ether_shost, ETHER_ADDR_LEN);
+	mb->m_data += sizeof(struct ipoib_header);
+	mb->m_len -= sizeof(struct ipoib_header);
+	bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb);
+	mb->m_data -= sizeof(struct ipoib_header);
+	mb->m_len += sizeof(struct ipoib_header);
+}
+
+void
+ipoib_mtap_proto(struct ifnet *ifp, struct mbuf *mb, uint16_t proto)
+{
+	struct ether_header eh;
+
+	eh.ether_type = proto;
+	bzero(&eh.ether_shost, ETHER_ADDR_LEN);
+	bzero(&eh.ether_dhost, ETHER_ADDR_LEN);
+	bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb);
+}
+
+static struct ib_client ipoib_client = {
+	.name   = "ipoib",
+	.add    = ipoib_add_one,
+	.remove = ipoib_remove_one
+};
+
+int
+ipoib_open(struct ipoib_dev_priv *priv)
+{
+	struct ifnet *dev = priv->dev;
+
+	ipoib_dbg(priv, "bringing up interface\n");
+
+	set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
+
+	if (ipoib_pkey_dev_delay_open(priv))
+		return 0;
+
+	if (ipoib_ib_dev_open(priv))
+		goto err_disable;
+
+	if (ipoib_ib_dev_up(priv))
+		goto err_stop;
+
+	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+		struct ipoib_dev_priv *cpriv;
+
+		/* Bring up any child interfaces too */
+		mutex_lock(&priv->vlan_mutex);
+		list_for_each_entry(cpriv, &priv->child_intfs, list)
+			if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) == 0)
+				ipoib_open(cpriv);
+		mutex_unlock(&priv->vlan_mutex);
+	}
+	dev->if_drv_flags |= IFF_DRV_RUNNING;
+	dev->if_drv_flags &= ~IFF_DRV_OACTIVE;
+
+	return 0;
+
+err_stop:
+	ipoib_ib_dev_stop(priv, 1);
+
+err_disable:
+	clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
+
+	return -EINVAL;
+}
+
+static void
+ipoib_init(void *arg)
+{
+	struct ifnet *dev;
+	struct ipoib_dev_priv *priv;
+
+	priv = arg;
+	dev = priv->dev;
+	if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0)
+		ipoib_open(priv);
+	queue_work(ipoib_workqueue, &priv->flush_light);
+}
+
+
+static int
+ipoib_stop(struct ipoib_dev_priv *priv)
+{
+	struct ifnet *dev = priv->dev;
+
+	ipoib_dbg(priv, "stopping interface\n");
+
+	clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
+
+	dev->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
+
+	ipoib_ib_dev_down(priv, 0);
+	ipoib_ib_dev_stop(priv, 0);
+
+	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+		struct ipoib_dev_priv *cpriv;
+
+		/* Bring down any child interfaces too */
+		mutex_lock(&priv->vlan_mutex);
+		list_for_each_entry(cpriv, &priv->child_intfs, list)
+			if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) != 0)
+				ipoib_stop(cpriv);
+		mutex_unlock(&priv->vlan_mutex);
+	}
+
+	return 0;
+}
+
+int
+ipoib_change_mtu(struct ipoib_dev_priv *priv, int new_mtu)
+{
+	struct ifnet *dev = priv->dev;
+
+	/* dev->if_mtu > 2K ==> connected mode */
+	if (ipoib_cm_admin_enabled(priv)) {
+		if (new_mtu > IPOIB_CM_MTU(ipoib_cm_max_mtu(priv)))
+			return -EINVAL;
+
+		if (new_mtu > priv->mcast_mtu)
+			ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
+				   priv->mcast_mtu);
+
+		dev->if_mtu = new_mtu;
+		return 0;
+	}
+
+	if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
+		return -EINVAL;
+
+	priv->admin_mtu = new_mtu;
+
+	dev->if_mtu = min(priv->mcast_mtu, priv->admin_mtu);
+
+	queue_work(ipoib_workqueue, &priv->flush_light);
+
+	return 0;
+}
+
+static int
+ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
+{
+	struct ipoib_dev_priv *priv = ifp->if_softc;
+	struct ifaddr *ifa = (struct ifaddr *) data;
+	struct ifreq *ifr = (struct ifreq *) data;
+	int error = 0;
+
+	switch (command) {
+	case SIOCSIFFLAGS:
+		if (ifp->if_flags & IFF_UP) {
+			if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
+				error = -ipoib_open(priv);
+		} else
+			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
+				ipoib_stop(priv);
+		break;
+	case SIOCADDMULTI:
+	case SIOCDELMULTI:
+		if (ifp->if_drv_flags & IFF_DRV_RUNNING)
+			queue_work(ipoib_workqueue, &priv->restart_task);
+		break;
+	case SIOCSIFADDR:
+		ifp->if_flags |= IFF_UP;
+
+		switch (ifa->ifa_addr->sa_family) {
+#ifdef INET
+		case AF_INET:
+			ifp->if_init(ifp->if_softc);	/* before arpwhohas */
+			arp_ifinit(ifp, ifa);
+			break;
+#endif
+		default:
+			ifp->if_init(ifp->if_softc);
+			break;
+		}
+		break;
+
+	case SIOCGIFADDR:
+		{
+			struct sockaddr *sa;
+
+			sa = (struct sockaddr *) & ifr->ifr_data;
+			bcopy(IF_LLADDR(ifp),
+			      (caddr_t) sa->sa_data, INFINIBAND_ALEN);
+		}
+		break;
+
+	case SIOCSIFMTU:
+		/*
+		 * Set the interface MTU.
+		 */
+		error = -ipoib_change_mtu(priv, ifr->ifr_mtu);
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	return (error);
+}
+
+
+static struct ipoib_path *
+__path_find(struct ipoib_dev_priv *priv, void *gid)
+{
+	struct rb_node *n = priv->path_tree.rb_node;
+	struct ipoib_path *path;
+	int ret;
+
+	while (n) {
+		path = rb_entry(n, struct ipoib_path, rb_node);
+
+		ret = memcmp(gid, path->pathrec.dgid.raw,
+			     sizeof (union ib_gid));
+
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else
+			return path;
+	}
+
+	return NULL;
+}
+
+static int
+__path_add(struct ipoib_dev_priv *priv, struct ipoib_path *path)
+{
+	struct rb_node **n = &priv->path_tree.rb_node;
+	struct rb_node *pn = NULL;
+	struct ipoib_path *tpath;
+	int ret;
+
+	while (*n) {
+		pn = *n;
+		tpath = rb_entry(pn, struct ipoib_path, rb_node);
+
+		ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
+			     sizeof (union ib_gid));
+		if (ret < 0)
+			n = &pn->rb_left;
+		else if (ret > 0)
+			n = &pn->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&path->rb_node, pn, n);
+	rb_insert_color(&path->rb_node, &priv->path_tree);
+
+	list_add_tail(&path->list, &priv->path_list);
+
+	return 0;
+}
+
+void
+ipoib_path_free(struct ipoib_dev_priv *priv, struct ipoib_path *path)
+{
+
+	_IF_DRAIN(&path->queue);
+
+	if (path->ah)
+		ipoib_put_ah(path->ah);
+	if (ipoib_cm_get(path))
+		ipoib_cm_destroy_tx(ipoib_cm_get(path));
+
+	kfree(path);
+}
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+
+struct ipoib_path_iter *
+ipoib_path_iter_init(struct ipoib_dev_priv *priv)
+{
+	struct ipoib_path_iter *iter;
+
+	iter = kmalloc(sizeof *iter, GFP_KERNEL);
+	if (!iter)
+		return NULL;
+
+	iter->priv = priv;
+	memset(iter->path.pathrec.dgid.raw, 0, 16);
+
+	if (ipoib_path_iter_next(iter)) {
+		kfree(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+int
+ipoib_path_iter_next(struct ipoib_path_iter *iter)
+{
+	struct ipoib_dev_priv *priv = iter->priv;
+	struct rb_node *n;
+	struct ipoib_path *path;
+	int ret = 1;
+
+	spin_lock_irq(&priv->lock);
+
+	n = rb_first(&priv->path_tree);
+
+	while (n) {
+		path = rb_entry(n, struct ipoib_path, rb_node);
+
+		if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,
+			   sizeof (union ib_gid)) < 0) {
+			iter->path = *path;
+			ret = 0;
+			break;
+		}
+
+		n = rb_next(n);
+	}
+
+	spin_unlock_irq(&priv->lock);
+
+	return ret;
+}
+
+void
+ipoib_path_iter_read(struct ipoib_path_iter *iter, struct ipoib_path *path)
+{
+	*path = iter->path;
+}
+
+#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
+
+void
+ipoib_mark_paths_invalid(struct ipoib_dev_priv *priv)
+{
+	struct ipoib_path *path, *tp;
+
+	spin_lock_irq(&priv->lock);
+
+	list_for_each_entry_safe(path, tp, &priv->path_list, list) {
+		ipoib_dbg(priv, "mark path LID 0x%04x GID %16D invalid\n",
+			be16_to_cpu(path->pathrec.dlid),
+			path->pathrec.dgid.raw, ":");
+		path->valid =  0;
+	}
+
+	spin_unlock_irq(&priv->lock);
+}
+
+void
+ipoib_flush_paths(struct ipoib_dev_priv *priv)
+{
+	struct ipoib_path *path, *tp;
+	LIST_HEAD(remove_list);
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	list_splice_init(&priv->path_list, &remove_list);
+
+	list_for_each_entry(path, &remove_list, list)
+		rb_erase(&path->rb_node, &priv->path_tree);
+
+	list_for_each_entry_safe(path, tp, &remove_list, list) {
+		if (path->query)
+			ib_sa_cancel_query(path->query_id, path->query);
+		spin_unlock_irqrestore(&priv->lock, flags);
+		wait_for_completion(&path->done);
+		ipoib_path_free(priv, path);
+		spin_lock_irqsave(&priv->lock, flags);
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static void
+path_rec_completion(int status, struct ib_sa_path_rec *pathrec, void *path_ptr)
+{
+	struct ipoib_path *path = path_ptr;
+	struct ipoib_dev_priv *priv = path->priv;
+	struct ifnet *dev = priv->dev;
+	struct ipoib_ah *ah = NULL;
+	struct ipoib_ah *old_ah = NULL;
+	struct ifqueue mbqueue;
+	struct mbuf *mb;
+	unsigned long flags;
+
+	if (!status)
+		ipoib_dbg(priv, "PathRec LID 0x%04x for GID %16D\n",
+			  be16_to_cpu(pathrec->dlid), pathrec->dgid.raw, ":");
+	else
+		ipoib_dbg(priv, "PathRec status %d for GID %16D\n",
+			  status, path->pathrec.dgid.raw, ":");
+
+	bzero(&mbqueue, sizeof(mbqueue));
+
+	if (!status) {
+		struct ib_ah_attr av;
+
+		if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av))
+			ah = ipoib_create_ah(priv, priv->pd, &av);
+	}
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	if (ah) {
+		path->pathrec = *pathrec;
+
+		old_ah   = path->ah;
+		path->ah = ah;
+
+		ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
+			  ah, be16_to_cpu(pathrec->dlid), pathrec->sl);
+
+		for (;;) {
+			_IF_DEQUEUE(&path->queue, mb);
+			if (mb == NULL)
+				break;
+			_IF_ENQUEUE(&mbqueue, mb);
+		}
+
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+		if (ipoib_cm_enabled(priv, path->hwaddr) && !ipoib_cm_get(path))
+			ipoib_cm_set(path, ipoib_cm_create_tx(priv, path));
+#endif
+
+		path->valid = 1;
+	}
+
+	path->query = NULL;
+	complete(&path->done);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	if (old_ah)
+		ipoib_put_ah(old_ah);
+
+	for (;;) {
+		_IF_DEQUEUE(&mbqueue, mb);
+		if (mb == NULL)
+			break;
+		mb->m_pkthdr.rcvif = dev;
+		if (dev->if_transmit(dev, mb))
+			ipoib_warn(priv, "dev_queue_xmit failed "
+				   "to requeue packet\n");
+	}
+}
+
+static struct ipoib_path *
+path_rec_create(struct ipoib_dev_priv *priv, uint8_t *hwaddr)
+{
+	struct ipoib_path *path;
+
+	if (!priv->broadcast)
+		return NULL;
+
+	path = kzalloc(sizeof *path, GFP_ATOMIC);
+	if (!path)
+		return NULL;
+
+	path->priv = priv;
+
+	bzero(&path->queue, sizeof(path->queue));
+
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+	memcpy(&path->hwaddr, hwaddr, INFINIBAND_ALEN);
+#endif
+	memcpy(path->pathrec.dgid.raw, &hwaddr[4], sizeof (union ib_gid));
+	path->pathrec.sgid	    = priv->local_gid;
+	path->pathrec.pkey	    = cpu_to_be16(priv->pkey);
+	path->pathrec.numb_path     = 1;
+	path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
+
+	return path;
+}
+
+static int
+path_rec_start(struct ipoib_dev_priv *priv, struct ipoib_path *path)
+{
+	struct ifnet *dev = priv->dev;
+
+	ib_sa_comp_mask comp_mask = IB_SA_PATH_REC_MTU_SELECTOR | IB_SA_PATH_REC_MTU;
+	struct ib_sa_path_rec p_rec;
+
+	p_rec = path->pathrec;
+	p_rec.mtu_selector = IB_SA_GT;
+
+	switch (roundup_pow_of_two(dev->if_mtu + IPOIB_ENCAP_LEN)) {
+	case 512:
+		p_rec.mtu = IB_MTU_256;
+		break;
+	case 1024:
+		p_rec.mtu = IB_MTU_512;
+		break;
+	case 2048:
+		p_rec.mtu = IB_MTU_1024;
+		break;
+	case 4096:
+		p_rec.mtu = IB_MTU_2048;
+		break;
+	default:
+		/* Wildcard everything */
+		comp_mask = 0;
+		p_rec.mtu = 0;
+		p_rec.mtu_selector = 0;
+	}
+
+	ipoib_dbg(priv, "Start path record lookup for %16D MTU > %d\n",
+		  p_rec.dgid.raw, ":",
+		  comp_mask ? ib_mtu_enum_to_int(p_rec.mtu) : 0);
+
+	init_completion(&path->done);
+
+	path->query_id =
+		ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,
+				   &p_rec, comp_mask		|
+				   IB_SA_PATH_REC_DGID		|
+				   IB_SA_PATH_REC_SGID		|
+				   IB_SA_PATH_REC_NUMB_PATH	|
+				   IB_SA_PATH_REC_TRAFFIC_CLASS |
+				   IB_SA_PATH_REC_PKEY,
+				   1000, GFP_ATOMIC,
+				   path_rec_completion,
+				   path, &path->query);
+	if (path->query_id < 0) {
+		ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);
+		path->query = NULL;
+		complete(&path->done);
+		return path->query_id;
+	}
+
+	return 0;
+}
+
+static void
+ipoib_unicast_send(struct mbuf *mb, struct ipoib_dev_priv *priv, struct ipoib_header *eh)
+{
+	struct ipoib_path *path;
+
+	path = __path_find(priv, eh->hwaddr + 4);
+	if (!path || !path->valid) {
+		int new_path = 0;
+
+		if (!path) {
+			path = path_rec_create(priv, eh->hwaddr);
+			new_path = 1;
+		}
+		if (path) {
+			_IF_ENQUEUE(&path->queue, mb);
+			if (!path->query && path_rec_start(priv, path)) {
+				spin_unlock_irqrestore(&priv->lock, flags);
+				if (new_path)
+					ipoib_path_free(priv, path);
+				return;
+			} else
+				__path_add(priv, path);
+		} else {
+			++priv->dev->if_oerrors;
+			m_freem(mb);
+		}
+
+		return;
+	}
+
+	if (ipoib_cm_get(path) && ipoib_cm_up(path)) {
+		ipoib_cm_send(priv, mb, ipoib_cm_get(path));
+	} else if (path->ah) {
+		ipoib_send(priv, mb, path->ah, IPOIB_QPN(eh->hwaddr));
+	} else if ((path->query || !path_rec_start(priv, path)) &&
+		    path->queue.ifq_len < IPOIB_MAX_PATH_REC_QUEUE) {
+		_IF_ENQUEUE(&path->queue, mb);
+	} else {
+		++priv->dev->if_oerrors;
+		m_freem(mb);
+	}
+}
+
+static int
+ipoib_send_one(struct ipoib_dev_priv *priv, struct mbuf *mb)
+{
+	struct ipoib_header *eh;
+
+	eh = mtod(mb, struct ipoib_header *);
+	if (IPOIB_IS_MULTICAST(eh->hwaddr)) {
+		/* Add in the P_Key for multicast*/
+		eh->hwaddr[8] = (priv->pkey >> 8) & 0xff;
+		eh->hwaddr[9] = priv->pkey & 0xff;
+
+		ipoib_mcast_send(priv, eh->hwaddr + 4, mb);
+	} else
+		ipoib_unicast_send(mb, priv, eh);
+
+	return 0;
+}
+
+
+static void
+_ipoib_start(struct ifnet *dev, struct ipoib_dev_priv *priv)
+{
+	struct mbuf *mb;
+
+	if ((dev->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
+	    IFF_DRV_RUNNING)
+		return;
+
+	spin_lock(&priv->lock);
+	while (!IFQ_DRV_IS_EMPTY(&dev->if_snd) &&
+	    (dev->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
+		IFQ_DRV_DEQUEUE(&dev->if_snd, mb);
+		if (mb == NULL)
+			break;
+		IPOIB_MTAP(dev, mb);
+		ipoib_send_one(priv, mb);
+	}
+	spin_unlock(&priv->lock);
+}
+
+static void
+ipoib_start(struct ifnet *dev)
+{
+	_ipoib_start(dev, dev->if_softc);
+}
+
+static void
+ipoib_vlan_start(struct ifnet *dev)
+{
+	struct ipoib_dev_priv *priv;
+	struct mbuf *mb;
+
+	priv = VLAN_COOKIE(dev);
+	if (priv != NULL)
+		return _ipoib_start(dev, priv);
+	while (!IFQ_DRV_IS_EMPTY(&dev->if_snd)) {
+		IFQ_DRV_DEQUEUE(&dev->if_snd, mb);
+		if (mb == NULL)
+			break;
+		m_freem(mb);
+		dev->if_oerrors++;
+	}
+}
+
+int
+ipoib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port)
+{
+
+	/* Allocate RX/TX "rings" to hold queued mbs */
+	priv->rx_ring =	kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
+				GFP_KERNEL);
+	if (!priv->rx_ring) {
+		printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
+		       ca->name, ipoib_recvq_size);
+		goto out;
+	}
+
+	priv->tx_ring = kzalloc(ipoib_sendq_size * sizeof *priv->tx_ring, GFP_KERNEL);
+	if (!priv->tx_ring) {
+		printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
+		       ca->name, ipoib_sendq_size);
+		goto out_rx_ring_cleanup;
+	}
+	memset(priv->tx_ring, 0, ipoib_sendq_size * sizeof *priv->tx_ring);
+
+	/* priv->tx_head, tx_tail & tx_outstanding are already 0 */
+
+	if (ipoib_ib_dev_init(priv, ca, port))
+		goto out_tx_ring_cleanup;
+
+	return 0;
+
+out_tx_ring_cleanup:
+	kfree(priv->tx_ring);
+
+out_rx_ring_cleanup:
+	kfree(priv->rx_ring);
+
+out:
+	return -ENOMEM;
+}
+
+static void
+ipoib_detach(struct ipoib_dev_priv *priv)
+{
+	struct ifnet *dev;
+
+	dev = priv->dev;
+	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+		bpfdetach(dev);
+		if_detach(dev);
+		if_free(dev);
+	} else
+		VLAN_SETCOOKIE(priv->dev, NULL);
+
+	free(priv, M_TEMP);
+}
+
+void
+ipoib_dev_cleanup(struct ipoib_dev_priv *priv)
+{
+	struct ipoib_dev_priv *cpriv, *tcpriv;
+
+	/* Delete any child interfaces first */
+	list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
+		ipoib_dev_cleanup(cpriv);
+		ipoib_detach(cpriv);
+	}
+
+	ipoib_ib_dev_cleanup(priv);
+
+	kfree(priv->rx_ring);
+	kfree(priv->tx_ring);
+
+	priv->rx_ring = NULL;
+	priv->tx_ring = NULL;
+}
+
+static volatile int ipoib_unit;
+
+static struct ipoib_dev_priv *
+ipoib_priv_alloc(void)
+{
+	struct ipoib_dev_priv *priv;
+
+	priv = malloc(sizeof(struct ipoib_dev_priv), M_TEMP, M_ZERO|M_WAITOK);
+	spin_lock_init(&priv->lock);
+	mutex_init(&priv->vlan_mutex);
+	INIT_LIST_HEAD(&priv->path_list);
+	INIT_LIST_HEAD(&priv->child_intfs);
+	INIT_LIST_HEAD(&priv->dead_ahs);
+	INIT_LIST_HEAD(&priv->multicast_list);
+	INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
+	INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
+	INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
+	INIT_WORK(&priv->flush_light,   ipoib_ib_dev_flush_light);
+	INIT_WORK(&priv->flush_normal,   ipoib_ib_dev_flush_normal);
+	INIT_WORK(&priv->flush_heavy,   ipoib_ib_dev_flush_heavy);
+	INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
+	INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
+	memcpy(priv->broadcastaddr, ipv4_bcast_addr, INFINIBAND_ALEN);
+
+	return (priv);
+}
+
+struct ipoib_dev_priv *
+ipoib_intf_alloc(const char *name)
+{
+	struct ipoib_dev_priv *priv;
+	struct sockaddr_dl *sdl;
+	struct ifnet *dev;
+
+	priv = ipoib_priv_alloc();
+	dev = priv->dev = if_alloc(IFT_INFINIBAND);
+	if (!dev) {
+		free(priv, M_TEMP);
+		return NULL;
+	}
+	dev->if_softc = priv;
+	if_initname(dev, name, atomic_fetchadd_int(&ipoib_unit, 1));
+	dev->if_flags = IFF_BROADCAST | IFF_MULTICAST;
+	dev->if_addrlen = INFINIBAND_ALEN;
+	dev->if_hdrlen = IPOIB_HEADER_LEN;
+	if_attach(dev);
+	dev->if_init = ipoib_init;
+	dev->if_ioctl = ipoib_ioctl;
+	dev->if_start = ipoib_start;
+	dev->if_output = ipoib_output;
+	dev->if_input = ipoib_input;
+	dev->if_resolvemulti = ipoib_resolvemulti;
+	dev->if_baudrate = IF_Gbps(10LL);
+	dev->if_broadcastaddr = priv->broadcastaddr;
+	dev->if_snd.ifq_maxlen = ipoib_sendq_size * 2;
+	sdl = (struct sockaddr_dl *)dev->if_addr->ifa_addr;
+	sdl->sdl_type = IFT_INFINIBAND;
+	sdl->sdl_alen = dev->if_addrlen;
+	priv->dev = dev;
+	if_link_state_change(dev, LINK_STATE_DOWN);
+	bpfattach(dev, DLT_EN10MB, ETHER_HDR_LEN);
+
+	return dev->if_softc;
+}
+
+int
+ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
+{
+	struct ib_device_attr *device_attr;
+	int result = -ENOMEM;
+
+	device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL);
+	if (!device_attr) {
+		printk(KERN_WARNING "%s: allocation of %zu bytes failed\n",
+		       hca->name, sizeof *device_attr);
+		return result;
+	}
+
+	result = ib_query_device(hca, device_attr);
+	if (result) {
+		printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n",
+		       hca->name, result);
+		kfree(device_attr);
+		return result;
+	}
+	priv->hca_caps = device_attr->device_cap_flags;
+
+	kfree(device_attr);
+
+	priv->dev->if_hwassist = 0;
+	priv->dev->if_capabilities = 0;
+
+#ifndef CONFIG_INFINIBAND_IPOIB_CM
+	if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
+		set_bit(IPOIB_FLAG_CSUM, &priv->flags);
+		priv->dev->if_hwassist = CSUM_IP | CSUM_TCP | CSUM_UDP;
+		priv->dev->if_capabilities = IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM;
+	}
+
+#if 0
+	if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO)
+		priv->dev->if_capabilities |= IFCAP_TSO4 | CSUM_TSO;
+#endif
+#endif
+	priv->dev->if_capabilities |=
+	    IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_LINKSTATE;
+	priv->dev->if_capenable = priv->dev->if_capabilities;
+
+	return 0;
+}
+
+
+static struct ifnet *
+ipoib_add_port(const char *format, struct ib_device *hca, u8 port)
+{
+	struct ipoib_dev_priv *priv;
+	struct ib_port_attr attr;
+	int result = -ENOMEM;
+
+	priv = ipoib_intf_alloc(format);
+	if (!priv)
+		goto alloc_mem_failed;
+
+	if (!ib_query_port(hca, port, &attr))
+		priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
+	else {
+		printk(KERN_WARNING "%s: ib_query_port %d failed\n",
+		       hca->name, port);
+		goto device_init_failed;
+	}
+
+	/* MTU will be reset when mcast join happens */
+	priv->dev->if_mtu = IPOIB_UD_MTU(priv->max_ib_mtu);
+	priv->mcast_mtu = priv->admin_mtu = priv->dev->if_mtu;
+
+	result = ib_query_pkey(hca, port, 0, &priv->pkey);
+	if (result) {
+		printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
+		       hca->name, port, result);
+		goto device_init_failed;
+	}
+
+	if (ipoib_set_dev_features(priv, hca))
+		goto device_init_failed;
+
+	/*
+	 * Set the full membership bit, so that we join the right
+	 * broadcast group, etc.
+	 */
+	priv->pkey |= 0x8000;
+
+	priv->broadcastaddr[8] = priv->pkey >> 8;
+	priv->broadcastaddr[9] = priv->pkey & 0xff;
+
+	result = ib_query_gid(hca, port, 0, &priv->local_gid);
+	if (result) {
+		printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
+		       hca->name, port, result);
+		goto device_init_failed;
+	}
+	memcpy(IF_LLADDR(priv->dev) + 4, priv->local_gid.raw, sizeof (union ib_gid));
+
+	result = ipoib_dev_init(priv, hca, port);
+	if (result < 0) {
+		printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n",
+		       hca->name, port, result);
+		goto device_init_failed;
+	}
+	if (ipoib_cm_admin_enabled(priv))
+		priv->dev->if_mtu = IPOIB_CM_MTU(ipoib_cm_max_mtu(priv));
+
+	INIT_IB_EVENT_HANDLER(&priv->event_handler,
+			      priv->ca, ipoib_event);
+	result = ib_register_event_handler(&priv->event_handler);
+	if (result < 0) {
+		printk(KERN_WARNING "%s: ib_register_event_handler failed for "
+		       "port %d (ret = %d)\n",
+		       hca->name, port, result);
+		goto event_failed;
+	}
+	if_printf(priv->dev, "Attached to %s port %d\n", hca->name, port);
+
+	return priv->dev;
+
+event_failed:
+	ipoib_dev_cleanup(priv);
+
+device_init_failed:
+	ipoib_detach(priv);
+
+alloc_mem_failed:
+	return ERR_PTR(result);
+}
+
+static void
+ipoib_add_one(struct ib_device *device)
+{
+	struct list_head *dev_list;
+	struct ifnet *dev;
+	struct ipoib_dev_priv *priv;
+	int s, e, p;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);
+	if (!dev_list)
+		return;
+
+	INIT_LIST_HEAD(dev_list);
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH) {
+		s = 0;
+		e = 0;
+	} else {
+		s = 1;
+		e = device->phys_port_cnt;
+	}
+
+	for (p = s; p <= e; ++p) {
+		if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND)
+			continue;
+		dev = ipoib_add_port("ib", device, p);
+		if (!IS_ERR(dev)) {
+			priv = dev->if_softc;
+			list_add_tail(&priv->list, dev_list);
+		}
+	}
+
+	ib_set_client_data(device, &ipoib_client, dev_list);
+}
+
+static void
+ipoib_remove_one(struct ib_device *device)
+{
+	struct ipoib_dev_priv *priv, *tmp;
+	struct list_head *dev_list;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	dev_list = ib_get_client_data(device, &ipoib_client);
+
+	list_for_each_entry_safe(priv, tmp, dev_list, list) {
+		if (rdma_port_get_link_layer(device, priv->port) != IB_LINK_LAYER_INFINIBAND)
+			continue;
+
+		ib_unregister_event_handler(&priv->event_handler);
+
+		/* dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); */
+
+		flush_workqueue(ipoib_workqueue);
+
+		ipoib_dev_cleanup(priv);
+		ipoib_detach(priv);
+	}
+
+	kfree(dev_list);
+}
+
+static void
+ipoib_config_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
+{
+	struct ipoib_dev_priv *parent;
+	struct ipoib_dev_priv *priv;
+	struct ifnet *dev;
+	uint16_t pkey;
+	int error;
+
+	if (ifp->if_type != IFT_INFINIBAND)
+		return;
+	dev = VLAN_DEVAT(ifp, vtag);
+	if (dev == NULL)
+		return;
+	priv = NULL;
+	error = 0;
+	parent = ifp->if_softc;
+	/* We only support 15 bits of pkey. */
+	if (vtag & 0x8000)
+		return;
+	pkey = vtag | 0x8000;	/* Set full membership bit. */
+	if (pkey == parent->pkey)
+		return;
+	/* Check for dups */
+	mutex_lock(&parent->vlan_mutex);
+	list_for_each_entry(priv, &parent->child_intfs, list) {
+		if (priv->pkey == pkey) {
+			priv = NULL;
+			error = EBUSY;
+			goto out;
+		}
+	}
+	priv = ipoib_priv_alloc();
+	priv->dev = dev;
+	priv->max_ib_mtu = parent->max_ib_mtu;
+	priv->mcast_mtu = priv->admin_mtu = parent->dev->if_mtu;
+	set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags);
+	error = ipoib_set_dev_features(priv, parent->ca);
+	if (error)
+		goto out;
+	priv->pkey = pkey;
+	priv->broadcastaddr[8] = pkey >> 8;
+	priv->broadcastaddr[9] = pkey & 0xff;
+	dev->if_broadcastaddr = priv->broadcastaddr;
+	error = ipoib_dev_init(priv, parent->ca, parent->port);
+	if (error)
+		goto out;
+	priv->parent = parent->dev;
+	list_add_tail(&priv->list, &parent->child_intfs);
+	VLAN_SETCOOKIE(dev, priv);
+	dev->if_start = ipoib_vlan_start;
+	dev->if_drv_flags &= ~IFF_DRV_RUNNING;
+	dev->if_hdrlen = IPOIB_HEADER_LEN;
+	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
+		ipoib_open(priv);
+	mutex_unlock(&parent->vlan_mutex);
+	return;
+out:
+	mutex_unlock(&parent->vlan_mutex);
+	if (priv)
+		free(priv, M_TEMP);
+	if (error)
+		ipoib_warn(parent,
+		    "failed to initialize subinterface: device %s, port %d vtag 0x%X",
+		    parent->ca->name, parent->port, vtag);
+	return;
+}
+
+static void
+ipoib_unconfig_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
+{
+	struct ipoib_dev_priv *parent;
+	struct ipoib_dev_priv *priv;
+	struct ifnet *dev;
+	uint16_t pkey;
+
+	if (ifp->if_type != IFT_INFINIBAND)
+		return;
+
+	dev = VLAN_DEVAT(ifp, vtag);
+	if (dev)
+		VLAN_SETCOOKIE(dev, NULL);
+	pkey = vtag | 0x8000;
+	parent = ifp->if_softc;
+	mutex_lock(&parent->vlan_mutex);
+	list_for_each_entry(priv, &parent->child_intfs, list) {
+		if (priv->pkey == pkey) {
+			ipoib_dev_cleanup(priv);
+			list_del(&priv->list);
+			break;
+		}
+	}
+	mutex_unlock(&parent->vlan_mutex);
+}
+
+eventhandler_tag ipoib_vlan_attach;
+eventhandler_tag ipoib_vlan_detach;
+
+static int __init
+ipoib_init_module(void)
+{
+	int ret;
+
+	ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);
+	ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);
+	ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);
+
+	ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
+	ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
+	ipoib_sendq_size = max(ipoib_sendq_size, max(2 * MAX_SEND_CQE,
+						     IPOIB_MIN_QUEUE_SIZE));
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+	ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
+#endif
+
+	ipoib_vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
+		ipoib_config_vlan, NULL, EVENTHANDLER_PRI_FIRST);
+	ipoib_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
+		ipoib_unconfig_vlan, NULL, EVENTHANDLER_PRI_FIRST);
+
+	/*
+	 * We create our own workqueue mainly because we want to be
+	 * able to flush it when devices are being removed.  We can't
+	 * use schedule_work()/flush_scheduled_work() because both
+	 * unregister_netdev() and linkwatch_event take the rtnl lock,
+	 * so flush_scheduled_work() can deadlock during device
+	 * removal.
+	 */
+	ipoib_workqueue = create_singlethread_workqueue("ipoib");
+	if (!ipoib_workqueue) {
+		ret = -ENOMEM;
+		goto err_fs;
+	}
+
+	ib_sa_register_client(&ipoib_sa_client);
+
+	ret = ib_register_client(&ipoib_client);
+	if (ret)
+		goto err_sa;
+
+	return 0;
+
+err_sa:
+	ib_sa_unregister_client(&ipoib_sa_client);
+	destroy_workqueue(ipoib_workqueue);
+
+err_fs:
+	return ret;
+}
+
+static void __exit
+ipoib_cleanup_module(void)
+{
+
+	EVENTHANDLER_DEREGISTER(vlan_config, ipoib_vlan_attach);
+	EVENTHANDLER_DEREGISTER(vlan_unconfig, ipoib_vlan_detach);
+	ib_unregister_client(&ipoib_client);
+	ib_sa_unregister_client(&ipoib_sa_client);
+	destroy_workqueue(ipoib_workqueue);
+}
+
+/*
+ * Infiniband output routine.
+ */
+static int
+ipoib_output(struct ifnet *ifp, struct mbuf *m,
+	struct sockaddr *dst, struct route *ro)
+{
+	u_char edst[INFINIBAND_ALEN];
+	struct llentry *lle = NULL;
+	struct rtentry *rt0 = NULL;
+	struct ipoib_header *eh;
+	int error = 0;
+	short type;
+
+	if (ro != NULL) {
+		if (!(m->m_flags & (M_BCAST | M_MCAST)))
+			lle = ro->ro_lle;
+		rt0 = ro->ro_rt;
+	}
+#ifdef MAC
+	error = mac_ifnet_check_transmit(ifp, m);
+	if (error)
+		goto bad;
+#endif
+
+	M_PROFILE(m);
+	if (ifp->if_flags & IFF_MONITOR) {
+		error = ENETDOWN;
+		goto bad;
+	}
+	if (!((ifp->if_flags & IFF_UP) &&
+	    (ifp->if_drv_flags & IFF_DRV_RUNNING))) {
+		error = ENETDOWN;
+		goto bad;
+	}
+
+	switch (dst->sa_family) {
+#ifdef INET
+	case AF_INET:
+		if (lle != NULL && (lle->la_flags & LLE_VALID))
+			memcpy(edst, &lle->ll_addr.mac8, sizeof(edst));
+		else if (m->m_flags & M_MCAST)
+			ip_ib_mc_map(((struct sockaddr_in *)dst)->sin_addr.s_addr, ifp->if_broadcastaddr, edst);
+		else
+			error = arpresolve(ifp, rt0, m, dst, edst, &lle);
+		if (error)
+			return (error == EWOULDBLOCK ? 0 : error);
+		type = htons(ETHERTYPE_IP);
+		break;
+	case AF_ARP:
+	{
+		struct arphdr *ah;
+		ah = mtod(m, struct arphdr *);
+		ah->ar_hrd = htons(ARPHRD_INFINIBAND);
+
+		switch(ntohs(ah->ar_op)) {
+		case ARPOP_REVREQUEST:
+		case ARPOP_REVREPLY:
+			type = htons(ETHERTYPE_REVARP);
+			break;
+		case ARPOP_REQUEST:
+		case ARPOP_REPLY:
+		default:
+			type = htons(ETHERTYPE_ARP);
+			break;
+		}
+
+		if (m->m_flags & M_BCAST)
+			bcopy(ifp->if_broadcastaddr, edst, INFINIBAND_ALEN);
+		else
+			bcopy(ar_tha(ah), edst, INFINIBAND_ALEN);
+
+	}
+	break;
+#endif
+#ifdef INET6
+	case AF_INET6:
+		if (lle != NULL && (lle->la_flags & LLE_VALID))
+			memcpy(edst, &lle->ll_addr.mac8, sizeof(edst));
+		else if (m->m_flags & M_MCAST)
+			ipv6_ib_mc_map(&((struct sockaddr_in6 *)dst)->sin6_addr, ifp->if_broadcastaddr, edst);
+		else
+			error = nd6_storelladdr(ifp, m, dst, (u_char *)edst, &lle);
+		if (error)
+			return error;
+		type = htons(ETHERTYPE_IPV6);
+		break;
+#endif
+
+	default:
+		if_printf(ifp, "can't handle af%d\n", dst->sa_family);
+		error = EAFNOSUPPORT;
+		goto bad;
+	}
+
+	/*
+	 * Add local net header.  If no space in first mbuf,
+	 * allocate another.
+	 */
+	M_PREPEND(m, IPOIB_HEADER_LEN, M_DONTWAIT);
+	if (m == NULL) {
+		error = ENOBUFS;
+		goto bad;
+	}
+	eh = mtod(m, struct ipoib_header *);
+	(void)memcpy(&eh->proto, &type, sizeof(eh->proto));
+	(void)memcpy(&eh->hwaddr, edst, sizeof (edst));
+
+	/*
+	 * Queue message on interface, update output statistics if
+	 * successful, and start output if interface not yet active.
+	 */
+	return ((ifp->if_transmit)(ifp, m));
+bad:
+	if (m != NULL)
+		m_freem(m);
+	return (error);
+}
+
+/*
+ * Upper layer processing for a received Infiniband packet.
+ */
+void
+ipoib_demux(struct ifnet *ifp, struct mbuf *m, u_short proto)
+{
+	int isr;
+
+#ifdef MAC
+	/*
+	 * Tag the mbuf with an appropriate MAC label before any other
+	 * consumers can get to it.
+	 */
+	mac_ifnet_create_mbuf(ifp, m);
+#endif
+	/* Allow monitor mode to claim this frame, after stats are updated. */
+	if (ifp->if_flags & IFF_MONITOR) {
+		if_printf(ifp, "discard frame at IFF_MONITOR\n");
+		m_freem(m);
+		return;
+	}
+	/*
+	 * Dispatch frame to upper layer.
+	 */
+	switch (proto) {
+#ifdef INET
+	case ETHERTYPE_IP:
+		isr = NETISR_IP;
+		break;
+
+	case ETHERTYPE_ARP:
+		if (ifp->if_flags & IFF_NOARP) {
+			/* Discard packet if ARP is disabled on interface */
+			m_freem(m);
+			return;
+		}
+		isr = NETISR_ARP;
+		break;
+#endif
+#ifdef INET6
+	case ETHERTYPE_IPV6:
+		isr = NETISR_IPV6;
+		break;
+#endif
+	default:
+		goto discard;
+	}
+	netisr_dispatch(isr, m);
+	return;
+
+discard:
+	m_freem(m);
+}
+
+/*
+ * Process a received Infiniband packet.
+ */
+static void
+ipoib_input(struct ifnet *ifp, struct mbuf *m)
+{
+	struct ipoib_header *eh;
+
+	if ((ifp->if_flags & IFF_UP) == 0) {
+		m_freem(m);
+		return;
+	}
+	CURVNET_SET_QUIET(ifp->if_vnet);
+
+	/* Let BPF have it before we strip the header. */
+	IPOIB_MTAP(ifp, m);
+	eh = mtod(m, struct ipoib_header *);
+	/*
+	 * Reset layer specific mbuf flags to avoid confusing upper layers.
+	 * Strip off Infiniband header.
+	 */
+	m->m_flags &= ~M_VLANTAG;
+	m->m_flags &= ~(M_PROTOFLAGS);
+	m_adj(m, IPOIB_HEADER_LEN);
+
+	if (IPOIB_IS_MULTICAST(eh->hwaddr)) {
+		if (memcmp(eh->hwaddr, ifp->if_broadcastaddr,
+		    ifp->if_addrlen) == 0)
+			m->m_flags |= M_BCAST;
+		else
+			m->m_flags |= M_MCAST;
+		ifp->if_imcasts++;
+	}
+
+	ipoib_demux(ifp, m, ntohs(eh->proto));
+	CURVNET_RESTORE();
+}
+
+static int
+ipoib_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa,
+	struct sockaddr *sa)
+{
+	struct sockaddr_dl *sdl;
+#ifdef INET
+	struct sockaddr_in *sin;
+#endif
+#ifdef INET6
+	struct sockaddr_in6 *sin6;
+#endif
+	u_char *e_addr;
+
+	switch(sa->sa_family) {
+	case AF_LINK:
+		/*
+		 * No mapping needed. Just check that it's a valid MC address.
+		 */
+		sdl = (struct sockaddr_dl *)sa;
+		e_addr = LLADDR(sdl);
+		if (!IPOIB_IS_MULTICAST(e_addr))
+			return EADDRNOTAVAIL;
+		*llsa = 0;
+		return 0;
+
+#ifdef INET
+	case AF_INET:
+		sin = (struct sockaddr_in *)sa;
+		if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
+			return EADDRNOTAVAIL;
+		sdl = malloc(sizeof *sdl, M_IFMADDR,
+		       M_NOWAIT|M_ZERO);
+		if (sdl == NULL)
+			return ENOMEM;
+		sdl->sdl_len = sizeof *sdl;
+		sdl->sdl_family = AF_LINK;
+		sdl->sdl_index = ifp->if_index;
+		sdl->sdl_type = IFT_INFINIBAND;
+		sdl->sdl_alen = INFINIBAND_ALEN;
+		e_addr = LLADDR(sdl);
+		ip_ib_mc_map(sin->sin_addr.s_addr, ifp->if_broadcastaddr,
+		    e_addr);
+		*llsa = (struct sockaddr *)sdl;
+		return 0;
+#endif
+#ifdef INET6
+	case AF_INET6:
+		sin6 = (struct sockaddr_in6 *)sa;
+		/*
+		 * An IP6 address of 0 means listen to all
+		 * of the multicast address used for IP6.  
+		 * This has no meaning in ipoib.
+		 */
+		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
+			return EADDRNOTAVAIL;
+		if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
+			return EADDRNOTAVAIL;
+		sdl = malloc(sizeof *sdl, M_IFMADDR,
+		       M_NOWAIT|M_ZERO);
+		if (sdl == NULL)
+			return (ENOMEM);
+		sdl->sdl_len = sizeof *sdl;
+		sdl->sdl_family = AF_LINK;
+		sdl->sdl_index = ifp->if_index;
+		sdl->sdl_type = IFT_INFINIBAND;
+		sdl->sdl_alen = INFINIBAND_ALEN;
+		e_addr = LLADDR(sdl);
+		ipv6_ib_mc_map(&sin6->sin6_addr, ifp->if_broadcastaddr, e_addr);
+		*llsa = (struct sockaddr *)sdl;
+		return 0;
+#endif
+
+	default:
+		return EAFNOSUPPORT;
+	}
+}
+
+module_init(ipoib_init_module);
+module_exit(ipoib_cleanup_module);
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
new file mode 100644
index 0000000..a5746e4
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -0,0 +1,907 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ipoib.h"
+
+#include <linux/delay.h>
+#include <linux/completion.h>
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+static int mcast_debug_level = 1;
+
+module_param(mcast_debug_level, int, 0644);
+MODULE_PARM_DESC(mcast_debug_level,
+		 "Enable multicast debug tracing if > 0");
+#endif
+
+static DEFINE_MUTEX(mcast_mutex);
+
+struct ipoib_mcast_iter {
+	struct ipoib_dev_priv *priv;
+	union ib_gid       mgid;
+	unsigned long      created;
+	unsigned int       queuelen;
+	unsigned int       complete;
+	unsigned int       send_only;
+};
+
+static void ipoib_mcast_free(struct ipoib_mcast *mcast)
+{
+	struct ifnet *dev = mcast->priv->dev;
+	int tx_dropped = 0;
+
+	ipoib_dbg_mcast(mcast->priv, "deleting multicast group %16D\n",
+			mcast->mcmember.mgid.raw, ":");
+
+	if (mcast->ah)
+		ipoib_put_ah(mcast->ah);
+
+	tx_dropped = mcast->pkt_queue.ifq_len;
+	_IF_DRAIN(&mcast->pkt_queue);	/* XXX Locking. */
+
+	dev->if_oerrors += tx_dropped;
+
+	kfree(mcast);
+}
+
+static struct ipoib_mcast *ipoib_mcast_alloc(struct ipoib_dev_priv *priv,
+					     int can_sleep)
+{
+	struct ipoib_mcast *mcast;
+
+	mcast = kzalloc(sizeof *mcast, can_sleep ? GFP_KERNEL : GFP_ATOMIC);
+	if (!mcast)
+		return NULL;
+
+	mcast->priv = priv;
+	mcast->created = jiffies;
+	mcast->backoff = 1;
+
+	INIT_LIST_HEAD(&mcast->list);
+	bzero(&mcast->pkt_queue, sizeof(mcast->pkt_queue));
+
+	return mcast;
+}
+
+static struct ipoib_mcast *__ipoib_mcast_find(struct ipoib_dev_priv *priv,
+    void *mgid)
+{
+	struct rb_node *n = priv->multicast_tree.rb_node;
+
+	while (n) {
+		struct ipoib_mcast *mcast;
+		int ret;
+
+		mcast = rb_entry(n, struct ipoib_mcast, rb_node);
+
+		ret = memcmp(mgid, mcast->mcmember.mgid.raw,
+			     sizeof (union ib_gid));
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else
+			return mcast;
+	}
+
+	return NULL;
+}
+
+static int __ipoib_mcast_add(struct ipoib_dev_priv *priv,
+    struct ipoib_mcast *mcast)
+{
+	struct rb_node **n = &priv->multicast_tree.rb_node, *pn = NULL;
+
+	while (*n) {
+		struct ipoib_mcast *tmcast;
+		int ret;
+
+		pn = *n;
+		tmcast = rb_entry(pn, struct ipoib_mcast, rb_node);
+
+		ret = memcmp(mcast->mcmember.mgid.raw, tmcast->mcmember.mgid.raw,
+			     sizeof (union ib_gid));
+		if (ret < 0)
+			n = &pn->rb_left;
+		else if (ret > 0)
+			n = &pn->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&mcast->rb_node, pn, n);
+	rb_insert_color(&mcast->rb_node, &priv->multicast_tree);
+
+	return 0;
+}
+
+static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
+				   struct ib_sa_mcmember_rec *mcmember)
+{
+	struct ipoib_dev_priv *priv = mcast->priv;
+	struct ifnet *dev = priv->dev;
+	struct ipoib_ah *ah;
+	int ret;
+	int set_qkey = 0;
+
+	mcast->mcmember = *mcmember;
+
+	/* Set the cached Q_Key before we attach if it's the broadcast group */
+	if (!memcmp(mcast->mcmember.mgid.raw, dev->if_broadcastaddr + 4,
+		    sizeof (union ib_gid))) {
+		spin_lock_irq(&priv->lock);
+		if (!priv->broadcast) {
+			spin_unlock_irq(&priv->lock);
+			return -EAGAIN;
+		}
+		priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey);
+		spin_unlock_irq(&priv->lock);
+		priv->tx_wr.wr.ud.remote_qkey = priv->qkey;
+		set_qkey = 1;
+	}
+
+	if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+		if (test_and_set_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
+			ipoib_warn(priv, "multicast group %16D already attached\n",
+				   mcast->mcmember.mgid.raw, ":");
+
+			return 0;
+		}
+
+		ret = ipoib_mcast_attach(priv, be16_to_cpu(mcast->mcmember.mlid),
+					 &mcast->mcmember.mgid, set_qkey);
+		if (ret < 0) {
+			ipoib_warn(priv, "couldn't attach QP to multicast group %16D\n",
+				   mcast->mcmember.mgid.raw, ":");
+
+			clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags);
+			return ret;
+		}
+	}
+
+	{
+		struct ib_ah_attr av = {
+			.dlid	       = be16_to_cpu(mcast->mcmember.mlid),
+			.port_num      = priv->port,
+			.sl	       = mcast->mcmember.sl,
+			.ah_flags      = IB_AH_GRH,
+			.static_rate   = mcast->mcmember.rate,
+			.grh	       = {
+				.flow_label    = be32_to_cpu(mcast->mcmember.flow_label),
+				.hop_limit     = mcast->mcmember.hop_limit,
+				.sgid_index    = 0,
+				.traffic_class = mcast->mcmember.traffic_class
+			}
+		};
+		av.grh.dgid = mcast->mcmember.mgid;
+
+		ah = ipoib_create_ah(priv, priv->pd, &av);
+		if (!ah) {
+			ipoib_warn(priv, "ib_address_create failed\n");
+		} else {
+			spin_lock_irq(&priv->lock);
+			mcast->ah = ah;
+			spin_unlock_irq(&priv->lock);
+
+			ipoib_dbg_mcast(priv, "MGID %16D AV %p, LID 0x%04x, SL %d\n",
+					mcast->mcmember.mgid.raw, ":",
+					mcast->ah->ah,
+					be16_to_cpu(mcast->mcmember.mlid),
+					mcast->mcmember.sl);
+		}
+	}
+
+	/* actually send any queued packets */
+	while (mcast->pkt_queue.ifq_len) {
+		struct mbuf *mb;
+		_IF_DEQUEUE(&mcast->pkt_queue, mb);
+		mb->m_pkthdr.rcvif = dev;
+
+		if (dev->if_transmit(dev, mb))
+			ipoib_warn(priv, "dev_queue_xmit failed to requeue packet\n");
+	}
+
+	return 0;
+}
+
+static int
+ipoib_mcast_sendonly_join_complete(int status,
+				   struct ib_sa_multicast *multicast)
+{
+	struct ipoib_mcast *mcast = multicast->context;
+	struct ipoib_dev_priv *priv = mcast->priv;
+
+	/* We trap for port events ourselves. */
+	if (status == -ENETRESET)
+		return 0;
+
+	if (!status)
+		status = ipoib_mcast_join_finish(mcast, &multicast->rec);
+
+	if (status) {
+		if (mcast->logcount++ < 20)
+			ipoib_dbg_mcast(priv, "multicast join failed for %16D, status %d\n",
+					mcast->mcmember.mgid.raw, ":", status);
+
+		/* Flush out any queued packets */
+		priv->dev->if_oerrors += mcast->pkt_queue.ifq_len;
+		_IF_DRAIN(&mcast->pkt_queue);
+
+		/* Clear the busy flag so we try again */
+		status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY,
+					    &mcast->flags);
+	}
+	return status;
+}
+
+static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
+{
+	struct ipoib_dev_priv *priv = mcast->priv;
+	struct ib_sa_mcmember_rec rec = {
+#if 0				/* Some SMs don't support send-only yet */
+		.join_state = 4
+#else
+		.join_state = 1
+#endif
+	};
+	int ret = 0;
+
+	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
+		ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n");
+		return -ENODEV;
+	}
+
+	if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) {
+		ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n");
+		return -EBUSY;
+	}
+
+	rec.mgid     = mcast->mcmember.mgid;
+	rec.port_gid = priv->local_gid;
+	rec.pkey     = cpu_to_be16(priv->pkey);
+
+	mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca,
+					 priv->port, &rec,
+					 IB_SA_MCMEMBER_REC_MGID	|
+					 IB_SA_MCMEMBER_REC_PORT_GID	|
+					 IB_SA_MCMEMBER_REC_PKEY	|
+					 IB_SA_MCMEMBER_REC_JOIN_STATE,
+					 GFP_ATOMIC,
+					 ipoib_mcast_sendonly_join_complete,
+					 mcast);
+	if (IS_ERR(mcast->mc)) {
+		ret = PTR_ERR(mcast->mc);
+		clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+		ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n",
+			   ret);
+	} else {
+		ipoib_dbg_mcast(priv, "no multicast record for %16D, starting join\n",
+				mcast->mcmember.mgid.raw, ":");
+	}
+
+	return ret;
+}
+
+void ipoib_mcast_carrier_on_task(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+						   carrier_on_task);
+	struct ib_port_attr attr;
+
+	/*
+	 * Take rtnl_lock to avoid racing with ipoib_stop() and
+	 * turning the carrier back on while a device is being
+	 * removed.
+	 */
+	if (ib_query_port(priv->ca, priv->port, &attr) ||
+	    attr.state != IB_PORT_ACTIVE) {
+		ipoib_dbg(priv, "Keeping carrier off until IB port is active\n");
+		return;
+	}
+	if_link_state_change(priv->dev, LINK_STATE_UP);
+}
+
+static int ipoib_mcast_join_complete(int status,
+				     struct ib_sa_multicast *multicast)
+{
+	struct ipoib_mcast *mcast = multicast->context;
+	struct ipoib_dev_priv *priv = mcast->priv;
+
+	ipoib_dbg_mcast(priv, "join completion for %16D (status %d)\n",
+			mcast->mcmember.mgid.raw, ":", status);
+
+	/* We trap for port events ourselves. */
+	if (status == -ENETRESET)
+		return 0;
+
+	if (!status)
+		status = ipoib_mcast_join_finish(mcast, &multicast->rec);
+
+	if (!status) {
+		mcast->backoff = 1;
+		mutex_lock(&mcast_mutex);
+		if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+			queue_delayed_work(ipoib_workqueue,
+					   &priv->mcast_task, 0);
+		mutex_unlock(&mcast_mutex);
+
+		/*
+		 * Defer carrier on work to ipoib_workqueue to avoid a
+		 * deadlock on rtnl_lock here.
+		 */
+		if (mcast == priv->broadcast)
+			queue_work(ipoib_workqueue, &priv->carrier_on_task);
+
+		return 0;
+	}
+
+	if (mcast->logcount++ < 20) {
+		if (status == -ETIMEDOUT || status == -EAGAIN) {
+			ipoib_dbg_mcast(priv, "multicast join failed for %16D, status %d\n",
+					mcast->mcmember.mgid.raw, ":", status);
+		} else {
+			ipoib_warn(priv, "multicast join failed for %16D, status %d\n",
+				   mcast->mcmember.mgid.raw, ":", status);
+		}
+	}
+
+	mcast->backoff *= 2;
+	if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
+		mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
+
+	/* Clear the busy flag so we try again */
+	status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+
+	mutex_lock(&mcast_mutex);
+	spin_lock_irq(&priv->lock);
+	if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+		queue_delayed_work(ipoib_workqueue, &priv->mcast_task,
+				   mcast->backoff * HZ);
+	spin_unlock_irq(&priv->lock);
+	mutex_unlock(&mcast_mutex);
+
+	return status;
+}
+
+static void ipoib_mcast_join(struct ipoib_dev_priv *priv,
+    struct ipoib_mcast *mcast, int create)
+{
+	struct ib_sa_mcmember_rec rec = {
+		.join_state = 1
+	};
+	ib_sa_comp_mask comp_mask;
+	int ret = 0;
+
+	ipoib_dbg_mcast(priv, "joining MGID %16D\n",
+	    mcast->mcmember.mgid.raw, ":");
+
+	rec.mgid     = mcast->mcmember.mgid;
+	rec.port_gid = priv->local_gid;
+	rec.pkey     = cpu_to_be16(priv->pkey);
+
+	comp_mask =
+		IB_SA_MCMEMBER_REC_MGID		|
+		IB_SA_MCMEMBER_REC_PORT_GID	|
+		IB_SA_MCMEMBER_REC_PKEY		|
+		IB_SA_MCMEMBER_REC_JOIN_STATE;
+
+	if (create) {
+		comp_mask |=
+			IB_SA_MCMEMBER_REC_QKEY			|
+			IB_SA_MCMEMBER_REC_MTU_SELECTOR		|
+			IB_SA_MCMEMBER_REC_MTU			|
+			IB_SA_MCMEMBER_REC_TRAFFIC_CLASS	|
+			IB_SA_MCMEMBER_REC_RATE_SELECTOR	|
+			IB_SA_MCMEMBER_REC_RATE			|
+			IB_SA_MCMEMBER_REC_SL			|
+			IB_SA_MCMEMBER_REC_FLOW_LABEL		|
+			IB_SA_MCMEMBER_REC_HOP_LIMIT;
+
+		rec.qkey	  = priv->broadcast->mcmember.qkey;
+		rec.mtu_selector  = IB_SA_EQ;
+		rec.mtu		  = priv->broadcast->mcmember.mtu;
+		rec.traffic_class = priv->broadcast->mcmember.traffic_class;
+		rec.rate_selector = IB_SA_EQ;
+		rec.rate	  = priv->broadcast->mcmember.rate;
+		rec.sl		  = priv->broadcast->mcmember.sl;
+		rec.flow_label	  = priv->broadcast->mcmember.flow_label;
+		rec.hop_limit	  = priv->broadcast->mcmember.hop_limit;
+	}
+
+	set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+	mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
+					 &rec, comp_mask, GFP_KERNEL,
+					 ipoib_mcast_join_complete, mcast);
+	if (IS_ERR(mcast->mc)) {
+		clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+		ret = PTR_ERR(mcast->mc);
+		ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret);
+
+		mcast->backoff *= 2;
+		if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
+			mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
+
+		mutex_lock(&mcast_mutex);
+		if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+			queue_delayed_work(ipoib_workqueue,
+					   &priv->mcast_task,
+					   mcast->backoff * HZ);
+		mutex_unlock(&mcast_mutex);
+	}
+}
+
+void ipoib_mcast_join_task(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, mcast_task.work);
+	struct ifnet *dev = priv->dev;
+
+	ipoib_dbg_mcast(priv, "Running join task. flags 0x%lX\n", priv->flags);
+
+	if (!test_bit(IPOIB_MCAST_RUN, &priv->flags))
+		return;
+
+	if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid))
+		ipoib_warn(priv, "ib_query_gid() failed\n");
+	else
+		memcpy(IF_LLADDR(dev) + 4, priv->local_gid.raw, sizeof (union ib_gid));
+
+	{
+		struct ib_port_attr attr;
+
+		if (!ib_query_port(priv->ca, priv->port, &attr))
+			priv->local_lid = attr.lid;
+		else
+			ipoib_warn(priv, "ib_query_port failed\n");
+	}
+
+	if (!priv->broadcast) {
+		struct ipoib_mcast *broadcast;
+
+		if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+			return;
+
+		broadcast = ipoib_mcast_alloc(priv, 1);
+		if (!broadcast) {
+			ipoib_warn(priv, "failed to allocate broadcast group\n");
+			mutex_lock(&mcast_mutex);
+			if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+				queue_delayed_work(ipoib_workqueue,
+						   &priv->mcast_task, HZ);
+			mutex_unlock(&mcast_mutex);
+			return;
+		}
+
+		spin_lock_irq(&priv->lock);
+		memcpy(broadcast->mcmember.mgid.raw, dev->if_broadcastaddr + 4,
+		       sizeof (union ib_gid));
+		priv->broadcast = broadcast;
+
+		__ipoib_mcast_add(priv, priv->broadcast);
+		spin_unlock_irq(&priv->lock);
+	}
+
+	if (priv->broadcast &&
+	    !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
+		if (priv->broadcast &&
+		    !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags))
+			ipoib_mcast_join(priv, priv->broadcast, 0);
+		return;
+	}
+
+	while (1) {
+		struct ipoib_mcast *mcast = NULL;
+
+		spin_lock_irq(&priv->lock);
+		list_for_each_entry(mcast, &priv->multicast_list, list) {
+			if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)
+			    && !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)
+			    && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
+				/* Found the next unjoined group */
+				break;
+			}
+		}
+		spin_unlock_irq(&priv->lock);
+
+		if (&mcast->list == &priv->multicast_list) {
+			/* All done */
+			break;
+		}
+
+		ipoib_mcast_join(priv, mcast, 1);
+		return;
+	}
+
+	spin_lock_irq(&priv->lock);
+	if (priv->broadcast)
+		priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
+	else
+		priv->mcast_mtu = priv->admin_mtu;
+	spin_unlock_irq(&priv->lock);
+
+	if (!ipoib_cm_admin_enabled(priv))
+		ipoib_change_mtu(priv, min(priv->mcast_mtu, priv->admin_mtu));
+
+	ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n");
+
+	clear_bit(IPOIB_MCAST_RUN, &priv->flags);
+}
+
+int ipoib_mcast_start_thread(struct ipoib_dev_priv *priv)
+{
+	ipoib_dbg_mcast(priv, "starting multicast thread flags 0x%lX\n",
+	    priv->flags);
+
+	mutex_lock(&mcast_mutex);
+	if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags))
+		queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0);
+	mutex_unlock(&mcast_mutex);
+
+	return 0;
+}
+
+int ipoib_mcast_stop_thread(struct ipoib_dev_priv *priv, int flush)
+{
+
+	ipoib_dbg_mcast(priv, "stopping multicast thread\n");
+
+	mutex_lock(&mcast_mutex);
+	clear_bit(IPOIB_MCAST_RUN, &priv->flags);
+	cancel_delayed_work(&priv->mcast_task);
+	mutex_unlock(&mcast_mutex);
+
+	if (flush)
+		flush_workqueue(ipoib_workqueue);
+
+	return 0;
+}
+
+static int ipoib_mcast_leave(struct ipoib_dev_priv *priv, struct ipoib_mcast *mcast)
+{
+	int ret = 0;
+
+	if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
+		ib_sa_free_multicast(mcast->mc);
+
+	if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
+		ipoib_dbg_mcast(priv, "leaving MGID %16D\n",
+				mcast->mcmember.mgid.raw, ":");
+
+		/* Remove ourselves from the multicast group */
+		ret = ib_detach_mcast(priv->qp, &mcast->mcmember.mgid,
+				      be16_to_cpu(mcast->mcmember.mlid));
+		if (ret)
+			ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret);
+	}
+
+	return 0;
+}
+
+void
+ipoib_mcast_send(struct ipoib_dev_priv *priv, void *mgid, struct mbuf *mb)
+{
+	struct ifnet *dev = priv->dev;
+	struct ipoib_mcast *mcast;
+
+	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)		||
+	    !priv->broadcast					||
+	    !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
+		++dev->if_oerrors;
+		m_freem(mb);
+		return;
+	}
+
+	mcast = __ipoib_mcast_find(priv, mgid);
+	if (!mcast) {
+		/* Let's create a new send only group now */
+		ipoib_dbg_mcast(priv, "setting up send only multicast group for %16D\n",
+				mgid, ":");
+
+		mcast = ipoib_mcast_alloc(priv, 0);
+		if (!mcast) {
+			ipoib_warn(priv, "unable to allocate memory for "
+				   "multicast structure\n");
+			++dev->if_oerrors;
+			m_freem(mb);
+			goto out;
+		}
+
+		set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags);
+		memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid));
+		__ipoib_mcast_add(priv, mcast);
+		list_add_tail(&mcast->list, &priv->multicast_list);
+	}
+
+	if (!mcast->ah) {
+		if (mcast->pkt_queue.ifq_len < IPOIB_MAX_MCAST_QUEUE) {
+			_IF_ENQUEUE(&mcast->pkt_queue, mb);
+		} else {
+			++dev->if_oerrors;
+			m_freem(mb);
+		}
+
+		if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
+			ipoib_dbg_mcast(priv, "no address vector, "
+					"but multicast join already started\n");
+		else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
+			ipoib_mcast_sendonly_join(mcast);
+
+		/*
+		 * If lookup completes between here and out:, don't
+		 * want to send packet twice.
+		 */
+		mcast = NULL;
+	}
+
+out:
+	if (mcast && mcast->ah)
+		ipoib_send(priv, mb, mcast->ah, IB_MULTICAST_QPN);
+}
+
+void ipoib_mcast_dev_flush(struct ipoib_dev_priv *priv)
+{
+	LIST_HEAD(remove_list);
+	struct ipoib_mcast *mcast, *tmcast;
+	unsigned long flags;
+
+	ipoib_dbg_mcast(priv, "flushing multicast list\n");
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
+		list_del(&mcast->list);
+		rb_erase(&mcast->rb_node, &priv->multicast_tree);
+		list_add_tail(&mcast->list, &remove_list);
+	}
+
+	if (priv->broadcast) {
+		rb_erase(&priv->broadcast->rb_node, &priv->multicast_tree);
+		list_add_tail(&priv->broadcast->list, &remove_list);
+		priv->broadcast = NULL;
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
+		ipoib_mcast_leave(priv, mcast);
+		ipoib_mcast_free(mcast);
+	}
+}
+
+static int ipoib_mcast_addr_is_valid(const u8 *addr, unsigned int addrlen,
+				     const u8 *broadcast)
+{
+	if (addrlen != INFINIBAND_ALEN)
+		return 0;
+	/* reserved QPN, prefix, scope */
+	if (memcmp(addr, broadcast, 6))
+		return 0;
+	/* signature lower, pkey */
+	if (memcmp(addr + 7, broadcast + 7, 3))
+		return 0;
+	return 1;
+}
+
+void ipoib_mcast_restart_task(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, restart_task);
+	ipoib_mcast_restart(priv);
+}
+
+void ipoib_mcast_restart(struct ipoib_dev_priv *priv)
+{
+	struct ifnet *dev = priv->dev;
+	struct ifmultiaddr *ifma;;
+	struct ipoib_mcast *mcast, *tmcast;
+	LIST_HEAD(remove_list);
+	struct ib_sa_mcmember_rec rec;
+	int addrlen;
+
+	ipoib_dbg_mcast(priv, "restarting multicast task flags 0x%lX\n",
+	    priv->flags);
+
+	ipoib_mcast_stop_thread(priv, 0);
+
+	if_maddr_rlock(dev);
+	spin_lock(&priv->lock);
+
+	/*
+	 * Unfortunately, the networking core only gives us a list of all of
+	 * the multicast hardware addresses. We need to figure out which ones
+	 * are new and which ones have been removed
+	 */
+
+	/* Clear out the found flag */
+	list_for_each_entry(mcast, &priv->multicast_list, list)
+		clear_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags);
+
+	/* Mark all of the entries that are found or don't exist */
+
+
+	TAILQ_FOREACH(ifma, &dev->if_multiaddrs, ifma_link) {
+		union ib_gid mgid;
+		uint8_t *addr;
+
+		if (ifma->ifma_addr->sa_family != AF_LINK)
+			continue;
+		addr = LLADDR((struct sockaddr_dl *)ifma->ifma_addr);
+		addrlen = ((struct sockaddr_dl *)ifma->ifma_addr)->sdl_alen;
+		if (!ipoib_mcast_addr_is_valid(addr, addrlen,
+					       dev->if_broadcastaddr))
+			continue;
+
+		memcpy(mgid.raw, addr + 4, sizeof mgid);
+
+		mcast = __ipoib_mcast_find(priv, &mgid);
+		if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+			struct ipoib_mcast *nmcast;
+
+			/* ignore group which is directly joined by userspace */
+			if (test_bit(IPOIB_FLAG_UMCAST, &priv->flags) &&
+			    !ib_sa_get_mcmember_rec(priv->ca, priv->port, &mgid, &rec)) {
+				ipoib_dbg_mcast(priv, "ignoring multicast entry for mgid %16D\n",
+						mgid.raw, ":");
+				continue;
+			}
+
+			/* Not found or send-only group, let's add a new entry */
+			ipoib_dbg_mcast(priv, "adding multicast entry for mgid %16D\n",
+					mgid.raw, ":");
+
+			nmcast = ipoib_mcast_alloc(priv, 0);
+			if (!nmcast) {
+				ipoib_warn(priv, "unable to allocate memory for multicast structure\n");
+				continue;
+			}
+
+			set_bit(IPOIB_MCAST_FLAG_FOUND, &nmcast->flags);
+
+			nmcast->mcmember.mgid = mgid;
+
+			if (mcast) {
+				/* Destroy the send only entry */
+				list_move_tail(&mcast->list, &remove_list);
+
+				rb_replace_node(&mcast->rb_node,
+						&nmcast->rb_node,
+						&priv->multicast_tree);
+			} else
+				__ipoib_mcast_add(priv, nmcast);
+
+			list_add_tail(&nmcast->list, &priv->multicast_list);
+		}
+
+		if (mcast)
+			set_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags);
+	}
+
+	/* Remove all of the entries don't exist anymore */
+	list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
+		if (!test_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags) &&
+		    !test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+			ipoib_dbg_mcast(priv, "deleting multicast group %16D\n",
+					mcast->mcmember.mgid.raw, ":");
+
+			rb_erase(&mcast->rb_node, &priv->multicast_tree);
+
+			/* Move to the remove list */
+			list_move_tail(&mcast->list, &remove_list);
+		}
+	}
+
+	spin_unlock(&priv->lock);
+	if_maddr_runlock(dev);
+
+	/* We have to cancel outside of the spinlock */
+	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
+		ipoib_mcast_leave(mcast->priv, mcast);
+		ipoib_mcast_free(mcast);
+	}
+
+	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+		ipoib_mcast_start_thread(priv);
+}
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+
+struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct ipoib_dev_priv *priv)
+{
+	struct ipoib_mcast_iter *iter;
+
+	iter = kmalloc(sizeof *iter, GFP_KERNEL);
+	if (!iter)
+		return NULL;
+
+	iter->priv = priv;
+	memset(iter->mgid.raw, 0, 16);
+
+	if (ipoib_mcast_iter_next(iter)) {
+		kfree(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter)
+{
+	struct ipoib_dev_priv *priv = iter->priv;
+	struct rb_node *n;
+	struct ipoib_mcast *mcast;
+	int ret = 1;
+
+	spin_lock_irq(&priv->lock);
+
+	n = rb_first(&priv->multicast_tree);
+
+	while (n) {
+		mcast = rb_entry(n, struct ipoib_mcast, rb_node);
+
+		if (memcmp(iter->mgid.raw, mcast->mcmember.mgid.raw,
+			   sizeof (union ib_gid)) < 0) {
+			iter->mgid      = mcast->mcmember.mgid;
+			iter->created   = mcast->created;
+			iter->queuelen  = mcast->pkt_queue.ifq_len;
+			iter->complete  = !!mcast->ah;
+			iter->send_only = !!(mcast->flags & (1 << IPOIB_MCAST_FLAG_SENDONLY));
+
+			ret = 0;
+
+			break;
+		}
+
+		n = rb_next(n);
+	}
+
+	spin_unlock_irq(&priv->lock);
+
+	return ret;
+}
+
+void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter,
+			   union ib_gid *mgid,
+			   unsigned long *created,
+			   unsigned int *queuelen,
+			   unsigned int *complete,
+			   unsigned int *send_only)
+{
+	*mgid      = iter->mgid;
+	*created   = iter->created;
+	*queuelen  = iter->queuelen;
+	*complete  = iter->complete;
+	*send_only = iter->send_only;
+}
+
+#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
new file mode 100644
index 0000000..fb9a27a
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ipoib.h"
+#include <linux/ethtool.h>
+
+int ipoib_mcast_attach(struct ipoib_dev_priv *priv, u16 mlid, union ib_gid *mgid, int set_qkey)
+{
+	struct ib_qp_attr *qp_attr = NULL;
+	int ret;
+	u16 pkey_index;
+
+	if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) {
+		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+		ret = -ENXIO;
+		goto out;
+	}
+	set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+
+	if (set_qkey) {
+		ret = -ENOMEM;
+		qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL);
+		if (!qp_attr)
+			goto out;
+
+		/* set correct QKey for QP */
+		qp_attr->qkey = priv->qkey;
+		ret = ib_modify_qp(priv->qp, qp_attr, IB_QP_QKEY);
+		if (ret) {
+			ipoib_warn(priv, "failed to modify QP, ret = %d\n", ret);
+			goto out;
+		}
+	}
+
+	/* attach QP to multicast group */
+	ret = ib_attach_mcast(priv->qp, mgid, mlid);
+	if (ret)
+		ipoib_warn(priv, "failed to attach to multicast group, ret = %d\n", ret);
+
+out:
+	kfree(qp_attr);
+	return ret;
+}
+
+int ipoib_init_qp(struct ipoib_dev_priv *priv)
+{
+	int ret;
+	struct ib_qp_attr qp_attr;
+	int attr_mask;
+
+	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
+		return -1;
+
+	qp_attr.qp_state = IB_QPS_INIT;
+	qp_attr.qkey = 0;
+	qp_attr.port_num = priv->port;
+	qp_attr.pkey_index = priv->pkey_index;
+	attr_mask =
+	    IB_QP_QKEY |
+	    IB_QP_PORT |
+	    IB_QP_PKEY_INDEX |
+	    IB_QP_STATE;
+	ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to init, ret = %d\n", ret);
+		goto out_fail;
+	}
+
+	qp_attr.qp_state = IB_QPS_RTR;
+	/* Can't set this in a INIT->RTR transition */
+	attr_mask &= ~IB_QP_PORT;
+	ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTR, ret = %d\n", ret);
+		goto out_fail;
+	}
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	qp_attr.sq_psn = 0;
+	attr_mask |= IB_QP_SQ_PSN;
+	attr_mask &= ~IB_QP_PKEY_INDEX;
+	ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTS, ret = %d\n", ret);
+		goto out_fail;
+	}
+
+	return 0;
+
+out_fail:
+	qp_attr.qp_state = IB_QPS_RESET;
+	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
+		ipoib_warn(priv, "Failed to modify QP to RESET state\n");
+
+	return ret;
+}
+
+int ipoib_transport_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca)
+{
+	struct ib_qp_init_attr init_attr = {
+		.cap = {
+			.max_send_wr  = ipoib_sendq_size,
+			.max_recv_wr  = ipoib_recvq_size,
+			.max_send_sge = 1,
+			.max_recv_sge = IPOIB_UD_RX_SG
+		},
+		.sq_sig_type = IB_SIGNAL_ALL_WR,
+		.qp_type     = IB_QPT_UD
+	};
+
+	int ret, size;
+	int i;
+	/* XXX struct ethtool_coalesce *coal; */
+
+	priv->pd = ib_alloc_pd(priv->ca);
+	if (IS_ERR(priv->pd)) {
+		printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name);
+		return -ENODEV;
+	}
+
+	priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(priv->mr)) {
+		printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name);
+		goto out_free_pd;
+	}
+
+	size = ipoib_recvq_size + 1;
+	ret = ipoib_cm_dev_init(priv);
+	if (!ret) {
+		size += ipoib_sendq_size;
+		if (ipoib_cm_has_srq(priv))
+			size += ipoib_recvq_size + 1; /* 1 extra for rx_drain_qp */
+		else
+			size += ipoib_recvq_size * ipoib_max_conn_qp;
+	}
+
+	priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, priv, size, 0);
+	if (IS_ERR(priv->recv_cq)) {
+		printk(KERN_WARNING "%s: failed to create receive CQ\n", ca->name);
+		goto out_free_mr;
+	}
+
+	priv->send_cq = ib_create_cq(priv->ca, ipoib_send_comp_handler, NULL,
+				     priv, ipoib_sendq_size, 0);
+	if (IS_ERR(priv->send_cq)) {
+		printk(KERN_WARNING "%s: failed to create send CQ\n", ca->name);
+		goto out_free_recv_cq;
+	}
+
+	if (ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP))
+		goto out_free_send_cq;
+
+#if 0
+	/* XXX */
+	coal = kzalloc(sizeof *coal, GFP_KERNEL);
+	if (coal) {
+		coal->rx_coalesce_usecs = 10;
+		coal->tx_coalesce_usecs = 10;
+		coal->rx_max_coalesced_frames = 16;
+		coal->tx_max_coalesced_frames = 16;
+		dev->ethtool_ops->set_coalesce(dev, coal);
+		kfree(coal);
+	}
+#endif
+
+	init_attr.send_cq = priv->send_cq;
+	init_attr.recv_cq = priv->recv_cq;
+
+	if (priv->hca_caps & IB_DEVICE_UD_TSO)
+		init_attr.create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
+
+	if (priv->hca_caps & IB_DEVICE_BLOCK_MULTICAST_LOOPBACK)
+		init_attr.create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
+
+	init_attr.cap.max_send_sge = IPOIB_UD_TX_SG;
+
+	priv->qp = ib_create_qp(priv->pd, &init_attr);
+	if (IS_ERR(priv->qp)) {
+		printk(KERN_WARNING "%s: failed to create QP\n", ca->name);
+		goto out_free_send_cq;
+	}
+
+	IF_LLADDR(priv->dev)[1] = (priv->qp->qp_num >> 16) & 0xff;
+	IF_LLADDR(priv->dev)[2] = (priv->qp->qp_num >>  8) & 0xff;
+	IF_LLADDR(priv->dev)[3] = (priv->qp->qp_num      ) & 0xff;
+
+	for (i = 0; i < IPOIB_MAX_TX_SG; ++i)
+		priv->tx_sge[i].lkey = priv->mr->lkey;
+
+	priv->tx_wr.opcode	= IB_WR_SEND;
+	priv->tx_wr.sg_list	= priv->tx_sge;
+	priv->tx_wr.send_flags	= IB_SEND_SIGNALED;
+
+	for (i = 0; i < IPOIB_UD_RX_SG; ++i)
+		priv->rx_sge[i].lkey = priv->mr->lkey;
+	priv->rx_wr.next = NULL;
+	priv->rx_wr.sg_list = priv->rx_sge;
+
+	return 0;
+
+out_free_send_cq:
+	ib_destroy_cq(priv->send_cq);
+
+out_free_recv_cq:
+	ib_destroy_cq(priv->recv_cq);
+
+out_free_mr:
+	ib_dereg_mr(priv->mr);
+	ipoib_cm_dev_cleanup(priv);
+
+out_free_pd:
+	ib_dealloc_pd(priv->pd);
+	return -ENODEV;
+}
+
+void ipoib_transport_dev_cleanup(struct ipoib_dev_priv *priv)
+{
+
+	if (priv->qp) {
+		if (ib_destroy_qp(priv->qp))
+			ipoib_warn(priv, "ib_qp_destroy failed\n");
+
+		priv->qp = NULL;
+		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+	}
+
+	if (ib_destroy_cq(priv->send_cq))
+		ipoib_warn(priv, "ib_cq_destroy (send) failed\n");
+
+	if (ib_destroy_cq(priv->recv_cq))
+		ipoib_warn(priv, "ib_cq_destroy (recv) failed\n");
+
+	ipoib_cm_dev_cleanup(priv);
+
+	if (ib_dereg_mr(priv->mr))
+		ipoib_warn(priv, "ib_dereg_mr failed\n");
+
+	if (ib_dealloc_pd(priv->pd))
+		ipoib_warn(priv, "ib_dealloc_pd failed\n");
+}
+
+void ipoib_event(struct ib_event_handler *handler,
+		 struct ib_event *record)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(handler, struct ipoib_dev_priv, event_handler);
+
+	if (record->element.port_num != priv->port)
+		return;
+
+	ipoib_dbg(priv, "Event %d on device %s port %d\n", record->event,
+		  record->device->name, record->element.port_num);
+
+	if (record->event == IB_EVENT_SM_CHANGE ||
+	    record->event == IB_EVENT_CLIENT_REREGISTER) {
+		queue_work(ipoib_workqueue, &priv->flush_light);
+	} else if (record->event == IB_EVENT_PORT_ERR ||
+		   record->event == IB_EVENT_PORT_ACTIVE ||
+		   record->event == IB_EVENT_LID_CHANGE) {
+		queue_work(ipoib_workqueue, &priv->flush_normal);
+	} else if (record->event == IB_EVENT_PKEY_CHANGE) {
+		queue_work(ipoib_workqueue, &priv->flush_heavy);
+	}
+}
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
new file mode 100644
index 0000000..18c761f
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+
+#include <asm/uaccess.h>
+
+#include "ipoib.h"
+
+static ssize_t show_parent(struct device *d, struct device_attribute *attr,
+			   char *buf)
+{
+	struct ifnet *dev = to_net_dev(d);
+	struct ipoib_dev_priv *priv = dev->if_softc;
+
+	return sprintf(buf, "%s\n", priv->parent->name);
+}
+static DEVICE_ATTR(parent, S_IRUGO, show_parent, NULL);
+
+int ipoib_vlan_add(struct ifnet *pdev, unsigned short pkey)
+{
+	struct ipoib_dev_priv *ppriv, *priv;
+	char intf_name[IFNAMSIZ];
+	int result;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	ppriv = pdev->if_softc;
+
+	rtnl_lock();
+	mutex_lock(&ppriv->vlan_mutex);
+
+	/*
+	 * First ensure this isn't a duplicate. We check the parent device and
+	 * then all of the child interfaces to make sure the Pkey doesn't match.
+	 */
+	if (ppriv->pkey == pkey) {
+		result = -ENOTUNIQ;
+		priv = NULL;
+		goto err;
+	}
+
+	list_for_each_entry(priv, &ppriv->child_intfs, list) {
+		if (priv->pkey == pkey) {
+			result = -ENOTUNIQ;
+			priv = NULL;
+			goto err;
+		}
+	}
+
+	snprintf(intf_name, sizeof intf_name, "%s.%04x",
+		 ppriv->dev->name, pkey);
+	priv = ipoib_intf_alloc(intf_name);
+	if (!priv) {
+		result = -ENOMEM;
+		goto err;
+	}
+
+	priv->max_ib_mtu = ppriv->max_ib_mtu;
+	/* MTU will be reset when mcast join happens */
+	priv->dev->mtu   = IPOIB_UD_MTU(priv->max_ib_mtu);
+	priv->mcast_mtu  = priv->admin_mtu = priv->dev->mtu;
+	set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags);
+
+	result = ipoib_set_dev_features(priv, ppriv->ca);
+	if (result)
+		goto err;
+
+	priv->pkey = pkey;
+
+	memcpy(IF_LLADDR(priv->dev), ppriv->dev->dev_addr, INFINIBAND_ALEN);
+	priv->broadcastaddr[8] = pkey >> 8;
+	priv->broadcastaddr[9] = pkey & 0xff;
+
+	result = ipoib_dev_init(priv->dev, ppriv->ca, ppriv->port);
+	if (result < 0) {
+		ipoib_warn(ppriv, "failed to initialize subinterface: "
+			   "device %s, port %d",
+			   ppriv->ca->name, ppriv->port);
+		goto err;
+	}
+
+	result = register_netdevice(priv->dev);
+	if (result) {
+		ipoib_warn(priv, "failed to initialize; error %i", result);
+		goto register_failed;
+	}
+
+	priv->parent = ppriv->dev;
+
+	ipoib_create_debug_files(priv->dev);
+
+	if (ipoib_cm_add_mode_attr(priv->dev))
+		goto sysfs_failed;
+	if (ipoib_add_pkey_attr(priv->dev))
+		goto sysfs_failed;
+	if (ipoib_add_umcast_attr(priv->dev))
+		goto sysfs_failed;
+
+	if (device_create_file(&priv->dev->dev, &dev_attr_parent))
+		goto sysfs_failed;
+
+	list_add_tail(&priv->list, &ppriv->child_intfs);
+
+	mutex_unlock(&ppriv->vlan_mutex);
+	rtnl_unlock();
+
+	return 0;
+
+sysfs_failed:
+	ipoib_delete_debug_files(priv->dev);
+	unregister_netdevice(priv->dev);
+
+register_failed:
+	ipoib_dev_cleanup(priv->dev);
+
+err:
+	mutex_unlock(&ppriv->vlan_mutex);
+	rtnl_unlock();
+	if (priv)
+		free_netdev(priv->dev);
+
+	return result;
+}
+
+int ipoib_vlan_delete(struct ifnet *pdev, unsigned short pkey)
+{
+	struct ipoib_dev_priv *ppriv, *priv, *tpriv;
+	struct ifnet *dev = NULL;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	ppriv = pdev->if_softc;
+
+	rtnl_lock();
+	mutex_lock(&ppriv->vlan_mutex);
+	list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) {
+		if (priv->pkey == pkey) {
+			unregister_netdevice(priv->dev);
+			ipoib_dev_cleanup(priv->dev);
+			list_del(&priv->list);
+			dev = priv->dev;
+			break;
+		}
+	}
+	mutex_unlock(&ppriv->vlan_mutex);
+	rtnl_unlock();
+
+	if (dev) {
+		free_netdev(dev);
+		return 0;
+	}
+
+	return -ENODEV;
+}
diff --git a/sys/ofed/drivers/infiniband/ulp/sdp/Kconfig b/sys/ofed/drivers/infiniband/ulp/sdp/Kconfig
new file mode 100644
index 0000000..b5fadf4
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/sdp/Kconfig
@@ -0,0 +1,28 @@
+config INFINIBAND_SDP
+	tristate "Sockets Direct Protocol"
+	depends on INFINIBAND && INFINIBAND_IPOIB
+	---help---
+	  Support for Sockets Direct Protocol (SDP).  This provides
+          sockets semantics over InfiniBand via address family
+          AF_INET_SDP (address family 27).  You can also LD_PRELOAD the
+          libsdp library from <http://openib.org> to have standard
+          sockets applications use SDP.
+
+config INFINIBAND_SDP_DEBUG
+	bool "Sockets Direct Protocol debugging"
+	depends on INFINIBAND_SDP
+	---help---
+	  This option causes debugging code to be compiled into the
+	  SDP driver.  The output can be turned on via the debug_level
+	  module parameter  (which can also be set through sysfs after the
+	  driver is loaded).
+
+config INFINIBAND_SDP_DEBUG_DATA
+        bool "Sockets Direct Protocol data path debugging"
+        depends on INFINIBAND_SDP_DEBUG
+        ---help---
+          This option compiles debugging code into the the data path
+          of the SDP driver.  The output can be turned on via the
+          data_debug_level module parameter; however, even with output
+          turned off, this debugging code will have some performance
+          impact.
diff --git a/sys/ofed/drivers/infiniband/ulp/sdp/Makefile b/sys/ofed/drivers/infiniband/ulp/sdp/Makefile
new file mode 100644
index 0000000..5c250e9
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/sdp/Makefile
@@ -0,0 +1,6 @@
+EXTRA_CFLAGS += -Idrivers/infiniband/include
+EXTRA_CFLAGS += -ggdb
+
+obj-$(CONFIG_INFINIBAND_SDP) += ib_sdp.o
+
+ib_sdp-objs := sdp_main.o sdp_cma.o sdp_bcopy.o sdp_proc.o sdp_tx.o sdp_rx.o sdp_zcopy.o
diff --git a/sys/ofed/drivers/infiniband/ulp/sdp/sdp.h b/sys/ofed/drivers/infiniband/ulp/sdp/sdp.h
new file mode 100644
index 0000000..a1ccdff
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/sdp/sdp.h
@@ -0,0 +1,721 @@
+#ifndef _SDP_H_
+#define _SDP_H_
+
+#include "opt_ddb.h"
+#include "opt_inet.h"
+#include "opt_ofed.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/mbuf.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/protosw.h>
+#include <sys/proc.h>
+#include <sys/jail.h>
+#include <sys/domain.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#include <net/if.h>
+#include <net/route.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+#include <linux/wait.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/ib_cm.h>
+#include <rdma/sdp_socket.h>
+#include <rdma/ib_fmr_pool.h>
+
+#ifdef SDP_DEBUG
+#define	CONFIG_INFINIBAND_SDP_DEBUG
+#endif
+
+#include "sdp_dbg.h"
+
+#undef LIST_HEAD
+/* From sys/queue.h */
+#define LIST_HEAD(name, type)                                           \
+struct name {                                                           \
+        struct type *lh_first;  /* first element */                     \
+}
+
+/* Interval between sucessive polls in the Tx routine when polling is used
+   instead of interrupts (in per-core Tx rings) - should be power of 2 */
+#define SDP_TX_POLL_MODER	16
+#define SDP_TX_POLL_TIMEOUT	(HZ / 20)
+#define SDP_NAGLE_TIMEOUT (HZ / 10)
+
+#define SDP_SRCAVAIL_CANCEL_TIMEOUT (HZ * 5)
+#define SDP_SRCAVAIL_ADV_TIMEOUT (1 * HZ)
+#define SDP_SRCAVAIL_PAYLOAD_LEN 1
+
+#define SDP_RESOLVE_TIMEOUT 1000
+#define SDP_ROUTE_TIMEOUT 1000
+#define SDP_RETRY_COUNT 5
+#define SDP_KEEPALIVE_TIME (120 * 60 * HZ)
+#define SDP_FIN_WAIT_TIMEOUT (60 * HZ) /* like TCP_FIN_TIMEOUT */
+
+#define SDP_TX_SIZE 0x40
+#define SDP_RX_SIZE 0x40
+
+#define SDP_FMR_SIZE (MIN(0x1000, PAGE_SIZE) / sizeof(u64))
+#define SDP_FMR_POOL_SIZE	1024
+#define SDP_FMR_DIRTY_SIZE	( SDP_FMR_POOL_SIZE / 4 )
+
+#define SDP_MAX_RDMA_READ_LEN (PAGE_SIZE * (SDP_FMR_SIZE - 2))
+
+/* mb inlined data len - rest will be rx'ed into frags */
+#define SDP_HEAD_SIZE (sizeof(struct sdp_bsdh))
+
+/* limit tx payload len, if the sink supports bigger buffers than the source
+ * can handle.
+ * or rx fragment size (limited by sge->length size) */
+#define	SDP_MAX_PACKET	(1 << 16)
+#define SDP_MAX_PAYLOAD (SDP_MAX_PACKET - SDP_HEAD_SIZE)
+
+#define SDP_MAX_RECV_SGES (SDP_MAX_PACKET / MCLBYTES)
+#define SDP_MAX_SEND_SGES (SDP_MAX_PACKET / MCLBYTES) + 2
+
+#define SDP_NUM_WC 4
+
+#define SDP_DEF_ZCOPY_THRESH 64*1024
+#define SDP_MIN_ZCOPY_THRESH PAGE_SIZE
+#define SDP_MAX_ZCOPY_THRESH 1048576
+
+#define SDP_OP_RECV 0x800000000LL
+#define SDP_OP_SEND 0x400000000LL
+#define SDP_OP_RDMA 0x200000000LL
+#define SDP_OP_NOP  0x100000000LL
+
+/* how long (in jiffies) to block sender till tx completion*/
+#define SDP_BZCOPY_POLL_TIMEOUT (HZ / 10)
+
+#define SDP_AUTO_CONF	0xffff
+#define AUTO_MOD_DELAY (HZ / 4)
+
+struct sdp_mb_cb {
+	__u32		seq;		/* Starting sequence number	*/
+	struct bzcopy_state      *bz;
+	struct rx_srcavail_state *rx_sa;
+	struct tx_srcavail_state *tx_sa;
+};
+
+#define	M_PUSH	M_PROTO1	/* Do a 'push'. */
+#define	M_URG	M_PROTO2	/* Mark as urgent (oob). */
+
+#define SDP_SKB_CB(__mb)      ((struct sdp_mb_cb *)&((__mb)->cb[0]))
+#define BZCOPY_STATE(mb)      (SDP_SKB_CB(mb)->bz)
+#define RX_SRCAVAIL_STATE(mb) (SDP_SKB_CB(mb)->rx_sa)
+#define TX_SRCAVAIL_STATE(mb) (SDP_SKB_CB(mb)->tx_sa)
+
+#ifndef MIN
+#define MIN(a, b) (a < b ? a : b)
+#endif
+
+#define ring_head(ring)   (atomic_read(&(ring).head))
+#define ring_tail(ring)   (atomic_read(&(ring).tail))
+#define ring_posted(ring) (ring_head(ring) - ring_tail(ring))
+
+#define rx_ring_posted(ssk) ring_posted(ssk->rx_ring)
+#ifdef SDP_ZCOPY
+#define tx_ring_posted(ssk) (ring_posted(ssk->tx_ring) + \
+	(ssk->tx_ring.rdma_inflight ? ssk->tx_ring.rdma_inflight->busy : 0))
+#else
+#define tx_ring_posted(ssk) ring_posted(ssk->tx_ring)
+#endif
+
+extern int sdp_zcopy_thresh;
+extern int rcvbuf_initial_size;
+extern struct workqueue_struct *rx_comp_wq;
+extern struct ib_client sdp_client;
+
+enum sdp_mid {
+	SDP_MID_HELLO = 0x0,
+	SDP_MID_HELLO_ACK = 0x1,
+	SDP_MID_DISCONN = 0x2,
+	SDP_MID_ABORT = 0x3,
+	SDP_MID_SENDSM = 0x4,
+	SDP_MID_RDMARDCOMPL = 0x6,
+	SDP_MID_SRCAVAIL_CANCEL = 0x8,
+	SDP_MID_CHRCVBUF = 0xB,
+	SDP_MID_CHRCVBUF_ACK = 0xC,
+	SDP_MID_SINKAVAIL = 0xFD,
+	SDP_MID_SRCAVAIL = 0xFE,
+	SDP_MID_DATA = 0xFF,
+};
+
+enum sdp_flags {
+        SDP_OOB_PRES = 1 << 0,
+        SDP_OOB_PEND = 1 << 1,
+};
+
+enum {
+	SDP_MIN_TX_CREDITS = 2
+};
+
+enum {
+	SDP_ERR_ERROR   = -4,
+	SDP_ERR_FAULT   = -3,
+	SDP_NEW_SEG     = -2,
+	SDP_DO_WAIT_MEM = -1
+};
+
+struct sdp_bsdh {
+	u8 mid;
+	u8 flags;
+	__u16 bufs;
+	__u32 len;
+	__u32 mseq;
+	__u32 mseq_ack;
+} __attribute__((__packed__));
+
+union cma_ip_addr {
+	struct in6_addr ip6;
+	struct {
+		__u32 pad[3];
+		__u32 addr;
+	} ip4;
+} __attribute__((__packed__));
+
+/* TODO: too much? Can I avoid having the src/dst and port here? */
+struct sdp_hh {
+	struct sdp_bsdh bsdh;
+	u8 majv_minv;
+	u8 ipv_cap;
+	u8 rsvd1;
+	u8 max_adverts;
+	__u32 desremrcvsz;
+	__u32 localrcvsz;
+	__u16 port;
+	__u16 rsvd2;
+	union cma_ip_addr src_addr;
+	union cma_ip_addr dst_addr;
+	u8 rsvd3[IB_CM_REQ_PRIVATE_DATA_SIZE - sizeof(struct sdp_bsdh) - 48];
+} __attribute__((__packed__));
+
+struct sdp_hah {
+	struct sdp_bsdh bsdh;
+	u8 majv_minv;
+	u8 ipv_cap;
+	u8 rsvd1;
+	u8 ext_max_adverts;
+	__u32 actrcvsz;
+	u8 rsvd2[IB_CM_REP_PRIVATE_DATA_SIZE - sizeof(struct sdp_bsdh) - 8];
+} __attribute__((__packed__));
+
+struct sdp_rrch {
+	__u32 len;
+} __attribute__((__packed__));
+
+struct sdp_srcah {
+	__u32 len;
+	__u32 rkey;
+	__u64 vaddr;
+} __attribute__((__packed__));
+
+struct sdp_buf {
+        struct mbuf *mb;
+        u64             mapping[SDP_MAX_SEND_SGES];
+} __attribute__((__packed__));
+
+struct sdp_chrecvbuf {
+	u32 size;
+} __attribute__((__packed__));
+
+/* Context used for synchronous zero copy bcopy (BZCOPY) */
+struct bzcopy_state {
+	unsigned char __user  *u_base;
+	int                    u_len;
+	int                    left;
+	int                    page_cnt;
+	int                    cur_page;
+	int                    cur_offset;
+	int                    busy;
+	struct sdp_sock      *ssk;
+	struct page         **pages;
+};
+
+enum rx_sa_flag {
+	RX_SA_ABORTED    = 2,
+};
+
+enum tx_sa_flag {
+	TX_SA_SENDSM     = 0x01,
+	TX_SA_CROSS_SEND = 0x02,
+	TX_SA_INTRRUPTED = 0x04,
+	TX_SA_TIMEDOUT   = 0x08,
+	TX_SA_ERROR      = 0x10,
+};
+
+struct rx_srcavail_state {
+	/* Advertised buffer stuff */
+	u32 mseq;
+	u32 used;
+	u32 reported;
+	u32 len;
+	u32 rkey;
+	u64 vaddr;
+
+	/* Dest buff info */
+	struct ib_umem *umem;
+	struct ib_pool_fmr *fmr;
+
+	/* Utility */
+	u8  busy;
+	enum rx_sa_flag  flags;
+};
+
+struct tx_srcavail_state {
+	/* Data below 'busy' will be reset */
+	u8		busy;
+
+	struct ib_umem *umem;
+	struct ib_pool_fmr *fmr;
+
+	u32		bytes_sent;
+	u32		bytes_acked;
+
+	enum tx_sa_flag	abort_flags;
+	u8		posted;
+
+	u32		mseq;
+};
+
+struct sdp_tx_ring {
+#ifdef SDP_ZCOPY
+	struct rx_srcavail_state *rdma_inflight;
+#endif
+	struct sdp_buf   	*buffer;
+	atomic_t          	head;
+	atomic_t          	tail;
+	struct ib_cq 	 	*cq;
+
+	atomic_t 	  	credits;
+#define tx_credits(ssk) (atomic_read(&ssk->tx_ring.credits))
+
+	struct callout		timer;
+	u16 		  	poll_cnt;
+};
+
+struct sdp_rx_ring {
+	struct sdp_buf   *buffer;
+	atomic_t          head;
+	atomic_t          tail;
+	struct ib_cq 	 *cq;
+
+	int		 destroyed;
+	struct rwlock	 destroyed_lock;
+};
+
+struct sdp_device {
+	struct ib_pd 		*pd;
+	struct ib_mr 		*mr;
+	struct ib_fmr_pool 	*fmr_pool;
+};
+
+struct sdp_moderation {
+	unsigned long last_moder_packets;
+	unsigned long last_moder_tx_packets;
+	unsigned long last_moder_bytes;
+	unsigned long last_moder_jiffies;
+	int last_moder_time;
+	u16 rx_usecs;
+	u16 rx_frames;
+	u16 tx_usecs;
+	u32 pkt_rate_low;
+	u16 rx_usecs_low;
+	u32 pkt_rate_high;
+	u16 rx_usecs_high;
+	u16 sample_interval;
+	u16 adaptive_rx_coal;
+	u32 msg_enable;
+
+	int moder_cnt;
+	int moder_time;
+};
+
+/* These are flags fields. */
+#define	SDP_TIMEWAIT	0x0001		/* In ssk timewait state. */
+#define	SDP_DROPPED	0x0002		/* Socket has been dropped. */
+#define	SDP_SOCKREF	0x0004		/* Holding a sockref for close. */
+#define	SDP_NODELAY	0x0008		/* Disble nagle. */
+#define	SDP_NEEDFIN	0x0010		/* Send a fin on the next tx. */
+#define	SDP_DREQWAIT	0x0020		/* Waiting on DREQ. */
+#define	SDP_DESTROY	0x0040		/* Being destroyed. */
+#define	SDP_DISCON	0x0080		/* rdma_disconnect is owed. */
+
+/* These are oobflags */
+#define	SDP_HADOOB	0x0001		/* Had OOB data. */
+#define	SDP_HAVEOOB	0x0002		/* Have OOB data. */
+
+struct sdp_sock {
+	LIST_ENTRY(sdp_sock) list;
+	struct socket *socket;
+	struct rdma_cm_id *id;
+	struct ib_device *ib_device;
+	struct sdp_device *sdp_dev;
+	struct ib_qp *qp;
+	struct ucred *cred;
+	struct callout keep2msl;	/* 2msl and keepalive timer. */
+	struct callout nagle_timer;	/* timeout waiting for ack */
+	struct ib_ucontext context;
+	in_port_t lport;
+	in_addr_t laddr;
+	in_port_t fport;
+	in_addr_t faddr;
+	int flags;
+	int oobflags;		/* protected by rx lock. */
+	int state;
+	int softerror;
+	int recv_bytes;		/* Bytes per recv. buf including header */
+	int xmit_size_goal;
+	char iobc;
+
+	struct sdp_rx_ring rx_ring;
+	struct sdp_tx_ring tx_ring;
+	struct rwlock	lock;
+	struct mbuf *rx_ctl_q;
+	struct mbuf *rx_ctl_tail;
+
+	int qp_active;	/* XXX Flag. */
+	int max_sge;
+	struct work_struct rx_comp_work;
+#define rcv_nxt(ssk) atomic_read(&(ssk->rcv_nxt))
+	atomic_t rcv_nxt;
+
+	/* SDP specific */
+	atomic_t mseq_ack;
+#define mseq_ack(ssk) (atomic_read(&ssk->mseq_ack))
+	unsigned max_bufs;	/* Initial buffers offered by other side */
+	unsigned min_bufs;	/* Low water mark to wake senders */
+
+	unsigned long nagle_last_unacked; /* mseq of lastest unacked packet */
+
+	atomic_t               remote_credits;
+#define remote_credits(ssk) (atomic_read(&ssk->remote_credits))
+	int 		  poll_cq;
+
+	/* SDP slow start */
+	int recv_request_head; 	/* mark the rx_head when the resize request
+				   was recieved */
+	int recv_request; 	/* XXX flag if request to resize was recieved */
+
+	unsigned long tx_packets;
+	unsigned long rx_packets;
+	unsigned long tx_bytes;
+	unsigned long rx_bytes;
+	struct sdp_moderation auto_mod;
+	struct task shutdown_task;
+#ifdef SDP_ZCOPY
+	struct tx_srcavail_state *tx_sa;
+	struct rx_srcavail_state *rx_sa;
+	spinlock_t tx_sa_lock;
+	struct delayed_work srcavail_cancel_work;
+	int srcavail_cancel_mseq;
+	/* ZCOPY data: -1:use global; 0:disable zcopy; >0: zcopy threshold */
+	int zcopy_thresh;
+#endif
+};
+
+#define	sdp_sk(so)	((struct sdp_sock *)(so->so_pcb))
+
+#define	SDP_RLOCK(ssk)		rw_rlock(&(ssk)->lock)
+#define	SDP_WLOCK(ssk)		rw_wlock(&(ssk)->lock)
+#define	SDP_RUNLOCK(ssk)	rw_runlock(&(ssk)->lock)
+#define	SDP_WUNLOCK(ssk)	rw_wunlock(&(ssk)->lock)
+#define	SDP_WLOCK_ASSERT(ssk)	rw_assert(&(ssk)->lock, RA_WLOCKED)
+#define	SDP_RLOCK_ASSERT(ssk)	rw_assert(&(ssk)->lock, RA_RLOCKED)
+#define	SDP_LOCK_ASSERT(ssk)	rw_assert(&(ssk)->lock, RA_LOCKED)
+
+static inline void tx_sa_reset(struct tx_srcavail_state *tx_sa)
+{
+	memset((void *)&tx_sa->busy, 0,
+			sizeof(*tx_sa) - offsetof(typeof(*tx_sa), busy));
+}
+
+static inline void rx_ring_unlock(struct sdp_rx_ring *rx_ring)
+{
+	rw_runlock(&rx_ring->destroyed_lock);
+}
+
+static inline int rx_ring_trylock(struct sdp_rx_ring *rx_ring)
+{
+	rw_rlock(&rx_ring->destroyed_lock);
+	if (rx_ring->destroyed) {
+		rx_ring_unlock(rx_ring);
+		return 0;
+	}
+	return 1;
+}
+
+static inline void rx_ring_destroy_lock(struct sdp_rx_ring *rx_ring)
+{
+	rw_wlock(&rx_ring->destroyed_lock);
+	rx_ring->destroyed = 1;
+	rw_wunlock(&rx_ring->destroyed_lock);
+}
+
+static inline void sdp_arm_rx_cq(struct sdp_sock *ssk)
+{
+	sdp_prf(ssk->socket, NULL, "Arming RX cq");
+	sdp_dbg_data(ssk->socket, "Arming RX cq\n");
+
+	ib_req_notify_cq(ssk->rx_ring.cq, IB_CQ_NEXT_COMP);
+}
+
+static inline void sdp_arm_tx_cq(struct sdp_sock *ssk)
+{
+	sdp_prf(ssk->socket, NULL, "Arming TX cq");
+	sdp_dbg_data(ssk->socket, "Arming TX cq. credits: %d, posted: %d\n",
+		tx_credits(ssk), tx_ring_posted(ssk));
+
+	ib_req_notify_cq(ssk->tx_ring.cq, IB_CQ_NEXT_COMP);
+}
+
+/* return the min of:
+ * - tx credits
+ * - free slots in tx_ring (not including SDP_MIN_TX_CREDITS
+ */
+static inline int tx_slots_free(struct sdp_sock *ssk)
+{
+	int min_free;
+
+	min_free = MIN(tx_credits(ssk),
+			SDP_TX_SIZE - tx_ring_posted(ssk));
+	if (min_free < SDP_MIN_TX_CREDITS)
+		return 0;
+
+	return min_free - SDP_MIN_TX_CREDITS;
+};
+
+/* utilities */
+static inline char *mid2str(int mid)
+{
+#define ENUM2STR(e) [e] = #e
+	static char *mid2str[] = {
+		ENUM2STR(SDP_MID_HELLO),
+		ENUM2STR(SDP_MID_HELLO_ACK),
+		ENUM2STR(SDP_MID_ABORT),
+		ENUM2STR(SDP_MID_DISCONN),
+		ENUM2STR(SDP_MID_SENDSM),
+		ENUM2STR(SDP_MID_RDMARDCOMPL),
+		ENUM2STR(SDP_MID_SRCAVAIL_CANCEL),
+		ENUM2STR(SDP_MID_CHRCVBUF),
+		ENUM2STR(SDP_MID_CHRCVBUF_ACK),
+		ENUM2STR(SDP_MID_DATA),
+		ENUM2STR(SDP_MID_SRCAVAIL),
+		ENUM2STR(SDP_MID_SINKAVAIL),
+	};
+
+	if (mid >= ARRAY_SIZE(mid2str))
+		return NULL;
+
+	return mid2str[mid];
+}
+
+static inline struct mbuf *
+sdp_alloc_mb(struct socket *sk, u8 mid, int size, int wait)
+{
+	struct sdp_bsdh *h;
+	struct mbuf *mb;
+
+	MGETHDR(mb, wait, MT_DATA);
+	if (mb == NULL)
+		return (NULL);
+	mb->m_pkthdr.len = mb->m_len = sizeof(struct sdp_bsdh);
+	h = mtod(mb, struct sdp_bsdh *);
+	h->mid = mid;
+
+	return mb;
+}
+static inline struct mbuf *
+sdp_alloc_mb_data(struct socket *sk, int wait)
+{
+	return sdp_alloc_mb(sk, SDP_MID_DATA, 0, wait);
+}
+
+static inline struct mbuf *
+sdp_alloc_mb_disconnect(struct socket *sk, int wait)
+{
+	return sdp_alloc_mb(sk, SDP_MID_DISCONN, 0, wait);
+}
+
+static inline void *
+mb_put(struct mbuf *mb, int len)
+{
+	uint8_t *data;
+
+	data = mb->m_data;
+	data += mb->m_len;
+	mb->m_len += len;
+	return (void *)data;
+}
+
+static inline struct mbuf *
+sdp_alloc_mb_chrcvbuf_ack(struct socket *sk, int size, int wait)
+{
+	struct mbuf *mb;
+	struct sdp_chrecvbuf *resp_size;
+
+	mb = sdp_alloc_mb(sk, SDP_MID_CHRCVBUF_ACK, sizeof(*resp_size), wait);
+	if (mb == NULL)
+		return (NULL);
+	resp_size = (struct sdp_chrecvbuf *)mb_put(mb, sizeof *resp_size);
+	resp_size->size = htonl(size);
+
+	return mb;
+}
+
+static inline struct mbuf *
+sdp_alloc_mb_srcavail(struct socket *sk, u32 len, u32 rkey, u64 vaddr, int wait)
+{
+	struct mbuf *mb;
+	struct sdp_srcah *srcah;
+
+	mb = sdp_alloc_mb(sk, SDP_MID_SRCAVAIL, sizeof(*srcah), wait);
+	if (mb == NULL)
+		return (NULL);
+	srcah = (struct sdp_srcah *)mb_put(mb, sizeof(*srcah));
+	srcah->len = htonl(len);
+	srcah->rkey = htonl(rkey);
+	srcah->vaddr = cpu_to_be64(vaddr);
+
+	return mb;
+}
+
+static inline struct mbuf *
+sdp_alloc_mb_srcavail_cancel(struct socket *sk, int wait)
+{
+	return sdp_alloc_mb(sk, SDP_MID_SRCAVAIL_CANCEL, 0, wait);
+}
+
+static inline struct mbuf *
+sdp_alloc_mb_rdmardcompl(struct socket *sk, u32 len, int wait)
+{
+	struct mbuf *mb;
+	struct sdp_rrch *rrch;
+
+	mb = sdp_alloc_mb(sk, SDP_MID_RDMARDCOMPL, sizeof(*rrch), wait);
+	if (mb == NULL)
+		return (NULL);
+	rrch = (struct sdp_rrch *)mb_put(mb, sizeof(*rrch));
+	rrch->len = htonl(len);
+
+	return mb;
+}
+
+static inline struct mbuf *
+sdp_alloc_mb_sendsm(struct socket *sk, int wait)
+{
+	return sdp_alloc_mb(sk, SDP_MID_SENDSM, 0, wait);
+}
+static inline int sdp_tx_ring_slots_left(struct sdp_sock *ssk)
+{
+	return SDP_TX_SIZE - tx_ring_posted(ssk);
+}
+
+static inline int credit_update_needed(struct sdp_sock *ssk)
+{
+	int c;
+
+	c = remote_credits(ssk);
+	if (likely(c > SDP_MIN_TX_CREDITS))
+		c += c/2;
+	return unlikely(c < rx_ring_posted(ssk)) &&
+	    likely(tx_credits(ssk) > 0) &&
+	    likely(sdp_tx_ring_slots_left(ssk));
+}
+
+
+#define SDPSTATS_COUNTER_INC(stat)
+#define SDPSTATS_COUNTER_ADD(stat, val)
+#define SDPSTATS_COUNTER_MID_INC(stat, mid)
+#define SDPSTATS_HIST_LINEAR(stat, size)
+#define SDPSTATS_HIST(stat, size)
+
+static inline void
+sdp_cleanup_sdp_buf(struct sdp_sock *ssk, struct sdp_buf *sbuf,
+    enum dma_data_direction dir)
+{
+	struct ib_device *dev;
+	struct mbuf *mb;
+	int i;
+
+	dev = ssk->ib_device;
+	for (i = 0, mb = sbuf->mb; mb != NULL; mb = mb->m_next, i++)
+		ib_dma_unmap_single(dev, sbuf->mapping[i], mb->m_len, dir);
+}
+
+/* sdp_main.c */
+void sdp_set_default_moderation(struct sdp_sock *ssk);
+void sdp_start_keepalive_timer(struct socket *sk);
+void sdp_urg(struct sdp_sock *ssk, struct mbuf *mb);
+void sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk);
+void sdp_abort(struct socket *sk);
+struct sdp_sock *sdp_notify(struct sdp_sock *ssk, int error);
+
+
+/* sdp_cma.c */
+int sdp_cma_handler(struct rdma_cm_id *, struct rdma_cm_event *);
+
+/* sdp_tx.c */
+int sdp_tx_ring_create(struct sdp_sock *ssk, struct ib_device *device);
+void sdp_tx_ring_destroy(struct sdp_sock *ssk);
+int sdp_xmit_poll(struct sdp_sock *ssk, int force);
+void sdp_post_send(struct sdp_sock *ssk, struct mbuf *mb);
+void sdp_post_sends(struct sdp_sock *ssk, int wait);
+void sdp_post_keepalive(struct sdp_sock *ssk);
+
+/* sdp_rx.c */
+void sdp_rx_ring_init(struct sdp_sock *ssk);
+int sdp_rx_ring_create(struct sdp_sock *ssk, struct ib_device *device);
+void sdp_rx_ring_destroy(struct sdp_sock *ssk);
+int sdp_resize_buffers(struct sdp_sock *ssk, u32 new_size);
+int sdp_init_buffers(struct sdp_sock *ssk, u32 new_size);
+void sdp_do_posts(struct sdp_sock *ssk);
+void sdp_rx_comp_full(struct sdp_sock *ssk);
+
+/* sdp_zcopy.c */
+int sdp_sendmsg_zcopy(struct kiocb *iocb, struct socket *sk, struct iovec *iov);
+int sdp_handle_srcavail(struct sdp_sock *ssk, struct sdp_srcah *srcah);
+void sdp_handle_sendsm(struct sdp_sock *ssk, u32 mseq_ack);
+void sdp_handle_rdma_read_compl(struct sdp_sock *ssk, u32 mseq_ack,
+		u32 bytes_completed);
+int sdp_handle_rdma_read_cqe(struct sdp_sock *ssk);
+int sdp_rdma_to_iovec(struct socket *sk, struct iovec *iov, struct mbuf *mb,
+		unsigned long *used);
+int sdp_post_rdma_rd_compl(struct sdp_sock *ssk,
+		struct rx_srcavail_state *rx_sa);
+int sdp_post_sendsm(struct socket *sk);
+void srcavail_cancel_timeout(struct work_struct *work);
+void sdp_abort_srcavail(struct socket *sk);
+void sdp_abort_rdma_read(struct socket *sk);
+int sdp_process_rx(struct sdp_sock *ssk);
+
+#endif
diff --git a/sys/ofed/drivers/infiniband/ulp/sdp/sdp_bcopy.c b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_bcopy.c
new file mode 100644
index 0000000..d068852
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_bcopy.c
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2006 Mellanox Technologies Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+#include "sdp.h"
+
+static void sdp_nagle_timeout(void *data);
+
+#ifdef CONFIG_INFINIBAND_SDP_DEBUG_DATA
+void _dump_packet(const char *func, int line, struct socket *sk, char *str,
+		struct mbuf *mb, const struct sdp_bsdh *h)
+{
+	struct sdp_hh *hh;
+	struct sdp_hah *hah;
+	struct sdp_chrecvbuf *req_size;
+	struct sdp_rrch *rrch;
+	struct sdp_srcah *srcah;
+	int len = 0;
+	char buf[256];
+	len += snprintf(buf, 255-len, "%s mb: %p mid: %2x:%-20s flags: 0x%x "
+			"bufs: 0x%x len: 0x%x mseq: 0x%x mseq_ack: 0x%x | ",
+			str, mb, h->mid, mid2str(h->mid), h->flags,
+			ntohs(h->bufs), ntohl(h->len), ntohl(h->mseq),
+			ntohl(h->mseq_ack));
+
+	switch (h->mid) {
+	case SDP_MID_HELLO:
+		hh = (struct sdp_hh *)h;
+		len += snprintf(buf + len, 255-len,
+				"max_adverts: %d  majv_minv: 0x%x "
+				"localrcvsz: 0x%x desremrcvsz: 0x%x |",
+				hh->max_adverts, hh->majv_minv,
+				ntohl(hh->localrcvsz),
+				ntohl(hh->desremrcvsz));
+		break;
+	case SDP_MID_HELLO_ACK:
+		hah = (struct sdp_hah *)h;
+		len += snprintf(buf + len, 255-len, "actrcvz: 0x%x |",
+				ntohl(hah->actrcvsz));
+		break;
+	case SDP_MID_CHRCVBUF:
+	case SDP_MID_CHRCVBUF_ACK:
+		req_size = (struct sdp_chrecvbuf *)(h+1);
+		len += snprintf(buf + len, 255-len, "req_size: 0x%x |",
+				ntohl(req_size->size));
+		break;
+	case SDP_MID_DATA:
+		len += snprintf(buf + len, 255-len, "data_len: 0x%lx |",
+			ntohl(h->len) - sizeof(struct sdp_bsdh));
+		break;
+	case SDP_MID_RDMARDCOMPL:
+		rrch = (struct sdp_rrch *)(h+1);
+
+		len += snprintf(buf + len, 255-len, " | len: 0x%x |",
+				ntohl(rrch->len));
+		break;
+	case SDP_MID_SRCAVAIL:
+		srcah = (struct sdp_srcah *)(h+1);
+
+		len += snprintf(buf + len, 255-len, " | payload: 0x%lx, "
+				"len: 0x%x, rkey: 0x%x, vaddr: 0x%jx |",
+				ntohl(h->len) - sizeof(struct sdp_bsdh) - 
+				sizeof(struct sdp_srcah),
+				ntohl(srcah->len), ntohl(srcah->rkey),
+				be64_to_cpu(srcah->vaddr));
+		break;
+	default:
+		break;
+	}
+	buf[len] = 0;
+	_sdp_printk(func, line, KERN_WARNING, sk, "%s: %s\n", str, buf);
+}
+#endif
+
+static inline int
+sdp_nagle_off(struct sdp_sock *ssk, struct mbuf *mb)
+{
+
+	struct sdp_bsdh *h;
+
+	h = mtod(mb, struct sdp_bsdh *);
+	int send_now =
+#ifdef SDP_ZCOPY
+		BZCOPY_STATE(mb) ||
+#endif
+		unlikely(h->mid != SDP_MID_DATA) ||
+		(ssk->flags & SDP_NODELAY) ||
+		!ssk->nagle_last_unacked ||
+		mb->m_pkthdr.len >= ssk->xmit_size_goal / 4 ||
+		(mb->m_flags & M_PUSH);
+
+	if (send_now) {
+		unsigned long mseq = ring_head(ssk->tx_ring);
+		ssk->nagle_last_unacked = mseq;
+	} else {
+		if (!callout_pending(&ssk->nagle_timer)) {
+			callout_reset(&ssk->nagle_timer, SDP_NAGLE_TIMEOUT,
+			    sdp_nagle_timeout, ssk);
+			sdp_dbg_data(ssk->socket, "Starting nagle timer\n");
+		}
+	}
+	sdp_dbg_data(ssk->socket, "send_now = %d last_unacked = %ld\n",
+		send_now, ssk->nagle_last_unacked);
+
+	return send_now;
+}
+
+static void
+sdp_nagle_timeout(void *data)
+{
+	struct sdp_sock *ssk = (struct sdp_sock *)data;
+	struct socket *sk = ssk->socket;
+
+	sdp_dbg_data(sk, "last_unacked = %ld\n", ssk->nagle_last_unacked);
+
+	if (!callout_active(&ssk->nagle_timer))
+		return;
+	callout_deactivate(&ssk->nagle_timer);
+
+	if (!ssk->nagle_last_unacked)
+		goto out;
+	if (ssk->state == TCPS_CLOSED)
+		return;
+	ssk->nagle_last_unacked = 0;
+	sdp_post_sends(ssk, M_DONTWAIT);
+
+	sowwakeup(ssk->socket);
+out:
+	if (sk->so_snd.sb_sndptr)
+		callout_reset(&ssk->nagle_timer, SDP_NAGLE_TIMEOUT,
+		    sdp_nagle_timeout, ssk);
+}
+
+void
+sdp_post_sends(struct sdp_sock *ssk, int wait)
+{
+	struct mbuf *mb;
+	int post_count = 0;
+	struct socket *sk;
+	int low;
+
+	sk = ssk->socket;
+	if (unlikely(!ssk->id)) {
+		if (sk->so_snd.sb_sndptr) {
+			sdp_dbg(ssk->socket,
+				"Send on socket without cmid ECONNRESET.\n");
+			sdp_notify(ssk, ECONNRESET);
+		}
+		return;
+	}
+again:
+	if (sdp_tx_ring_slots_left(ssk) < SDP_TX_SIZE / 2)
+		sdp_xmit_poll(ssk,  1);
+
+	if (ssk->recv_request &&
+	    ring_tail(ssk->rx_ring) >= ssk->recv_request_head &&
+	    tx_credits(ssk) >= SDP_MIN_TX_CREDITS &&
+	    sdp_tx_ring_slots_left(ssk)) {
+		mb = sdp_alloc_mb_chrcvbuf_ack(sk,
+		    ssk->recv_bytes - SDP_HEAD_SIZE, wait);
+		if (mb == NULL)
+			goto allocfail;
+		ssk->recv_request = 0;
+		sdp_post_send(ssk, mb);
+		post_count++;
+	}
+
+	if (tx_credits(ssk) <= SDP_MIN_TX_CREDITS &&
+	    sdp_tx_ring_slots_left(ssk) && sk->so_snd.sb_sndptr &&
+	    sdp_nagle_off(ssk, sk->so_snd.sb_sndptr)) {
+		SDPSTATS_COUNTER_INC(send_miss_no_credits);
+	}
+
+	while (tx_credits(ssk) > SDP_MIN_TX_CREDITS &&
+	    sdp_tx_ring_slots_left(ssk) && (mb = sk->so_snd.sb_sndptr) &&
+	    sdp_nagle_off(ssk, mb)) {
+		struct mbuf *n;
+
+		SOCKBUF_LOCK(&sk->so_snd);
+		sk->so_snd.sb_sndptr = mb->m_nextpkt;
+		sk->so_snd.sb_mb = mb->m_nextpkt;
+		mb->m_nextpkt = NULL;
+		SB_EMPTY_FIXUP(&sk->so_snd);
+		for (n = mb; n != NULL; n = n->m_next)
+			sbfree(&sk->so_snd, n);
+		SOCKBUF_UNLOCK(&sk->so_snd);
+		sdp_post_send(ssk, mb);
+		post_count++;
+	}
+
+	if (credit_update_needed(ssk) && ssk->state >= TCPS_ESTABLISHED &&
+	    ssk->state < TCPS_FIN_WAIT_2) {
+		mb = sdp_alloc_mb_data(ssk->socket, wait);
+		if (mb == NULL)
+			goto allocfail;
+		sdp_post_send(ssk, mb);
+
+		SDPSTATS_COUNTER_INC(post_send_credits);
+		post_count++;
+	}
+
+	/* send DisConn if needed
+	 * Do not send DisConn if there is only 1 credit. Compliance with CA4-82
+	 * If one credit is available, an implementation shall only send SDP
+	 * messages that provide additional credits and also do not contain ULP
+	 * payload. */
+	if ((ssk->flags & SDP_NEEDFIN) && !sk->so_snd.sb_sndptr &&
+	    tx_credits(ssk) > 1) {
+		mb = sdp_alloc_mb_disconnect(sk, wait);
+		if (mb == NULL)
+			goto allocfail;
+		ssk->flags &= ~SDP_NEEDFIN;
+		sdp_post_send(ssk, mb);
+		post_count++;
+	}
+	low = (sdp_tx_ring_slots_left(ssk) <= SDP_MIN_TX_CREDITS);
+	if (post_count || low) {
+		if (low)
+			sdp_arm_tx_cq(ssk);
+		if (sdp_xmit_poll(ssk, low))
+			goto again;
+	}
+	return;
+
+allocfail:
+	ssk->nagle_last_unacked = -1;
+	callout_reset(&ssk->nagle_timer, 1, sdp_nagle_timeout, ssk);
+	return;
+}
diff --git a/sys/ofed/drivers/infiniband/ulp/sdp/sdp_cma.c b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_cma.c
new file mode 100644
index 0000000..9350609
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_cma.c
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2006 Mellanox Technologies Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+#include "sdp.h"
+
+#define SDP_MAJV_MINV 0x22
+
+SDP_MODPARAM_SINT(sdp_link_layer_ib_only, 1, "Support only link layer of "
+		"type Infiniband");
+
+enum {
+	SDP_HH_SIZE = 76,
+	SDP_HAH_SIZE = 180,
+};
+
+static void
+sdp_qp_event_handler(struct ib_event *event, void *data)
+{
+}
+
+static int
+sdp_get_max_dev_sge(struct ib_device *dev)
+{
+	struct ib_device_attr attr;
+	static int max_sges = -1;
+
+	if (max_sges > 0)
+		goto out;
+
+	ib_query_device(dev, &attr);
+
+	max_sges = attr.max_sge;
+
+out:
+	return max_sges;
+}
+
+static int
+sdp_init_qp(struct socket *sk, struct rdma_cm_id *id)
+{
+	struct ib_qp_init_attr qp_init_attr = {
+		.event_handler = sdp_qp_event_handler,
+		.cap.max_send_wr = SDP_TX_SIZE,
+		.cap.max_recv_wr = SDP_RX_SIZE,
+        	.sq_sig_type = IB_SIGNAL_REQ_WR,
+        	.qp_type = IB_QPT_RC,
+	};
+	struct ib_device *device = id->device;
+	struct sdp_sock *ssk;
+	int rc;
+
+	sdp_dbg(sk, "%s\n", __func__);
+
+	ssk = sdp_sk(sk);
+	ssk->max_sge = sdp_get_max_dev_sge(device);
+	sdp_dbg(sk, "Max sges: %d\n", ssk->max_sge);
+
+	qp_init_attr.cap.max_send_sge = MIN(ssk->max_sge, SDP_MAX_SEND_SGES);
+	sdp_dbg(sk, "Setting max send sge to: %d\n",
+	    qp_init_attr.cap.max_send_sge);
+		
+	qp_init_attr.cap.max_recv_sge = MIN(ssk->max_sge, SDP_MAX_RECV_SGES);
+	sdp_dbg(sk, "Setting max recv sge to: %d\n",
+	    qp_init_attr.cap.max_recv_sge);
+		
+	ssk->sdp_dev = ib_get_client_data(device, &sdp_client);
+	if (!ssk->sdp_dev) {
+		sdp_warn(sk, "SDP not available on device %s\n", device->name);
+		rc = -ENODEV;
+		goto err_rx;
+	}
+
+	rc = sdp_rx_ring_create(ssk, device);
+	if (rc)
+		goto err_rx;
+
+	rc = sdp_tx_ring_create(ssk, device);
+	if (rc)
+		goto err_tx;
+
+	qp_init_attr.recv_cq = ssk->rx_ring.cq;
+	qp_init_attr.send_cq = ssk->tx_ring.cq;
+
+	rc = rdma_create_qp(id, ssk->sdp_dev->pd, &qp_init_attr);
+	if (rc) {
+		sdp_warn(sk, "Unable to create QP: %d.\n", rc);
+		goto err_qp;
+	}
+	ssk->qp = id->qp;
+	ssk->ib_device = device;
+	ssk->qp_active = 1;
+	ssk->context.device = device;
+
+	sdp_dbg(sk, "%s done\n", __func__);
+	return 0;
+
+err_qp:
+	sdp_tx_ring_destroy(ssk);
+err_tx:
+	sdp_rx_ring_destroy(ssk);
+err_rx:
+	return rc;
+}
+
+static int
+sdp_connect_handler(struct socket *sk, struct rdma_cm_id *id,
+    struct rdma_cm_event *event)
+{
+	struct sockaddr_in *src_addr;
+	struct sockaddr_in *dst_addr;
+	struct socket *child;
+	const struct sdp_hh *h;
+	struct sdp_sock *ssk;
+	int rc;
+
+	sdp_dbg(sk, "%s %p -> %p\n", __func__, sdp_sk(sk)->id, id);
+
+	h = event->param.conn.private_data;
+	SDP_DUMP_PACKET(sk, "RX", NULL, &h->bsdh);
+
+	if (!h->max_adverts)
+		return -EINVAL;
+
+	child = sonewconn(sk, SS_ISCONNECTED);
+	if (!child)
+		return -ENOMEM;
+
+	ssk = sdp_sk(child);
+	rc = sdp_init_qp(child, id);
+	if (rc)
+		return rc;
+	SDP_WLOCK(ssk);
+	id->context = ssk;
+	ssk->id = id;
+	ssk->socket = child;
+	ssk->cred = crhold(child->so_cred);
+	dst_addr = (struct sockaddr_in *)&id->route.addr.dst_addr;
+	src_addr = (struct sockaddr_in *)&id->route.addr.src_addr;
+	ssk->fport = dst_addr->sin_port;
+	ssk->faddr = dst_addr->sin_addr.s_addr;
+	ssk->lport = src_addr->sin_port;
+	ssk->max_bufs = ntohs(h->bsdh.bufs);
+	atomic_set(&ssk->tx_ring.credits, ssk->max_bufs);
+	ssk->min_bufs = tx_credits(ssk) / 4;
+	ssk->xmit_size_goal = ntohl(h->localrcvsz) - sizeof(struct sdp_bsdh);
+	sdp_init_buffers(ssk, rcvbuf_initial_size);
+	ssk->state = TCPS_SYN_RECEIVED;
+	SDP_WUNLOCK(ssk);
+
+	return 0;
+}
+
+static int
+sdp_response_handler(struct socket *sk, struct rdma_cm_id *id,
+    struct rdma_cm_event *event)
+{
+	const struct sdp_hah *h;
+	struct sockaddr_in *dst_addr;
+	struct sdp_sock *ssk;
+	sdp_dbg(sk, "%s\n", __func__);
+
+	ssk = sdp_sk(sk);
+	SDP_WLOCK(ssk);
+	ssk->state = TCPS_ESTABLISHED;
+	sdp_set_default_moderation(ssk);
+	if (ssk->flags & SDP_DROPPED) {
+		SDP_WUNLOCK(ssk);
+		return 0;
+	}
+	if (sk->so_options & SO_KEEPALIVE)
+		sdp_start_keepalive_timer(sk);
+	h = event->param.conn.private_data;
+	SDP_DUMP_PACKET(sk, "RX", NULL, &h->bsdh);
+	ssk->max_bufs = ntohs(h->bsdh.bufs);
+	atomic_set(&ssk->tx_ring.credits, ssk->max_bufs);
+	ssk->min_bufs = tx_credits(ssk) / 4;
+	ssk->xmit_size_goal =
+		ntohl(h->actrcvsz) - sizeof(struct sdp_bsdh);
+	ssk->poll_cq = 1;
+
+	dst_addr = (struct sockaddr_in *)&id->route.addr.dst_addr;
+	ssk->fport = dst_addr->sin_port;
+	ssk->faddr = dst_addr->sin_addr.s_addr;
+	soisconnected(sk);
+	SDP_WUNLOCK(ssk);
+
+	return 0;
+}
+
+static int
+sdp_connected_handler(struct socket *sk, struct rdma_cm_event *event)
+{
+	struct sdp_sock *ssk;
+
+	sdp_dbg(sk, "%s\n", __func__);
+
+	ssk = sdp_sk(sk);
+	SDP_WLOCK(ssk);
+	ssk->state = TCPS_ESTABLISHED;
+
+	sdp_set_default_moderation(ssk);
+
+	if (sk->so_options & SO_KEEPALIVE)
+		sdp_start_keepalive_timer(sk);
+
+	if ((ssk->flags & SDP_DROPPED) == 0)
+		soisconnected(sk);
+	SDP_WUNLOCK(ssk);
+	return 0;
+}
+
+static int
+sdp_disconnected_handler(struct socket *sk)
+{
+	struct sdp_sock *ssk;
+
+	ssk = sdp_sk(sk);
+	sdp_dbg(sk, "%s\n", __func__);
+
+	SDP_WLOCK_ASSERT(ssk);
+	if (sdp_sk(sk)->state == TCPS_SYN_RECEIVED) {
+		sdp_connected_handler(sk, NULL);
+
+		if (rcv_nxt(ssk))
+			return 0;
+	}
+
+	return -ECONNRESET;
+}
+
+int
+sdp_cma_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
+{
+	struct rdma_conn_param conn_param;
+	struct socket *sk;
+	struct sdp_sock *ssk;
+	struct sdp_hah hah;
+	struct sdp_hh hh;
+
+	int rc = 0;
+
+	ssk = id->context;
+	sk = NULL;
+	if (ssk)
+		sk = ssk->socket;
+	if (!ssk || !sk || !ssk->id) {
+		sdp_dbg(sk,
+		    "cm_id is being torn down, event %d, ssk %p, sk %p, id %p\n",
+		       	event->event, ssk, sk, id);
+		return event->event == RDMA_CM_EVENT_CONNECT_REQUEST ?
+			-EINVAL : 0;
+	}
+
+	sdp_dbg(sk, "%s event %d id %p\n", __func__, event->event, id);
+	switch (event->event) {
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+		sdp_dbg(sk, "RDMA_CM_EVENT_ADDR_RESOLVED\n");
+
+		if (sdp_link_layer_ib_only &&
+			rdma_node_get_transport(id->device->node_type) == 
+				RDMA_TRANSPORT_IB &&
+			rdma_port_get_link_layer(id->device, id->port_num) !=
+				IB_LINK_LAYER_INFINIBAND) {
+			sdp_dbg(sk, "Link layer is: %d. Only IB link layer "
+				"is allowed\n",
+				rdma_port_get_link_layer(id->device, id->port_num));
+			rc = -ENETUNREACH;
+			break;
+		}
+
+		rc = rdma_resolve_route(id, SDP_ROUTE_TIMEOUT);
+		break;
+	case RDMA_CM_EVENT_ADDR_ERROR:
+		sdp_dbg(sk, "RDMA_CM_EVENT_ADDR_ERROR\n");
+		rc = -ENETUNREACH;
+		break;
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		sdp_dbg(sk, "RDMA_CM_EVENT_ROUTE_RESOLVED : %p\n", id);
+		rc = sdp_init_qp(sk, id);
+		if (rc)
+			break;
+		atomic_set(&sdp_sk(sk)->remote_credits,
+				rx_ring_posted(sdp_sk(sk)));
+		memset(&hh, 0, sizeof hh);
+		hh.bsdh.mid = SDP_MID_HELLO;
+		hh.bsdh.len = htonl(sizeof(struct sdp_hh));
+		hh.max_adverts = 1;
+		hh.ipv_cap = 0x40;
+		hh.majv_minv = SDP_MAJV_MINV;
+		sdp_init_buffers(sdp_sk(sk), rcvbuf_initial_size);
+		hh.bsdh.bufs = htons(rx_ring_posted(sdp_sk(sk)));
+		hh.localrcvsz = hh.desremrcvsz = htonl(sdp_sk(sk)->recv_bytes);
+		hh.max_adverts = 0x1;
+		sdp_sk(sk)->laddr = 
+			((struct sockaddr_in *)&id->route.addr.src_addr)->sin_addr.s_addr;
+		memset(&conn_param, 0, sizeof conn_param);
+		conn_param.private_data_len = sizeof hh;
+		conn_param.private_data = &hh;
+		conn_param.responder_resources = 4 /* TODO */;
+		conn_param.initiator_depth = 4 /* TODO */;
+		conn_param.retry_count = SDP_RETRY_COUNT;
+		SDP_DUMP_PACKET(NULL, "TX", NULL, &hh.bsdh);
+		rc = rdma_connect(id, &conn_param);
+		break;
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+		sdp_dbg(sk, "RDMA_CM_EVENT_ROUTE_ERROR : %p\n", id);
+		rc = -ETIMEDOUT;
+		break;
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+		sdp_dbg(sk, "RDMA_CM_EVENT_CONNECT_REQUEST\n");
+		rc = sdp_connect_handler(sk, id, event);
+		if (rc) {
+			sdp_dbg(sk, "Destroying qp\n");
+			rdma_reject(id, NULL, 0);
+			break;
+		}
+		ssk = id->context;
+		atomic_set(&ssk->remote_credits, rx_ring_posted(ssk));
+		memset(&hah, 0, sizeof hah);
+		hah.bsdh.mid = SDP_MID_HELLO_ACK;
+		hah.bsdh.bufs = htons(rx_ring_posted(ssk));
+		hah.bsdh.len = htonl(sizeof(struct sdp_hah));
+		hah.majv_minv = SDP_MAJV_MINV;
+		hah.ext_max_adverts = 1; /* Doesn't seem to be mandated by spec,
+					    but just in case */
+		hah.actrcvsz = htonl(ssk->recv_bytes);
+		memset(&conn_param, 0, sizeof conn_param);
+		conn_param.private_data_len = sizeof hah;
+		conn_param.private_data = &hah;
+		conn_param.responder_resources = 4 /* TODO */;
+		conn_param.initiator_depth = 4 /* TODO */;
+		conn_param.retry_count = SDP_RETRY_COUNT;
+		SDP_DUMP_PACKET(sk, "TX", NULL, &hah.bsdh);
+		rc = rdma_accept(id, &conn_param);
+		if (rc) {
+			ssk->id = NULL;
+			id->qp = NULL;
+			id->context = NULL;
+		}
+		break;
+	case RDMA_CM_EVENT_CONNECT_RESPONSE:
+		sdp_dbg(sk, "RDMA_CM_EVENT_CONNECT_RESPONSE\n");
+		rc = sdp_response_handler(sk, id, event);
+		if (rc) {
+			sdp_dbg(sk, "Destroying qp\n");
+			rdma_reject(id, NULL, 0);
+		} else
+			rc = rdma_accept(id, NULL);
+		break;
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+		sdp_dbg(sk, "RDMA_CM_EVENT_CONNECT_ERROR\n");
+		rc = -ETIMEDOUT;
+		break;
+	case RDMA_CM_EVENT_UNREACHABLE:
+		sdp_dbg(sk, "RDMA_CM_EVENT_UNREACHABLE\n");
+		rc = -ENETUNREACH;
+		break;
+	case RDMA_CM_EVENT_REJECTED:
+		sdp_dbg(sk, "RDMA_CM_EVENT_REJECTED\n");
+		rc = -ECONNREFUSED;
+		break;
+	case RDMA_CM_EVENT_ESTABLISHED:
+		sdp_dbg(sk, "RDMA_CM_EVENT_ESTABLISHED\n");
+		sdp_sk(sk)->laddr = 
+			((struct sockaddr_in *)&id->route.addr.src_addr)->sin_addr.s_addr;
+		rc = sdp_connected_handler(sk, event);
+		break;
+	case RDMA_CM_EVENT_DISCONNECTED: /* This means DREQ/DREP received */
+		sdp_dbg(sk, "RDMA_CM_EVENT_DISCONNECTED\n");
+
+		SDP_WLOCK(ssk);
+		if (ssk->state == TCPS_LAST_ACK) {
+			sdp_cancel_dreq_wait_timeout(ssk);
+
+			sdp_dbg(sk, "%s: waiting for Infiniband tear down\n",
+				__func__);
+		}
+		ssk->qp_active = 0;
+		SDP_WUNLOCK(ssk);
+		rdma_disconnect(id);
+		SDP_WLOCK(ssk);
+		if (ssk->state != TCPS_TIME_WAIT) {
+			if (ssk->state == TCPS_CLOSE_WAIT) {
+				sdp_dbg(sk, "IB teardown while in "
+					"TCPS_CLOSE_WAIT taking reference to "
+					"let close() finish the work\n");
+			}
+			rc = sdp_disconnected_handler(sk);
+			if (rc)
+				rc = -EPIPE;
+		}
+		SDP_WUNLOCK(ssk);
+		break;
+	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+		sdp_dbg(sk, "RDMA_CM_EVENT_TIMEWAIT_EXIT\n");
+		SDP_WLOCK(ssk);
+		rc = sdp_disconnected_handler(sk);
+		SDP_WUNLOCK(ssk);
+		break;
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		sdp_dbg(sk, "RDMA_CM_EVENT_DEVICE_REMOVAL\n");
+		rc = -ENETRESET;
+		break;
+	default:
+		printk(KERN_ERR "SDP: Unexpected CMA event: %d\n",
+		       event->event);
+		rc = -ECONNABORTED;
+		break;
+	}
+
+	sdp_dbg(sk, "event %d done. status %d\n", event->event, rc);
+
+	if (rc) {
+		SDP_WLOCK(ssk);
+		if (ssk->id == id) {
+			ssk->id = NULL;
+			id->qp = NULL;
+			id->context = NULL;
+			if (sdp_notify(ssk, -rc))
+				SDP_WUNLOCK(ssk);
+		} else
+			SDP_WUNLOCK(ssk);
+	}
+
+	return rc;
+}
diff --git a/sys/ofed/drivers/infiniband/ulp/sdp/sdp_dbg.h b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_dbg.h
new file mode 100644
index 0000000..188b58b
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_dbg.h
@@ -0,0 +1,167 @@
+#ifndef _SDP_DBG_H_
+#define _SDP_DBG_H_
+
+#define SDPSTATS_ON
+
+//#define GETNSTIMEODAY_SUPPORTED
+
+#define _sdp_printk(func, line, level, sk, format, arg...)	\
+do {								\
+	printk(level "%s:%d %p sdp_sock(%d:%d %d:%d): " format "\n",	\
+	       func, line, sk ? sdp_sk(sk) : NULL,		\
+	       curproc->p_pid, PCPU_GET(cpuid),			\
+	       (sk) && sdp_sk(sk) ? ntohs(sdp_sk(sk)->lport) : -1,	\
+	       (sk) && sdp_sk(sk) ? ntohs(sdp_sk(sk)->fport) : -1, ## arg);	\
+} while (0)
+#define sdp_printk(level, sk, format, arg...)                \
+	_sdp_printk(__func__, __LINE__, level, sk, format, ## arg)
+#define sdp_warn(sk, format, arg...)                         \
+	sdp_printk(KERN_WARNING, sk, format , ## arg)
+
+#define SDP_MODPARAM_SINT(var, def_val, msg) \
+	static int var = def_val; \
+	module_param_named(var, var, int, 0644); \
+	MODULE_PARM_DESC(var, msg " [" #def_val "]"); \
+
+#define SDP_MODPARAM_INT(var, def_val, msg) \
+	int var = def_val; \
+	module_param_named(var, var, int, 0644); \
+	MODULE_PARM_DESC(var, msg " [" #def_val "]"); \
+
+#ifdef SDP_PROFILING
+struct mbuf;
+struct sdpprf_log {
+	int 		idx;
+	int 		pid;
+	int 		cpu;
+	int 		sk_num;
+	int 		sk_dport;
+	struct mbuf 	*mb;
+	char		msg[256];
+
+	unsigned long long time;
+
+	const char 	*func;
+	int 		line;
+};
+
+#define SDPPRF_LOG_SIZE 0x20000 /* must be a power of 2 */
+
+extern struct sdpprf_log sdpprf_log[SDPPRF_LOG_SIZE];
+extern int sdpprf_log_count;
+
+#ifdef GETNSTIMEODAY_SUPPORTED
+static inline unsigned long long current_nsec(void)
+{
+	struct timespec tv;
+	getnstimeofday(&tv);
+	return tv.tv_sec * NSEC_PER_SEC + tv.tv_nsec;
+}
+#else
+#define current_nsec() jiffies_to_usecs(jiffies)
+#endif
+
+#define sdp_prf1(sk, s, format, arg...) ({ \
+	struct sdpprf_log *l = \
+		&sdpprf_log[sdpprf_log_count++ & (SDPPRF_LOG_SIZE - 1)]; \
+	preempt_disable(); \
+	l->idx = sdpprf_log_count - 1; \
+	l->pid = current->pid; \
+	l->sk_num = (sk) ? inet_sk(sk)->num : -1;                 \
+	l->sk_dport = (sk) ? ntohs(inet_sk(sk)->dport) : -1; \
+	l->cpu = smp_processor_id(); \
+	l->mb = s; \
+	snprintf(l->msg, sizeof(l->msg) - 1, format, ## arg); \
+	l->time = current_nsec(); \
+	l->func = __func__; \
+	l->line = __LINE__; \
+	preempt_enable(); \
+	1; \
+})
+//#define sdp_prf(sk, s, format, arg...)
+#define sdp_prf(sk, s, format, arg...) sdp_prf1(sk, s, format, ## arg)
+
+#else
+#define sdp_prf1(sk, s, format, arg...)
+#define sdp_prf(sk, s, format, arg...)
+#endif
+
+#ifdef CONFIG_INFINIBAND_SDP_DEBUG
+extern int sdp_debug_level;
+
+#define sdp_dbg(sk, format, arg...)                          \
+	do {                                                 \
+		if (sdp_debug_level > 0)                     \
+		sdp_printk(KERN_WARNING, sk, format , ## arg); \
+	} while (0)
+
+#else /* CONFIG_INFINIBAND_SDP_DEBUG */
+#define sdp_dbg(priv, format, arg...)                        \
+	do { (void) (priv); } while (0)
+#define sock_ref(sk, msg, sock_op) sock_op(sk)
+#endif /* CONFIG_INFINIBAND_SDP_DEBUG */
+
+#ifdef CONFIG_INFINIBAND_SDP_DEBUG_DATA
+
+extern int sdp_data_debug_level;
+#define sdp_dbg_data(sk, format, arg...)                     		\
+	do {                                                 		\
+		if (sdp_data_debug_level & 0x2)                		\
+			sdp_printk(KERN_WARNING, sk, format , ## arg); 	\
+	} while (0)
+#define SDP_DUMP_PACKET(sk, str, mb, h)                     		\
+	do {                                                 		\
+		if (sdp_data_debug_level & 0x1)                		\
+			dump_packet(sk, str, mb, h); 			\
+	} while (0)
+#else
+#define sdp_dbg_data(priv, format, arg...)
+#define SDP_DUMP_PACKET(sk, str, mb, h)
+#endif
+
+#define SOCK_REF_RESET "RESET"
+#define SOCK_REF_ALIVE "ALIVE" /* sock_alloc -> destruct_sock */
+#define SOCK_REF_CLONE "CLONE"
+#define SOCK_REF_CMA "CMA" /* sdp_cma_handler() is expected to be invoked */
+#define SOCK_REF_SEQ "SEQ" /* during proc read */
+#define SOCK_REF_DREQ_TO "DREQ_TO" /* dreq timeout is pending */
+#define SOCK_REF_ZCOPY "ZCOPY" /* zcopy send in process */
+#define SOCK_REF_RDMA_RD "RDMA_RD" /* RDMA read in process */
+
+#define sock_hold(sk, msg)  sock_ref(sk, msg, sock_hold)
+#define sock_put(sk, msg)  sock_ref(sk, msg, sock_put)
+#define __sock_put(sk, msg)  sock_ref(sk, msg, __sock_put)
+
+#define ENUM2STR(e) [e] = #e
+
+static inline char *sdp_state_str(int state)
+{
+	static char *state2str[] = {
+		ENUM2STR(TCPS_ESTABLISHED),
+		ENUM2STR(TCPS_SYN_SENT),
+		ENUM2STR(TCPS_SYN_RECEIVED),
+		ENUM2STR(TCPS_FIN_WAIT_1),
+		ENUM2STR(TCPS_FIN_WAIT_2),
+		ENUM2STR(TCPS_TIME_WAIT),
+		ENUM2STR(TCPS_CLOSED),
+		ENUM2STR(TCPS_CLOSE_WAIT),
+		ENUM2STR(TCPS_LAST_ACK),
+		ENUM2STR(TCPS_LISTEN),
+		ENUM2STR(TCPS_CLOSING),
+	};
+
+	if (state < 0 || state >= ARRAY_SIZE(state2str))
+		return "unknown";
+
+	return state2str[state];
+}
+
+struct sdp_bsdh;
+#ifdef CONFIG_INFINIBAND_SDP_DEBUG_DATA
+void _dump_packet(const char *func, int line, struct socket *sk, char *str,
+		struct mbuf *mb, const struct sdp_bsdh *h);
+#define dump_packet(sk, str, mb, h) \
+	_dump_packet(__func__, __LINE__, sk, str, mb, h)
+#endif
+
+#endif
diff --git a/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c
new file mode 100644
index 0000000..fe747af
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c
@@ -0,0 +1,1962 @@
+
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
+ *      The Regents of the University of California.  All rights reserved.
+ * Copyright (c) 2004 The FreeBSD Foundation.  All rights reserved.
+ * Copyright (c) 2004-2008 Robert N. M. Watson.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c
+ */
+
+/*
+ *
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "sdp.h"
+
+#include <net/if.h>
+#include <net/route.h>
+#include <net/vnet.h>
+
+uma_zone_t	sdp_zone;
+struct rwlock	sdp_lock;
+LIST_HEAD(, sdp_sock) sdp_list;
+
+struct workqueue_struct *rx_comp_wq;
+
+RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock");
+#define	SDP_LIST_WLOCK()	rw_wlock(&sdp_lock)
+#define	SDP_LIST_RLOCK()	rw_rlock(&sdp_lock)
+#define	SDP_LIST_WUNLOCK()	rw_wunlock(&sdp_lock)
+#define	SDP_LIST_RUNLOCK()	rw_runlock(&sdp_lock)
+#define	SDP_LIST_WLOCK_ASSERT()	rw_assert(&sdp_lock, RW_WLOCKED)
+#define	SDP_LIST_RLOCK_ASSERT()	rw_assert(&sdp_lock, RW_RLOCKED)
+#define	SDP_LIST_LOCK_ASSERT()	rw_assert(&sdp_lock, RW_LOCKED)
+
+MALLOC_DEFINE(M_SDP, "sdp", "Socket Direct Protocol");
+
+static void sdp_stop_keepalive_timer(struct socket *so);
+
+/*
+ * SDP protocol interface to socket abstraction.
+ */
+/*
+ * sdp_sendspace and sdp_recvspace are the default send and receive window
+ * sizes, respectively.
+ */
+u_long	sdp_sendspace = 1024*32;
+u_long	sdp_recvspace = 1024*64;
+
+static int sdp_count;
+
+/*
+ * Disable async. CMA events for sockets which are being torn down.
+ */
+static void
+sdp_destroy_cma(struct sdp_sock *ssk)
+{
+
+	if (ssk->id == NULL)
+		return;
+	rdma_destroy_id(ssk->id);
+	ssk->id = NULL;
+}
+
+static int
+sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred)
+{
+	struct sockaddr_in *sin;
+	struct sockaddr_in null;
+	int error;
+
+	SDP_WLOCK_ASSERT(ssk);
+
+	if (ssk->lport != 0 || ssk->laddr != INADDR_ANY)
+		return (EINVAL);
+	/* rdma_bind_addr handles bind races.  */
+	SDP_WUNLOCK(ssk);
+	if (ssk->id == NULL)
+		ssk->id = rdma_create_id(sdp_cma_handler, ssk, RDMA_PS_SDP);
+	if (ssk->id == NULL) {
+		SDP_WLOCK(ssk);
+		return (ENOMEM);
+	}
+	if (nam == NULL) {
+		null.sin_family = AF_INET;
+		null.sin_len = sizeof(null);
+		null.sin_addr.s_addr = INADDR_ANY;
+		null.sin_port = 0;
+		bzero(&null.sin_zero, sizeof(null.sin_zero));
+		nam = (struct sockaddr *)&null;
+	}
+	error = -rdma_bind_addr(ssk->id, nam);
+	SDP_WLOCK(ssk);
+	if (error == 0) {
+		sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr;
+		ssk->laddr = sin->sin_addr.s_addr;
+		ssk->lport = sin->sin_port;
+	} else
+		sdp_destroy_cma(ssk);
+	return (error);
+}
+
+static void
+sdp_pcbfree(struct sdp_sock *ssk)
+{
+	KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk));
+
+	sdp_dbg(ssk->socket, "Freeing pcb");
+	SDP_WLOCK_ASSERT(ssk);
+	ssk->flags |= SDP_DESTROY;
+	SDP_WUNLOCK(ssk);
+	SDP_LIST_WLOCK();
+	sdp_count--;
+	LIST_REMOVE(ssk, list);
+	SDP_LIST_WUNLOCK();
+	crfree(ssk->cred);
+	sdp_destroy_cma(ssk);
+	ssk->qp_active = 0;
+	if (ssk->qp) {
+		ib_destroy_qp(ssk->qp);
+		ssk->qp = NULL;
+	}
+	sdp_tx_ring_destroy(ssk);
+	sdp_rx_ring_destroy(ssk);
+	rw_destroy(&ssk->rx_ring.destroyed_lock);
+	uma_zfree(sdp_zone, ssk);
+	rw_destroy(&ssk->lock);
+}
+
+/*
+ * Common routines to return a socket address.
+ */
+static struct sockaddr *
+sdp_sockaddr(in_port_t port, struct in_addr *addr_p)
+{
+	struct sockaddr_in *sin;
+
+	sin = malloc(sizeof *sin, M_SONAME,
+		M_WAITOK | M_ZERO);
+	sin->sin_family = AF_INET;
+	sin->sin_len = sizeof(*sin);
+	sin->sin_addr = *addr_p;
+	sin->sin_port = port;
+
+	return (struct sockaddr *)sin;
+}
+
+static int
+sdp_getsockaddr(struct socket *so, struct sockaddr **nam)
+{
+	struct sdp_sock *ssk;
+	struct in_addr addr;
+	in_port_t port;
+
+	ssk = sdp_sk(so);
+	SDP_RLOCK(ssk);
+	port = ssk->lport;
+	addr.s_addr = ssk->laddr;
+	SDP_RUNLOCK(ssk);
+
+	*nam = sdp_sockaddr(port, &addr);
+	return 0;
+}
+
+static int
+sdp_getpeeraddr(struct socket *so, struct sockaddr **nam)
+{
+	struct sdp_sock *ssk;
+	struct in_addr addr;
+	in_port_t port;
+
+	ssk = sdp_sk(so);
+	SDP_RLOCK(ssk);
+	port = ssk->fport;
+	addr.s_addr = ssk->faddr;
+	SDP_RUNLOCK(ssk);
+
+	*nam = sdp_sockaddr(port, &addr);
+	return 0;
+}
+
+static void
+sdp_pcbnotifyall(struct in_addr faddr, int errno,
+    struct sdp_sock *(*notify)(struct sdp_sock *, int))
+{
+	struct sdp_sock *ssk, *ssk_temp;
+
+	SDP_LIST_WLOCK();
+	LIST_FOREACH_SAFE(ssk, &sdp_list, list, ssk_temp) {
+		SDP_WLOCK(ssk);
+		if (ssk->faddr != faddr.s_addr || ssk->socket == NULL) {
+			SDP_WUNLOCK(ssk);
+			continue;
+		}
+		if ((ssk->flags & SDP_DESTROY) == 0)
+			if ((*notify)(ssk, errno))
+				SDP_WUNLOCK(ssk);
+	}
+	SDP_LIST_WUNLOCK();
+}
+
+#if 0
+static void
+sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg)
+{
+	struct sdp_sock *ssk;
+
+	SDP_LIST_RLOCK();
+	LIST_FOREACH(ssk, &sdp_list, list) {
+		SDP_WLOCK(ssk);
+		func(ssk, arg);
+		SDP_WUNLOCK(ssk);
+	}
+	SDP_LIST_RUNLOCK();
+}
+#endif
+
+static void
+sdp_output_reset(struct sdp_sock *ssk)
+{
+	struct rdma_cm_id *id;
+
+	SDP_WLOCK_ASSERT(ssk);
+	if (ssk->id) {
+		id = ssk->id;
+		ssk->qp_active = 0;
+		SDP_WUNLOCK(ssk);
+		rdma_disconnect(id);
+		SDP_WLOCK(ssk);
+	}
+	ssk->state = TCPS_CLOSED;
+}
+
+/*
+ * Attempt to close a SDP socket, marking it as dropped, and freeing
+ * the socket if we hold the only reference.
+ */
+static struct sdp_sock *
+sdp_closed(struct sdp_sock *ssk)
+{
+	struct socket *so;
+
+	SDP_WLOCK_ASSERT(ssk);
+
+	ssk->flags |= SDP_DROPPED;
+	so = ssk->socket;
+	soisdisconnected(so);
+	if (ssk->flags & SDP_SOCKREF) {
+		KASSERT(so->so_state & SS_PROTOREF,
+		    ("sdp_closed: !SS_PROTOREF"));
+		ssk->flags &= ~SDP_SOCKREF;
+		SDP_WUNLOCK(ssk);
+		ACCEPT_LOCK();
+		SOCK_LOCK(so);
+		so->so_state &= ~SS_PROTOREF;
+		sofree(so);
+		return (NULL);
+	}
+	return (ssk);
+}
+
+/*
+ * Perform timer based shutdowns which can not operate in
+ * callout context.
+ */
+static void
+sdp_shutdown_task(void *data, int pending)
+{
+	struct sdp_sock *ssk;
+
+	ssk = data;
+	SDP_WLOCK(ssk);
+	/*
+	 * I don't think this can race with another call to pcbfree()
+	 * because SDP_TIMEWAIT protects it.  SDP_DESTROY may be redundant.
+	 */
+	if (ssk->flags & SDP_DESTROY)
+		panic("sdp_shutdown_task: Racing with pcbfree for ssk %p",
+		    ssk);
+	if (ssk->flags & SDP_DISCON)
+		sdp_output_reset(ssk);
+	/* We have to clear this so sdp_detach() will call pcbfree(). */
+	ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT);
+	if ((ssk->flags & SDP_DROPPED) == 0 &&
+	    sdp_closed(ssk) == NULL)
+		return;
+	if (ssk->socket == NULL) {
+		sdp_pcbfree(ssk);
+		return;
+	}
+	SDP_WUNLOCK(ssk);
+}
+
+/*
+ * 2msl has expired, schedule the shutdown task.
+ */
+static void
+sdp_2msl_timeout(void *data)
+{
+	struct sdp_sock *ssk;
+
+	ssk = data;
+	/* Callout canceled. */
+        if (!callout_active(&ssk->keep2msl))
+		goto out;
+        callout_deactivate(&ssk->keep2msl);
+	/* Should be impossible, defensive programming. */
+	if ((ssk->flags & SDP_TIMEWAIT) == 0)
+		goto out;
+	taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task);
+out:
+	SDP_WUNLOCK(ssk);
+	return;
+}
+
+/*
+ * Schedule the 2msl wait timer.
+ */
+static void
+sdp_2msl_wait(struct sdp_sock *ssk)
+{
+
+	SDP_WLOCK_ASSERT(ssk);
+	ssk->flags |= SDP_TIMEWAIT;
+	ssk->state = TCPS_TIME_WAIT;
+	soisdisconnected(ssk->socket);
+	callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk);
+}
+
+/*
+ * Timed out waiting for the final fin/ack from rdma_disconnect().
+ */
+static void
+sdp_dreq_timeout(void *data)
+{
+	struct sdp_sock *ssk;
+
+	ssk = data;
+	/* Callout canceled. */
+        if (!callout_active(&ssk->keep2msl))
+		goto out;
+	/* Callout rescheduled, probably as a different timer. */
+	if (callout_pending(&ssk->keep2msl))
+		goto out;
+        callout_deactivate(&ssk->keep2msl);
+	if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK)
+		goto out;
+	if ((ssk->flags & SDP_DREQWAIT) == 0)
+		goto out;
+	ssk->flags &= ~SDP_DREQWAIT;
+	ssk->flags |= SDP_DISCON;
+	sdp_2msl_wait(ssk);
+	ssk->qp_active = 0;
+out:
+	SDP_WUNLOCK(ssk);
+}
+
+/*
+ * Received the final fin/ack.  Cancel the 2msl.
+ */
+void
+sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk)
+{
+	sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n");
+	ssk->flags &= ~SDP_DREQWAIT;
+	sdp_2msl_wait(ssk);
+}
+
+static int
+sdp_init_sock(struct socket *sk)
+{
+	struct sdp_sock *ssk = sdp_sk(sk);
+
+	sdp_dbg(sk, "%s\n", __func__);
+
+	callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED);
+	TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk);
+#ifdef SDP_ZCOPY
+	INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout);
+	ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */
+	ssk->tx_ring.rdma_inflight = NULL;
+#endif
+	atomic_set(&ssk->mseq_ack, 0);
+	sdp_rx_ring_init(ssk);
+	ssk->tx_ring.buffer = NULL;
+
+	return 0;
+}
+
+/*
+ * Allocate an sdp_sock for the socket and reserve socket buffer space.
+ */
+static int
+sdp_attach(struct socket *so, int proto, struct thread *td)
+{
+	struct sdp_sock *ssk;
+	int error;
+
+	ssk = sdp_sk(so);
+	KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so));
+	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
+		error = soreserve(so, sdp_sendspace, sdp_recvspace);
+		if (error)
+			return (error);
+	}
+	so->so_rcv.sb_flags |= SB_AUTOSIZE;
+	so->so_snd.sb_flags |= SB_AUTOSIZE;
+	ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO);
+	if (ssk == NULL)
+		return (ENOBUFS);
+	rw_init(&ssk->lock, "sdpsock");
+	ssk->socket = so;
+	ssk->cred = crhold(so->so_cred);
+	so->so_pcb = (caddr_t)ssk;
+	sdp_init_sock(so);
+	ssk->flags = 0;
+	ssk->qp_active = 0;
+	ssk->state = TCPS_CLOSED;
+	SDP_LIST_WLOCK();
+	LIST_INSERT_HEAD(&sdp_list, ssk, list);
+	sdp_count++;
+	SDP_LIST_WUNLOCK();
+	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
+		so->so_linger = TCP_LINGERTIME;
+
+	return (0);
+}
+
+/*
+ * Detach SDP from the socket, potentially leaving it around for the
+ * timewait to expire.
+ */
+static void
+sdp_detach(struct socket *so)
+{
+	struct sdp_sock *ssk;
+
+	ssk = sdp_sk(so);
+	SDP_WLOCK(ssk);
+	KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL"));
+	ssk->socket->so_pcb = NULL;
+	ssk->socket = NULL;
+	if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT))
+		SDP_WUNLOCK(ssk);
+	else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT)
+		sdp_pcbfree(ssk);
+	else
+		panic("sdp_detach: Unexpected state, ssk %p.\n", ssk);
+}
+
+/*
+ * Allocate a local address for the socket.
+ */
+static int
+sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+	int error = 0;
+	struct sdp_sock *ssk;
+	struct sockaddr_in *sin;
+
+	sin = (struct sockaddr_in *)nam;
+	if (nam->sa_len != sizeof (*sin))
+		return (EINVAL);
+	if (sin->sin_family != AF_INET)
+		return (EINVAL);
+	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
+		return (EAFNOSUPPORT);
+
+	ssk = sdp_sk(so);
+	SDP_WLOCK(ssk);
+	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
+		error = EINVAL;
+		goto out;
+	}
+	error = sdp_pcbbind(ssk, nam, td->td_ucred);
+out:
+	SDP_WUNLOCK(ssk);
+
+	return (error);
+}
+
+/*
+ * Prepare to accept connections.
+ */
+static int
+sdp_listen(struct socket *so, int backlog, struct thread *td)
+{
+	int error = 0;
+	struct sdp_sock *ssk;
+
+	ssk = sdp_sk(so);
+	SDP_WLOCK(ssk);
+	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
+		error = EINVAL;
+		goto out;
+	}
+	if (error == 0 && ssk->lport == 0)
+		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
+	SOCK_LOCK(so);
+	if (error == 0)
+		error = solisten_proto_check(so);
+	if (error == 0) {
+		solisten_proto(so, backlog);
+		ssk->state = TCPS_LISTEN;
+	}
+	SOCK_UNLOCK(so);
+
+out:
+	SDP_WUNLOCK(ssk);
+	if (error == 0)
+		error = -rdma_listen(ssk->id, backlog);
+	return (error);
+}
+
+/*
+ * Initiate a SDP connection to nam.
+ */
+static int
+sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td)
+{
+	struct sockaddr_in src;
+	struct socket *so;
+	int error;
+
+	so = ssk->socket;
+
+	SDP_WLOCK_ASSERT(ssk);
+	if (ssk->lport == 0) {
+		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
+		if (error)
+			return error;
+	}
+	src.sin_family = AF_INET;
+	src.sin_len = sizeof(src);
+	bzero(&src.sin_zero, sizeof(src.sin_zero));
+	src.sin_port = ssk->lport;
+	src.sin_addr.s_addr = ssk->laddr;
+	soisconnecting(so);
+	SDP_WUNLOCK(ssk);
+	error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam,
+	    SDP_RESOLVE_TIMEOUT);
+	SDP_WLOCK(ssk);
+	if (error == 0)
+		ssk->state = TCPS_SYN_SENT;
+
+	return 0;
+}
+
+/*
+ * Initiate SDP connection.
+ */
+static int
+sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+	int error = 0;
+	struct sdp_sock *ssk;
+	struct sockaddr_in *sin;
+
+	sin = (struct sockaddr_in *)nam;
+	if (nam->sa_len != sizeof (*sin))
+		return (EINVAL);
+	if (sin->sin_family != AF_INET)
+		return (EINVAL);
+	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
+		return (EAFNOSUPPORT);
+	if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0)
+		return (error);
+	ssk = sdp_sk(so);
+	SDP_WLOCK(ssk);
+	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED))
+		error = EINVAL;
+	else
+		error = sdp_start_connect(ssk, nam, td);
+	SDP_WUNLOCK(ssk);
+	return (error);
+}
+
+/*
+ * Drop a SDP socket, reporting
+ * the specified error.  If connection is synchronized,
+ * then send a RST to peer.
+ */
+static struct sdp_sock *
+sdp_drop(struct sdp_sock *ssk, int errno)
+{
+	struct socket *so;
+
+	SDP_WLOCK_ASSERT(ssk);
+	so = ssk->socket;
+	if (TCPS_HAVERCVDSYN(ssk->state))
+		sdp_output_reset(ssk);
+	if (errno == ETIMEDOUT && ssk->softerror)
+		errno = ssk->softerror;
+	so->so_error = errno;
+	return (sdp_closed(ssk));
+}
+
+/*
+ * User issued close, and wish to trail through shutdown states:
+ * if never received SYN, just forget it.  If got a SYN from peer,
+ * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
+ * If already got a FIN from peer, then almost done; go to LAST_ACK
+ * state.  In all other cases, have already sent FIN to peer (e.g.
+ * after PRU_SHUTDOWN), and just have to play tedious game waiting
+ * for peer to send FIN or not respond to keep-alives, etc.
+ * We can let the user exit from the close as soon as the FIN is acked.
+ */
+static void
+sdp_usrclosed(struct sdp_sock *ssk)
+{
+
+	SDP_WLOCK_ASSERT(ssk);
+
+	switch (ssk->state) {
+	case TCPS_LISTEN:
+		ssk->state = TCPS_CLOSED;
+		SDP_WUNLOCK(ssk);
+		sdp_destroy_cma(ssk);
+		SDP_WLOCK(ssk);
+		/* FALLTHROUGH */
+	case TCPS_CLOSED:
+		ssk = sdp_closed(ssk);
+		/*
+		 * sdp_closed() should never return NULL here as the socket is
+		 * still open.
+		 */
+		KASSERT(ssk != NULL,
+		    ("sdp_usrclosed: sdp_closed() returned NULL"));
+		break;
+
+	case TCPS_SYN_SENT:
+		/* FALLTHROUGH */
+	case TCPS_SYN_RECEIVED:
+		ssk->flags |= SDP_NEEDFIN;
+		break;
+
+	case TCPS_ESTABLISHED:
+		ssk->flags |= SDP_NEEDFIN;
+		ssk->state = TCPS_FIN_WAIT_1;
+		break;
+
+	case TCPS_CLOSE_WAIT:
+		ssk->state = TCPS_LAST_ACK;
+		break;
+	}
+	if (ssk->state >= TCPS_FIN_WAIT_2) {
+		/* Prevent the connection hanging in FIN_WAIT_2 forever. */
+		if (ssk->state == TCPS_FIN_WAIT_2)
+			sdp_2msl_wait(ssk);
+		else
+			soisdisconnected(ssk->socket);
+	}
+}
+
+static void
+sdp_output_disconnect(struct sdp_sock *ssk)
+{
+
+	SDP_WLOCK_ASSERT(ssk);
+	callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT,
+	    sdp_dreq_timeout, ssk);
+	ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT;
+	sdp_post_sends(ssk, M_NOWAIT);
+}
+
+/*
+ * Initiate or continue a disconnect.
+ * If embryonic state, just send reset (once).
+ * If in ``let data drain'' option and linger null, just drop.
+ * Otherwise (hard), mark socket disconnecting and drop
+ * current input data; switch states based on user close, and
+ * send segment to peer (with FIN).
+ */
+static void
+sdp_start_disconnect(struct sdp_sock *ssk)
+{
+	struct socket *so;
+	int unread;
+
+	so = ssk->socket;
+	SDP_WLOCK_ASSERT(ssk);
+	sdp_stop_keepalive_timer(so);
+	/*
+	 * Neither sdp_closed() nor sdp_drop() should return NULL, as the
+	 * socket is still open.
+	 */
+	if (ssk->state < TCPS_ESTABLISHED) {
+		ssk = sdp_closed(ssk);
+		KASSERT(ssk != NULL,
+		    ("sdp_start_disconnect: sdp_close() returned NULL"));
+	} else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
+		ssk = sdp_drop(ssk, 0);
+		KASSERT(ssk != NULL,
+		    ("sdp_start_disconnect: sdp_drop() returned NULL"));
+	} else {
+		soisdisconnecting(so);
+		unread = so->so_rcv.sb_cc;
+		sbflush(&so->so_rcv);
+		sdp_usrclosed(ssk);
+		if (!(ssk->flags & SDP_DROPPED)) {
+			if (unread)
+				sdp_output_reset(ssk);
+			else
+				sdp_output_disconnect(ssk);
+		}
+	}
+}
+
+/*
+ * User initiated disconnect.
+ */
+static int
+sdp_disconnect(struct socket *so)
+{
+	struct sdp_sock *ssk;
+	int error = 0;
+
+	ssk = sdp_sk(so);
+	SDP_WLOCK(ssk);
+	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
+		error = ECONNRESET;
+		goto out;
+	}
+	sdp_start_disconnect(ssk);
+out:
+	SDP_WUNLOCK(ssk);
+	return (error);
+}
+
+/*
+ * Accept a connection.  Essentially all the work is done at higher levels;
+ * just return the address of the peer, storing through addr.
+ *
+ *
+ * XXX This is broken XXX
+ * 
+ * The rationale for acquiring the sdp lock here is somewhat complicated,
+ * and is described in detail in the commit log entry for r175612.  Acquiring
+ * it delays an accept(2) racing with sonewconn(), which inserts the socket
+ * before the address/port fields are initialized.  A better fix would
+ * prevent the socket from being placed in the listen queue until all fields
+ * are fully initialized.
+ */
+static int
+sdp_accept(struct socket *so, struct sockaddr **nam)
+{
+	struct sdp_sock *ssk = NULL;
+	struct in_addr addr;
+	in_port_t port;
+	int error;
+
+	if (so->so_state & SS_ISDISCONNECTED)
+		return (ECONNABORTED);
+
+	port = 0;
+	addr.s_addr = 0;
+	error = 0;
+	ssk = sdp_sk(so);
+	SDP_WLOCK(ssk);
+	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
+		error = ECONNABORTED;
+		goto out;
+	}
+	port = ssk->fport;
+	addr.s_addr = ssk->faddr;
+out:
+	SDP_WUNLOCK(ssk);
+	if (error == 0)
+		*nam = sdp_sockaddr(port, &addr);
+	return error;
+}
+
+/*
+ * Mark the connection as being incapable of further output.
+ */
+static int
+sdp_shutdown(struct socket *so)
+{
+	int error = 0;
+	struct sdp_sock *ssk;
+
+	ssk = sdp_sk(so);
+	SDP_WLOCK(ssk);
+	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
+		error = ECONNRESET;
+		goto out;
+	}
+	socantsendmore(so);
+	sdp_usrclosed(ssk);
+	if (!(ssk->flags & SDP_DROPPED))
+		sdp_output_disconnect(ssk);
+
+out:
+	SDP_WUNLOCK(ssk);
+
+	return (error);
+}
+
+static void
+sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt)
+{
+	struct mbuf *n;
+	int ncnt;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+	SBLASTRECORDCHK(sb)
+	KASSERT(mb->m_flags & M_PKTHDR,
+		("sdp_append: %p Missing packet header.\n", mb));
+	n = sb->sb_lastrecord;
+	/*
+	 * If the queue is empty just set all pointers and proceed.
+	 */
+	if (n == NULL) {
+		sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb;
+		for (; mb; mb = mb->m_next) {
+	                sb->sb_mbtail = mb;
+			sballoc(sb, mb);
+		}
+		return;
+	}
+	/*
+	 * Count the number of mbufs in the current tail.
+	 */
+	for (ncnt = 0; n->m_next; n = n->m_next)
+		ncnt++;
+	n = sb->sb_lastrecord;
+	/*
+	 * If the two chains can fit in a single sdp packet and
+	 * the last record has not been sent yet (WRITABLE) coalesce
+	 * them.  The lastrecord remains the same but we must strip the
+	 * packet header and then let sbcompress do the hard part.
+	 */
+	if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES &&
+	    n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE <
+	    ssk->xmit_size_goal) {
+		m_adj(mb, SDP_HEAD_SIZE);
+		n->m_pkthdr.len += mb->m_pkthdr.len;
+		n->m_flags |= mb->m_flags & (M_PUSH | M_URG);
+		m_demote(mb, 1);
+		sbcompress(sb, mb, sb->sb_mbtail);
+		return;
+	}
+	/*
+	 * Not compressible, just append to the end and adjust counters.
+	 */
+	sb->sb_lastrecord->m_flags |= M_PUSH;
+	sb->sb_lastrecord->m_nextpkt = mb;
+	sb->sb_lastrecord = mb;
+	if (sb->sb_sndptr == NULL)
+		sb->sb_sndptr = mb;
+	for (; mb; mb = mb->m_next) {
+		sb->sb_mbtail = mb;
+		sballoc(sb, mb);
+	}
+}
+
+/*
+ * Do a send by putting data in output queue and updating urgent
+ * marker if URG set.  Possibly send more data.  Unlike the other
+ * pru_*() routines, the mbuf chains are our responsibility.  We
+ * must either enqueue them or free them.  The other pru_* routines
+ * generally are caller-frees.
+ *
+ * This comes from sendfile, normal sends will come from sdp_sosend().
+ */
+static int
+sdp_send(struct socket *so, int flags, struct mbuf *m,
+    struct sockaddr *nam, struct mbuf *control, struct thread *td)
+{
+	struct sdp_sock *ssk;
+	struct mbuf *n;
+	int error;
+	int cnt;
+
+	error = 0;
+	ssk = sdp_sk(so);
+	KASSERT(m->m_flags & M_PKTHDR,
+	    ("sdp_send: %p no packet header", m));
+	M_PREPEND(m, SDP_HEAD_SIZE, M_WAIT);
+	mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA; 
+	for (n = m, cnt = 0; n->m_next; n = n->m_next)
+		cnt++;
+	if (cnt > SDP_MAX_SEND_SGES) {
+		n = m_collapse(m, M_WAIT, SDP_MAX_SEND_SGES);
+		if (n == NULL) {
+			m_freem(m);
+			return (EMSGSIZE);
+		}
+		m = n;
+		for (cnt = 0; n->m_next; n = n->m_next)
+			cnt++;
+	}
+	SDP_WLOCK(ssk);
+	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
+		if (control)
+			m_freem(control);
+		if (m)
+			m_freem(m);
+		error = ECONNRESET;
+		goto out;
+	}
+	if (control) {
+		/* SDP doesn't support control messages. */
+		if (control->m_len) {
+			m_freem(control);
+			if (m)
+				m_freem(m);
+			error = EINVAL;
+			goto out;
+		}
+		m_freem(control);	/* empty control, just free it */
+	}
+	if (!(flags & PRUS_OOB)) {
+		SOCKBUF_LOCK(&so->so_snd);
+		sdp_append(ssk, &so->so_snd, m, cnt);
+		SOCKBUF_UNLOCK(&so->so_snd);
+		if (nam && ssk->state < TCPS_SYN_SENT) {
+			/*
+			 * Do implied connect if not yet connected.
+			 */
+			error = sdp_start_connect(ssk, nam, td);
+			if (error)
+				goto out;
+		}
+		if (flags & PRUS_EOF) {
+			/*
+			 * Close the send side of the connection after
+			 * the data is sent.
+			 */
+			socantsendmore(so);
+			sdp_usrclosed(ssk);
+			if (!(ssk->flags & SDP_DROPPED))
+				sdp_output_disconnect(ssk);
+		} else if (!(ssk->flags & SDP_DROPPED) &&
+		    !(flags & PRUS_MORETOCOME))
+			sdp_post_sends(ssk, M_NOWAIT);
+		SDP_WUNLOCK(ssk);
+		return (0);
+	} else {
+		SOCKBUF_LOCK(&so->so_snd);
+		if (sbspace(&so->so_snd) < -512) {
+			SOCKBUF_UNLOCK(&so->so_snd);
+			m_freem(m);
+			error = ENOBUFS;
+			goto out;
+		}
+		/*
+		 * According to RFC961 (Assigned Protocols),
+		 * the urgent pointer points to the last octet
+		 * of urgent data.  We continue, however,
+		 * to consider it to indicate the first octet
+		 * of data past the urgent section.
+		 * Otherwise, snd_up should be one lower.
+		 */
+		m->m_flags |= M_URG | M_PUSH;
+		sdp_append(ssk, &so->so_snd, m, cnt);
+		SOCKBUF_UNLOCK(&so->so_snd);
+		if (nam && ssk->state < TCPS_SYN_SENT) {
+			/*
+			 * Do implied connect if not yet connected.
+			 */
+			error = sdp_start_connect(ssk, nam, td);
+			if (error)
+				goto out;
+		}
+		sdp_post_sends(ssk, M_NOWAIT);
+		SDP_WUNLOCK(ssk);
+		return (0);
+	}
+out:
+	SDP_WUNLOCK(ssk);
+	return (error);
+}
+
+#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
+
+/*
+ * Send on a socket.  If send must go all at once and message is larger than
+ * send buffering, then hard error.  Lock against other senders.  If must go
+ * all at once and not enough room now, then inform user that this would
+ * block and do nothing.  Otherwise, if nonblocking, send as much as
+ * possible.  The data to be sent is described by "uio" if nonzero, otherwise
+ * by the mbuf chain "top" (which must be null if uio is not).  Data provided
+ * in mbuf chain must be small enough to send all at once.
+ *
+ * Returns nonzero on error, timeout or signal; callers must check for short
+ * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
+ * on return.
+ */
+static int
+sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
+    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
+{
+	struct sdp_sock *ssk;
+	long space, resid;
+	int atomic;
+	int error;
+	int copy;
+
+	if (uio != NULL)
+		resid = uio->uio_resid;
+	else
+		resid = top->m_pkthdr.len;
+	atomic = top != NULL;
+	if (control != NULL) {
+		if (control->m_len) {
+			m_freem(control);
+			if (top)
+				m_freem(top);
+			return (EINVAL);
+		}
+		m_freem(control);
+		control = NULL;
+	}
+	/*
+	 * In theory resid should be unsigned.  However, space must be
+	 * signed, as it might be less than 0 if we over-committed, and we
+	 * must use a signed comparison of space and resid.  On the other
+	 * hand, a negative resid causes us to loop sending 0-length
+	 * segments to the protocol.
+	 *
+	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
+	 * type sockets since that's an error.
+	 */
+	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
+		error = EINVAL;
+		goto out;
+	}
+	if (td != NULL)
+		td->td_ru.ru_msgsnd++;
+
+	ssk = sdp_sk(so);
+	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
+	if (error)
+		goto out;
+
+restart:
+	do {
+		SOCKBUF_LOCK(&so->so_snd);
+		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+			SOCKBUF_UNLOCK(&so->so_snd);
+			error = EPIPE;
+			goto release;
+		}
+		if (so->so_error) {
+			error = so->so_error;
+			so->so_error = 0;
+			SOCKBUF_UNLOCK(&so->so_snd);
+			goto release;
+		}
+		if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) {
+			SOCKBUF_UNLOCK(&so->so_snd);
+			error = ENOTCONN;
+			goto release;
+		}
+		space = sbspace(&so->so_snd);
+		if (flags & MSG_OOB)
+			space += 1024;
+		if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) {
+			SOCKBUF_UNLOCK(&so->so_snd);
+			error = EMSGSIZE;
+			goto release;
+		}
+		if (space < resid &&
+		    (atomic || space < so->so_snd.sb_lowat)) {
+			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
+				SOCKBUF_UNLOCK(&so->so_snd);
+				error = EWOULDBLOCK;
+				goto release;
+			}
+			error = sbwait(&so->so_snd);
+			SOCKBUF_UNLOCK(&so->so_snd);
+			if (error)
+				goto release;
+			goto restart;
+		}
+		SOCKBUF_UNLOCK(&so->so_snd);
+		do {
+			if (uio == NULL) {
+				resid = 0;
+				if (flags & MSG_EOR)
+					top->m_flags |= M_EOR;
+			} else {
+				/*
+				 * Copy the data from userland into a mbuf
+				 * chain.  If no data is to be copied in,
+				 * a single empty mbuf is returned.
+				 */
+				copy = min(space,
+				    ssk->xmit_size_goal - SDP_HEAD_SIZE);
+				top = m_uiotombuf(uio, M_WAITOK, copy,
+				    0, M_PKTHDR |
+				    ((flags & MSG_EOR) ? M_EOR : 0));
+				if (top == NULL) {
+					/* only possible error */
+					error = EFAULT;
+					goto release;
+				}
+				space -= resid - uio->uio_resid;
+				resid = uio->uio_resid;
+			}
+			/*
+			 * XXX all the SBS_CANTSENDMORE checks previously
+			 * done could be out of date after dropping the
+			 * socket lock.
+			 */
+			error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB :
+			/*
+			 * Set EOF on the last send if the user specified
+			 * MSG_EOF.
+			 */
+			    ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF :
+			/* If there is more to send set PRUS_MORETOCOME. */
+			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
+			    top, addr, NULL, td);
+			top = NULL;
+			if (error)
+				goto release;
+		} while (resid && space > 0);
+	} while (resid);
+
+release:
+	sbunlock(&so->so_snd);
+out:
+	if (top != NULL)
+		m_freem(top);
+	return (error);
+}
+
+/*
+ * The part of soreceive() that implements reading non-inline out-of-band
+ * data from a socket.  For more complete comments, see soreceive(), from
+ * which this code originated.
+ *
+ * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
+ * unable to return an mbuf chain to the caller.
+ */
+static int
+soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
+{
+	struct protosw *pr = so->so_proto;
+	struct mbuf *m;
+	int error;
+
+	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
+
+	m = m_get(M_WAIT, MT_DATA);
+	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
+	if (error)
+		goto bad;
+	do {
+		error = uiomove(mtod(m, void *),
+		    (int) min(uio->uio_resid, m->m_len), uio);
+		m = m_free(m);
+	} while (uio->uio_resid && error == 0 && m);
+bad:
+	if (m != NULL)
+		m_freem(m);
+	return (error);
+}
+
+/*
+ * Optimized version of soreceive() for stream (TCP) sockets.
+ */
+static int
+sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio,
+    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+{
+	int len = 0, error = 0, flags, oresid;
+	struct sockbuf *sb;
+	struct mbuf *m, *n = NULL;
+	struct sdp_sock *ssk;
+
+	/* We only do stream sockets. */
+	if (so->so_type != SOCK_STREAM)
+		return (EINVAL);
+	if (psa != NULL)
+		*psa = NULL;
+	if (controlp != NULL)
+		return (EINVAL);
+	if (flagsp != NULL)
+		flags = *flagsp &~ MSG_EOR;
+	else
+		flags = 0;
+	if (flags & MSG_OOB)
+		return (soreceive_rcvoob(so, uio, flags));
+	if (mp0 != NULL)
+		*mp0 = NULL;
+
+	sb = &so->so_rcv;
+	ssk = sdp_sk(so);
+
+	/* Prevent other readers from entering the socket. */
+	error = sblock(sb, SBLOCKWAIT(flags));
+	if (error)
+		goto out;
+	SOCKBUF_LOCK(sb);
+
+	/* Easy one, no space to copyout anything. */
+	if (uio->uio_resid == 0) {
+		error = EINVAL;
+		goto out;
+	}
+	oresid = uio->uio_resid;
+
+	/* We will never ever get anything unless we are connected. */
+	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
+		/* When disconnecting there may be still some data left. */
+		if (sb->sb_cc > 0)
+			goto deliver;
+		if (!(so->so_state & SS_ISDISCONNECTED))
+			error = ENOTCONN;
+		goto out;
+	}
+
+	/* Socket buffer is empty and we shall not block. */
+	if (sb->sb_cc == 0 &&
+	    ((sb->sb_flags & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
+		error = EAGAIN;
+		goto out;
+	}
+
+restart:
+	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+
+	/* Abort if socket has reported problems. */
+	if (so->so_error) {
+		if (sb->sb_cc > 0)
+			goto deliver;
+		if (oresid > uio->uio_resid)
+			goto out;
+		error = so->so_error;
+		if (!(flags & MSG_PEEK))
+			so->so_error = 0;
+		goto out;
+	}
+
+	/* Door is closed.  Deliver what is left, if any. */
+	if (sb->sb_state & SBS_CANTRCVMORE) {
+		if (sb->sb_cc > 0)
+			goto deliver;
+		else
+			goto out;
+	}
+
+	/* Socket buffer got some data that we shall deliver now. */
+	if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
+	    ((sb->sb_flags & SS_NBIO) ||
+	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
+	     sb->sb_cc >= sb->sb_lowat ||
+	     sb->sb_cc >= uio->uio_resid ||
+	     sb->sb_cc >= sb->sb_hiwat) ) {
+		goto deliver;
+	}
+
+	/* On MSG_WAITALL we must wait until all data or error arrives. */
+	if ((flags & MSG_WAITALL) &&
+	    (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_lowat))
+		goto deliver;
+
+	/*
+	 * Wait and block until (more) data comes in.
+	 * NB: Drops the sockbuf lock during wait.
+	 */
+	error = sbwait(sb);
+	if (error)
+		goto out;
+	goto restart;
+
+deliver:
+	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+	KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
+	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
+
+	/* Statistics. */
+	if (uio->uio_td)
+		uio->uio_td->td_ru.ru_msgrcv++;
+
+	/* Fill uio until full or current end of socket buffer is reached. */
+	len = min(uio->uio_resid, sb->sb_cc);
+	if (mp0 != NULL) {
+		/* Dequeue as many mbufs as possible. */
+		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
+			for (*mp0 = m = sb->sb_mb;
+			     m != NULL && m->m_len <= len;
+			     m = m->m_next) {
+				len -= m->m_len;
+				uio->uio_resid -= m->m_len;
+				sbfree(sb, m);
+				n = m;
+			}
+			sb->sb_mb = m;
+			if (sb->sb_mb == NULL)
+				SB_EMPTY_FIXUP(sb);
+			n->m_next = NULL;
+		}
+		/* Copy the remainder. */
+		if (len > 0) {
+			KASSERT(sb->sb_mb != NULL,
+			    ("%s: len > 0 && sb->sb_mb empty", __func__));
+
+			m = m_copym(sb->sb_mb, 0, len, M_DONTWAIT);
+			if (m == NULL)
+				len = 0;	/* Don't flush data from sockbuf. */
+			else
+				uio->uio_resid -= m->m_len;
+			if (*mp0 != NULL)
+				n->m_next = m;
+			else
+				*mp0 = m;
+			if (*mp0 == NULL) {
+				error = ENOBUFS;
+				goto out;
+			}
+		}
+	} else {
+		/* NB: Must unlock socket buffer as uiomove may sleep. */
+		SOCKBUF_UNLOCK(sb);
+		error = m_mbuftouio(uio, sb->sb_mb, len);
+		SOCKBUF_LOCK(sb);
+		if (error)
+			goto out;
+	}
+	SBLASTRECORDCHK(sb);
+	SBLASTMBUFCHK(sb);
+
+	/*
+	 * Remove the delivered data from the socket buffer unless we
+	 * were only peeking.
+	 */
+	if (!(flags & MSG_PEEK)) {
+		if (len > 0)
+			sbdrop_locked(sb, len);
+
+		/* Notify protocol that we drained some data. */
+		SOCKBUF_UNLOCK(sb);
+		SDP_WLOCK(ssk);
+		sdp_do_posts(ssk);
+		SDP_WUNLOCK(ssk);
+		SOCKBUF_LOCK(sb);
+	}
+
+	/*
+	 * For MSG_WAITALL we may have to loop again and wait for
+	 * more data to come in.
+	 */
+	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
+		goto restart;
+out:
+	SOCKBUF_LOCK_ASSERT(sb);
+	SBLASTRECORDCHK(sb);
+	SBLASTMBUFCHK(sb);
+	SOCKBUF_UNLOCK(sb);
+	sbunlock(sb);
+	return (error);
+}
+
+/*
+ * Abort is used to teardown a connection typically while sitting in
+ * the accept queue.
+ */
+void
+sdp_abort(struct socket *so)
+{
+	struct sdp_sock *ssk;
+
+	ssk = sdp_sk(so);
+	SDP_WLOCK(ssk);
+	/*
+	 * If we have not yet dropped, do it now.
+	 */
+	if (!(ssk->flags & SDP_TIMEWAIT) &&
+	    !(ssk->flags & SDP_DROPPED))
+		sdp_drop(ssk, ECONNABORTED);
+	KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X",
+	    ssk, ssk->flags));
+	SDP_WUNLOCK(ssk);
+}
+
+/*
+ * Close a SDP socket and initiate a friendly disconnect.
+ */
+static void
+sdp_close(struct socket *so)
+{
+	struct sdp_sock *ssk;
+
+	ssk = sdp_sk(so);
+	SDP_WLOCK(ssk);
+	/*
+	 * If we have not yet dropped, do it now.
+	 */
+	if (!(ssk->flags & SDP_TIMEWAIT) &&
+	    !(ssk->flags & SDP_DROPPED)) 
+		sdp_start_disconnect(ssk);
+
+	/*
+	 * If we've still not dropped let the socket layer know we're
+	 * holding on to the socket and pcb for a while.
+	 */
+	if (!(ssk->flags & SDP_DROPPED)) {
+		SOCK_LOCK(so);
+		so->so_state |= SS_PROTOREF;
+		SOCK_UNLOCK(so);
+		ssk->flags |= SDP_SOCKREF;
+	}
+	SDP_WUNLOCK(ssk);
+}
+
+/*
+ * User requests out-of-band data.
+ */
+static int
+sdp_rcvoob(struct socket *so, struct mbuf *m, int flags)
+{
+	int error = 0;
+	struct sdp_sock *ssk;
+
+	ssk = sdp_sk(so);
+	SDP_WLOCK(ssk);
+	if (!rx_ring_trylock(&ssk->rx_ring)) {
+		SDP_WUNLOCK(ssk);
+		return (ECONNRESET);
+	}
+	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
+		error = ECONNRESET;
+		goto out;
+	}
+	if ((so->so_oobmark == 0 &&
+	     (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
+	    so->so_options & SO_OOBINLINE ||
+	    ssk->oobflags & SDP_HADOOB) {
+		error = EINVAL;
+		goto out;
+	}
+	if ((ssk->oobflags & SDP_HAVEOOB) == 0) {
+		error = EWOULDBLOCK;
+		goto out;
+	}
+	m->m_len = 1;
+	*mtod(m, caddr_t) = ssk->iobc;
+	if ((flags & MSG_PEEK) == 0)
+		ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB);
+out:
+	rx_ring_unlock(&ssk->rx_ring);
+	SDP_WUNLOCK(ssk);
+	return (error);
+}
+
+void
+sdp_urg(struct sdp_sock *ssk, struct mbuf *mb)
+{
+	struct mbuf *m;
+	struct socket *so;
+
+	so = ssk->socket;
+	if (so == NULL)
+		return;
+
+	so->so_oobmark = so->so_rcv.sb_cc + mb->m_pkthdr.len - 1;
+	sohasoutofband(so);
+	ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB);
+	if (!(so->so_options & SO_OOBINLINE)) {
+		for (m = mb; m->m_next != NULL; m = m->m_next);
+		ssk->iobc = *(mtod(m, char *) + m->m_len - 1);
+		ssk->oobflags |= SDP_HAVEOOB;
+		m->m_len--;
+		mb->m_pkthdr.len--;
+	}
+}
+
+/*
+ * Notify a sdp socket of an asynchronous error.
+ *
+ * Do not wake up user since there currently is no mechanism for
+ * reporting soft errors (yet - a kqueue filter may be added).
+ */
+struct sdp_sock *
+sdp_notify(struct sdp_sock *ssk, int error)
+{
+
+	SDP_WLOCK_ASSERT(ssk);
+
+	if ((ssk->flags & SDP_TIMEWAIT) ||
+	    (ssk->flags & SDP_DROPPED))
+		return (ssk);
+
+	/*
+	 * Ignore some errors if we are hooked up.
+	 */
+	if (ssk->state == TCPS_ESTABLISHED &&
+	    (error == EHOSTUNREACH || error == ENETUNREACH ||
+	     error == EHOSTDOWN))
+		return (ssk);
+	ssk->softerror = error;
+	return sdp_drop(ssk, error);
+}
+
+static void
+sdp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
+{
+	struct in_addr faddr;
+
+	faddr = ((struct sockaddr_in *)sa)->sin_addr;
+	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
+		return;
+
+	sdp_pcbnotifyall(faddr, inetctlerrmap[cmd], sdp_notify);
+}
+
+static int
+sdp_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
+    struct thread *td)
+{
+	return (EOPNOTSUPP);
+}
+
+static void
+sdp_keepalive_timeout(void *data)
+{
+	struct sdp_sock *ssk;
+
+	ssk = data;
+	/* Callout canceled. */
+        if (!callout_active(&ssk->keep2msl))
+                return;
+	/* Callout rescheduled as a different kind of timer. */
+	if (callout_pending(&ssk->keep2msl))
+		goto out;
+        callout_deactivate(&ssk->keep2msl);
+	if (ssk->flags & SDP_DROPPED ||
+	    (ssk->socket->so_options & SO_KEEPALIVE) == 0)
+		goto out;
+	sdp_post_keepalive(ssk);
+	callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
+	    sdp_keepalive_timeout, ssk);
+out:
+	SDP_WUNLOCK(ssk);
+}
+
+
+void
+sdp_start_keepalive_timer(struct socket *so)
+{
+	struct sdp_sock *ssk;
+
+	ssk = sdp_sk(so);
+	if (!callout_pending(&ssk->keep2msl))
+                callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
+                    sdp_keepalive_timeout, ssk);
+}
+
+static void
+sdp_stop_keepalive_timer(struct socket *so)
+{
+	struct sdp_sock *ssk;
+
+	ssk = sdp_sk(so);
+	callout_stop(&ssk->keep2msl);
+}
+
+/*
+ * sdp_ctloutput() must drop the inpcb lock before performing copyin on
+ * socket option arguments.  When it re-acquires the lock after the copy, it
+ * has to revalidate that the connection is still valid for the socket
+ * option.
+ */
+#define SDP_WLOCK_RECHECK(inp) do {					\
+	SDP_WLOCK(ssk);							\
+	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {		\
+		SDP_WUNLOCK(ssk);					\
+		return (ECONNRESET);					\
+	}								\
+} while(0)
+
+static int
+sdp_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+	int	error, opt, optval;
+	struct sdp_sock *ssk;
+
+	error = 0;
+	ssk = sdp_sk(so);
+	if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) {
+		SDP_WLOCK(ssk);
+		if (so->so_options & SO_KEEPALIVE)
+			sdp_start_keepalive_timer(so);
+		else
+			sdp_stop_keepalive_timer(so);
+		SDP_WUNLOCK(ssk);
+	}
+	if (sopt->sopt_level != IPPROTO_TCP)
+		return (error);
+
+	SDP_WLOCK(ssk);
+	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
+		SDP_WUNLOCK(ssk);
+		return (ECONNRESET);
+	}
+
+	switch (sopt->sopt_dir) {
+	case SOPT_SET:
+		switch (sopt->sopt_name) {
+		case TCP_NODELAY:
+			SDP_WUNLOCK(ssk);
+			error = sooptcopyin(sopt, &optval, sizeof optval,
+			    sizeof optval);
+			if (error)
+				return (error);
+
+			SDP_WLOCK_RECHECK(ssk);
+			opt = SDP_NODELAY;
+			if (optval)
+				ssk->flags |= opt;
+			else
+				ssk->flags &= ~opt;
+			sdp_do_posts(ssk);
+			SDP_WUNLOCK(ssk);
+			break;
+
+		default:
+			SDP_WUNLOCK(ssk);
+			error = ENOPROTOOPT;
+			break;
+		}
+		break;
+
+	case SOPT_GET:
+		switch (sopt->sopt_name) {
+		case TCP_NODELAY:
+			optval = ssk->flags & SDP_NODELAY;
+			SDP_WUNLOCK(ssk);
+			error = sooptcopyout(sopt, &optval, sizeof optval);
+			break;
+		default:
+			SDP_WUNLOCK(ssk);
+			error = ENOPROTOOPT;
+			break;
+		}
+		break;
+	}
+	return (error);
+}
+#undef SDP_WLOCK_RECHECK
+
+int sdp_mod_count = 0;
+int sdp_mod_usec = 0;
+
+void
+sdp_set_default_moderation(struct sdp_sock *ssk)
+{
+	if (sdp_mod_count <= 0 || sdp_mod_usec <= 0)
+		return;
+	ib_modify_cq(ssk->rx_ring.cq, sdp_mod_count, sdp_mod_usec);
+}
+
+
+static void
+sdp_dev_add(struct ib_device *device)
+{
+	struct ib_fmr_pool_param param;
+	struct sdp_device *sdp_dev;
+
+	sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO);
+	sdp_dev->pd = ib_alloc_pd(device);
+	if (IS_ERR(sdp_dev->pd))
+		goto out_pd;
+        sdp_dev->mr = ib_get_dma_mr(sdp_dev->pd, IB_ACCESS_LOCAL_WRITE);
+        if (IS_ERR(sdp_dev->mr))
+		goto out_mr;
+	memset(&param, 0, sizeof param);
+	param.max_pages_per_fmr = SDP_FMR_SIZE;
+	param.page_shift = PAGE_SHIFT;
+	param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ);
+	param.pool_size = SDP_FMR_POOL_SIZE;
+	param.dirty_watermark = SDP_FMR_DIRTY_SIZE;
+	param.cache = 1;
+	sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, &param);
+	if (IS_ERR(sdp_dev->fmr_pool))
+		goto out_fmr;
+	ib_set_client_data(device, &sdp_client, sdp_dev);
+	return;
+
+out_fmr:
+	ib_dereg_mr(sdp_dev->mr);
+out_mr:
+	ib_dealloc_pd(sdp_dev->pd);
+out_pd:
+	free(sdp_dev, M_SDP);
+}
+
+static void
+sdp_dev_rem(struct ib_device *device)
+{
+	struct sdp_device *sdp_dev;
+	struct sdp_sock *ssk;
+
+	SDP_LIST_WLOCK();
+	LIST_FOREACH(ssk, &sdp_list, list) {
+		if (ssk->ib_device != device)
+			continue;
+		SDP_WLOCK(ssk);
+		if ((ssk->flags & SDP_DESTROY) == 0)
+			ssk = sdp_notify(ssk, ECONNRESET);
+		if (ssk)
+			SDP_WUNLOCK(ssk);
+	}
+	SDP_LIST_WUNLOCK();
+	/*
+	 * XXX Do I need to wait between these two?
+	 */
+	sdp_dev = ib_get_client_data(device, &sdp_client);
+	if (!sdp_dev)
+		return;
+	ib_flush_fmr_pool(sdp_dev->fmr_pool);
+	ib_destroy_fmr_pool(sdp_dev->fmr_pool);
+	ib_dereg_mr(sdp_dev->mr);
+	ib_dealloc_pd(sdp_dev->pd);
+	free(sdp_dev, M_SDP);
+}
+
+struct ib_client sdp_client =
+    { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem };
+
+
+static int
+sdp_pcblist(SYSCTL_HANDLER_ARGS)
+{
+	int error, n, i;
+	struct sdp_sock *ssk;
+	struct xinpgen xig;
+
+	/*
+	 * The process of preparing the TCB list is too time-consuming and
+	 * resource-intensive to repeat twice on every request.
+	 */
+	if (req->oldptr == NULL) {
+		n = sdp_count;
+		n += imax(n / 8, 10);
+		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb);
+		return (0);
+	}
+
+	if (req->newptr != NULL)
+		return (EPERM);
+
+	/*
+	 * OK, now we're committed to doing something.
+	 */
+	SDP_LIST_RLOCK();
+	n = sdp_count;
+	SDP_LIST_RUNLOCK();
+
+	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
+		+ n * sizeof(struct xtcpcb));
+	if (error != 0)
+		return (error);
+
+	xig.xig_len = sizeof xig;
+	xig.xig_count = n;
+	xig.xig_gen = 0;
+	xig.xig_sogen = so_gencnt;
+	error = SYSCTL_OUT(req, &xig, sizeof xig);
+	if (error)
+		return (error);
+
+	SDP_LIST_RLOCK();
+	for (ssk = LIST_FIRST(&sdp_list), i = 0;
+	    ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) {
+		struct xtcpcb xt;
+
+		SDP_RLOCK(ssk);
+		if (ssk->flags & SDP_TIMEWAIT) {
+			if (ssk->cred != NULL)
+				error = cr_cansee(req->td->td_ucred,
+				    ssk->cred);
+			else
+				error = EINVAL;	/* Skip this inp. */
+		} else if (ssk->socket)
+			error = cr_canseesocket(req->td->td_ucred,
+			    ssk->socket);
+		else
+			error = EINVAL;
+		if (error) {
+			error = 0;
+			goto next;
+		}
+
+		bzero(&xt, sizeof(xt));
+		xt.xt_len = sizeof xt;
+		xt.xt_inp.inp_gencnt = 0;
+		xt.xt_inp.inp_vflag = INP_IPV4;
+		memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr));
+		xt.xt_inp.inp_lport = ssk->lport;
+		memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr));
+		xt.xt_inp.inp_fport = ssk->fport;
+		xt.xt_tp.t_state = ssk->state;
+		if (ssk->socket != NULL)
+			sotoxsocket(ssk->socket, &xt.xt_socket);
+		else
+			bzero(&xt.xt_socket, sizeof xt.xt_socket);
+		xt.xt_socket.xso_protocol = IPPROTO_TCP;
+		SDP_RUNLOCK(ssk);
+		error = SYSCTL_OUT(req, &xt, sizeof xt);
+		if (error)
+			break;
+		i++;
+		continue;
+next:
+		SDP_RUNLOCK(ssk);
+	}
+	if (!error) {
+		/*
+		 * Give the user an updated idea of our state.
+		 * If the generation differs from what we told
+		 * her before, she knows that something happened
+		 * while we were processing this request, and it
+		 * might be necessary to retry.
+		 */
+		xig.xig_gen = 0;
+		xig.xig_sogen = so_gencnt;
+		xig.xig_count = sdp_count;
+		error = SYSCTL_OUT(req, &xig, sizeof xig);
+	}
+	SDP_LIST_RUNLOCK();
+	return (error);
+}
+
+SYSCTL_NODE(_net_inet, -1,  sdp,    CTLFLAG_RW, 0,  "SDP");
+
+SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist,
+    CTLFLAG_RD | CTLTYPE_STRUCT, 0, 0, sdp_pcblist, "S,xtcpcb",
+    "List of active SDP connections");
+
+static void
+sdp_zone_change(void *tag)
+{
+
+	uma_zone_set_max(sdp_zone, maxsockets);
+}
+
+static void
+sdp_init(void)
+{
+
+	LIST_INIT(&sdp_list);
+	sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	uma_zone_set_max(sdp_zone, maxsockets);
+	EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL,
+		EVENTHANDLER_PRI_ANY);
+	rx_comp_wq = create_singlethread_workqueue("rx_comp_wq");
+	ib_register_client(&sdp_client);
+}
+
+extern struct domain sdpdomain;
+
+struct pr_usrreqs sdp_usrreqs = {
+	.pru_abort =		sdp_abort,
+	.pru_accept =		sdp_accept,
+	.pru_attach =		sdp_attach,
+	.pru_bind =		sdp_bind,
+	.pru_connect =		sdp_connect,
+	.pru_control =		sdp_control,
+	.pru_detach =		sdp_detach,
+	.pru_disconnect =	sdp_disconnect,
+	.pru_listen =		sdp_listen,
+	.pru_peeraddr =		sdp_getpeeraddr,
+	.pru_rcvoob =		sdp_rcvoob,
+	.pru_send =		sdp_send,
+	.pru_sosend =		sdp_sosend,
+	.pru_soreceive =	sdp_sorecv,
+	.pru_shutdown =		sdp_shutdown,
+	.pru_sockaddr =		sdp_getsockaddr,
+	.pru_close =		sdp_close,
+};
+
+struct protosw sdpsw[] = {
+{
+	.pr_type =		SOCK_STREAM,
+	.pr_domain =		&sdpdomain,
+	.pr_protocol =		IPPROTO_IP,
+	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
+	.pr_ctlinput =		sdp_ctlinput,
+	.pr_ctloutput =		sdp_ctloutput,
+	.pr_usrreqs =		&sdp_usrreqs
+},
+{
+	.pr_type =		SOCK_STREAM,
+	.pr_domain =		&sdpdomain,
+	.pr_protocol =		IPPROTO_TCP,
+	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
+	.pr_ctlinput =		sdp_ctlinput,
+	.pr_ctloutput =		sdp_ctloutput,
+	.pr_usrreqs =		&sdp_usrreqs
+},
+};
+
+struct domain sdpdomain = {
+	.dom_family =		AF_INET_SDP,
+	.dom_name =		"SDP",
+	.dom_init =		sdp_init,
+	.dom_protosw =		sdpsw,
+	.dom_protoswNPROTOSW =	&sdpsw[sizeof(sdpsw)/sizeof(sdpsw[0])],
+};
+
+DOMAIN_SET(sdp);
+
+int sdp_debug_level = 1;
+int sdp_data_debug_level = 0;
diff --git a/sys/ofed/drivers/infiniband/ulp/sdp/sdp_proc.c b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_proc.c
new file mode 100644
index 0000000..74bc04a
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_proc.c
@@ -0,0 +1,533 @@
+/*
+ * Copyright (c) 2008 Mellanox Technologies Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/proc_fs.h>
+#include <rdma/sdp_socket.h>
+#include "sdp.h"
+
+#ifdef CONFIG_PROC_FS
+
+#define PROC_SDP_STATS "sdpstats"
+#define PROC_SDP_PERF "sdpprf"
+
+/* just like TCP fs */
+struct sdp_seq_afinfo {
+	struct module           *owner;
+	char                    *name;
+	sa_family_t             family;
+	int                     (*seq_show) (struct seq_file *m, void *v);
+	struct file_operations  *seq_fops;
+};
+
+struct sdp_iter_state {
+	sa_family_t             family;
+	int                     num;
+	struct seq_operations   seq_ops;
+};
+
+static void *sdp_get_idx(struct seq_file *seq, loff_t pos)
+{
+	int i = 0;
+	struct sdp_sock *ssk;
+
+	if (!list_empty(&sock_list))
+		list_for_each_entry(ssk, &sock_list, sock_list) {
+			if (i == pos)
+				return ssk;
+			i++;
+		}
+
+	return NULL;
+}
+
+static void *sdp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	void *start = NULL;
+	struct sdp_iter_state *st = seq->private;
+
+	st->num = 0;
+
+	if (!*pos)
+		return SEQ_START_TOKEN;
+
+	spin_lock_irq(&sock_list_lock);
+	start = sdp_get_idx(seq, *pos - 1);
+	if (start)
+		sock_hold((struct socket *)start, SOCK_REF_SEQ);
+	spin_unlock_irq(&sock_list_lock);
+
+	return start;
+}
+
+static void *sdp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct sdp_iter_state *st = seq->private;
+	void *next = NULL;
+
+	spin_lock_irq(&sock_list_lock);
+	if (v == SEQ_START_TOKEN)
+		next = sdp_get_idx(seq, 0);
+	else
+		next = sdp_get_idx(seq, *pos);
+	if (next)
+		sock_hold((struct socket *)next, SOCK_REF_SEQ);
+	spin_unlock_irq(&sock_list_lock);
+
+	*pos += 1;
+	st->num++;
+
+	return next;
+}
+
+static void sdp_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+#define TMPSZ 150
+
+static int sdp_seq_show(struct seq_file *seq, void *v)
+{
+	struct sdp_iter_state *st;
+	struct socket *sk = v;
+	char tmpbuf[TMPSZ + 1];
+	unsigned int dest;
+	unsigned int src;
+	int uid;
+	unsigned long inode;
+	__u16 destp;
+	__u16 srcp;
+	__u32 rx_queue, tx_queue;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq, "%-*s\n", TMPSZ - 1,
+				"  sl  local_address rem_address        "
+				"uid inode   rx_queue tx_queue state");
+		goto out;
+	}
+
+	st = seq->private;
+
+	dest = inet_sk(sk)->daddr;
+	src = inet_sk(sk)->rcv_saddr;
+	destp = ntohs(inet_sk(sk)->dport);
+	srcp = ntohs(inet_sk(sk)->sport);
+	uid = sock_i_uid(sk);
+	inode = sock_i_ino(sk);
+	rx_queue = rcv_nxt(sdp_sk(sk)) - sdp_sk(sk)->copied_seq;
+	tx_queue = sdp_sk(sk)->write_seq - sdp_sk(sk)->tx_ring.una_seq;
+
+	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %5d %lu	%08X:%08X %X",
+		st->num, src, srcp, dest, destp, uid, inode,
+		rx_queue, tx_queue, sk->sk_state);
+
+	seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
+
+	sock_put(sk, SOCK_REF_SEQ);
+out:
+	return 0;
+}
+
+static int sdp_seq_open(struct inode *inode, struct file *file)
+{
+	struct sdp_seq_afinfo *afinfo = PDE(inode)->data;
+	struct seq_file *seq;
+	struct sdp_iter_state *s;
+	int rc;
+
+	if (unlikely(afinfo == NULL))
+		return -EINVAL;
+
+/* Workaround bogus warning by memtrack */
+#define _kzalloc(size,flags) kzalloc(size,flags)
+#undef kzalloc
+	s = kzalloc(sizeof(*s), GFP_KERNEL);
+#define kzalloc(s,f) _kzalloc(s,f)	
+	if (!s)
+		return -ENOMEM;
+	s->family               = afinfo->family;
+	s->seq_ops.start        = sdp_seq_start;
+	s->seq_ops.next         = sdp_seq_next;
+	s->seq_ops.show         = afinfo->seq_show;
+	s->seq_ops.stop         = sdp_seq_stop;
+
+	rc = seq_open(file, &s->seq_ops);
+	if (rc)
+		goto out_kfree;
+	seq          = file->private_data;
+	seq->private = s;
+out:
+	return rc;
+out_kfree:
+	kfree(s);
+	goto out;
+}
+
+
+static struct file_operations sdp_seq_fops;
+static struct sdp_seq_afinfo sdp_seq_afinfo = {
+	.owner          = THIS_MODULE,
+	.name           = "sdp",
+	.family         = AF_INET_SDP,
+	.seq_show       = sdp_seq_show,
+	.seq_fops       = &sdp_seq_fops,
+};
+
+#ifdef SDPSTATS_ON
+DEFINE_PER_CPU(struct sdpstats, sdpstats);
+
+static void sdpstats_seq_hist(struct seq_file *seq, char *str, u32 *h, int n,
+		int is_log)
+{
+	int i;
+	u32 max = 0;
+
+	seq_printf(seq, "%s:\n", str);
+
+	for (i = 0; i < n; i++) {
+		if (h[i] > max)
+			max = h[i];
+	}
+
+	if (max == 0) {
+		seq_printf(seq, " - all values are 0\n");
+		return;
+	}
+
+	for (i = 0; i < n; i++) {
+		char s[51];
+		int j = 50 * h[i] / max;
+		int val = is_log ? (i == n-1 ? 0 : 1<<i) : i;
+		memset(s, '*', j);
+		s[j] = '\0';
+
+		seq_printf(seq, "%10d | %-50s - %d\n", val, s, h[i]);
+	}
+}
+
+#define SDPSTATS_COUNTER_GET(var) ({ \
+	u32 __val = 0;						\
+	unsigned int __i;                                       \
+	for_each_possible_cpu(__i)                              \
+		__val += per_cpu(sdpstats, __i).var;		\
+	__val;							\
+})	
+
+#define SDPSTATS_HIST_GET(hist, hist_len, sum) ({ \
+	unsigned int __i;                                       \
+	for_each_possible_cpu(__i) {                            \
+		unsigned int __j;				\
+		u32 *h = per_cpu(sdpstats, __i).hist;		\
+		for (__j = 0; __j < hist_len; __j++) { 		\
+			sum[__j] += h[__j];			\
+		} \
+	} 							\
+})
+
+#define __sdpstats_seq_hist(seq, msg, hist, is_log) ({		\
+	u32 tmp_hist[SDPSTATS_MAX_HIST_SIZE];			\
+	int hist_len = ARRAY_SIZE(__get_cpu_var(sdpstats).hist);\
+	memset(tmp_hist, 0, sizeof(tmp_hist));			\
+	SDPSTATS_HIST_GET(hist, hist_len, tmp_hist);	\
+	sdpstats_seq_hist(seq, msg, tmp_hist, hist_len, is_log);\
+})
+
+static int sdpstats_seq_show(struct seq_file *seq, void *v)
+{
+	int i;
+
+	seq_printf(seq, "SDP statistics:\n");
+
+	__sdpstats_seq_hist(seq, "sendmsg_seglen", sendmsg_seglen, 1);
+	__sdpstats_seq_hist(seq, "send_size", send_size, 1);
+	__sdpstats_seq_hist(seq, "credits_before_update",
+		credits_before_update, 0);
+
+	seq_printf(seq, "sdp_sendmsg() calls\t\t: %d\n",
+		SDPSTATS_COUNTER_GET(sendmsg));
+	seq_printf(seq, "bcopy segments     \t\t: %d\n",
+		SDPSTATS_COUNTER_GET(sendmsg_bcopy_segment));
+	seq_printf(seq, "bzcopy segments    \t\t: %d\n",
+		SDPSTATS_COUNTER_GET(sendmsg_bzcopy_segment));
+	seq_printf(seq, "zcopy segments    \t\t: %d\n",
+		SDPSTATS_COUNTER_GET(sendmsg_zcopy_segment));
+	seq_printf(seq, "post_send_credits  \t\t: %d\n",
+		SDPSTATS_COUNTER_GET(post_send_credits));
+	seq_printf(seq, "memcpy_count       \t\t: %u\n",
+		SDPSTATS_COUNTER_GET(memcpy_count));
+
+        for (i = 0; i < ARRAY_SIZE(__get_cpu_var(sdpstats).post_send); i++) {
+                if (mid2str(i)) {
+                        seq_printf(seq, "post_send %-20s\t: %d\n",
+                                        mid2str(i),
+					SDPSTATS_COUNTER_GET(post_send[i]));
+                }
+        }
+
+	seq_printf(seq, "\n");
+	seq_printf(seq, "post_recv         \t\t: %d\n",
+		SDPSTATS_COUNTER_GET(post_recv));
+	seq_printf(seq, "BZCopy poll miss  \t\t: %d\n",
+		SDPSTATS_COUNTER_GET(bzcopy_poll_miss));
+	seq_printf(seq, "send_wait_for_mem \t\t: %d\n",
+		SDPSTATS_COUNTER_GET(send_wait_for_mem));
+	seq_printf(seq, "send_miss_no_credits\t\t: %d\n",
+		SDPSTATS_COUNTER_GET(send_miss_no_credits));
+
+	seq_printf(seq, "rx_poll_miss      \t\t: %d\n", SDPSTATS_COUNTER_GET(rx_poll_miss));
+	seq_printf(seq, "tx_poll_miss      \t\t: %d\n", SDPSTATS_COUNTER_GET(tx_poll_miss));
+	seq_printf(seq, "tx_poll_busy      \t\t: %d\n", SDPSTATS_COUNTER_GET(tx_poll_busy));
+	seq_printf(seq, "tx_poll_hit       \t\t: %d\n", SDPSTATS_COUNTER_GET(tx_poll_hit));
+
+	seq_printf(seq, "CQ stats:\n");
+	seq_printf(seq, "- RX interrupts\t\t: %d\n", SDPSTATS_COUNTER_GET(rx_int_count));
+	seq_printf(seq, "- TX interrupts\t\t: %d\n", SDPSTATS_COUNTER_GET(tx_int_count));
+
+	seq_printf(seq, "ZCopy stats:\n");
+	seq_printf(seq, "- TX timeout\t\t: %d\n", SDPSTATS_COUNTER_GET(zcopy_tx_timeout));
+	seq_printf(seq, "- TX cross send\t\t: %d\n", SDPSTATS_COUNTER_GET(zcopy_cross_send));
+	seq_printf(seq, "- TX aborted by peer\t: %d\n", SDPSTATS_COUNTER_GET(zcopy_tx_aborted));
+	seq_printf(seq, "- TX error\t\t: %d\n", SDPSTATS_COUNTER_GET(zcopy_tx_error));
+	return 0;
+}
+
+static ssize_t sdpstats_write(struct file *file, const char __user *buf,
+			    size_t count, loff_t *offs)
+{
+	int i;
+
+	for_each_possible_cpu(i)
+		memset(&per_cpu(sdpstats, i), 0, sizeof(struct sdpstats));
+	printk(KERN_WARNING "Cleared sdp statistics\n");
+
+	return count;
+}
+
+static int sdpstats_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, sdpstats_seq_show, NULL);
+}
+
+static struct file_operations sdpstats_fops = {
+	.owner		= THIS_MODULE,
+	.open		= sdpstats_seq_open,
+	.read		= seq_read,
+	.write		= sdpstats_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#endif
+
+#ifdef SDP_PROFILING
+struct sdpprf_log sdpprf_log[SDPPRF_LOG_SIZE];
+int sdpprf_log_count;
+
+static unsigned long long start_t;
+
+static int sdpprf_show(struct seq_file *m, void *v)
+{
+	struct sdpprf_log *l = v;
+	unsigned long nsec_rem, t;
+
+	if (!sdpprf_log_count) {
+		seq_printf(m, "No performance logs\n");
+		goto out;
+	}
+
+	t = l->time - start_t;
+	nsec_rem = do_div(t, 1000000000);
+
+	seq_printf(m, "%-6d: [%5lu.%06lu] %-50s - [%d{%d} %d:%d] "
+			"mb: %p %s:%d\n",
+			l->idx, (unsigned long)t, nsec_rem/1000,
+			l->msg, l->pid, l->cpu, l->sk_num, l->sk_dport,
+			l->mb, l->func, l->line);
+out:
+	return 0;
+}
+
+static void *sdpprf_start(struct seq_file *p, loff_t *pos)
+{
+	int idx = *pos;
+
+	if (!*pos) {
+		if (!sdpprf_log_count)
+			return SEQ_START_TOKEN;
+	}
+
+	if (*pos >= MIN(sdpprf_log_count, SDPPRF_LOG_SIZE - 1))
+		return NULL;
+
+	if (sdpprf_log_count >= SDPPRF_LOG_SIZE - 1) {
+		int off = sdpprf_log_count & (SDPPRF_LOG_SIZE - 1);
+		idx = (idx + off) & (SDPPRF_LOG_SIZE - 1);
+
+	}
+
+	if (!start_t)
+		start_t = sdpprf_log[idx].time;
+	return &sdpprf_log[idx];
+}
+
+static void *sdpprf_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	struct sdpprf_log *l = v;
+
+	if (++*pos >= MIN(sdpprf_log_count, SDPPRF_LOG_SIZE - 1))
+		return NULL;
+
+	++l;
+	if (l - &sdpprf_log[0] >= SDPPRF_LOG_SIZE - 1)
+		return &sdpprf_log[0];
+
+	return l;
+}
+
+static void sdpprf_stop(struct seq_file *p, void *v)
+{
+}
+
+static struct seq_operations sdpprf_ops = {
+	.start = sdpprf_start,
+	.stop = sdpprf_stop,
+	.next = sdpprf_next,
+	.show = sdpprf_show,
+};
+
+static int sdpprf_open(struct inode *inode, struct file *file)
+{
+	int res;
+
+	res = seq_open(file, &sdpprf_ops);
+
+	return res;
+}
+
+static ssize_t sdpprf_write(struct file *file, const char __user *buf,
+			    size_t count, loff_t *offs)
+{
+	sdpprf_log_count = 0;
+	printk(KERN_INFO "Cleared sdpprf statistics\n");
+
+	return count;
+}
+
+static struct file_operations sdpprf_fops = {
+	.open           = sdpprf_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = seq_release,
+	.write		= sdpprf_write,
+};
+#endif /* SDP_PROFILING */
+
+int __init sdp_proc_init(void)
+{
+	struct proc_dir_entry *p = NULL;
+#ifdef SDPSTATS_ON
+	struct proc_dir_entry *stats = NULL;
+#endif
+#ifdef SDP_PROFILING
+	struct proc_dir_entry *prof = NULL;
+#endif
+
+	sdp_seq_afinfo.seq_fops->owner         = sdp_seq_afinfo.owner;
+	sdp_seq_afinfo.seq_fops->open          = sdp_seq_open;
+	sdp_seq_afinfo.seq_fops->read          = seq_read;
+	sdp_seq_afinfo.seq_fops->llseek        = seq_lseek;
+	sdp_seq_afinfo.seq_fops->release       = seq_release_private;
+
+	p = proc_net_fops_create(&init_net, sdp_seq_afinfo.name, S_IRUGO,
+				 sdp_seq_afinfo.seq_fops);
+	if (p)
+		p->data = &sdp_seq_afinfo;
+	else
+		goto no_mem;
+
+#ifdef SDPSTATS_ON
+
+	stats = proc_net_fops_create(&init_net, PROC_SDP_STATS,
+			S_IRUGO | S_IWUGO, &sdpstats_fops);
+	if (!stats)
+		goto no_mem_stats;
+
+#endif
+
+#ifdef SDP_PROFILING
+	prof = proc_net_fops_create(&init_net, PROC_SDP_PERF,
+			S_IRUGO | S_IWUGO, &sdpprf_fops);
+	if (!prof)
+		goto no_mem_prof;
+#endif
+
+	return 0;
+
+#ifdef SDP_PROFILING
+no_mem_prof:
+#endif
+
+#ifdef SDPSTATS_ON
+	proc_net_remove(&init_net, PROC_SDP_STATS);
+
+no_mem_stats:
+#endif
+	proc_net_remove(&init_net, sdp_seq_afinfo.name);
+
+no_mem:	
+	return -ENOMEM;
+}
+
+void sdp_proc_unregister(void)
+{
+	proc_net_remove(&init_net, sdp_seq_afinfo.name);
+	memset(sdp_seq_afinfo.seq_fops, 0, sizeof(*sdp_seq_afinfo.seq_fops));
+
+#ifdef SDPSTATS_ON
+	proc_net_remove(&init_net, PROC_SDP_STATS);
+#endif
+#ifdef SDP_PROFILING
+	proc_net_remove(&init_net, PROC_SDP_PERF);
+#endif
+}
+
+#else /* CONFIG_PROC_FS */
+
+int __init sdp_proc_init(void)
+{
+	return 0;
+}
+
+void sdp_proc_unregister(void)
+{
+
+}
+#endif /* CONFIG_PROC_FS */
diff --git a/sys/ofed/drivers/infiniband/ulp/sdp/sdp_rx.c b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_rx.c
new file mode 100644
index 0000000..de4b80b
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_rx.c
@@ -0,0 +1,783 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "sdp.h"
+
+SDP_MODPARAM_INT(rcvbuf_initial_size, 32 * 1024,
+		"Receive buffer initial size in bytes.");
+SDP_MODPARAM_SINT(rcvbuf_scale, 0x8,
+		"Receive buffer size scale factor.");
+
+/* Like tcp_fin - called when SDP_MID_DISCONNECT is received */
+static void
+sdp_handle_disconn(struct sdp_sock *ssk)
+{
+
+	sdp_dbg(ssk->socket, "%s\n", __func__);
+
+	SDP_WLOCK_ASSERT(ssk);
+	if (TCPS_HAVERCVDFIN(ssk->state) == 0)
+		socantrcvmore(ssk->socket);
+
+	switch (ssk->state) {
+	case TCPS_SYN_RECEIVED:
+	case TCPS_ESTABLISHED:
+		ssk->state = TCPS_CLOSE_WAIT;
+		break;
+
+	case TCPS_FIN_WAIT_1:
+		/* Received a reply FIN - start Infiniband tear down */
+		sdp_dbg(ssk->socket,
+		    "%s: Starting Infiniband tear down sending DREQ\n",
+		    __func__);
+
+		sdp_cancel_dreq_wait_timeout(ssk);
+		ssk->qp_active = 0;
+		if (ssk->id) {
+			struct rdma_cm_id *id;
+
+			id = ssk->id;
+			SDP_WUNLOCK(ssk);
+			rdma_disconnect(id);
+			SDP_WLOCK(ssk);
+		} else {
+			sdp_warn(ssk->socket,
+			    "%s: ssk->id is NULL\n", __func__);
+			return;
+		}
+		break;
+	case TCPS_TIME_WAIT:
+		/* This is a mutual close situation and we've got the DREQ from
+		   the peer before the SDP_MID_DISCONNECT */
+		break;
+	case TCPS_CLOSED:
+		/* FIN arrived after IB teardown started - do nothing */
+		sdp_dbg(ssk->socket, "%s: fin in state %s\n",
+		    __func__, sdp_state_str(ssk->state));
+		return;
+	default:
+		sdp_warn(ssk->socket,
+		    "%s: FIN in unexpected state. state=%d\n",
+		    __func__, ssk->state);
+		break;
+	}
+}
+
+static int
+sdp_post_recv(struct sdp_sock *ssk)
+{
+	struct sdp_buf *rx_req;
+	int i, rc;
+	u64 addr;
+	struct ib_device *dev;
+	struct ib_recv_wr rx_wr = { NULL };
+	struct ib_sge ibsge[SDP_MAX_RECV_SGES];
+	struct ib_sge *sge = ibsge;
+	struct ib_recv_wr *bad_wr;
+	struct mbuf *mb, *m;
+	struct sdp_bsdh *h;
+	int id = ring_head(ssk->rx_ring);
+
+	/* Now, allocate and repost recv */
+	sdp_prf(ssk->socket, mb, "Posting mb");
+	mb = m_getm2(NULL, ssk->recv_bytes, M_NOWAIT, MT_DATA, M_PKTHDR);
+	if (mb == NULL) {
+		/* Retry so we can't stall out with no memory. */
+		if (!rx_ring_posted(ssk))
+			queue_work(rx_comp_wq, &ssk->rx_comp_work);
+		return -1;
+	}
+	for (m = mb; m != NULL; m = m->m_next) {
+		m->m_len = (m->m_flags & M_EXT) ? m->m_ext.ext_size :
+                        ((m->m_flags & M_PKTHDR) ? MHLEN : MLEN);
+		mb->m_pkthdr.len += m->m_len;
+	}
+	h = mtod(mb, struct sdp_bsdh *);
+	rx_req = ssk->rx_ring.buffer + (id & (SDP_RX_SIZE - 1));
+	rx_req->mb = mb;
+	dev = ssk->ib_device;
+        for (i = 0;  mb != NULL; i++, mb = mb->m_next, sge++) {
+		addr = ib_dma_map_single(dev, mb->m_data, mb->m_len,
+		    DMA_TO_DEVICE);
+		/* TODO: proper error handling */
+		BUG_ON(ib_dma_mapping_error(dev, addr));
+		BUG_ON(i >= SDP_MAX_RECV_SGES);
+		rx_req->mapping[i] = addr;
+		sge->addr = addr;
+		sge->length = mb->m_len;
+		sge->lkey = ssk->sdp_dev->mr->lkey;
+        }
+
+	rx_wr.next = NULL;
+	rx_wr.wr_id = id | SDP_OP_RECV;
+	rx_wr.sg_list = ibsge;
+	rx_wr.num_sge = i;
+	rc = ib_post_recv(ssk->qp, &rx_wr, &bad_wr);
+	if (unlikely(rc)) {
+		sdp_warn(ssk->socket, "ib_post_recv failed. status %d\n", rc);
+
+		sdp_cleanup_sdp_buf(ssk, rx_req, DMA_FROM_DEVICE);
+		m_freem(mb);
+
+		sdp_notify(ssk, ECONNRESET);
+
+		return -1;
+	}
+
+	atomic_inc(&ssk->rx_ring.head);
+	SDPSTATS_COUNTER_INC(post_recv);
+
+	return 0;
+}
+
+static inline int
+sdp_post_recvs_needed(struct sdp_sock *ssk)
+{
+	unsigned long bytes_in_process;
+	unsigned long max_bytes;
+	int buffer_size;
+	int posted;
+
+	if (!ssk->qp_active || !ssk->socket)
+		return 0;
+
+	posted = rx_ring_posted(ssk);
+	if (posted >= SDP_RX_SIZE)
+		return 0;
+	if (posted < SDP_MIN_TX_CREDITS)
+		return 1;
+
+	buffer_size = ssk->recv_bytes;
+	max_bytes = max(ssk->socket->so_snd.sb_hiwat,
+	    (1 + SDP_MIN_TX_CREDITS) * buffer_size);
+	max_bytes *= rcvbuf_scale;
+	/*
+	 * Compute bytes in the receive queue and socket buffer.
+	 */
+	bytes_in_process = (posted - SDP_MIN_TX_CREDITS) * buffer_size;
+	bytes_in_process += ssk->socket->so_rcv.sb_cc;
+
+	return bytes_in_process < max_bytes;
+}
+
+static inline void
+sdp_post_recvs(struct sdp_sock *ssk)
+{
+
+	while (sdp_post_recvs_needed(ssk))
+		if (sdp_post_recv(ssk))
+			return;
+}
+
+static inline struct mbuf *
+sdp_sock_queue_rcv_mb(struct socket *sk, struct mbuf *mb)
+{
+	struct sdp_sock *ssk = sdp_sk(sk);
+	struct sdp_bsdh *h;
+
+	h = mtod(mb, struct sdp_bsdh *);
+
+#ifdef SDP_ZCOPY
+	SDP_SKB_CB(mb)->seq = rcv_nxt(ssk);
+	if (h->mid == SDP_MID_SRCAVAIL) {
+		struct sdp_srcah *srcah = (struct sdp_srcah *)(h+1);
+		struct rx_srcavail_state *rx_sa;
+		
+		ssk->srcavail_cancel_mseq = 0;
+
+		ssk->rx_sa = rx_sa = RX_SRCAVAIL_STATE(mb) = kzalloc(
+				sizeof(struct rx_srcavail_state), M_NOWAIT);
+
+		rx_sa->mseq = ntohl(h->mseq);
+		rx_sa->used = 0;
+		rx_sa->len = mb_len = ntohl(srcah->len);
+		rx_sa->rkey = ntohl(srcah->rkey);
+		rx_sa->vaddr = be64_to_cpu(srcah->vaddr);
+		rx_sa->flags = 0;
+
+		if (ssk->tx_sa) {
+			sdp_dbg_data(ssk->socket, "got RX SrcAvail while waiting "
+					"for TX SrcAvail. waking up TX SrcAvail"
+					"to be aborted\n");
+			wake_up(sk->sk_sleep);
+		}
+
+		atomic_add(mb->len, &ssk->rcv_nxt);
+		sdp_dbg_data(sk, "queueing SrcAvail. mb_len = %d vaddr = %lld\n",
+			mb_len, rx_sa->vaddr);
+	} else
+#endif
+	{
+		atomic_add(mb->m_pkthdr.len, &ssk->rcv_nxt);
+	}
+
+	m_adj(mb, SDP_HEAD_SIZE);
+	SOCKBUF_LOCK(&sk->so_rcv);
+	if (unlikely(h->flags & SDP_OOB_PRES))
+		sdp_urg(ssk, mb);
+	sbappend_locked(&sk->so_rcv, mb);
+	sorwakeup_locked(sk);
+	return mb;
+}
+
+static int
+sdp_get_recv_bytes(struct sdp_sock *ssk, u32 new_size)
+{
+
+	return MIN(new_size, SDP_MAX_PACKET);
+}
+
+int
+sdp_init_buffers(struct sdp_sock *ssk, u32 new_size)
+{
+
+	ssk->recv_bytes = sdp_get_recv_bytes(ssk, new_size);
+	sdp_post_recvs(ssk);
+
+	return 0;
+}
+
+int
+sdp_resize_buffers(struct sdp_sock *ssk, u32 new_size)
+{
+	u32 curr_size = ssk->recv_bytes;
+	u32 max_size = SDP_MAX_PACKET;
+
+	if (new_size > curr_size && new_size <= max_size) {
+		ssk->recv_bytes = sdp_get_recv_bytes(ssk, new_size);
+		return 0;
+	}
+	return -1;
+}
+
+static void
+sdp_handle_resize_request(struct sdp_sock *ssk, struct sdp_chrecvbuf *buf)
+{
+	if (sdp_resize_buffers(ssk, ntohl(buf->size)) == 0)
+		ssk->recv_request_head = ring_head(ssk->rx_ring) + 1;
+	else
+		ssk->recv_request_head = ring_tail(ssk->rx_ring);
+	ssk->recv_request = 1;
+}
+
+static void
+sdp_handle_resize_ack(struct sdp_sock *ssk, struct sdp_chrecvbuf *buf)
+{
+	u32 new_size = ntohl(buf->size);
+
+	if (new_size > ssk->xmit_size_goal)
+		ssk->xmit_size_goal = new_size;
+}
+
+static struct mbuf *
+sdp_recv_completion(struct sdp_sock *ssk, int id)
+{
+	struct sdp_buf *rx_req;
+	struct ib_device *dev;
+	struct mbuf *mb;
+
+	if (unlikely(id != ring_tail(ssk->rx_ring))) {
+		printk(KERN_WARNING "Bogus recv completion id %d tail %d\n",
+			id, ring_tail(ssk->rx_ring));
+		return NULL;
+	}
+
+	dev = ssk->ib_device;
+	rx_req = &ssk->rx_ring.buffer[id & (SDP_RX_SIZE - 1)];
+	mb = rx_req->mb;
+	sdp_cleanup_sdp_buf(ssk, rx_req, DMA_FROM_DEVICE);
+
+	atomic_inc(&ssk->rx_ring.tail);
+	atomic_dec(&ssk->remote_credits);
+	return mb;
+}
+
+/* socket lock should be taken before calling this */
+static int
+sdp_process_rx_ctl_mb(struct sdp_sock *ssk, struct mbuf *mb)
+{
+	struct sdp_bsdh *h;
+	struct socket *sk;
+
+	SDP_WLOCK_ASSERT(ssk);
+	sk = ssk->socket;
+ 	h = mtod(mb, struct sdp_bsdh *);
+	switch (h->mid) {
+	case SDP_MID_DATA:
+	case SDP_MID_SRCAVAIL:
+		sdp_dbg(sk, "DATA after socket rcv was shutdown\n");
+
+		/* got data in RCV_SHUTDOWN */
+		if (ssk->state == TCPS_FIN_WAIT_1) {
+			sdp_dbg(sk, "RX data when state = FIN_WAIT1\n");
+			sdp_notify(ssk, ECONNRESET);
+		}
+		m_freem(mb);
+
+		break;
+#ifdef SDP_ZCOPY
+	case SDP_MID_RDMARDCOMPL:
+		m_freem(mb);
+		break;
+	case SDP_MID_SENDSM:
+		sdp_handle_sendsm(ssk, ntohl(h->mseq_ack));
+		m_freem(mb);
+		break;
+	case SDP_MID_SRCAVAIL_CANCEL:
+		sdp_dbg_data(sk, "Handling SrcAvailCancel\n");
+		sdp_prf(sk, NULL, "Handling SrcAvailCancel");
+		if (ssk->rx_sa) {
+			ssk->srcavail_cancel_mseq = ntohl(h->mseq);
+			ssk->rx_sa->flags |= RX_SA_ABORTED;
+			ssk->rx_sa = NULL; /* TODO: change it into SDP_MID_DATA and get 
+			                      the dirty logic from recvmsg */
+		} else {
+			sdp_dbg(sk, "Got SrcAvailCancel - "
+					"but no SrcAvail in process\n");
+		}
+		m_freem(mb);
+		break;
+	case SDP_MID_SINKAVAIL:
+		sdp_dbg_data(sk, "Got SinkAvail - not supported: ignored\n");
+		sdp_prf(sk, NULL, "Got SinkAvail - not supported: ignored");
+		/* FALLTHROUGH */
+#endif
+	case SDP_MID_ABORT:
+		sdp_dbg_data(sk, "Handling ABORT\n");
+		sdp_prf(sk, NULL, "Handling ABORT");
+		sdp_notify(ssk, ECONNRESET);
+		m_freem(mb);
+		break;
+	case SDP_MID_DISCONN:
+		sdp_dbg_data(sk, "Handling DISCONN\n");
+		sdp_prf(sk, NULL, "Handling DISCONN");
+		sdp_handle_disconn(ssk);
+		break;
+	case SDP_MID_CHRCVBUF:
+		sdp_dbg_data(sk, "Handling RX CHRCVBUF\n");
+		sdp_handle_resize_request(ssk, (struct sdp_chrecvbuf *)(h+1));
+		m_freem(mb);
+		break;
+	case SDP_MID_CHRCVBUF_ACK:
+		sdp_dbg_data(sk, "Handling RX CHRCVBUF_ACK\n");
+		sdp_handle_resize_ack(ssk, (struct sdp_chrecvbuf *)(h+1));
+		m_freem(mb);
+		break;
+	default:
+		/* TODO: Handle other messages */
+		sdp_warn(sk, "SDP: FIXME MID %d\n", h->mid);
+		m_freem(mb);
+	}
+
+	return 0;
+}
+
+static int
+sdp_process_rx_mb(struct sdp_sock *ssk, struct mbuf *mb)
+{
+	struct socket *sk;
+	struct sdp_bsdh *h;
+	unsigned long mseq_ack;
+	int credits_before;
+
+	h = mtod(mb, struct sdp_bsdh *);
+	sk = ssk->socket;
+	/*
+	 * If another thread is in so_pcbfree this may be partially torn
+	 * down but no further synchronization is required as the destroying
+	 * thread will wait for receive to shutdown before discarding the
+	 * socket.
+	 */
+	if (sk == NULL) {
+		m_freem(mb);
+		return 0;
+	}
+
+	SDPSTATS_HIST_LINEAR(credits_before_update, tx_credits(ssk));
+
+	mseq_ack = ntohl(h->mseq_ack);
+	credits_before = tx_credits(ssk);
+	atomic_set(&ssk->tx_ring.credits, mseq_ack - ring_head(ssk->tx_ring) +
+			1 + ntohs(h->bufs));
+	if (mseq_ack >= ssk->nagle_last_unacked)
+		ssk->nagle_last_unacked = 0;
+
+	sdp_prf1(ssk->socket, mb, "RX %s +%d c:%d->%d mseq:%d ack:%d\n",
+		mid2str(h->mid), ntohs(h->bufs), credits_before,
+		tx_credits(ssk), ntohl(h->mseq), ntohl(h->mseq_ack));
+
+	if (unlikely(h->mid == SDP_MID_DATA &&
+	    mb->m_pkthdr.len == SDP_HEAD_SIZE)) {
+		/* Credit update is valid even after RCV_SHUTDOWN */
+		m_freem(mb);
+		return 0;
+	}
+
+	if ((h->mid != SDP_MID_DATA && h->mid != SDP_MID_SRCAVAIL) ||
+	    TCPS_HAVERCVDFIN(ssk->state)) {
+		sdp_prf(sk, NULL, "Control mb - queing to control queue");
+#ifdef SDP_ZCOPY
+		if (h->mid == SDP_MID_SRCAVAIL_CANCEL) {
+			sdp_dbg_data(sk, "Got SrcAvailCancel. "
+					"seq: 0x%d seq_ack: 0x%d\n",
+					ntohl(h->mseq), ntohl(h->mseq_ack));
+			ssk->srcavail_cancel_mseq = ntohl(h->mseq);
+		}
+
+
+		if (h->mid == SDP_MID_RDMARDCOMPL) {
+			struct sdp_rrch *rrch = (struct sdp_rrch *)(h+1);
+			sdp_dbg_data(sk, "RdmaRdCompl message arrived\n");
+			sdp_handle_rdma_read_compl(ssk, ntohl(h->mseq_ack),
+					ntohl(rrch->len));
+		}
+#endif
+		mb->m_nextpkt = NULL;
+		if (ssk->rx_ctl_tail)
+			ssk->rx_ctl_tail->m_nextpkt = mb;
+		else
+			ssk->rx_ctl_q = mb;
+		ssk->rx_ctl_tail = mb;
+
+		return 0;
+	}
+
+	sdp_prf1(sk, NULL, "queueing %s mb\n", mid2str(h->mid));
+	mb = sdp_sock_queue_rcv_mb(sk, mb);
+
+
+	return 0;
+}
+
+/* called only from irq */
+static struct mbuf *
+sdp_process_rx_wc(struct sdp_sock *ssk, struct ib_wc *wc)
+{
+	struct mbuf *mb;
+	struct sdp_bsdh *h;
+	struct socket *sk = ssk->socket;
+	int mseq;
+
+	mb = sdp_recv_completion(ssk, wc->wr_id);
+	if (unlikely(!mb))
+		return NULL;
+
+	if (unlikely(wc->status)) {
+		if (ssk->qp_active && sk) {
+			sdp_dbg(sk, "Recv completion with error. "
+					"Status %d, vendor: %d\n",
+				wc->status, wc->vendor_err);
+			sdp_abort(sk);
+			ssk->qp_active = 0;
+		}
+		m_freem(mb);
+		return NULL;
+	}
+
+	sdp_dbg_data(sk, "Recv completion. ID %d Length %d\n",
+			(int)wc->wr_id, wc->byte_len);
+	if (unlikely(wc->byte_len < sizeof(struct sdp_bsdh))) {
+		sdp_warn(sk, "SDP BUG! byte_len %d < %zd\n",
+				wc->byte_len, sizeof(struct sdp_bsdh));
+		m_freem(mb);
+		return NULL;
+	}
+	/* Use m_adj to trim the tail of data we didn't use. */
+	m_adj(mb, -(mb->m_pkthdr.len - wc->byte_len));
+	h = mtod(mb, struct sdp_bsdh *);
+
+	SDP_DUMP_PACKET(ssk->socket, "RX", mb, h);
+
+	ssk->rx_packets++;
+	ssk->rx_bytes += mb->m_pkthdr.len;
+
+	mseq = ntohl(h->mseq);
+	atomic_set(&ssk->mseq_ack, mseq);
+	if (mseq != (int)wc->wr_id)
+		sdp_warn(sk, "SDP BUG! mseq %d != wrid %d\n",
+				mseq, (int)wc->wr_id);
+
+	return mb;
+}
+
+/* Wakeup writers if we now have credits. */
+static void
+sdp_bzcopy_write_space(struct sdp_sock *ssk)
+{
+	struct socket *sk = ssk->socket;
+
+	if (tx_credits(ssk) >= ssk->min_bufs && sk)
+		sowwakeup(sk);
+}
+
+/* only from interrupt. */
+static int
+sdp_poll_rx_cq(struct sdp_sock *ssk)
+{
+	struct ib_cq *cq = ssk->rx_ring.cq;
+	struct ib_wc ibwc[SDP_NUM_WC];
+	int n, i;
+	int wc_processed = 0;
+	struct mbuf *mb;
+
+	do {
+		n = ib_poll_cq(cq, SDP_NUM_WC, ibwc);
+		for (i = 0; i < n; ++i) {
+			struct ib_wc *wc = &ibwc[i];
+
+			BUG_ON(!(wc->wr_id & SDP_OP_RECV));
+			mb = sdp_process_rx_wc(ssk, wc);
+			if (!mb)
+				continue;
+
+			sdp_process_rx_mb(ssk, mb);
+			wc_processed++;
+		}
+	} while (n == SDP_NUM_WC);
+
+	if (wc_processed)
+		sdp_bzcopy_write_space(ssk);
+
+	return wc_processed;
+}
+
+static void
+sdp_rx_comp_work(struct work_struct *work)
+{
+	struct sdp_sock *ssk = container_of(work, struct sdp_sock,
+			rx_comp_work);
+
+	sdp_prf(ssk->socket, NULL, "%s", __func__);
+
+	SDP_WLOCK(ssk);
+	if (unlikely(!ssk->qp)) {
+		sdp_prf(ssk->socket, NULL, "qp was destroyed");
+		goto out;
+	}
+	if (unlikely(!ssk->rx_ring.cq)) {
+		sdp_prf(ssk->socket, NULL, "rx_ring.cq is NULL");
+		goto out;
+	}
+
+	if (unlikely(!ssk->poll_cq)) {
+		struct rdma_cm_id *id = ssk->id;
+		if (id && id->qp)
+			rdma_notify(id, RDMA_CM_EVENT_ESTABLISHED);
+		goto out;
+	}
+
+	sdp_do_posts(ssk);
+out:
+	SDP_WUNLOCK(ssk);
+}
+
+void
+sdp_do_posts(struct sdp_sock *ssk)
+{
+	struct socket *sk = ssk->socket;
+	int xmit_poll_force;
+	struct mbuf *mb;
+
+	SDP_WLOCK_ASSERT(ssk);
+	if (!ssk->qp_active) {
+		sdp_dbg(sk, "QP is deactivated\n");
+		return;
+	}
+
+	while ((mb = ssk->rx_ctl_q)) {
+		ssk->rx_ctl_q = mb->m_nextpkt;
+		mb->m_nextpkt = NULL;
+		sdp_process_rx_ctl_mb(ssk, mb);
+	}
+
+	if (ssk->state == TCPS_TIME_WAIT)
+		return;
+
+	if (!ssk->rx_ring.cq || !ssk->tx_ring.cq)
+		return;
+
+	sdp_post_recvs(ssk);
+
+	if (tx_ring_posted(ssk))
+		sdp_xmit_poll(ssk, 1);
+
+	sdp_post_sends(ssk, M_NOWAIT);
+
+	xmit_poll_force = tx_credits(ssk) < SDP_MIN_TX_CREDITS;
+
+	if (credit_update_needed(ssk) || xmit_poll_force) {
+		/* if has pending tx because run out of tx_credits - xmit it */
+		sdp_prf(sk, NULL, "Processing to free pending sends");
+		sdp_xmit_poll(ssk,  xmit_poll_force);
+		sdp_prf(sk, NULL, "Sending credit update");
+		sdp_post_sends(ssk, M_NOWAIT);
+	}
+
+}
+
+int
+sdp_process_rx(struct sdp_sock *ssk)
+{
+	int wc_processed = 0;
+	int credits_before;
+
+	if (!rx_ring_trylock(&ssk->rx_ring)) {
+		sdp_dbg(ssk->socket, "ring destroyed. not polling it\n");
+		return 0;
+	}
+
+	credits_before = tx_credits(ssk);
+
+	wc_processed = sdp_poll_rx_cq(ssk);
+	sdp_prf(ssk->socket, NULL, "processed %d", wc_processed);
+
+	if (wc_processed) {
+		sdp_prf(ssk->socket, NULL, "credits:  %d -> %d",
+				credits_before, tx_credits(ssk));
+		queue_work(rx_comp_wq, &ssk->rx_comp_work);
+	}
+	sdp_arm_rx_cq(ssk);
+
+	rx_ring_unlock(&ssk->rx_ring);
+
+	return (wc_processed);
+}
+
+static void
+sdp_rx_irq(struct ib_cq *cq, void *cq_context)
+{
+	struct socket *sk = cq_context;
+	struct sdp_sock *ssk = sdp_sk(sk);
+
+	if (cq != ssk->rx_ring.cq) {
+		sdp_dbg(sk, "cq = %p, ssk->cq = %p\n", cq, ssk->rx_ring.cq);
+		return;
+	}
+
+	SDPSTATS_COUNTER_INC(rx_int_count);
+
+	sdp_prf(sk, NULL, "rx irq");
+
+	sdp_process_rx(ssk);
+}
+
+static
+void sdp_rx_ring_purge(struct sdp_sock *ssk)
+{
+	while (rx_ring_posted(ssk) > 0) {
+		struct mbuf *mb;
+		mb = sdp_recv_completion(ssk, ring_tail(ssk->rx_ring));
+		if (!mb)
+			break;
+		m_freem(mb);
+	}
+}
+
+void
+sdp_rx_ring_init(struct sdp_sock *ssk)
+{
+	ssk->rx_ring.buffer = NULL;
+	ssk->rx_ring.destroyed = 0;
+	rw_init(&ssk->rx_ring.destroyed_lock, "sdp rx lock");
+}
+
+static void
+sdp_rx_cq_event_handler(struct ib_event *event, void *data)
+{
+}
+
+int
+sdp_rx_ring_create(struct sdp_sock *ssk, struct ib_device *device)
+{
+	struct ib_cq *rx_cq;
+	int rc = 0;
+
+
+	sdp_dbg(ssk->socket, "rx ring created");
+	INIT_WORK(&ssk->rx_comp_work, sdp_rx_comp_work);
+	atomic_set(&ssk->rx_ring.head, 1);
+	atomic_set(&ssk->rx_ring.tail, 1);
+
+	ssk->rx_ring.buffer = kmalloc(
+			sizeof *ssk->rx_ring.buffer * SDP_RX_SIZE, GFP_KERNEL);
+	if (!ssk->rx_ring.buffer) {
+		sdp_warn(ssk->socket,
+			"Unable to allocate RX Ring size %zd.\n",
+			 sizeof(*ssk->rx_ring.buffer) * SDP_RX_SIZE);
+
+		return -ENOMEM;
+	}
+
+	rx_cq = ib_create_cq(device, sdp_rx_irq, sdp_rx_cq_event_handler,
+			  ssk->socket, SDP_RX_SIZE, IB_CQ_VECTOR_LEAST_ATTACHED);
+
+	if (IS_ERR(rx_cq)) {
+		rc = PTR_ERR(rx_cq);
+		sdp_warn(ssk->socket, "Unable to allocate RX CQ: %d.\n", rc);
+		goto err_cq;
+	}
+
+	sdp_sk(ssk->socket)->rx_ring.cq = rx_cq;
+	sdp_arm_rx_cq(ssk);
+
+	return 0;
+
+err_cq:
+	kfree(ssk->rx_ring.buffer);
+	ssk->rx_ring.buffer = NULL;
+	return rc;
+}
+
+void
+sdp_rx_ring_destroy(struct sdp_sock *ssk)
+{
+
+	cancel_work_sync(&ssk->rx_comp_work);
+	rx_ring_destroy_lock(&ssk->rx_ring);
+
+	if (ssk->rx_ring.buffer) {
+		sdp_rx_ring_purge(ssk);
+
+		kfree(ssk->rx_ring.buffer);
+		ssk->rx_ring.buffer = NULL;
+	}
+
+	if (ssk->rx_ring.cq) {
+		if (ib_destroy_cq(ssk->rx_ring.cq)) {
+			sdp_warn(ssk->socket, "destroy cq(%p) failed\n",
+				ssk->rx_ring.cq);
+		} else {
+			ssk->rx_ring.cq = NULL;
+		}
+	}
+
+	WARN_ON(ring_head(ssk->rx_ring) != ring_tail(ssk->rx_ring));
+}
diff --git a/sys/ofed/drivers/infiniband/ulp/sdp/sdp_tx.c b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_tx.c
new file mode 100644
index 0000000..b0c37e5
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_tx.c
@@ -0,0 +1,490 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "sdp.h"
+
+#define sdp_cnt(var) do { (var)++; } while (0)
+
+SDP_MODPARAM_SINT(sdp_keepalive_probes_sent, 0,
+		"Total number of keepalive probes sent.");
+
+static int sdp_process_tx_cq(struct sdp_sock *ssk);
+static void sdp_poll_tx_timeout(void *data);
+
+int
+sdp_xmit_poll(struct sdp_sock *ssk, int force)
+{
+	int wc_processed = 0;
+
+	SDP_WLOCK_ASSERT(ssk);
+	sdp_prf(ssk->socket, NULL, "%s", __func__);
+
+	/* If we don't have a pending timer, set one up to catch our recent
+	   post in case the interface becomes idle */
+	if (!callout_pending(&ssk->tx_ring.timer))
+		callout_reset(&ssk->tx_ring.timer, SDP_TX_POLL_TIMEOUT,
+		    sdp_poll_tx_timeout, ssk);
+
+	/* Poll the CQ every SDP_TX_POLL_MODER packets */
+	if (force || (++ssk->tx_ring.poll_cnt & (SDP_TX_POLL_MODER - 1)) == 0)
+		wc_processed = sdp_process_tx_cq(ssk);
+
+	return wc_processed;
+}
+
+void
+sdp_post_send(struct sdp_sock *ssk, struct mbuf *mb)
+{
+	struct sdp_buf *tx_req;
+	struct sdp_bsdh *h;
+	unsigned long mseq;
+	struct ib_device *dev;
+	struct ib_send_wr *bad_wr;
+	struct ib_sge ibsge[SDP_MAX_SEND_SGES];
+	struct ib_sge *sge;
+	struct ib_send_wr tx_wr = { NULL };
+	int i, rc;
+	u64 addr;
+
+	SDPSTATS_COUNTER_MID_INC(post_send, h->mid);
+	SDPSTATS_HIST(send_size, mb->len);
+
+	if (!ssk->qp_active) {
+		m_freem(mb);
+		return;
+	}
+
+	mseq = ring_head(ssk->tx_ring);
+	h = mtod(mb, struct sdp_bsdh *);
+	ssk->tx_packets++;
+	ssk->tx_bytes += mb->m_pkthdr.len;
+
+#ifdef SDP_ZCOPY
+	if (unlikely(h->mid == SDP_MID_SRCAVAIL)) {
+		struct tx_srcavail_state *tx_sa = TX_SRCAVAIL_STATE(mb);
+		if (ssk->tx_sa != tx_sa) {
+			sdp_dbg_data(ssk->socket, "SrcAvail cancelled "
+					"before being sent!\n");
+			WARN_ON(1);
+			m_freem(mb);
+			return;
+		}
+		TX_SRCAVAIL_STATE(mb)->mseq = mseq;
+	}
+#endif
+
+	if (unlikely(mb->m_flags & M_URG))
+		h->flags = SDP_OOB_PRES | SDP_OOB_PEND;
+	else
+		h->flags = 0;
+
+	mb->m_flags |= M_RDONLY; /* Don't allow compression once sent. */
+	h->bufs = htons(rx_ring_posted(ssk));
+	h->len = htonl(mb->m_pkthdr.len);
+	h->mseq = htonl(mseq);
+	h->mseq_ack = htonl(mseq_ack(ssk));
+
+	sdp_prf1(ssk->socket, mb, "TX: %s bufs: %d mseq:%ld ack:%d",
+			mid2str(h->mid), rx_ring_posted(ssk), mseq,
+			ntohl(h->mseq_ack));
+
+	SDP_DUMP_PACKET(ssk->socket, "TX", mb, h);
+
+	tx_req = &ssk->tx_ring.buffer[mseq & (SDP_TX_SIZE - 1)];
+	tx_req->mb = mb;
+	dev = ssk->ib_device;
+	sge = &ibsge[0];
+	for (i = 0;  mb != NULL; i++, mb = mb->m_next, sge++) {
+		addr = ib_dma_map_single(dev, mb->m_data, mb->m_len,
+		    DMA_TO_DEVICE);
+		/* TODO: proper error handling */
+		BUG_ON(ib_dma_mapping_error(dev, addr));
+		BUG_ON(i >= SDP_MAX_SEND_SGES);
+		tx_req->mapping[i] = addr;
+		sge->addr = addr;
+		sge->length = mb->m_len;
+		sge->lkey = ssk->sdp_dev->mr->lkey;
+	}
+	tx_wr.next = NULL;
+	tx_wr.wr_id = mseq | SDP_OP_SEND;
+	tx_wr.sg_list = ibsge;
+	tx_wr.num_sge = i;
+	tx_wr.opcode = IB_WR_SEND;
+	tx_wr.send_flags = IB_SEND_SIGNALED;
+	if (unlikely(tx_req->mb->m_flags & M_URG))
+		tx_wr.send_flags |= IB_SEND_SOLICITED;
+
+	rc = ib_post_send(ssk->qp, &tx_wr, &bad_wr);
+	if (unlikely(rc)) {
+		sdp_dbg(ssk->socket,
+				"ib_post_send failed with status %d.\n", rc);
+
+		sdp_cleanup_sdp_buf(ssk, tx_req, DMA_TO_DEVICE);
+
+		sdp_notify(ssk, ECONNRESET);
+		m_freem(tx_req->mb);
+		return;
+	}
+
+	atomic_inc(&ssk->tx_ring.head);
+	atomic_dec(&ssk->tx_ring.credits);
+	atomic_set(&ssk->remote_credits, rx_ring_posted(ssk));
+
+	return;
+}
+
+static struct mbuf *
+sdp_send_completion(struct sdp_sock *ssk, int mseq)
+{
+	struct ib_device *dev;
+	struct sdp_buf *tx_req;
+	struct mbuf *mb = NULL;
+	struct sdp_tx_ring *tx_ring = &ssk->tx_ring;
+
+	if (unlikely(mseq != ring_tail(*tx_ring))) {
+		printk(KERN_WARNING "Bogus send completion id %d tail %d\n",
+			mseq, ring_tail(*tx_ring));
+		goto out;
+	}
+
+	dev = ssk->ib_device;
+	tx_req = &tx_ring->buffer[mseq & (SDP_TX_SIZE - 1)];
+	mb = tx_req->mb;
+	sdp_cleanup_sdp_buf(ssk, tx_req, DMA_TO_DEVICE);
+
+#ifdef SDP_ZCOPY
+	/* TODO: AIO and real zcopy code; add their context support here */
+	if (BZCOPY_STATE(mb))
+		BZCOPY_STATE(mb)->busy--;
+#endif
+
+	atomic_inc(&tx_ring->tail);
+
+out:
+	return mb;
+}
+
+static int
+sdp_handle_send_comp(struct sdp_sock *ssk, struct ib_wc *wc)
+{
+	struct mbuf *mb = NULL;
+	struct sdp_bsdh *h;
+
+	if (unlikely(wc->status)) {
+		if (wc->status != IB_WC_WR_FLUSH_ERR) {
+			sdp_prf(ssk->socket, mb, "Send completion with error. "
+				"Status %d", wc->status);
+			sdp_dbg_data(ssk->socket, "Send completion with error. "
+				"Status %d\n", wc->status);
+			sdp_notify(ssk, ECONNRESET);
+		}
+	}
+
+	mb = sdp_send_completion(ssk, wc->wr_id);
+	if (unlikely(!mb))
+		return -1;
+
+	h = mtod(mb, struct sdp_bsdh *);
+	sdp_prf1(ssk->socket, mb, "tx completion. mseq:%d", ntohl(h->mseq));
+	sdp_dbg(ssk->socket, "tx completion. %p %d mseq:%d",
+	    mb, mb->m_pkthdr.len, ntohl(h->mseq));
+	m_freem(mb);
+
+	return 0;
+}
+
+static inline void
+sdp_process_tx_wc(struct sdp_sock *ssk, struct ib_wc *wc)
+{
+
+	if (likely(wc->wr_id & SDP_OP_SEND)) {
+		sdp_handle_send_comp(ssk, wc);
+		return;
+	}
+
+#ifdef SDP_ZCOPY
+	if (wc->wr_id & SDP_OP_RDMA) {
+		/* TODO: handle failed RDMA read cqe */
+
+		sdp_dbg_data(ssk->socket,
+	 	    "TX comp: RDMA read. status: %d\n", wc->status);
+		sdp_prf1(sk, NULL, "TX comp: RDMA read");
+
+		if (!ssk->tx_ring.rdma_inflight) {
+			sdp_warn(ssk->socket, "ERROR: unexpected RDMA read\n");
+			return;
+		}
+
+		if (!ssk->tx_ring.rdma_inflight->busy) {
+			sdp_warn(ssk->socket,
+			    "ERROR: too many RDMA read completions\n");
+			return;
+		}
+
+		/* Only last RDMA read WR is signalled. Order is guaranteed -
+		 * therefore if Last RDMA read WR is completed - all other
+		 * have, too */
+		ssk->tx_ring.rdma_inflight->busy = 0;
+		sowwakeup(ssk->socket);
+		sdp_dbg_data(ssk->socket, "woke up sleepers\n");
+		return;
+	}
+#endif
+
+	/* Keepalive probe sent cleanup */
+	sdp_cnt(sdp_keepalive_probes_sent);
+
+	if (likely(!wc->status))
+		return;
+
+	sdp_dbg(ssk->socket, " %s consumes KEEPALIVE status %d\n",
+			__func__, wc->status);
+
+	if (wc->status == IB_WC_WR_FLUSH_ERR)
+		return;
+
+	sdp_notify(ssk, ECONNRESET);
+}
+
+static int
+sdp_process_tx_cq(struct sdp_sock *ssk)
+{
+	struct ib_wc ibwc[SDP_NUM_WC];
+	int n, i;
+	int wc_processed = 0;
+
+	SDP_WLOCK_ASSERT(ssk);
+
+	if (!ssk->tx_ring.cq) {
+		sdp_dbg(ssk->socket, "tx irq on destroyed tx_cq\n");
+		return 0;
+	}
+
+	do {
+		n = ib_poll_cq(ssk->tx_ring.cq, SDP_NUM_WC, ibwc);
+		for (i = 0; i < n; ++i) {
+			sdp_process_tx_wc(ssk, ibwc + i);
+			wc_processed++;
+		}
+	} while (n == SDP_NUM_WC);
+
+	if (wc_processed) {
+		sdp_post_sends(ssk, M_DONTWAIT);
+		sdp_prf1(sk, NULL, "Waking sendmsg. inflight=%d", 
+				(u32) tx_ring_posted(ssk));
+		sowwakeup(ssk->socket);
+	}
+
+	return wc_processed;
+}
+
+static void
+sdp_poll_tx(struct sdp_sock *ssk)
+{
+	struct socket *sk = ssk->socket;
+	u32 inflight, wc_processed;
+
+	sdp_prf1(ssk->socket, NULL, "TX timeout: inflight=%d, head=%d tail=%d", 
+		(u32) tx_ring_posted(ssk),
+		ring_head(ssk->tx_ring), ring_tail(ssk->tx_ring));
+
+	if (unlikely(ssk->state == TCPS_CLOSED)) {
+		sdp_warn(sk, "Socket is closed\n");
+		goto out;
+	}
+
+	wc_processed = sdp_process_tx_cq(ssk);
+	if (!wc_processed)
+		SDPSTATS_COUNTER_INC(tx_poll_miss);
+	else
+		SDPSTATS_COUNTER_INC(tx_poll_hit);
+
+	inflight = (u32) tx_ring_posted(ssk);
+	sdp_prf1(ssk->socket, NULL, "finished tx proccessing. inflight = %d",
+	    inflight);
+
+	/* If there are still packets in flight and the timer has not already
+	 * been scheduled by the Tx routine then schedule it here to guarantee
+	 * completion processing of these packets */
+	if (inflight)
+		callout_reset(&ssk->tx_ring.timer, SDP_TX_POLL_TIMEOUT,
+		    sdp_poll_tx_timeout, ssk);
+out:
+#ifdef SDP_ZCOPY
+	if (ssk->tx_ring.rdma_inflight && ssk->tx_ring.rdma_inflight->busy) {
+		sdp_prf1(sk, NULL, "RDMA is inflight - arming irq");
+		sdp_arm_tx_cq(ssk);
+	}
+#endif
+	return;
+}
+
+static void
+sdp_poll_tx_timeout(void *data)
+{
+	struct sdp_sock *ssk = (struct sdp_sock *)data;
+
+	if (!callout_active(&ssk->tx_ring.timer))
+		return;
+	callout_deactivate(&ssk->tx_ring.timer);
+	sdp_poll_tx(ssk);
+}
+
+static void
+sdp_tx_irq(struct ib_cq *cq, void *cq_context)
+{
+	struct sdp_sock *ssk;
+
+	ssk = cq_context;
+	sdp_prf1(ssk->socket, NULL, "tx irq");
+	sdp_dbg_data(ssk->socket, "Got tx comp interrupt\n");
+	SDPSTATS_COUNTER_INC(tx_int_count);
+	SDP_WLOCK(ssk);
+	sdp_poll_tx(ssk);
+	SDP_WUNLOCK(ssk);
+}
+
+static
+void sdp_tx_ring_purge(struct sdp_sock *ssk)
+{
+	while (tx_ring_posted(ssk)) {
+		struct mbuf *mb;
+		mb = sdp_send_completion(ssk, ring_tail(ssk->tx_ring));
+		if (!mb)
+			break;
+		m_freem(mb);
+	}
+}
+
+void
+sdp_post_keepalive(struct sdp_sock *ssk)
+{
+	int rc;
+	struct ib_send_wr wr, *bad_wr;
+
+	sdp_dbg(ssk->socket, "%s\n", __func__);
+
+	memset(&wr, 0, sizeof(wr));
+
+	wr.next    = NULL;
+	wr.wr_id   = 0;
+	wr.sg_list = NULL;
+	wr.num_sge = 0;
+	wr.opcode  = IB_WR_RDMA_WRITE;
+
+	rc = ib_post_send(ssk->qp, &wr, &bad_wr);
+	if (rc) {
+		sdp_dbg(ssk->socket,
+			"ib_post_keepalive failed with status %d.\n", rc);
+		sdp_notify(ssk, ECONNRESET);
+	}
+
+	sdp_cnt(sdp_keepalive_probes_sent);
+}
+
+static void
+sdp_tx_cq_event_handler(struct ib_event *event, void *data)
+{
+}
+
+int
+sdp_tx_ring_create(struct sdp_sock *ssk, struct ib_device *device)
+{
+	struct ib_cq *tx_cq;
+	int rc = 0;
+
+	sdp_dbg(ssk->socket, "tx ring create\n");
+	callout_init_rw(&ssk->tx_ring.timer, &ssk->lock, 0);
+	callout_init_rw(&ssk->nagle_timer, &ssk->lock, 0);
+	atomic_set(&ssk->tx_ring.head, 1);
+	atomic_set(&ssk->tx_ring.tail, 1);
+
+	ssk->tx_ring.buffer = kzalloc(
+			sizeof *ssk->tx_ring.buffer * SDP_TX_SIZE, GFP_KERNEL);
+	if (!ssk->tx_ring.buffer) {
+		rc = -ENOMEM;
+		sdp_warn(ssk->socket, "Can't allocate TX Ring size %zd.\n",
+			 sizeof(*ssk->tx_ring.buffer) * SDP_TX_SIZE);
+
+		goto out;
+	}
+
+	tx_cq = ib_create_cq(device, sdp_tx_irq, sdp_tx_cq_event_handler,
+			  ssk, SDP_TX_SIZE, IB_CQ_VECTOR_LEAST_ATTACHED);
+
+	if (IS_ERR(tx_cq)) {
+		rc = PTR_ERR(tx_cq);
+		sdp_warn(ssk->socket, "Unable to allocate TX CQ: %d.\n", rc);
+		goto err_cq;
+	}
+	ssk->tx_ring.cq = tx_cq;
+	ssk->tx_ring.poll_cnt = 0;
+	sdp_arm_tx_cq(ssk);
+
+	return 0;
+
+err_cq:
+	kfree(ssk->tx_ring.buffer);
+	ssk->tx_ring.buffer = NULL;
+out:
+	return rc;
+}
+
+void
+sdp_tx_ring_destroy(struct sdp_sock *ssk)
+{
+
+	sdp_dbg(ssk->socket, "tx ring destroy\n");
+	SDP_WLOCK(ssk);
+	callout_stop(&ssk->tx_ring.timer);
+	callout_stop(&ssk->nagle_timer);
+	SDP_WUNLOCK(ssk);
+	callout_drain(&ssk->tx_ring.timer);
+	callout_drain(&ssk->nagle_timer);
+
+	if (ssk->tx_ring.buffer) {
+		sdp_tx_ring_purge(ssk);
+
+		kfree(ssk->tx_ring.buffer);
+		ssk->tx_ring.buffer = NULL;
+	}
+
+	if (ssk->tx_ring.cq) {
+		if (ib_destroy_cq(ssk->tx_ring.cq)) {
+			sdp_warn(ssk->socket, "destroy cq(%p) failed\n",
+					ssk->tx_ring.cq);
+		} else {
+			ssk->tx_ring.cq = NULL;
+		}
+	}
+
+	WARN_ON(ring_head(ssk->tx_ring) != ring_tail(ssk->tx_ring));
+}
diff --git a/sys/ofed/drivers/infiniband/ulp/sdp/sdp_zcopy.c b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_zcopy.c
new file mode 100644
index 0000000..0425f8e
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_zcopy.c
@@ -0,0 +1,804 @@
+/*
+ * Copyright (c) 2006 Mellanox Technologies Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/tcp.h>
+#include <asm/ioctls.h>
+#include <linux/workqueue.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <net/protocol.h>
+#include <net/inet_common.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_fmr_pool.h>
+#include <rdma/ib_umem.h> 
+#include <net/tcp.h> /* for memcpy_toiovec */
+#include <asm/io.h>
+#include <asm/uaccess.h>
+#include <linux/delay.h>
+#include "sdp.h"
+
+static int sdp_post_srcavail(struct socket *sk, struct tx_srcavail_state *tx_sa)
+{
+	struct sdp_sock *ssk = sdp_sk(sk);
+	struct mbuf *mb;
+	int payload_len;
+	struct page *payload_pg;
+	int off, len;
+	struct ib_umem_chunk *chunk;
+
+	WARN_ON(ssk->tx_sa);
+
+	BUG_ON(!tx_sa);
+	BUG_ON(!tx_sa->fmr || !tx_sa->fmr->fmr->lkey);
+	BUG_ON(!tx_sa->umem);
+	BUG_ON(!tx_sa->umem->chunk_list.next);
+
+	chunk = list_entry(tx_sa->umem->chunk_list.next, struct ib_umem_chunk, list);
+	BUG_ON(!chunk->nmap);
+
+	off = tx_sa->umem->offset;
+	len = tx_sa->umem->length;
+
+	tx_sa->bytes_sent = tx_sa->bytes_acked = 0;
+
+	mb = sdp_alloc_mb_srcavail(sk, len, tx_sa->fmr->fmr->lkey, off, 0);
+	if (!mb) {
+		return -ENOMEM;
+	}
+	sdp_dbg_data(sk, "sending SrcAvail\n");
+		
+	TX_SRCAVAIL_STATE(mb) = tx_sa; /* tx_sa is hanged on the mb 
+					 * but continue to live after mb is freed */
+	ssk->tx_sa = tx_sa;
+
+	/* must have payload inlined in SrcAvail packet in combined mode */
+	payload_len = MIN(tx_sa->umem->page_size - off, len);
+	payload_len = MIN(payload_len, ssk->xmit_size_goal - sizeof(struct sdp_srcah));
+	payload_pg  = sg_page(&chunk->page_list[0]);
+	get_page(payload_pg);
+
+	sdp_dbg_data(sk, "payload: off: 0x%x, pg: %p, len: 0x%x\n",
+		off, payload_pg, payload_len);
+
+	mb_fill_page_desc(mb, mb_shinfo(mb)->nr_frags,
+			payload_pg, off, payload_len);
+
+	mb->len             += payload_len;
+	mb->data_len         = payload_len;
+	mb->truesize        += payload_len;
+//	sk->sk_wmem_queued   += payload_len;
+//	sk->sk_forward_alloc -= payload_len;
+
+	mb_entail(sk, ssk, mb);
+	
+	ssk->write_seq += payload_len;
+	SDP_SKB_CB(mb)->end_seq += payload_len;
+
+	tx_sa->bytes_sent = tx_sa->umem->length;
+	tx_sa->bytes_acked = payload_len;
+
+	/* TODO: pushing the mb into the tx_queue should be enough */
+
+	return 0;
+}
+
+static int sdp_post_srcavail_cancel(struct socket *sk)
+{
+	struct sdp_sock *ssk = sdp_sk(sk);
+	struct mbuf *mb;
+
+	sdp_dbg_data(ssk->socket, "Posting srcavail cancel\n");
+
+	mb = sdp_alloc_mb_srcavail_cancel(sk, 0);
+	mb_entail(sk, ssk, mb);
+
+	sdp_post_sends(ssk, 0);
+
+	schedule_delayed_work(&ssk->srcavail_cancel_work,
+			SDP_SRCAVAIL_CANCEL_TIMEOUT);
+
+	return 0;
+}
+
+void srcavail_cancel_timeout(struct work_struct *work)
+{
+	struct sdp_sock *ssk =
+		container_of(work, struct sdp_sock, srcavail_cancel_work.work);
+	struct socket *sk = ssk->socket;
+
+	lock_sock(sk);
+
+	sdp_dbg_data(sk, "both SrcAvail and SrcAvailCancel timedout."
+			" closing connection\n");
+	sdp_set_error(sk, -ECONNRESET);
+	wake_up(&ssk->wq);
+
+	release_sock(sk);
+}
+
+static int sdp_wait_rdmardcompl(struct sdp_sock *ssk, long *timeo_p,
+		int ignore_signals)
+{
+	struct socket *sk = ssk->socket;
+	int err = 0;
+	long vm_wait = 0;
+	long current_timeo = *timeo_p;
+	struct tx_srcavail_state *tx_sa = ssk->tx_sa;
+	DEFINE_WAIT(wait);
+
+	sdp_dbg_data(sk, "sleep till RdmaRdCompl. timeo = %ld.\n", *timeo_p);
+	sdp_prf1(sk, NULL, "Going to sleep");
+	while (ssk->qp_active) {
+		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+
+		if (unlikely(!*timeo_p)) {
+			err = -ETIME;
+			tx_sa->abort_flags |= TX_SA_TIMEDOUT;
+			sdp_prf1(sk, NULL, "timeout");
+			SDPSTATS_COUNTER_INC(zcopy_tx_timeout);
+			break;
+		}
+
+		else if (tx_sa->bytes_acked > tx_sa->bytes_sent) {
+			err = -EINVAL;
+			sdp_dbg_data(sk, "acked bytes > sent bytes\n");
+			tx_sa->abort_flags |= TX_SA_ERROR;
+			break;
+		}
+
+		if (tx_sa->abort_flags & TX_SA_SENDSM) {
+			sdp_prf1(sk, NULL, "Aborting SrcAvail sending");
+			SDPSTATS_COUNTER_INC(zcopy_tx_aborted);
+			err = -EAGAIN;
+			break ;
+		}
+
+		if (!ignore_signals) {
+			if (signal_pending(current)) {
+				err = -EINTR;
+				sdp_prf1(sk, NULL, "signalled");
+				tx_sa->abort_flags |= TX_SA_INTRRUPTED;
+				break;
+			}
+
+			if (ssk->rx_sa && (tx_sa->bytes_acked < tx_sa->bytes_sent)) {
+				sdp_dbg_data(sk, "Crossing SrcAvail - aborting this\n");
+				tx_sa->abort_flags |= TX_SA_CROSS_SEND;
+				SDPSTATS_COUNTER_INC(zcopy_cross_send);
+				err = -ETIME;
+				break ;
+			}
+		}
+
+		posts_handler_put(ssk);
+
+		sk_wait_event(sk, &current_timeo,
+				tx_sa->abort_flags &&
+				ssk->rx_sa &&
+				(tx_sa->bytes_acked < tx_sa->bytes_sent) && 
+				vm_wait);
+		sdp_dbg_data(ssk->socket, "woke up sleepers\n");
+
+		posts_handler_get(ssk);
+
+		if (tx_sa->bytes_acked == tx_sa->bytes_sent)
+			break;
+
+		if (vm_wait) {
+			vm_wait -= current_timeo;
+			current_timeo = *timeo_p;
+			if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
+			    (current_timeo -= vm_wait) < 0)
+				current_timeo = 0;
+			vm_wait = 0;
+		}
+		*timeo_p = current_timeo;
+	}
+
+	finish_wait(sk->sk_sleep, &wait);
+
+	sdp_dbg_data(sk, "Finished waiting - RdmaRdCompl: %d/%d bytes, flags: 0x%x\n",
+			tx_sa->bytes_acked, tx_sa->bytes_sent, tx_sa->abort_flags);
+
+	if (!ssk->qp_active) {
+		sdp_dbg(sk, "QP destroyed while waiting\n");
+		return -EINVAL;
+	}
+	return err;
+}
+
+static void sdp_wait_rdma_wr_finished(struct sdp_sock *ssk)
+{
+	struct socket *sk = ssk->socket;
+	long timeo = HZ * 5; /* Timeout for for RDMA read */
+	DEFINE_WAIT(wait);
+
+	sdp_dbg_data(sk, "Sleep till RDMA wr finished.\n");
+	while (1) {
+		prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE);
+
+		if (!ssk->tx_ring.rdma_inflight->busy) {
+			sdp_dbg_data(sk, "got rdma cqe\n");
+			break;
+		}
+
+		if (!ssk->qp_active) {
+			sdp_dbg_data(sk, "QP destroyed\n");
+			break;
+		}
+
+		if (!timeo) {
+			sdp_warn(sk, "Panic: Timed out waiting for RDMA read\n");
+			WARN_ON(1);
+			break;
+		}
+
+		posts_handler_put(ssk);
+
+		sdp_prf1(sk, NULL, "Going to sleep");
+		sk_wait_event(sk, &timeo, 
+			!ssk->tx_ring.rdma_inflight->busy);
+		sdp_prf1(sk, NULL, "Woke up");
+		sdp_dbg_data(ssk->socket, "woke up sleepers\n");
+
+		posts_handler_get(ssk);
+	}
+
+	finish_wait(sk->sk_sleep, &wait);
+
+	sdp_dbg_data(sk, "Finished waiting\n");
+}
+
+int sdp_post_rdma_rd_compl(struct sdp_sock *ssk,
+		struct rx_srcavail_state *rx_sa)
+{
+	struct mbuf *mb;
+	int copied = rx_sa->used - rx_sa->reported;
+
+	if (rx_sa->used <= rx_sa->reported)
+		return 0;
+
+	mb = sdp_alloc_mb_rdmardcompl(ssk->socket, copied, 0);
+
+	rx_sa->reported += copied;
+
+	/* TODO: What if no tx_credits available? */
+	sdp_post_send(ssk, mb);
+
+	return 0;
+}
+
+int sdp_post_sendsm(struct socket *sk)
+{
+	struct mbuf *mb = sdp_alloc_mb_sendsm(sk, 0);
+
+	sdp_post_send(sdp_sk(sk), mb);
+
+	return 0;
+}
+
+static int sdp_update_iov_used(struct socket *sk, struct iovec *iov, int len)
+{
+	sdp_dbg_data(sk, "updating consumed 0x%x bytes from iov\n", len);
+	while (len > 0) {
+		if (iov->iov_len) {
+			int copy = min_t(unsigned int, iov->iov_len, len);
+			len -= copy;
+			iov->iov_len -= copy;
+			iov->iov_base += copy;
+		}
+		iov++;
+	}
+
+	return 0;
+}
+
+static inline int sge_bytes(struct ib_sge *sge, int sge_cnt)
+{
+	int bytes = 0;
+
+	while (sge_cnt > 0) {
+		bytes += sge->length;
+		sge++;
+		sge_cnt--;
+	}
+
+	return bytes;
+}
+void sdp_handle_sendsm(struct sdp_sock *ssk, u32 mseq_ack)
+{
+	struct socket *sk = ssk->socket;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ssk->tx_sa_lock, flags);
+
+	if (!ssk->tx_sa) {
+		sdp_prf1(sk, NULL, "SendSM for cancelled/finished SrcAvail");
+		goto out;
+	}
+
+	if (ssk->tx_sa->mseq > mseq_ack) {
+		sdp_dbg_data(sk, "SendSM arrived for old SrcAvail. "
+			"SendSM mseq_ack: 0x%x, SrcAvail mseq: 0x%x\n",
+			mseq_ack, ssk->tx_sa->mseq);
+		goto out;
+	}
+
+	sdp_dbg_data(sk, "Got SendSM - aborting SrcAvail\n");
+
+	ssk->tx_sa->abort_flags |= TX_SA_SENDSM;
+	cancel_delayed_work(&ssk->srcavail_cancel_work);
+
+	wake_up(sk->sk_sleep);
+	sdp_dbg_data(sk, "woke up sleepers\n");
+
+out:
+	spin_unlock_irqrestore(&ssk->tx_sa_lock, flags);
+}
+
+void sdp_handle_rdma_read_compl(struct sdp_sock *ssk, u32 mseq_ack,
+		u32 bytes_completed)
+{
+	struct socket *sk = ssk->socket;
+	unsigned long flags;
+
+	sdp_prf1(sk, NULL, "RdmaRdCompl ssk=%p tx_sa=%p", ssk, ssk->tx_sa);
+	sdp_dbg_data(sk, "RdmaRdCompl ssk=%p tx_sa=%p\n", ssk, ssk->tx_sa);
+
+	spin_lock_irqsave(&ssk->tx_sa_lock, flags);
+
+	BUG_ON(!ssk);
+
+	if (!ssk->tx_sa) {
+		sdp_dbg_data(sk, "Got RdmaRdCompl for aborted SrcAvail\n");
+		goto out;
+	}
+
+	if (ssk->tx_sa->mseq > mseq_ack) {
+		sdp_dbg_data(sk, "RdmaRdCompl arrived for old SrcAvail. "
+			"SendSM mseq_ack: 0x%x, SrcAvail mseq: 0x%x\n",
+			mseq_ack, ssk->tx_sa->mseq);
+		goto out;
+	}
+
+	ssk->tx_sa->bytes_acked += bytes_completed;
+
+	wake_up(sk->sk_sleep);
+	sdp_dbg_data(sk, "woke up sleepers\n");
+
+out:
+	spin_unlock_irqrestore(&ssk->tx_sa_lock, flags);
+	return;
+}
+
+static unsigned long sdp_get_max_memlockable_bytes(unsigned long offset)
+{
+	unsigned long avail;
+	unsigned long lock_limit;
+
+	if (capable(CAP_IPC_LOCK))
+		return ULONG_MAX;
+
+	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+	avail = lock_limit - (current->mm->locked_vm << PAGE_SHIFT);
+
+	return avail - offset;
+}
+
+static int sdp_alloc_fmr(struct socket *sk, void *uaddr, size_t len,
+	struct ib_pool_fmr **_fmr, struct ib_umem **_umem)
+{
+	struct ib_pool_fmr *fmr;
+	struct ib_umem *umem;
+	struct ib_device *dev;
+	u64 *pages;
+	struct ib_umem_chunk *chunk;
+	int n, j, k;
+	int rc = 0;
+	unsigned long max_lockable_bytes;
+
+	if (unlikely(len > SDP_MAX_RDMA_READ_LEN)) {
+		sdp_dbg_data(sk, "len:0x%lx > FMR_SIZE: 0x%lx\n",
+			len, SDP_MAX_RDMA_READ_LEN);
+		len = SDP_MAX_RDMA_READ_LEN;
+	}
+
+	max_lockable_bytes = sdp_get_max_memlockable_bytes((unsigned long)uaddr & ~PAGE_MASK);
+	if (unlikely(len > max_lockable_bytes)) {
+		sdp_dbg_data(sk, "len:0x%lx > RLIMIT_MEMLOCK available: 0x%lx\n",
+			len, max_lockable_bytes);
+		len = max_lockable_bytes;
+	}
+
+	sdp_dbg_data(sk, "user buf: %p, len:0x%lx max_lockable_bytes: 0x%lx\n",
+			uaddr, len, max_lockable_bytes);
+
+	umem = ib_umem_get(&sdp_sk(sk)->context, (unsigned long)uaddr, len,
+		IB_ACCESS_REMOTE_WRITE, 0);
+
+	if (IS_ERR(umem)) {
+		rc = PTR_ERR(umem);
+		sdp_warn(sk, "Error doing umem_get 0x%lx bytes: %d\n", len, rc);
+		sdp_warn(sk, "RLIMIT_MEMLOCK: 0x%lx[cur] 0x%lx[max] CAP_IPC_LOCK: %d\n",
+				current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur,
+				current->signal->rlim[RLIMIT_MEMLOCK].rlim_max,
+				capable(CAP_IPC_LOCK));
+		goto err_umem_get;
+	}
+
+	sdp_dbg_data(sk, "umem->offset = 0x%x, length = 0x%lx\n",
+		umem->offset, umem->length);
+
+	pages = (u64 *) __get_free_page(GFP_KERNEL);
+	if (!pages)
+		goto err_pages_alloc;
+
+	n = 0;
+
+	dev = sdp_sk(sk)->ib_device;
+	list_for_each_entry(chunk, &umem->chunk_list, list) {
+		for (j = 0; j < chunk->nmap; ++j) {
+			len = ib_sg_dma_len(dev,
+					&chunk->page_list[j]) >> PAGE_SHIFT;
+
+			for (k = 0; k < len; ++k) {
+				pages[n++] = ib_sg_dma_address(dev,
+						&chunk->page_list[j]) +
+					umem->page_size * k;
+
+			}
+		}
+	}
+
+	fmr = ib_fmr_pool_map_phys(sdp_sk(sk)->sdp_dev->fmr_pool, pages, n, 0);
+	if (IS_ERR(fmr)) {
+		sdp_warn(sk, "Error allocating fmr: %ld\n", PTR_ERR(fmr));
+		goto err_fmr_alloc;
+	}
+
+	free_page((unsigned long) pages);
+
+	*_umem = umem;
+	*_fmr = fmr;
+
+	return 0;
+
+err_fmr_alloc:	
+	free_page((unsigned long) pages);
+
+err_pages_alloc:
+	ib_umem_release(umem);
+
+err_umem_get:
+
+	return rc;
+}
+
+void sdp_free_fmr(struct socket *sk, struct ib_pool_fmr **_fmr, struct ib_umem **_umem)
+{
+	if (!sdp_sk(sk)->qp_active)
+		return;
+
+	ib_fmr_pool_unmap(*_fmr);
+	*_fmr = NULL;
+
+	ib_umem_release(*_umem);
+	*_umem = NULL;
+}
+
+static int sdp_post_rdma_read(struct socket *sk, struct rx_srcavail_state *rx_sa)
+{
+	struct sdp_sock *ssk = sdp_sk(sk);
+	struct ib_send_wr *bad_wr;
+	struct ib_send_wr wr = { NULL };
+	struct ib_sge sge;
+
+	wr.opcode = IB_WR_RDMA_READ;
+	wr.next = NULL;
+	wr.wr_id = SDP_OP_RDMA;
+	wr.wr.rdma.rkey = rx_sa->rkey;
+	wr.send_flags = 0;
+
+	ssk->tx_ring.rdma_inflight = rx_sa;
+
+	sge.addr = rx_sa->umem->offset;
+	sge.length = rx_sa->umem->length;
+	sge.lkey = rx_sa->fmr->fmr->lkey;
+
+	wr.wr.rdma.remote_addr = rx_sa->vaddr + rx_sa->used;
+	wr.num_sge = 1;
+	wr.sg_list = &sge;
+	rx_sa->busy++;
+
+	wr.send_flags = IB_SEND_SIGNALED;
+
+	return ib_post_send(ssk->qp, &wr, &bad_wr);
+}
+
+int sdp_rdma_to_iovec(struct socket *sk, struct iovec *iov, struct mbuf *mb,
+		unsigned long *used)
+{
+	struct sdp_sock *ssk = sdp_sk(sk);
+	struct rx_srcavail_state *rx_sa = RX_SRCAVAIL_STATE(mb);
+	int got_srcavail_cancel;
+	int rc = 0;
+	int len = *used;
+	int copied;
+
+	sdp_dbg_data(ssk->socket, "preparing RDMA read."
+		" len: 0x%x. buffer len: 0x%lx\n", len, iov->iov_len);
+
+	sock_hold(sk, SOCK_REF_RDMA_RD);
+
+	if (len > rx_sa->len) {
+		sdp_warn(sk, "len:0x%x > rx_sa->len: 0x%x\n", len, rx_sa->len);
+		WARN_ON(1);
+		len = rx_sa->len;
+	}
+
+	rc = sdp_alloc_fmr(sk, iov->iov_base, len, &rx_sa->fmr, &rx_sa->umem);
+	if (rc) {
+		sdp_warn(sk, "Error allocating fmr: %d\n", rc);
+		goto err_alloc_fmr;
+	}
+
+	rc = sdp_post_rdma_read(sk, rx_sa);
+	if (unlikely(rc)) {
+		sdp_warn(sk, "ib_post_send failed with status %d.\n", rc);
+		sdp_set_error(ssk->socket, -ECONNRESET);
+		wake_up(&ssk->wq);
+		goto err_post_send;
+	}
+
+	sdp_prf(sk, mb, "Finished posting(rc=%d), now to wait", rc);
+
+	got_srcavail_cancel = ssk->srcavail_cancel_mseq > rx_sa->mseq;
+
+	sdp_arm_tx_cq(sk);
+
+	sdp_wait_rdma_wr_finished(ssk);
+
+	sdp_prf(sk, mb, "Finished waiting(rc=%d)", rc);
+	if (!ssk->qp_active) {
+		sdp_dbg_data(sk, "QP destroyed during RDMA read\n");
+		rc = -EPIPE;
+		goto err_post_send;
+	}
+
+	copied = rx_sa->umem->length;
+
+	sdp_update_iov_used(sk, iov, copied);
+	rx_sa->used += copied;
+	atomic_add(copied, &ssk->rcv_nxt);
+	*used = copied;
+
+	ssk->tx_ring.rdma_inflight = NULL;
+
+err_post_send:
+	sdp_free_fmr(sk, &rx_sa->fmr, &rx_sa->umem);
+
+err_alloc_fmr:
+	if (rc && ssk->qp_active) {
+		sdp_warn(sk, "Couldn't do RDMA - post sendsm\n");
+		rx_sa->flags |= RX_SA_ABORTED;
+	}
+
+	sock_put(sk, SOCK_REF_RDMA_RD);
+
+	return rc;
+}
+
+static inline int wait_for_sndbuf(struct socket *sk, long *timeo_p)
+{
+	struct sdp_sock *ssk = sdp_sk(sk);
+	int ret = 0;
+	int credits_needed = 1;
+
+	sdp_dbg_data(sk, "Wait for mem\n");
+
+	set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+
+	SDPSTATS_COUNTER_INC(send_wait_for_mem);
+
+	sdp_do_posts(ssk);
+
+	sdp_xmit_poll(ssk, 1);
+
+	ret = sdp_tx_wait_memory(ssk, timeo_p, &credits_needed);
+
+	return ret;
+}
+
+static int do_sdp_sendmsg_zcopy(struct socket *sk, struct tx_srcavail_state *tx_sa,
+		struct iovec *iov, long *timeo)
+{
+	struct sdp_sock *ssk = sdp_sk(sk);
+	int rc = 0;
+	unsigned long lock_flags;
+
+	rc = sdp_alloc_fmr(sk, iov->iov_base, iov->iov_len,
+			&tx_sa->fmr, &tx_sa->umem);
+	if (rc) {
+		sdp_warn(sk, "Error allocating fmr: %d\n", rc);
+		goto err_alloc_fmr;
+	}
+
+	if (tx_slots_free(ssk) == 0) {
+		rc = wait_for_sndbuf(sk, timeo);
+		if (rc) {
+			sdp_warn(sk, "Couldn't get send buffer\n");
+			goto err_no_tx_slots;
+		}
+	}
+
+	rc = sdp_post_srcavail(sk, tx_sa);
+	if (rc) {
+		sdp_dbg(sk, "Error posting SrcAvail\n");
+		goto err_abort_send;
+	}
+
+	rc = sdp_wait_rdmardcompl(ssk, timeo, 0);
+	if (unlikely(rc)) {
+		enum tx_sa_flag f = tx_sa->abort_flags;
+
+		if (f & TX_SA_SENDSM) {
+			sdp_dbg_data(sk, "Got SendSM. use SEND verb.\n");
+		} else if (f & TX_SA_ERROR) {
+			sdp_dbg_data(sk, "SrcAvail error completion\n");
+			sdp_reset(sk);
+			SDPSTATS_COUNTER_INC(zcopy_tx_error);
+		} else if (ssk->qp_active) {
+			sdp_post_srcavail_cancel(sk);
+
+			/* Wait for RdmaRdCompl/SendSM to
+			 * finish the transaction */
+			*timeo = 2 * HZ;
+			sdp_dbg_data(sk, "Waiting for SendSM\n");
+			sdp_wait_rdmardcompl(ssk, timeo, 1);
+			sdp_dbg_data(sk, "finished waiting\n");
+
+			cancel_delayed_work(&ssk->srcavail_cancel_work);
+		} else {
+			sdp_dbg_data(sk, "QP was destroyed while waiting\n");
+		}
+	} else {
+		sdp_dbg_data(sk, "got RdmaRdCompl\n");
+	}
+
+	spin_lock_irqsave(&ssk->tx_sa_lock, lock_flags);
+	ssk->tx_sa = NULL;
+	spin_unlock_irqrestore(&ssk->tx_sa_lock, lock_flags);
+
+err_abort_send:
+	sdp_update_iov_used(sk, iov, tx_sa->bytes_acked);
+
+err_no_tx_slots:
+	sdp_free_fmr(sk, &tx_sa->fmr, &tx_sa->umem);
+
+err_alloc_fmr:
+	return rc;	
+}
+
+int sdp_sendmsg_zcopy(struct kiocb *iocb, struct socket *sk, struct iovec *iov)
+{
+	struct sdp_sock *ssk = sdp_sk(sk);
+	int rc = 0;
+	long timeo;
+	struct tx_srcavail_state *tx_sa;
+	int offset;
+	size_t bytes_to_copy = 0;
+	int copied = 0;
+
+	sdp_dbg_data(sk, "Sending iov: %p, iov_len: 0x%lx\n",
+			iov->iov_base, iov->iov_len);
+	sdp_prf1(sk, NULL, "sdp_sendmsg_zcopy start");
+	if (ssk->rx_sa) {
+		sdp_dbg_data(sk, "Deadlock prevent: crossing SrcAvail\n");
+		return 0;
+	}
+
+	sock_hold(ssk->socket, SOCK_REF_ZCOPY);
+
+	SDPSTATS_COUNTER_INC(sendmsg_zcopy_segment);
+
+	timeo = SDP_SRCAVAIL_ADV_TIMEOUT ;
+
+	/* Ok commence sending. */
+	offset = (unsigned long)iov->iov_base & (PAGE_SIZE - 1);
+
+	tx_sa = kmalloc(sizeof(struct tx_srcavail_state), GFP_KERNEL);
+	if (!tx_sa) {
+		sdp_warn(sk, "Error allocating zcopy context\n");
+		rc = -EAGAIN; /* Buffer too big - fallback to bcopy */
+		goto err_alloc_tx_sa;
+	}
+
+	bytes_to_copy = iov->iov_len;
+	do {
+		tx_sa_reset(tx_sa);
+
+		rc = do_sdp_sendmsg_zcopy(sk, tx_sa, iov, &timeo);
+
+		if (iov->iov_len && iov->iov_len < sdp_zcopy_thresh) {
+			sdp_dbg_data(sk, "0x%lx bytes left, switching to bcopy\n",
+				iov->iov_len);
+			break;
+		}
+	} while (!rc && iov->iov_len > 0 && !tx_sa->abort_flags);
+
+	kfree(tx_sa);
+err_alloc_tx_sa:
+	copied = bytes_to_copy - iov->iov_len;
+
+	sdp_prf1(sk, NULL, "sdp_sendmsg_zcopy end rc: %d copied: %d", rc, copied);
+
+	sock_put(ssk->socket, SOCK_REF_ZCOPY);
+
+	if (rc < 0 && rc != -EAGAIN && rc != -ETIME)
+		return rc;
+
+	return copied;
+}
+
+void sdp_abort_srcavail(struct socket *sk)
+{
+	struct sdp_sock *ssk = sdp_sk(sk);
+	struct tx_srcavail_state *tx_sa = ssk->tx_sa;
+	unsigned long flags;
+
+	if (!tx_sa)
+		return;
+
+	cancel_delayed_work(&ssk->srcavail_cancel_work);
+	flush_scheduled_work();
+
+	spin_lock_irqsave(&ssk->tx_sa_lock, flags);
+
+	sdp_free_fmr(sk, &tx_sa->fmr, &tx_sa->umem);
+
+	ssk->tx_sa = NULL;
+
+	spin_unlock_irqrestore(&ssk->tx_sa_lock, flags);
+}
+
+void sdp_abort_rdma_read(struct socket *sk)
+{
+	struct sdp_sock *ssk = sdp_sk(sk);
+	struct rx_srcavail_state *rx_sa = ssk->rx_sa;
+
+	if (!rx_sa)
+		return;
+
+	sdp_free_fmr(sk, &rx_sa->fmr, &rx_sa->umem);
+
+	ssk->rx_sa = NULL;
+}
diff --git a/sys/ofed/drivers/infiniband/util/Kconfig b/sys/ofed/drivers/infiniband/util/Kconfig
new file mode 100644
index 0000000..5e98eaa
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/util/Kconfig
@@ -0,0 +1,6 @@
+config INFINIBAND_MADEYE
+	tristate "MAD debug viewer for InfiniBand"
+	depends on INFINIBAND
+	---help---
+	  Prints sent and received MADs on QP 0/1 for debugging.
+
diff --git a/sys/ofed/drivers/infiniband/util/Makefile b/sys/ofed/drivers/infiniband/util/Makefile
new file mode 100644
index 0000000..caf9471
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/util/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_INFINIBAND_MADEYE)	+= ib_madeye.o
+
+ib_madeye-y := madeye.o
diff --git a/sys/ofed/drivers/infiniband/util/madeye.c b/sys/ofed/drivers/infiniband/util/madeye.c
new file mode 100644
index 0000000..2c650a3
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/util/madeye.c
@@ -0,0 +1,593 @@
+/*
+ * Copyright (c) 2004, 2005 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005, 2006 Voltaire Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directorY of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/err.h>
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_sa.h>
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("InfiniBand MAD viewer");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static void madeye_remove_one(struct ib_device *device);
+static void madeye_add_one(struct ib_device *device);
+
+static struct ib_client madeye_client = {
+	.name   = "madeye",
+	.add    = madeye_add_one,
+	.remove = madeye_remove_one
+};
+
+struct madeye_port {
+	struct ib_mad_agent *smi_agent;
+	struct ib_mad_agent *gsi_agent;
+};
+
+static int smp = 1;
+static int gmp = 1;
+static int mgmt_class = 0;
+static int attr_id = 0;
+static int data = 0;
+
+module_param(smp, int, 0444);
+module_param(gmp, int, 0444);
+module_param(mgmt_class, int, 0444);
+module_param(attr_id, int, 0444);
+module_param(data, int, 0444);
+
+MODULE_PARM_DESC(smp, "Display all SMPs (default=1)");
+MODULE_PARM_DESC(gmp, "Display all GMPs (default=1)");
+MODULE_PARM_DESC(mgmt_class, "Display all MADs of specified class (default=0)");
+MODULE_PARM_DESC(attr_id, "Display add MADs of specified attribute ID (default=0)");
+MODULE_PARM_DESC(data, "Display data area of MADs (default=0)");
+
+static char * get_class_name(u8 mgmt_class)
+{
+	switch(mgmt_class) {
+	case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+		return "LID routed SMP";
+	case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+		return "Directed route SMP";
+	case IB_MGMT_CLASS_SUBN_ADM:
+		return "Subnet admin.";
+	case IB_MGMT_CLASS_PERF_MGMT:
+		return "Perf. mgmt.";
+	case IB_MGMT_CLASS_BM:
+		return "Baseboard mgmt.";
+	case IB_MGMT_CLASS_DEVICE_MGMT:
+		return "Device mgmt.";
+	case IB_MGMT_CLASS_CM:
+		return "Comm. mgmt.";
+	case IB_MGMT_CLASS_SNMP:
+		return "SNMP";
+	default:
+		return "Unknown vendor/application";
+	}
+}
+
+static char * get_method_name(u8 mgmt_class, u8 method)
+{
+	switch(method) {
+	case IB_MGMT_METHOD_GET:
+		return "Get";
+	case IB_MGMT_METHOD_SET:
+		return "Set";
+	case IB_MGMT_METHOD_GET_RESP:
+		return "Get response";
+	case IB_MGMT_METHOD_SEND:
+		return "Send";
+	case IB_MGMT_METHOD_SEND | IB_MGMT_METHOD_RESP:
+		return "Send response";
+	case IB_MGMT_METHOD_TRAP:
+		return "Trap";
+	case IB_MGMT_METHOD_REPORT:
+		return "Report";
+	case IB_MGMT_METHOD_REPORT_RESP:
+		return "Report response";
+	case IB_MGMT_METHOD_TRAP_REPRESS:
+		return "Trap repress";
+	default:
+		break;
+	}
+
+	switch (mgmt_class) {
+	case IB_MGMT_CLASS_SUBN_ADM:
+		switch (method) {
+		case IB_SA_METHOD_GET_TABLE:
+			return "Get table";
+		case IB_SA_METHOD_GET_TABLE_RESP:
+			return "Get table response";
+		case IB_SA_METHOD_DELETE:
+			return "Delete";
+		case IB_SA_METHOD_DELETE_RESP:
+			return "Delete response";
+		case IB_SA_METHOD_GET_MULTI:
+			return "Get Multi";
+		case IB_SA_METHOD_GET_MULTI_RESP:
+			return "Get Multi response";
+		case IB_SA_METHOD_GET_TRACE_TBL:
+			return "Get Trace Table response";
+		default:
+			break;
+		}
+	default:
+		break;
+	}
+
+	return "Unknown";
+}
+
+static void print_status_details(u16 status)
+{
+	if (status & 0x0001)
+		printk("               busy\n");
+	if (status & 0x0002)
+		printk("               redirection required\n");
+	switch((status & 0x001C) >> 2) {
+	case 1:
+		printk("               bad version\n");
+		break;
+	case 2:
+		printk("               method not supported\n");
+		break;
+	case 3:
+		printk("               method/attribute combo not supported\n");
+		break;
+	case 7:
+		printk("               invalid attribute/modifier value\n");
+		break;
+	}
+}
+
+static char * get_sa_attr(__be16 attr)
+{
+	switch(attr) {
+	case IB_SA_ATTR_CLASS_PORTINFO:
+		return "Class Port Info";
+	case IB_SA_ATTR_NOTICE:
+		return "Notice";
+	case IB_SA_ATTR_INFORM_INFO:
+		return "Inform Info";
+	case IB_SA_ATTR_NODE_REC:
+		return "Node Record";
+	case IB_SA_ATTR_PORT_INFO_REC:
+		return "PortInfo Record";
+	case IB_SA_ATTR_SL2VL_REC:
+		return "SL to VL Record";
+	case IB_SA_ATTR_SWITCH_REC:
+		return "Switch Record";
+	case IB_SA_ATTR_LINEAR_FDB_REC:
+		return "Linear FDB Record";
+	case IB_SA_ATTR_RANDOM_FDB_REC:
+		return "Random FDB Record";
+	case IB_SA_ATTR_MCAST_FDB_REC:
+		return "Multicast FDB Record";
+	case IB_SA_ATTR_SM_INFO_REC:
+		return "SM Info Record";
+	case IB_SA_ATTR_LINK_REC:
+		return "Link Record";
+	case IB_SA_ATTR_GUID_INFO_REC:
+		return "Guid Info Record";
+	case IB_SA_ATTR_SERVICE_REC:
+		return "Service Record";
+	case IB_SA_ATTR_PARTITION_REC:
+		return "Partition Record";
+	case IB_SA_ATTR_PATH_REC:
+		return "Path Record";
+	case IB_SA_ATTR_VL_ARB_REC:
+		return "VL Arb Record";
+	case IB_SA_ATTR_MC_MEMBER_REC:
+		return "MC Member Record";
+	case IB_SA_ATTR_TRACE_REC:
+		return "Trace Record";
+	case IB_SA_ATTR_MULTI_PATH_REC:
+		return "Multi Path Record";
+	case IB_SA_ATTR_SERVICE_ASSOC_REC:
+		return "Service Assoc Record";
+	case IB_SA_ATTR_INFORM_INFO_REC:
+		return "Inform Info Record";
+	default:
+		return "";
+	}
+}
+
+static void print_mad_hdr(struct ib_mad_hdr *mad_hdr)
+{
+	printk("MAD version....0x%01x\n", mad_hdr->base_version);
+	printk("Class..........0x%01x (%s)\n", mad_hdr->mgmt_class,
+	       get_class_name(mad_hdr->mgmt_class));
+	printk("Class version..0x%01x\n", mad_hdr->class_version);
+	printk("Method.........0x%01x (%s)\n", mad_hdr->method,
+	       get_method_name(mad_hdr->mgmt_class, mad_hdr->method));
+	printk("Status.........0x%02x\n", be16_to_cpu(mad_hdr->status));
+	if (mad_hdr->status)
+		print_status_details(be16_to_cpu(mad_hdr->status));
+	printk("Class specific.0x%02x\n", be16_to_cpu(mad_hdr->class_specific));
+	printk("Trans ID.......0x%llx\n", 
+		(unsigned long long)be64_to_cpu(mad_hdr->tid));
+	if (mad_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_ADM)
+		printk("Attr ID........0x%02x (%s)\n",
+		       be16_to_cpu(mad_hdr->attr_id),
+		       get_sa_attr(be16_to_cpu(mad_hdr->attr_id)));
+	else
+		printk("Attr ID........0x%02x\n",
+		       be16_to_cpu(mad_hdr->attr_id));
+	printk("Attr modifier..0x%04x\n", be32_to_cpu(mad_hdr->attr_mod));
+}
+
+static char * get_rmpp_type(u8 rmpp_type)
+{
+	switch (rmpp_type) {
+	case IB_MGMT_RMPP_TYPE_DATA:
+		return "Data";
+	case IB_MGMT_RMPP_TYPE_ACK:
+		return "Ack";
+	case IB_MGMT_RMPP_TYPE_STOP:
+		return "Stop";
+	case IB_MGMT_RMPP_TYPE_ABORT:
+		return "Abort";
+	default:
+		return "Unknown";
+	}
+}
+
+static char * get_rmpp_flags(u8 rmpp_flags)
+{
+	if (rmpp_flags & IB_MGMT_RMPP_FLAG_ACTIVE)
+		if (rmpp_flags & IB_MGMT_RMPP_FLAG_FIRST)
+			if (rmpp_flags & IB_MGMT_RMPP_FLAG_LAST)
+				return "Active - First & Last";
+			else
+				return "Active - First";
+		else
+			if (rmpp_flags & IB_MGMT_RMPP_FLAG_LAST)
+				return "Active - Last";
+			else
+				return "Active";
+	else
+		return "Inactive";
+}
+
+static void print_rmpp_hdr(struct ib_rmpp_hdr *rmpp_hdr)
+{
+	printk("RMPP version...0x%01x\n", rmpp_hdr->rmpp_version);
+	printk("RMPP type......0x%01x (%s)\n", rmpp_hdr->rmpp_type,
+	       get_rmpp_type(rmpp_hdr->rmpp_type));
+	printk("RMPP RRespTime.0x%01x\n", ib_get_rmpp_resptime(rmpp_hdr));
+	printk("RMPP flags.....0x%01x (%s)\n", ib_get_rmpp_flags(rmpp_hdr),
+	       get_rmpp_flags(ib_get_rmpp_flags(rmpp_hdr)));
+	printk("RMPP status....0x%01x\n", rmpp_hdr->rmpp_status);
+	printk("Seg number.....0x%04x\n", be32_to_cpu(rmpp_hdr->seg_num));
+	switch (rmpp_hdr->rmpp_type) {
+	case IB_MGMT_RMPP_TYPE_DATA:
+		printk("Payload len....0x%04x\n",
+		       be32_to_cpu(rmpp_hdr->paylen_newwin));
+		break;
+	case IB_MGMT_RMPP_TYPE_ACK:
+		printk("New window.....0x%04x\n",
+		       be32_to_cpu(rmpp_hdr->paylen_newwin));
+		break;
+	default:
+		printk("Data 2.........0x%04x\n",
+		       be32_to_cpu(rmpp_hdr->paylen_newwin));
+		break;
+	}
+}
+
+static char * get_smp_attr(__be16 attr)
+{
+	switch (attr) {
+	case IB_SMP_ATTR_NOTICE:
+		return "notice";
+	case IB_SMP_ATTR_NODE_DESC:
+		return "node description";
+	case IB_SMP_ATTR_NODE_INFO:
+		return "node info";
+	case IB_SMP_ATTR_SWITCH_INFO:
+		return "switch info";
+	case IB_SMP_ATTR_GUID_INFO:
+		return "GUID info";
+	case IB_SMP_ATTR_PORT_INFO:
+		return "port info";
+	case IB_SMP_ATTR_PKEY_TABLE:
+		return "pkey table";
+	case IB_SMP_ATTR_SL_TO_VL_TABLE:
+		return "SL to VL table";
+	case IB_SMP_ATTR_VL_ARB_TABLE:
+		return "VL arbitration table";
+	case IB_SMP_ATTR_LINEAR_FORWARD_TABLE:
+		return "linear forwarding table";
+	case IB_SMP_ATTR_RANDOM_FORWARD_TABLE:
+		return "random forward table";
+	case IB_SMP_ATTR_MCAST_FORWARD_TABLE:
+		return "multicast forward table";
+	case IB_SMP_ATTR_SM_INFO:
+		return "SM info";
+	case IB_SMP_ATTR_VENDOR_DIAG:
+		return "vendor diags";
+	case IB_SMP_ATTR_LED_INFO:
+		return "LED info";
+	default:
+		return "";
+	}
+}
+
+static void print_smp(struct ib_smp *smp)
+{
+	int i;
+
+	printk("MAD version....0x%01x\n", smp->base_version);
+	printk("Class..........0x%01x (%s)\n", smp->mgmt_class,
+	       get_class_name(smp->mgmt_class));
+	printk("Class version..0x%01x\n", smp->class_version);
+	printk("Method.........0x%01x (%s)\n", smp->method,
+	       get_method_name(smp->mgmt_class, smp->method));
+	printk("Status.........0x%02x\n", be16_to_cpu(smp->status));
+	if (smp->status)
+		print_status_details(be16_to_cpu(smp->status));
+	printk("Hop pointer....0x%01x\n", smp->hop_ptr);
+	printk("Hop counter....0x%01x\n", smp->hop_cnt);
+	printk("Trans ID.......0x%llx\n", 
+		(unsigned long long)be64_to_cpu(smp->tid));
+	printk("Attr ID........0x%02x (%s)\n", be16_to_cpu(smp->attr_id),
+		get_smp_attr(smp->attr_id));
+	printk("Attr modifier..0x%04x\n", be32_to_cpu(smp->attr_mod));
+
+	printk("Mkey...........0x%llx\n",
+		(unsigned long long)be64_to_cpu(smp->mkey));
+	printk("DR SLID........0x%02x\n", be16_to_cpu(smp->dr_slid));
+	printk("DR DLID........0x%02x", be16_to_cpu(smp->dr_dlid));
+
+	if (data) {
+		for (i = 0; i < IB_SMP_DATA_SIZE; i++) {
+			if (i % 16 == 0)
+				printk("\nSMP Data.......");
+			printk("%01x ", smp->data[i]);
+		}
+		for (i = 0; i < IB_SMP_MAX_PATH_HOPS; i++) {
+			if (i % 16 == 0)
+				printk("\nInitial path...");
+			printk("%01x ", smp->initial_path[i]);
+		}
+		for (i = 0; i < IB_SMP_MAX_PATH_HOPS; i++) {
+			if (i % 16 == 0)
+				printk("\nReturn path....");
+			printk("%01x ", smp->return_path[i]);
+		}
+	}
+	printk("\n");
+}
+
+static void snoop_smi_handler(struct ib_mad_agent *mad_agent,
+			      struct ib_mad_send_buf *send_buf,
+			      struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_mad_hdr *hdr = send_buf->mad;
+
+	if (!smp && hdr->mgmt_class != mgmt_class)
+		return;
+	if (attr_id && be16_to_cpu(hdr->attr_id) != attr_id)
+		return;
+
+	printk("Madeye:sent SMP\n");
+	print_smp(send_buf->mad);
+}
+
+static void recv_smi_handler(struct ib_mad_agent *mad_agent,
+			     struct ib_mad_recv_wc *mad_recv_wc)
+{
+	if (!smp && mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class != mgmt_class)
+		return;
+	if (attr_id && be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) != attr_id)
+		return;
+
+	printk("Madeye:recv SMP\n");
+	print_smp((struct ib_smp *)&mad_recv_wc->recv_buf.mad->mad_hdr);
+}
+
+static int is_rmpp_mad(struct ib_mad_hdr *mad_hdr)
+{
+	if (mad_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_ADM) {
+		switch (mad_hdr->method) {
+		case IB_SA_METHOD_GET_TABLE:
+		case IB_SA_METHOD_GET_TABLE_RESP:
+		case IB_SA_METHOD_GET_MULTI_RESP:
+			return 1;
+		default:
+			break;
+		}
+	} else if ((mad_hdr->mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) &&
+		   (mad_hdr->mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END))
+		return 1;
+
+	return 0;
+}
+
+static void snoop_gsi_handler(struct ib_mad_agent *mad_agent,
+			      struct ib_mad_send_buf *send_buf,
+			      struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_mad_hdr *hdr = send_buf->mad;
+
+	if (!gmp && hdr->mgmt_class != mgmt_class)
+		return;
+	if (attr_id && be16_to_cpu(hdr->attr_id) != attr_id)
+		return;
+
+	printk("Madeye:sent GMP\n");
+	print_mad_hdr(hdr);
+
+	if (is_rmpp_mad(hdr))
+		print_rmpp_hdr(&((struct ib_rmpp_mad *) hdr)->rmpp_hdr);
+}
+
+static void recv_gsi_handler(struct ib_mad_agent *mad_agent,
+			     struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_mad_hdr *hdr = &mad_recv_wc->recv_buf.mad->mad_hdr;
+	struct ib_rmpp_mad *mad = NULL;
+	struct ib_sa_mad *sa_mad;
+	struct ib_vendor_mad *vendor_mad;
+	u8 *mad_data;
+	int i, j;
+
+	if (!gmp && hdr->mgmt_class != mgmt_class)
+		return;
+	if (attr_id && be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) != attr_id)
+		return;
+
+	printk("Madeye:recv GMP\n");
+	print_mad_hdr(hdr);
+
+	if (is_rmpp_mad(hdr)) {
+		mad = (struct ib_rmpp_mad *) hdr;
+		print_rmpp_hdr(&mad->rmpp_hdr);
+	}
+
+	if (data) {
+		if (hdr->mgmt_class == IB_MGMT_CLASS_SUBN_ADM) {
+			j = IB_MGMT_SA_DATA;
+			/* Display SA header */
+			if (is_rmpp_mad(hdr) &&
+			    mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA)
+				return;
+			sa_mad = (struct ib_sa_mad *)
+				 &mad_recv_wc->recv_buf.mad;
+			mad_data = sa_mad->data;
+		} else {
+			if (is_rmpp_mad(hdr)) {
+				j = IB_MGMT_VENDOR_DATA;
+				/* Display OUI */
+				vendor_mad = (struct ib_vendor_mad *)
+					     &mad_recv_wc->recv_buf.mad;
+				printk("Vendor OUI......%01x %01x %01x\n",
+					vendor_mad->oui[0],
+					vendor_mad->oui[1],
+					vendor_mad->oui[2]);
+				mad_data = vendor_mad->data;
+			} else {
+				j = IB_MGMT_MAD_DATA;
+				mad_data = mad_recv_wc->recv_buf.mad->data;
+			}
+		}
+		for (i = 0; i < j; i++) {
+			if (i % 16 == 0)
+				printk("\nData...........");
+			printk("%01x ", mad_data[i]);
+		}
+		printk("\n");
+	}
+}
+
+static void madeye_add_one(struct ib_device *device)
+{
+	struct madeye_port *port;
+	int reg_flags;
+	u8 i, s, e;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH) {
+		s = 0;
+		e = 0;
+	} else {
+		s = 1;
+		e = device->phys_port_cnt;
+	}
+
+	port = kmalloc(sizeof *port * (e - s + 1), GFP_KERNEL);
+	if (!port)
+		goto out;
+
+	reg_flags = IB_MAD_SNOOP_SEND_COMPLETIONS | IB_MAD_SNOOP_RECVS;
+	for (i = 0; i <= e - s; i++) {
+		port[i].smi_agent = ib_register_mad_snoop(device, i + s,
+							  IB_QPT_SMI,
+							  reg_flags,
+							  snoop_smi_handler,
+							  recv_smi_handler,
+							  &port[i]);
+		port[i].gsi_agent = ib_register_mad_snoop(device, i + s,
+							  IB_QPT_GSI,
+							  reg_flags,
+							  snoop_gsi_handler,
+							  recv_gsi_handler,
+							  &port[i]);
+	}
+
+out:
+	ib_set_client_data(device, &madeye_client, port);
+}
+
+static void madeye_remove_one(struct ib_device *device)
+{
+	struct madeye_port *port;
+	int i, s, e;
+
+	port = (struct madeye_port *)
+		ib_get_client_data(device, &madeye_client);
+	if (!port)
+		return;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH) {
+		s = 0;
+		e = 0;
+	} else {
+		s = 1;
+		e = device->phys_port_cnt;
+	}
+
+	for (i = 0; i <= e - s; i++) {
+		if (!IS_ERR(port[i].smi_agent))
+			ib_unregister_mad_agent(port[i].smi_agent);
+		if (!IS_ERR(port[i].gsi_agent))
+			ib_unregister_mad_agent(port[i].gsi_agent);
+	}
+	kfree(port);
+}
+
+static int __init ib_madeye_init(void)
+{
+	return ib_register_client(&madeye_client);
+}
+
+static void __exit ib_madeye_cleanup(void)
+{
+	ib_unregister_client(&madeye_client);
+}
+
+module_init(ib_madeye_init);
+module_exit(ib_madeye_cleanup);
diff --git a/sys/ofed/drivers/net/mlx4/Makefile b/sys/ofed/drivers/net/mlx4/Makefile
new file mode 100644
index 0000000..b9d2e7e
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/Makefile
@@ -0,0 +1,9 @@
+obj-$(CONFIG_MLX4_CORE)		+= mlx4_core.o
+
+mlx4_core-y :=	alloc.o catas.o cmd.o cq.o eq.o fw.o icm.o intf.o main.o mcg.o \
+		mr.o pd.o port.o profile.o qp.o reset.o sense.o srq.o xrcd.o
+
+obj-$(CONFIG_MLX4_EN)               += mlx4_en.o
+
+mlx4_en-y := 	en_main.o en_tx.o en_rx.o en_ethtool.o en_port.o en_cq.o \
+		en_resources.o en_netdev.o en_frag.o en_selftest.o
diff --git a/sys/ofed/drivers/net/mlx4/alloc.c b/sys/ofed/drivers/net/mlx4/alloc.c
new file mode 100644
index 0000000..c22791a
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/alloc.c
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/bitmap.h>
+#include <linux/dma-mapping.h>
+#include <linux/vmalloc.h>
+
+#include "mlx4.h"
+
+u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap)
+{
+	u32 obj;
+
+	spin_lock(&bitmap->lock);
+
+	obj = find_next_zero_bit(bitmap->table, bitmap->max, bitmap->last);
+	if (obj >= bitmap->max) {
+		bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
+				& bitmap->mask;
+		obj = find_first_zero_bit(bitmap->table, bitmap->max);
+	}
+
+	if (obj < bitmap->max) {
+		set_bit(obj, bitmap->table);
+		bitmap->last = (obj + 1);
+		if (bitmap->last == bitmap->max)
+			bitmap->last = 0;
+		obj |= bitmap->top;
+	} else
+		obj = -1;
+
+	if (obj != -1)
+		--bitmap->avail;
+
+	spin_unlock(&bitmap->lock);
+
+	return obj;
+}
+
+void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj)
+{
+	mlx4_bitmap_free_range(bitmap, obj, 1);
+}
+
+static unsigned long find_aligned_range(unsigned long *bitmap,
+					u32 start, u32 nbits,
+					int len, int align)
+{
+	unsigned long end, i;
+
+again:
+	start = ALIGN(start, align);
+
+	while ((start < nbits) && test_bit(start, bitmap))
+		start += align;
+
+	if (start >= nbits)
+		return -1;
+
+	end = start+len;
+	if (end > nbits)
+		return -1;
+
+	for (i = start + 1; i < end; i++) {
+		if (test_bit(i, bitmap)) {
+			start = i + 1;
+			goto again;
+		}
+	}
+
+	return start;
+}
+
+u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align)
+{
+	u32 obj, i;
+
+	if (likely(cnt == 1 && align == 1))
+		return mlx4_bitmap_alloc(bitmap);
+
+	spin_lock(&bitmap->lock);
+
+	obj = find_aligned_range(bitmap->table, bitmap->last,
+				 bitmap->max, cnt, align);
+	if (obj >= bitmap->max) {
+		bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
+				& bitmap->mask;
+		obj = find_aligned_range(bitmap->table, 0, bitmap->max,
+					 cnt, align);
+	}
+
+	if (obj < bitmap->max) {
+		for (i = 0; i < cnt; i++)
+			set_bit(obj + i, bitmap->table);
+		if (obj == bitmap->last) {
+			bitmap->last = (obj + cnt);
+			if (bitmap->last >= bitmap->max)
+				bitmap->last = 0;
+		}
+		obj |= bitmap->top;
+	} else
+		obj = -1;
+
+	if (obj != -1)
+		bitmap->avail -= cnt;
+
+	spin_unlock(&bitmap->lock);
+
+	return obj;
+}
+
+u32 mlx4_bitmap_avail(struct mlx4_bitmap *bitmap)
+{
+	return bitmap->avail;
+}
+
+void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt)
+{
+	u32 i;
+
+	obj &= bitmap->max + bitmap->reserved_top - 1;
+
+	spin_lock(&bitmap->lock);
+	for (i = 0; i < cnt; i++)
+		clear_bit(obj + i, bitmap->table);
+	bitmap->last = min(bitmap->last, obj);
+	bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
+			& bitmap->mask;
+	bitmap->avail += cnt;
+	spin_unlock(&bitmap->lock);
+}
+
+int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask,
+		     u32 reserved_bot, u32 reserved_top)
+{
+	int i;
+
+	/* num must be a power of 2 */
+	if (num != roundup_pow_of_two(num))
+		return -EINVAL;
+
+	bitmap->last = 0;
+	bitmap->top  = 0;
+	bitmap->max  = num - reserved_top;
+	bitmap->mask = mask;
+	bitmap->reserved_top = reserved_top;
+	bitmap->avail = num - reserved_top - reserved_bot;
+	spin_lock_init(&bitmap->lock);
+	bitmap->table = kzalloc(BITS_TO_LONGS(bitmap->max) *
+				sizeof (long), GFP_KERNEL);
+	if (!bitmap->table)
+		return -ENOMEM;
+
+	for (i = 0; i < reserved_bot; ++i)
+		set_bit(i, bitmap->table);
+
+	return 0;
+}
+
+void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap)
+{
+	kfree(bitmap->table);
+}
+
+/*
+ * Handling for queue buffers -- we allocate a bunch of memory and
+ * register it in a memory region at HCA virtual address 0.  If the
+ * requested size is > max_direct, we split the allocation into
+ * multiple pages, so we don't require too much contiguous memory.
+ */
+
+int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
+		   struct mlx4_buf *buf)
+{
+	dma_addr_t t;
+
+	buf->direct.buf = NULL;
+	if (size <= max_direct) {
+		buf->nbufs        = 1;
+		buf->npages       = 1;
+		buf->page_shift   = get_order(size) + PAGE_SHIFT;
+		buf->direct.buf   = dma_alloc_coherent(&dev->pdev->dev,
+						       size, &t, GFP_KERNEL);
+		if (!buf->direct.buf)
+			return -ENOMEM;
+
+		buf->direct.map = t;
+
+		while (t & ((1 << buf->page_shift) - 1)) {
+			--buf->page_shift;
+			buf->npages *= 2;
+		}
+
+		memset(buf->direct.buf, 0, size);
+	} else {
+		int i;
+
+		buf->direct.buf  = NULL;
+		buf->direct.map  = 0;
+		buf->nbufs       = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+		buf->npages      = buf->nbufs;
+		buf->page_shift  = PAGE_SHIFT;
+		buf->page_list   = kzalloc(buf->nbufs * sizeof *buf->page_list,
+					   GFP_KERNEL);
+		if (!buf->page_list)
+			return -ENOMEM;
+
+		for (i = 0; i < buf->nbufs; ++i) {
+			buf->page_list[i].buf =
+				dma_alloc_coherent(&dev->pdev->dev, PAGE_SIZE,
+						   &t, GFP_KERNEL);
+			if (!buf->page_list[i].buf)
+				goto err_free;
+
+			buf->page_list[i].map = t;
+
+			memset(buf->page_list[i].buf, 0, PAGE_SIZE);
+		}
+
+		if (BITS_PER_LONG == 64) {
+			struct page **pages;
+			pages = kmalloc(sizeof *pages * buf->nbufs, GFP_KERNEL);
+			if (!pages)
+				goto err_free;
+			for (i = 0; i < buf->nbufs; ++i)
+				pages[i] = virt_to_page(buf->page_list[i].buf);
+			buf->direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL);
+			kfree(pages);
+			if (!buf->direct.buf)
+				goto err_free;
+		}
+	}
+
+	return 0;
+
+err_free:
+	mlx4_buf_free(dev, size, buf);
+
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(mlx4_buf_alloc);
+
+void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf)
+{
+	int i;
+
+	if (buf->nbufs == 1)
+		dma_free_coherent(&dev->pdev->dev, size, buf->direct.buf,
+				  buf->direct.map);
+	else {
+		if (BITS_PER_LONG == 64 && buf->direct.buf)
+			vunmap(buf->direct.buf);
+
+		for (i = 0; i < buf->nbufs; ++i)
+			if (buf->page_list[i].buf)
+				dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+						  buf->page_list[i].buf,
+						  buf->page_list[i].map);
+		kfree(buf->page_list);
+	}
+	buf->direct.buf = NULL;
+}
+EXPORT_SYMBOL_GPL(mlx4_buf_free);
+
+static struct mlx4_db_pgdir *mlx4_alloc_db_pgdir(struct device *dma_device)
+{
+	struct mlx4_db_pgdir *pgdir;
+
+	pgdir = kzalloc(sizeof *pgdir, GFP_KERNEL);
+	if (!pgdir)
+		return NULL;
+
+	bitmap_fill(pgdir->order1, MLX4_DB_PER_PAGE / 2);
+	pgdir->bits[0] = pgdir->order0;
+	pgdir->bits[1] = pgdir->order1;
+	pgdir->db_page = dma_alloc_coherent(dma_device, PAGE_SIZE,
+					    &pgdir->db_dma, GFP_KERNEL);
+	if (!pgdir->db_page) {
+		kfree(pgdir);
+		return NULL;
+	}
+
+	return pgdir;
+}
+
+static int mlx4_alloc_db_from_pgdir(struct mlx4_db_pgdir *pgdir,
+				    struct mlx4_db *db, int order)
+{
+	int o;
+	int i;
+
+	for (o = order; o <= 1; ++o) {
+		i = find_first_bit(pgdir->bits[o], MLX4_DB_PER_PAGE >> o);
+		if (i < MLX4_DB_PER_PAGE >> o)
+			goto found;
+	}
+
+	return -ENOMEM;
+
+found:
+	clear_bit(i, pgdir->bits[o]);
+
+	i <<= o;
+
+	if (o > order)
+		set_bit(i ^ 1, pgdir->bits[order]);
+
+	db->u.pgdir = pgdir;
+	db->index   = i;
+	db->db      = pgdir->db_page + db->index;
+	db->dma     = pgdir->db_dma  + db->index * 4;
+	db->order   = order;
+
+	return 0;
+}
+
+int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_db_pgdir *pgdir;
+	int ret = 0;
+
+	mutex_lock(&priv->pgdir_mutex);
+
+	list_for_each_entry(pgdir, &priv->pgdir_list, list)
+		if (!mlx4_alloc_db_from_pgdir(pgdir, db, order))
+			goto out;
+
+	pgdir = mlx4_alloc_db_pgdir(&(dev->pdev->dev));
+	if (!pgdir) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	list_add(&pgdir->list, &priv->pgdir_list);
+
+	/* This should never fail -- we just allocated an empty page: */
+	WARN_ON(mlx4_alloc_db_from_pgdir(pgdir, db, order));
+
+out:
+	mutex_unlock(&priv->pgdir_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_db_alloc);
+
+void mlx4_db_free(struct mlx4_dev *dev, struct mlx4_db *db)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int o;
+	int i;
+
+	mutex_lock(&priv->pgdir_mutex);
+
+	o = db->order;
+	i = db->index;
+
+	if (db->order == 0 && test_bit(i ^ 1, db->u.pgdir->order0)) {
+		clear_bit(i ^ 1, db->u.pgdir->order0);
+		++o;
+	}
+	i >>= o;
+	set_bit(i, db->u.pgdir->bits[o]);
+
+	if (bitmap_full(db->u.pgdir->order1, MLX4_DB_PER_PAGE / 2)) {
+		dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE,
+				  db->u.pgdir->db_page, db->u.pgdir->db_dma);
+		list_del(&db->u.pgdir->list);
+		kfree(db->u.pgdir);
+	}
+
+	mutex_unlock(&priv->pgdir_mutex);
+}
+EXPORT_SYMBOL_GPL(mlx4_db_free);
+
+int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres,
+		       int size, int max_direct)
+{
+	int err;
+
+	err = mlx4_db_alloc(dev, &wqres->db, 1);
+	if (err)
+		return err;
+
+	*wqres->db.db = 0;
+
+	err = mlx4_buf_alloc(dev, size, max_direct, &wqres->buf);
+	if (err)
+		goto err_db;
+
+	err = mlx4_mtt_init(dev, wqres->buf.npages, wqres->buf.page_shift,
+			    &wqres->mtt);
+	if (err)
+		goto err_buf;
+
+	err = mlx4_buf_write_mtt(dev, &wqres->mtt, &wqres->buf);
+	if (err)
+		goto err_mtt;
+
+	return 0;
+
+err_mtt:
+	mlx4_mtt_cleanup(dev, &wqres->mtt);
+err_buf:
+	mlx4_buf_free(dev, size, &wqres->buf);
+err_db:
+	mlx4_db_free(dev, &wqres->db);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_alloc_hwq_res);
+
+void mlx4_free_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres,
+		       int size)
+{
+	mlx4_mtt_cleanup(dev, &wqres->mtt);
+	mlx4_buf_free(dev, size, &wqres->buf);
+	mlx4_db_free(dev, &wqres->db);
+}
+EXPORT_SYMBOL_GPL(mlx4_free_hwq_res);
diff --git a/sys/ofed/drivers/net/mlx4/catas.c b/sys/ofed/drivers/net/mlx4/catas.c
new file mode 100644
index 0000000..334aad9
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/catas.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/workqueue.h>
+
+#include "mlx4.h"
+
+#define	MLX4_CATAS_POLL_INTERVAL	(5 * HZ)
+
+static DEFINE_SPINLOCK(catas_lock);
+
+static LIST_HEAD(catas_list);
+static struct work_struct catas_work;
+
+static int internal_err_reset = 1;
+module_param(internal_err_reset, int, 0644);
+MODULE_PARM_DESC(internal_err_reset,
+		 "Reset device on internal errors if non-zero (default 1)");
+
+static void dump_err_buf(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	int i;
+
+	mlx4_err(dev, "Internal error detected:\n");
+	for (i = 0; i < priv->fw.catas_size; ++i)
+		mlx4_err(dev, "  buf[%02x]: %08x\n",
+			 i, swab32(readl(priv->catas_err.map + i)));
+}
+
+static void poll_catas(unsigned long dev_ptr)
+{
+	struct mlx4_dev *dev = (struct mlx4_dev *) dev_ptr;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (readl(priv->catas_err.map)) {
+		dump_err_buf(dev);
+
+		mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
+
+		if (internal_err_reset) {
+			spin_lock(&catas_lock);
+			list_add(&priv->catas_err.list, &catas_list);
+			spin_unlock(&catas_lock);
+
+			queue_work(mlx4_wq, &catas_work);
+		}
+	} else
+		mod_timer(&priv->catas_err.timer,
+			  round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
+}
+
+static void catas_reset(struct work_struct *work)
+{
+	struct mlx4_priv *priv, *tmppriv;
+	struct mlx4_dev *dev;
+
+	LIST_HEAD(tlist);
+	int ret;
+
+	if (!mutex_trylock(&drv_mutex))
+		return;
+
+	spin_lock_irq(&catas_lock);
+	list_splice_init(&catas_list, &tlist);
+	spin_unlock_irq(&catas_lock);
+
+	list_for_each_entry_safe(priv, tmppriv, &tlist, catas_err.list) {
+		struct pci_dev *pdev = priv->dev.pdev;
+
+		ret = mlx4_restart_one(priv->dev.pdev);
+		/* 'priv' now is not valid */
+		if (ret)
+			printk(KERN_ERR "mlx4 %s: Reset failed (%d)\n",
+				pci_name(pdev), ret);
+		else {
+			dev  = pci_get_drvdata(pdev);
+			mlx4_dbg(dev, "Reset succeeded\n");
+		}
+	}
+	mutex_unlock(&drv_mutex);
+}
+
+void mlx4_start_catas_poll(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	unsigned long addr;
+
+	INIT_LIST_HEAD(&priv->catas_err.list);
+	init_timer(&priv->catas_err.timer);
+	priv->catas_err.map = NULL;
+
+	addr = pci_resource_start(dev->pdev, priv->fw.catas_bar) +
+		priv->fw.catas_offset;
+
+	priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4);
+	if (!priv->catas_err.map) {
+		mlx4_warn(dev, "Failed to map internal error buffer at 0x%lx\n",
+			  addr);
+		return;
+	}
+
+	priv->catas_err.timer.data     = (unsigned long) dev;
+	priv->catas_err.timer.function = poll_catas;
+	priv->catas_err.timer.expires  =
+		round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL);
+	add_timer(&priv->catas_err.timer);
+}
+
+void mlx4_stop_catas_poll(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	del_timer_sync(&priv->catas_err.timer);
+
+	if (priv->catas_err.map)
+		iounmap(priv->catas_err.map);
+
+	spin_lock_irq(&catas_lock);
+	list_del(&priv->catas_err.list);
+	spin_unlock_irq(&catas_lock);
+}
+
+void  __init mlx4_catas_init(void)
+{
+	INIT_WORK(&catas_work, catas_reset);
+}
diff --git a/sys/ofed/drivers/net/mlx4/cmd.c b/sys/ofed/drivers/net/mlx4/cmd.c
new file mode 100644
index 0000000..bc4a618
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/cmd.c
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/errno.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include <asm/io.h>
+
+#include "mlx4.h"
+
+#define CMD_POLL_TOKEN 0xffff
+
+enum {
+	/* command completed successfully: */
+	CMD_STAT_OK		= 0x00,
+	/* Internal error (such as a bus error) occurred while processing command: */
+	CMD_STAT_INTERNAL_ERR	= 0x01,
+	/* Operation/command not supported or opcode modifier not supported: */
+	CMD_STAT_BAD_OP		= 0x02,
+	/* Parameter not supported or parameter out of range: */
+	CMD_STAT_BAD_PARAM	= 0x03,
+	/* System not enabled or bad system state: */
+	CMD_STAT_BAD_SYS_STATE	= 0x04,
+	/* Attempt to access reserved or unallocaterd resource: */
+	CMD_STAT_BAD_RESOURCE	= 0x05,
+	/* Requested resource is currently executing a command, or is otherwise busy: */
+	CMD_STAT_RESOURCE_BUSY	= 0x06,
+	/* Required capability exceeds device limits: */
+	CMD_STAT_EXCEED_LIM	= 0x08,
+	/* Resource is not in the appropriate state or ownership: */
+	CMD_STAT_BAD_RES_STATE	= 0x09,
+	/* Index out of range: */
+	CMD_STAT_BAD_INDEX	= 0x0a,
+	/* FW image corrupted: */
+	CMD_STAT_BAD_NVMEM	= 0x0b,
+	/* Error in ICM mapping (e.g. not enough auxiliary ICM pages to execute command): */
+	CMD_STAT_ICM_ERROR	= 0x0c,
+	/* Attempt to modify a QP/EE which is not in the presumed state: */
+	CMD_STAT_BAD_QP_STATE   = 0x10,
+	/* Bad segment parameters (Address/Size): */
+	CMD_STAT_BAD_SEG_PARAM	= 0x20,
+	/* Memory Region has Memory Windows bound to: */
+	CMD_STAT_REG_BOUND	= 0x21,
+	/* HCA local attached memory not present: */
+	CMD_STAT_LAM_NOT_PRE	= 0x22,
+	/* Bad management packet (silently discarded): */
+	CMD_STAT_BAD_PKT	= 0x30,
+	/* More outstanding CQEs in CQ than new CQ size: */
+	CMD_STAT_BAD_SIZE	= 0x40,
+	/* Multi Function device support required: */
+	CMD_STAT_MULTI_FUNC_REQ	= 0x50,
+};
+
+enum {
+	HCR_IN_PARAM_OFFSET	= 0x00,
+	HCR_IN_MODIFIER_OFFSET	= 0x08,
+	HCR_OUT_PARAM_OFFSET	= 0x0c,
+	HCR_TOKEN_OFFSET	= 0x14,
+	HCR_STATUS_OFFSET	= 0x18,
+
+	HCR_OPMOD_SHIFT		= 12,
+	HCR_T_BIT		= 21,
+	HCR_E_BIT		= 22,
+	HCR_GO_BIT		= 23
+};
+
+enum {
+	GO_BIT_TIMEOUT_MSECS	= 10000
+};
+
+struct mlx4_cmd_context {
+	struct completion	done;
+	int			result;
+	int			next;
+	u64			out_param;
+	u16			token;
+	u8			fw_status;
+};
+
+static int mlx4_status_to_errno(u8 status)
+{
+	static const int trans_table[] = {
+		[CMD_STAT_INTERNAL_ERR]	  = -EIO,
+		[CMD_STAT_BAD_OP]	  = -EPERM,
+		[CMD_STAT_BAD_PARAM]	  = -EINVAL,
+		[CMD_STAT_BAD_SYS_STATE]  = -ENXIO,
+		[CMD_STAT_BAD_RESOURCE]	  = -EBADF,
+		[CMD_STAT_RESOURCE_BUSY]  = -EBUSY,
+		[CMD_STAT_EXCEED_LIM]	  = -ENOMEM,
+		[CMD_STAT_BAD_RES_STATE]  = -EBADF,
+		[CMD_STAT_BAD_INDEX]	  = -EBADF,
+		[CMD_STAT_BAD_NVMEM]	  = -EFAULT,
+		[CMD_STAT_ICM_ERROR]	  = -ENFILE,
+		[CMD_STAT_BAD_QP_STATE]   = -EINVAL,
+		[CMD_STAT_BAD_SEG_PARAM]  = -EFAULT,
+		[CMD_STAT_REG_BOUND]	  = -EBUSY,
+		[CMD_STAT_LAM_NOT_PRE]	  = -EAGAIN,
+		[CMD_STAT_BAD_PKT]	  = -EINVAL,
+		[CMD_STAT_BAD_SIZE]	  = -ENOMEM,
+		[CMD_STAT_MULTI_FUNC_REQ] = -EACCES,
+	};
+
+	if (status >= ARRAY_SIZE(trans_table) ||
+	    (status != CMD_STAT_OK && trans_table[status] == 0))
+		return -EIO;
+
+	return trans_table[status];
+}
+
+static int cmd_pending(struct mlx4_dev *dev)
+{
+	u32 status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET);
+
+	return (status & swab32(1 << HCR_GO_BIT)) ||
+		(mlx4_priv(dev)->cmd.toggle ==
+		 !!(status & swab32(1 << HCR_T_BIT)));
+}
+
+static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param,
+			 u32 in_modifier, u8 op_modifier, u16 op, u16 token,
+			 int event)
+{
+	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
+	u32 __iomem *hcr = cmd->hcr;
+	int ret = -EAGAIN;
+	unsigned long end;
+
+	mutex_lock(&cmd->hcr_mutex);
+
+	end = jiffies;
+	if (event)
+		end += msecs_to_jiffies(GO_BIT_TIMEOUT_MSECS);
+
+	while (cmd_pending(dev)) {
+		if (time_after_eq(jiffies, end))
+			goto out;
+		cond_resched();
+	}
+
+	/*
+	 * We use writel (instead of something like memcpy_toio)
+	 * because writes of less than 32 bits to the HCR don't work
+	 * (and some architectures such as ia64 implement memcpy_toio
+	 * in terms of writeb).
+	 */
+	__raw_writel((__force u32) cpu_to_be32(in_param >> 32),		  hcr + 0);
+	__raw_writel((__force u32) cpu_to_be32(in_param & 0xfffffffful),  hcr + 1);
+	__raw_writel((__force u32) cpu_to_be32(in_modifier),		  hcr + 2);
+	__raw_writel((__force u32) cpu_to_be32(out_param >> 32),	  hcr + 3);
+	__raw_writel((__force u32) cpu_to_be32(out_param & 0xfffffffful), hcr + 4);
+	__raw_writel((__force u32) cpu_to_be32(token << 16),		  hcr + 5);
+
+	/* __raw_writel may not order writes. */
+	wmb();
+
+	__raw_writel((__force u32) cpu_to_be32((1 << HCR_GO_BIT)		|
+					       (cmd->toggle << HCR_T_BIT)	|
+					       (event ? (1 << HCR_E_BIT) : 0)	|
+					       (op_modifier << HCR_OPMOD_SHIFT) |
+					       op),			  hcr + 6);
+
+	/*
+	 * Make sure that our HCR writes don't get mixed in with
+	 * writes from another CPU starting a FW command.
+	 */
+	mmiowb();
+
+	cmd->toggle = cmd->toggle ^ 1;
+
+	ret = 0;
+
+out:
+	mutex_unlock(&cmd->hcr_mutex);
+	return ret;
+}
+
+static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+			 int out_is_imm, u32 in_modifier, u8 op_modifier,
+			 u16 op, unsigned long timeout)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	void __iomem *hcr = priv->cmd.hcr;
+	int err = 0;
+	unsigned long end;
+	u32 stat;
+
+	down(&priv->cmd.poll_sem);
+
+	err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
+			    in_modifier, op_modifier, op, CMD_POLL_TOKEN, 0);
+	if (err)
+		goto out;
+
+	end = msecs_to_jiffies(timeout) + jiffies;
+	while (cmd_pending(dev) && time_before(jiffies, end))
+		cond_resched();
+
+	if (cmd_pending(dev)) {
+		err = -ETIMEDOUT;
+		goto out;
+	}
+
+	if (out_is_imm)
+		*out_param =
+			(u64) be32_to_cpu((__force __be32)
+					  __raw_readl(hcr + HCR_OUT_PARAM_OFFSET)) << 32 |
+			(u64) be32_to_cpu((__force __be32)
+					  __raw_readl(hcr + HCR_OUT_PARAM_OFFSET + 4));
+	stat = be32_to_cpu((__force __be32) __raw_readl(hcr + HCR_STATUS_OFFSET)) >> 24;
+	err = mlx4_status_to_errno(stat);
+	if (err) {
+		if (op != MLX4_CMD_SET_NODE || stat != CMD_STAT_BAD_OP)
+			mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n",
+				 op, stat);
+	}
+
+out:
+	up(&priv->cmd.poll_sem);
+	return err;
+}
+
+void mlx4_cmd_event(struct mlx4_dev *dev, u16 token, u8 status, u64 out_param)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_context *context =
+		&priv->cmd.context[token & priv->cmd.token_mask];
+
+	/* previously timed out command completing at long last */
+	if (token != context->token)
+		return;
+
+	context->fw_status = status;
+	context->result    = mlx4_status_to_errno(status);
+	context->out_param = out_param;
+
+	complete(&context->done);
+}
+
+static int mlx4_cmd_wait(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+			 int out_is_imm, u32 in_modifier, u8 op_modifier,
+			 u16 op, unsigned long timeout)
+{
+	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
+	struct mlx4_cmd_context *context;
+	int err = 0;
+
+	down(&cmd->event_sem);
+
+	spin_lock(&cmd->context_lock);
+	BUG_ON(cmd->free_head < 0);
+	context = &cmd->context[cmd->free_head];
+	context->token += cmd->token_mask + 1;
+	cmd->free_head = context->next;
+	spin_unlock(&cmd->context_lock);
+
+	init_completion(&context->done);
+
+	mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
+		      in_modifier, op_modifier, op, context->token, 1);
+
+	if (!wait_for_completion_timeout(&context->done, msecs_to_jiffies(timeout))) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	err = context->result;
+	if (err) {
+		if (op != MLX4_CMD_SET_NODE || context->fw_status != CMD_STAT_BAD_OP)
+			mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n",
+				 op, context->fw_status);
+		goto out;
+	}
+
+	if (out_is_imm)
+		*out_param = context->out_param;
+
+out:
+	spin_lock(&cmd->context_lock);
+	context->next = cmd->free_head;
+	cmd->free_head = context - cmd->context;
+	spin_unlock(&cmd->context_lock);
+
+	up(&cmd->event_sem);
+	return err;
+}
+
+int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+	       int out_is_imm, u32 in_modifier, u8 op_modifier,
+	       u16 op, unsigned long timeout)
+{
+	if (mlx4_priv(dev)->cmd.use_events && !cold)
+		return mlx4_cmd_wait(dev, in_param, out_param, out_is_imm,
+				     in_modifier, op_modifier, op, timeout);
+	else
+		return mlx4_cmd_poll(dev, in_param, out_param, out_is_imm,
+				     in_modifier, op_modifier, op, timeout);
+}
+EXPORT_SYMBOL_GPL(__mlx4_cmd);
+
+int mlx4_cmd_init(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mutex_init(&priv->cmd.hcr_mutex);
+	sema_init(&priv->cmd.poll_sem, 1);
+	priv->cmd.use_events = 0;
+	priv->cmd.toggle     = 1;
+
+	priv->cmd.hcr = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_HCR_BASE,
+				MLX4_HCR_SIZE);
+	if (!priv->cmd.hcr) {
+		mlx4_err(dev, "Couldn't map command register.");
+		return -ENOMEM;
+	}
+
+	priv->cmd.pool = pci_pool_create("mlx4_cmd", dev->pdev,
+					 MLX4_MAILBOX_SIZE,
+					 MLX4_MAILBOX_SIZE, 0);
+	if (!priv->cmd.pool) {
+		iounmap(priv->cmd.hcr);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void mlx4_cmd_cleanup(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	pci_pool_destroy(priv->cmd.pool);
+	iounmap(priv->cmd.hcr);
+}
+
+/*
+ * Switch to using events to issue FW commands (can only be called
+ * after event queue for command events has been initialized).
+ */
+int mlx4_cmd_use_events(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i;
+
+	priv->cmd.context = kmalloc(priv->cmd.max_cmds *
+				   sizeof (struct mlx4_cmd_context),
+				   GFP_KERNEL);
+	if (!priv->cmd.context)
+		return -ENOMEM;
+
+	for (i = 0; i < priv->cmd.max_cmds; ++i) {
+		priv->cmd.context[i].token = i;
+		priv->cmd.context[i].next  = i + 1;
+	}
+
+	priv->cmd.context[priv->cmd.max_cmds - 1].next = -1;
+	priv->cmd.free_head = 0;
+
+	sema_init(&priv->cmd.event_sem, priv->cmd.max_cmds);
+	spin_lock_init(&priv->cmd.context_lock);
+
+	for (priv->cmd.token_mask = 1;
+	     priv->cmd.token_mask < priv->cmd.max_cmds;
+	     priv->cmd.token_mask <<= 1)
+		; /* nothing */
+	--priv->cmd.token_mask;
+
+	priv->cmd.use_events = 1;
+
+	down(&priv->cmd.poll_sem);
+
+	return 0;
+}
+
+/*
+ * Switch back to polling (used when shutting down the device)
+ */
+void mlx4_cmd_use_polling(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i;
+
+	priv->cmd.use_events = 0;
+
+	for (i = 0; i < priv->cmd.max_cmds; ++i)
+		down(&priv->cmd.event_sem);
+
+	kfree(priv->cmd.context);
+
+	up(&priv->cmd.poll_sem);
+}
+
+struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+
+	mailbox = kmalloc(sizeof *mailbox, GFP_KERNEL);
+	if (!mailbox)
+		return ERR_PTR(-ENOMEM);
+
+	mailbox->buf = pci_pool_alloc(mlx4_priv(dev)->cmd.pool, GFP_KERNEL,
+				      &mailbox->dma);
+	if (!mailbox->buf) {
+		kfree(mailbox);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return mailbox;
+}
+EXPORT_SYMBOL_GPL(mlx4_alloc_cmd_mailbox);
+
+void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox)
+{
+	if (!mailbox)
+		return;
+
+	pci_pool_free(mlx4_priv(dev)->cmd.pool, mailbox->buf, mailbox->dma);
+	kfree(mailbox);
+}
+EXPORT_SYMBOL_GPL(mlx4_free_cmd_mailbox);
diff --git a/sys/ofed/drivers/net/mlx4/cq.c b/sys/ofed/drivers/net/mlx4/cq.c
new file mode 100644
index 0000000..076c602
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/cq.c
@@ -0,0 +1,338 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/hardirq.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/cq.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+struct mlx4_cq_context {
+	__be32			flags;
+	u16			reserved1[3];
+	__be16			page_offset;
+	__be32			logsize_usrpage;
+	__be16			cq_period;
+	__be16			cq_max_count;
+	u8			reserved2[3];
+	u8			comp_eqn;
+	u8			log_page_size;
+	u8			reserved3[2];
+	u8			mtt_base_addr_h;
+	__be32			mtt_base_addr_l;
+	__be32			last_notified_index;
+	__be32			solicit_producer_index;
+	__be32			consumer_index;
+	__be32			producer_index;
+	u32			reserved4[2];
+	__be64			db_rec_addr;
+};
+
+#define MLX4_CQ_STATUS_OK		( 0 << 28)
+#define MLX4_CQ_STATUS_OVERFLOW		( 9 << 28)
+#define MLX4_CQ_STATUS_WRITE_FAIL	(10 << 28)
+#define MLX4_CQ_FLAG_CC			( 1 << 18)
+#define MLX4_CQ_FLAG_OI			( 1 << 17)
+#define MLX4_CQ_STATE_ARMED		( 9 <<  8)
+#define MLX4_CQ_STATE_ARMED_SOL		( 6 <<  8)
+#define MLX4_EQ_STATE_FIRED		(10 <<  8)
+
+void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn)
+{
+	struct mlx4_cq *cq;
+
+	cq = radix_tree_lookup(&mlx4_priv(dev)->cq_table.tree,
+			       cqn & (dev->caps.num_cqs - 1));
+	if (!cq) {
+		mlx4_dbg(dev, "Completion event for bogus CQ %08x\n", cqn);
+		return;
+	}
+
+	++cq->arm_sn;
+
+	cq->comp(cq);
+}
+
+void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type)
+{
+	struct mlx4_cq_table *cq_table = &mlx4_priv(dev)->cq_table;
+	struct mlx4_cq *cq;
+
+	spin_lock(&cq_table->lock);
+
+	cq = radix_tree_lookup(&cq_table->tree, cqn & (dev->caps.num_cqs - 1));
+	if (cq)
+		atomic_inc(&cq->refcount);
+
+	spin_unlock(&cq_table->lock);
+
+	if (!cq) {
+		mlx4_warn(dev, "Async event for bogus CQ %08x\n", cqn);
+		return;
+	}
+
+	cq->event(cq, event_type);
+
+	if (atomic_dec_and_test(&cq->refcount))
+		complete(&cq->free);
+}
+
+static int mlx4_SW2HW_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			 int cq_num)
+{
+	return mlx4_cmd(dev, mailbox->dma, cq_num, 0, MLX4_CMD_SW2HW_CQ,
+			MLX4_CMD_TIME_CLASS_A);
+}
+
+static int mlx4_MODIFY_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			 int cq_num, u32 opmod)
+{
+	return mlx4_cmd(dev, mailbox->dma, cq_num, opmod, MLX4_CMD_MODIFY_CQ,
+			MLX4_CMD_TIME_CLASS_A);
+}
+
+static int mlx4_HW2SW_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			 int cq_num)
+{
+	return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, cq_num,
+			    mailbox ? 0 : 1, MLX4_CMD_HW2SW_CQ,
+			    MLX4_CMD_TIME_CLASS_A);
+}
+
+int mlx4_cq_modify(struct mlx4_dev *dev, struct mlx4_cq *cq,
+		   u16 count, u16 period)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_cq_context *cq_context;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	cq_context = mailbox->buf;
+	memset(cq_context, 0, sizeof *cq_context);
+
+	cq_context->cq_max_count = cpu_to_be16(count);
+	cq_context->cq_period    = cpu_to_be16(period);
+
+	err = mlx4_MODIFY_CQ(dev, mailbox, cq->cqn, 1);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_cq_modify);
+
+int mlx4_cq_resize(struct mlx4_dev *dev, struct mlx4_cq *cq,
+		   int entries, struct mlx4_mtt *mtt)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_cq_context *cq_context;
+	u64 mtt_addr;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	cq_context = mailbox->buf;
+	memset(cq_context, 0, sizeof *cq_context);
+
+	cq_context->logsize_usrpage = cpu_to_be32(ilog2(entries) << 24);
+	cq_context->log_page_size   = mtt->page_shift - 12;
+	mtt_addr = mlx4_mtt_addr(dev, mtt);
+	cq_context->mtt_base_addr_h = mtt_addr >> 32;
+	cq_context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff);
+
+	err = mlx4_MODIFY_CQ(dev, mailbox, cq->cqn, 0);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_cq_resize);
+
+static int mlx4_find_least_loaded_vector(struct mlx4_priv *priv)
+{
+	int i;
+	int index = 0;
+	int min = priv->eq_table.eq[0].load;
+
+	for (i = 1; i < priv->dev.caps.num_comp_vectors; i++) {
+		if (priv->eq_table.eq[i].load < min) {
+			index = i;
+			min = priv->eq_table.eq[i].load;
+		}
+	}
+
+	return index;
+}
+
+int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt,
+		  struct mlx4_uar *uar, u64 db_rec, struct mlx4_cq *cq,
+		  unsigned vector, int collapsed)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cq_table *cq_table = &priv->cq_table;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_cq_context *cq_context;
+	u64 mtt_addr;
+	int err;
+
+	cq->vector = (vector == MLX4_LEAST_ATTACHED_VECTOR) ?
+		mlx4_find_least_loaded_vector(priv) : vector;
+
+	if (cq->vector >= dev->caps.num_comp_vectors)
+		return -EINVAL;
+
+	cq->cqn = mlx4_bitmap_alloc(&cq_table->bitmap);
+	if (cq->cqn == -1)
+		return -ENOMEM;
+
+	err = mlx4_table_get(dev, &cq_table->table, cq->cqn);
+	if (err)
+		goto err_out;
+
+	err = mlx4_table_get(dev, &cq_table->cmpt_table, cq->cqn);
+	if (err)
+		goto err_put;
+
+	spin_lock_irq(&cq_table->lock);
+	err = radix_tree_insert(&cq_table->tree, cq->cqn, cq);
+	spin_unlock_irq(&cq_table->lock);
+	if (err)
+		goto err_cmpt_put;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_radix;
+	}
+
+	cq_context = mailbox->buf;
+	memset(cq_context, 0, sizeof *cq_context);
+
+	cq_context->flags	    = cpu_to_be32(!!collapsed << 18);
+	cq_context->logsize_usrpage = cpu_to_be32((ilog2(nent) << 24) | uar->index);
+	cq_context->comp_eqn	    = priv->eq_table.eq[cq->vector].eqn;
+	cq_context->log_page_size   = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
+
+	mtt_addr = mlx4_mtt_addr(dev, mtt);
+	cq_context->mtt_base_addr_h = mtt_addr >> 32;
+	cq_context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff);
+	cq_context->db_rec_addr     = cpu_to_be64(db_rec);
+
+	err = mlx4_SW2HW_CQ(dev, mailbox, cq->cqn);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (err)
+		goto err_radix;
+
+	priv->eq_table.eq[cq->vector].load++;
+	cq->cons_index = 0;
+	cq->arm_sn     = 1;
+	cq->uar        = uar;
+	atomic_set(&cq->refcount, 1);
+	init_completion(&cq->free);
+
+	return 0;
+
+err_radix:
+	spin_lock_irq(&cq_table->lock);
+	radix_tree_delete(&cq_table->tree, cq->cqn);
+	spin_unlock_irq(&cq_table->lock);
+
+err_cmpt_put:
+	mlx4_table_put(dev, &cq_table->cmpt_table, cq->cqn);
+
+err_put:
+	mlx4_table_put(dev, &cq_table->table, cq->cqn);
+
+err_out:
+	mlx4_bitmap_free(&cq_table->bitmap, cq->cqn);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_cq_alloc);
+
+void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cq_table *cq_table = &priv->cq_table;
+	int err;
+
+	err = mlx4_HW2SW_CQ(dev, NULL, cq->cqn);
+	if (err)
+		mlx4_warn(dev, "HW2SW_CQ failed (%d) for CQN %06x\n", err, cq->cqn);
+
+	synchronize_irq(priv->eq_table.eq[cq->vector].irq);
+	priv->eq_table.eq[cq->vector].load--;
+
+	spin_lock_irq(&cq_table->lock);
+	radix_tree_delete(&cq_table->tree, cq->cqn);
+	spin_unlock_irq(&cq_table->lock);
+
+	if (atomic_dec_and_test(&cq->refcount))
+		complete(&cq->free);
+	wait_for_completion(&cq->free);
+
+	mlx4_table_put(dev, &cq_table->table, cq->cqn);
+	mlx4_bitmap_free(&cq_table->bitmap, cq->cqn);
+}
+EXPORT_SYMBOL_GPL(mlx4_cq_free);
+
+int mlx4_init_cq_table(struct mlx4_dev *dev)
+{
+	struct mlx4_cq_table *cq_table = &mlx4_priv(dev)->cq_table;
+	int err;
+
+	spin_lock_init(&cq_table->lock);
+	INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC);
+
+	err = mlx4_bitmap_init(&cq_table->bitmap, dev->caps.num_cqs,
+			       dev->caps.num_cqs - 1, dev->caps.reserved_cqs, 0);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+void mlx4_cleanup_cq_table(struct mlx4_dev *dev)
+{
+	/* Nothing to do to clean up radix_tree */
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->cq_table.bitmap);
+}
diff --git a/sys/ofed/drivers/net/mlx4/en_cq.c b/sys/ofed/drivers/net/mlx4/en_cq.c
new file mode 100644
index 0000000..9f475ff
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/en_cq.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include "mlx4_en.h"
+
+#include <linux/mlx4/cq.h>
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/cmd.h>
+
+static void mlx4_en_cq_event(struct mlx4_cq *cq, enum mlx4_event event)
+{
+	return;
+}
+
+
+int mlx4_en_create_cq(struct mlx4_en_priv *priv,
+		      struct mlx4_en_cq *cq,
+		      int entries, int ring, enum cq_type mode)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	cq->size = entries;
+	if (mode == RX) {
+		cq->buf_size = cq->size * sizeof(struct mlx4_cqe);
+		cq->vector   = (ring + priv->port) %
+				mdev->dev->caps.num_comp_vectors;
+		TASK_INIT(&cq->cq_task, 0, mlx4_en_rx_que, cq);
+	} else {
+		cq->buf_size = sizeof(struct mlx4_cqe);
+		cq->vector   = MLX4_LEAST_ATTACHED_VECTOR;
+		TASK_INIT(&cq->cq_task, 0, mlx4_en_tx_que, cq);
+	}
+
+	cq->tq = taskqueue_create_fast("mlx4_en_que", M_NOWAIT,
+	    taskqueue_thread_enqueue, &cq->tq);
+	taskqueue_start_threads(&cq->tq, 1, PI_NET, "%s cq",
+	    if_name(priv->dev));
+	cq->ring = ring;
+	cq->is_tx = mode;
+	mtx_init(&cq->lock.m, "mlx4 cq", NULL, MTX_DEF);
+
+	err = mlx4_alloc_hwq_res(mdev->dev, &cq->wqres,
+				cq->buf_size, 2 * PAGE_SIZE);
+	if (err)
+		return err;
+
+	err = mlx4_en_map_buffer(&cq->wqres.buf);
+	if (err)
+		mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size);
+	else
+		cq->buf = (struct mlx4_cqe *) cq->wqres.buf.direct.buf;
+
+	return err;
+}
+
+int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	cq->dev = mdev->pndev[priv->port];
+	cq->mcq.set_ci_db  = cq->wqres.db.db;
+	cq->mcq.arm_db     = cq->wqres.db.db + 1;
+	*cq->mcq.set_ci_db = 0;
+	*cq->mcq.arm_db    = 0;
+	memset(cq->buf, 0, cq->buf_size);
+
+	if (!cq->is_tx)
+		cq->size = priv->rx_ring[cq->ring].actual_size;
+
+	err = mlx4_cq_alloc(mdev->dev, cq->size, &cq->wqres.mtt, &mdev->priv_uar,
+			    cq->wqres.db.dma, &cq->mcq, cq->vector, cq->is_tx);
+	if (err)
+		return err;
+
+	cq->mcq.comp  = cq->is_tx ? mlx4_en_tx_irq : mlx4_en_rx_irq;
+	cq->mcq.event = mlx4_en_cq_event;
+
+	if (cq->is_tx) {
+		init_timer(&cq->timer);
+		cq->timer.function = mlx4_en_poll_tx_cq;
+		cq->timer.data = (unsigned long) cq;
+	}
+
+	return 0;
+}
+
+void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	taskqueue_drain(cq->tq, &cq->cq_task);
+	taskqueue_free(cq->tq);
+	mlx4_en_unmap_buffer(&cq->wqres.buf);
+	mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size);
+	cq->buf_size = 0;
+	cq->buf = NULL;
+	mtx_destroy(&cq->lock.m);
+}
+
+void mlx4_en_deactivate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	taskqueue_drain(cq->tq, &cq->cq_task);
+	if (cq->is_tx)
+		del_timer(&cq->timer);
+
+	mlx4_cq_free(mdev->dev, &cq->mcq);
+}
+
+/* Set rx cq moderation parameters */
+int mlx4_en_set_cq_moder(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
+{
+	return mlx4_cq_modify(priv->mdev->dev, &cq->mcq,
+			      cq->moder_cnt, cq->moder_time);
+}
+
+int mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
+{
+	mlx4_cq_arm(&cq->mcq, MLX4_CQ_DB_REQ_NOT, priv->mdev->uar_map,
+		    &priv->mdev->uar_lock);
+
+	return 0;
+}
+
+
diff --git a/sys/ofed/drivers/net/mlx4/en_ethtool.c b/sys/ofed/drivers/net/mlx4/en_ethtool.c
new file mode 100644
index 0000000..9587fb3
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/en_ethtool.c
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+#include <linux/if_vlan.h>
+
+#include "mlx4_en.h"
+#include "en_port.h"
+
+
+static void mlx4_en_update_lro_stats(struct mlx4_en_priv *priv)
+{
+	int i;
+
+	priv->port_stats.lro_aggregated = 0;
+	priv->port_stats.lro_flushed = 0;
+	priv->port_stats.lro_no_desc = 0;
+
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		priv->port_stats.lro_aggregated += priv->rx_ring[i].lro.stats.aggregated;
+		priv->port_stats.lro_flushed += priv->rx_ring[i].lro.stats.flushed;
+		priv->port_stats.lro_no_desc += priv->rx_ring[i].lro.stats.no_desc;
+	}
+}
+
+static void
+mlx4_en_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	sprintf(drvinfo->driver, DRV_NAME " (%s)", mdev->dev->board_id);
+	strncpy(drvinfo->version, DRV_VERSION " (" DRV_RELDATE ")", 32);
+	sprintf(drvinfo->fw_version, "%d.%d.%d",
+		(u16) (mdev->dev->caps.fw_ver >> 32),
+		(u16) ((mdev->dev->caps.fw_ver >> 16) & 0xffff),
+		(u16) (mdev->dev->caps.fw_ver & 0xffff));
+	strncpy(drvinfo->bus_info, pci_name(mdev->dev->pdev), 32);
+	drvinfo->n_stats = 0;
+	drvinfo->regdump_len = 0;
+	drvinfo->eedump_len = 0;
+}
+
+static u32 mlx4_en_get_tso(struct net_device *dev)
+{
+	return (dev->features & NETIF_F_TSO) != 0;
+}
+
+static int mlx4_en_set_tso(struct net_device *dev, u32 data)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	if (data) {
+		if (!priv->mdev->LSO_support)
+			return -EPERM;
+		dev->features |= (NETIF_F_TSO | NETIF_F_TSO6);
+#ifdef HAVE_NETDEV_VLAN_FEATURES
+		dev->vlan_features |= (NETIF_F_TSO | NETIF_F_TSO6);
+#else
+		if (priv->vlgrp) {
+			int i;
+			struct net_device *vdev;
+			for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
+				vdev = vlan_group_get_device(priv->vlgrp, i);
+				if (vdev) {
+					vdev->features |= (NETIF_F_TSO | NETIF_F_TSO6);
+					vlan_group_set_device(priv->vlgrp, i, vdev);
+				}
+			}
+		}
+#endif
+	} else {
+		dev->features &= ~(NETIF_F_TSO | NETIF_F_TSO6);
+#ifdef HAVE_NETDEV_VLAN_FEATURES
+		dev->vlan_features &= ~(NETIF_F_TSO | NETIF_F_TSO6);
+#else
+		if (priv->vlgrp) {
+			int i;
+			struct net_device *vdev;
+			for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
+				vdev = vlan_group_get_device(priv->vlgrp, i);
+				if (vdev) {
+					vdev->features &= ~(NETIF_F_TSO | NETIF_F_TSO6);
+					vlan_group_set_device(priv->vlgrp, i, vdev);
+				}
+			}
+		}
+#endif
+	}
+	return 0;
+}
+
+static u32 mlx4_en_get_rx_csum(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	return priv->rx_csum;
+}
+
+static int mlx4_en_set_rx_csum(struct net_device *dev, u32 data)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	priv->rx_csum = (data != 0);
+	return 0;
+}
+
+static const char main_strings[][ETH_GSTRING_LEN] = {
+	"rx_packets", "tx_packets", "rx_bytes", "tx_bytes", "rx_errors",
+	"tx_errors", "rx_dropped", "tx_dropped", "multicast", "collisions",
+	"rx_length_errors", "rx_over_errors", "rx_crc_errors",
+	"rx_frame_errors", "rx_fifo_errors", "rx_missed_errors",
+	"tx_aborted_errors", "tx_carrier_errors", "tx_fifo_errors",
+	"tx_heartbeat_errors", "tx_window_errors",
+
+	/* port statistics */
+	"lro_aggregated", "lro_flushed", "lro_no_desc", "tso_packets",
+	"queue_stopped", "wake_queue", "tx_timeout", "rx_alloc_failed",
+	"rx_csum_good", "rx_csum_none", "tx_chksum_offload",
+
+	/* packet statistics */
+	"broadcast", "rx_prio_0", "rx_prio_1", "rx_prio_2", "rx_prio_3",
+	"rx_prio_4", "rx_prio_5", "rx_prio_6", "rx_prio_7", "tx_prio_0",
+	"tx_prio_1", "tx_prio_2", "tx_prio_3", "tx_prio_4", "tx_prio_5",
+	"tx_prio_6", "tx_prio_7",
+};
+#define NUM_MAIN_STATS	21
+#define NUM_ALL_STATS	(NUM_MAIN_STATS + NUM_PORT_STATS + NUM_PKT_STATS + NUM_PERF_STATS)
+
+static const char mlx4_en_test_names[][ETH_GSTRING_LEN]= {
+	"Interupt Test",
+	"Link Test",
+	"Speed Test",
+	"Register Test",
+	"Loopback Test",
+};
+
+static u32 mlx4_en_get_msglevel(struct net_device *dev)
+{
+	return ((struct mlx4_en_priv *) netdev_priv(dev))->msg_enable;
+}
+
+static void mlx4_en_set_msglevel(struct net_device *dev, u32 val)
+{
+	((struct mlx4_en_priv *) netdev_priv(dev))->msg_enable = val;
+}
+
+static void mlx4_en_get_wol(struct net_device *netdev,
+			    struct ethtool_wolinfo *wol)
+{
+	wol->supported = 0;
+	wol->wolopts = 0;
+
+	return;
+}
+
+static int mlx4_en_get_sset_count(struct net_device *dev, int sset)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	switch (sset) {
+	case ETH_SS_STATS:
+		return NUM_ALL_STATS +
+			(priv->tx_ring_num + priv->rx_ring_num) * 2;
+	case ETH_SS_TEST:
+		return MLX4_EN_NUM_SELF_TEST - !(priv->mdev->dev->caps.loopback_support) * 2;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static void mlx4_en_get_ethtool_stats(struct net_device *dev,
+		struct ethtool_stats *stats, uint64_t *data)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int index = 0;
+	int i;
+
+	spin_lock_bh(&priv->stats_lock);
+
+	mlx4_en_update_lro_stats(priv);
+
+	for (i = 0; i < NUM_MAIN_STATS; i++)
+		data[index++] = ((unsigned long *) &priv->stats)[i];
+	for (i = 0; i < NUM_PORT_STATS; i++)
+		data[index++] = ((unsigned long *) &priv->port_stats)[i];
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		data[index++] = priv->tx_ring[i].packets;
+		data[index++] = priv->tx_ring[i].bytes;
+	}
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		data[index++] = priv->rx_ring[i].packets;
+		data[index++] = priv->rx_ring[i].bytes;
+	}
+	for (i = 0; i < NUM_PKT_STATS; i++)
+		data[index++] = ((unsigned long *) &priv->pkstats)[i];
+	spin_unlock_bh(&priv->stats_lock);
+
+}
+
+static void mlx4_en_self_test(struct net_device *dev,
+			      struct ethtool_test *etest, u64 *buf)
+{
+	mlx4_en_ex_selftest(dev, &etest->flags, buf);
+}
+
+static void mlx4_en_get_strings(struct net_device *dev,
+				uint32_t stringset, uint8_t *data)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int index = 0;
+	int i;
+
+	switch (stringset) {
+	case ETH_SS_TEST:
+		for (i = 0; i < MLX4_EN_NUM_SELF_TEST - 2; i++)
+			strcpy(data + i * ETH_GSTRING_LEN, mlx4_en_test_names[i]);
+		if (priv->mdev->dev->caps.loopback_support)
+			for (; i < MLX4_EN_NUM_SELF_TEST; i++)
+				strcpy(data + i * ETH_GSTRING_LEN, mlx4_en_test_names[i]);
+		break;
+
+	case ETH_SS_STATS:
+		/* Add main counters */
+		for (i = 0; i < NUM_MAIN_STATS; i++)
+			strcpy(data + (index++) * ETH_GSTRING_LEN, main_strings[i]);
+		for (i = 0; i< NUM_PORT_STATS; i++)
+			strcpy(data + (index++) * ETH_GSTRING_LEN,
+			main_strings[i + NUM_MAIN_STATS]);
+		for (i = 0; i < priv->tx_ring_num; i++) {
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"tx%d_packets", i);
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"tx%d_bytes", i);
+		}
+		for (i = 0; i < priv->rx_ring_num; i++) {
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"rx%d_packets", i);
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"rx%d_bytes", i);
+		}
+		for (i = 0; i< NUM_PKT_STATS; i++)
+			strcpy(data + (index++) * ETH_GSTRING_LEN,
+			main_strings[i + NUM_MAIN_STATS + NUM_PORT_STATS]);
+		break;
+	}
+}
+
+static int mlx4_en_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int trans_type;
+
+	cmd->autoneg = AUTONEG_DISABLE;
+	cmd->supported = SUPPORTED_10000baseT_Full;
+	cmd->advertising = ADVERTISED_10000baseT_Full;
+
+	if (mlx4_en_QUERY_PORT(priv->mdev, priv->port))
+		return -ENOMEM;
+
+	trans_type = priv->port_state.transciver;
+	if (netif_carrier_ok(dev)) {
+		cmd->speed = priv->port_state.link_speed;
+		cmd->duplex = DUPLEX_FULL;
+	} else {
+		cmd->speed = -1;
+		cmd->duplex = -1;
+	}
+
+	if (trans_type > 0 && trans_type <= 0xC) {
+		cmd->port = PORT_FIBRE;
+		cmd->transceiver = XCVR_EXTERNAL;
+		cmd->supported |= SUPPORTED_FIBRE;
+		cmd->advertising |= ADVERTISED_FIBRE;
+	} else if (trans_type == 0x80 || trans_type == 0) {
+		cmd->port = PORT_TP;
+		cmd->transceiver = XCVR_INTERNAL;
+		cmd->supported |= SUPPORTED_TP;
+		cmd->advertising |= ADVERTISED_TP;
+	} else  {
+		cmd->port = -1;
+		cmd->transceiver = -1;
+	}
+	return 0;
+}
+
+static int mlx4_en_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	if ((cmd->autoneg == AUTONEG_ENABLE) ||
+	    (cmd->speed != SPEED_10000) || (cmd->duplex != DUPLEX_FULL))
+		return -EINVAL;
+
+	/* Nothing to change */
+	return 0;
+}
+
+static int mlx4_en_get_coalesce(struct net_device *dev,
+			      struct ethtool_coalesce *coal)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	coal->tx_coalesce_usecs = 0;
+	coal->tx_max_coalesced_frames = 0;
+	coal->rx_coalesce_usecs = priv->rx_usecs;
+	coal->rx_max_coalesced_frames = priv->rx_frames;
+
+	coal->pkt_rate_low = priv->pkt_rate_low;
+	coal->rx_coalesce_usecs_low = priv->rx_usecs_low;
+	coal->pkt_rate_high = priv->pkt_rate_high;
+	coal->rx_coalesce_usecs_high = priv->rx_usecs_high;
+	coal->rate_sample_interval = priv->sample_interval;
+	coal->use_adaptive_rx_coalesce = priv->adaptive_rx_coal;
+	return 0;
+}
+
+static int mlx4_en_set_coalesce(struct net_device *dev,
+			      struct ethtool_coalesce *coal)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int err, i;
+
+	priv->rx_frames = (coal->rx_max_coalesced_frames ==
+			   MLX4_EN_AUTO_CONF) ?
+				MLX4_EN_RX_COAL_TARGET /
+				priv->dev->mtu + 1 :
+				coal->rx_max_coalesced_frames;
+	priv->rx_usecs = (coal->rx_coalesce_usecs ==
+			  MLX4_EN_AUTO_CONF) ?
+				MLX4_EN_RX_COAL_TIME :
+				coal->rx_coalesce_usecs;
+
+	/* Set adaptive coalescing params */
+	priv->pkt_rate_low = coal->pkt_rate_low;
+	priv->rx_usecs_low = coal->rx_coalesce_usecs_low;
+	priv->pkt_rate_high = coal->pkt_rate_high;
+	priv->rx_usecs_high = coal->rx_coalesce_usecs_high;
+	priv->sample_interval = coal->rate_sample_interval;
+	priv->adaptive_rx_coal = coal->use_adaptive_rx_coalesce;
+	priv->last_moder_time = MLX4_EN_AUTO_CONF;
+	if (priv->adaptive_rx_coal)
+		return 0;
+
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		priv->rx_cq[i].moder_cnt = priv->rx_frames;
+		priv->rx_cq[i].moder_time = priv->rx_usecs;
+		err = mlx4_en_set_cq_moder(priv, &priv->rx_cq[i]);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int mlx4_en_set_pauseparam(struct net_device *dev,
+				struct ethtool_pauseparam *pause)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	priv->prof->tx_pause = pause->tx_pause != 0;
+	priv->prof->rx_pause = pause->rx_pause != 0;
+	err = mlx4_SET_PORT_general(mdev->dev, priv->port,
+				    priv->rx_mb_size + ETH_FCS_LEN,
+				    priv->prof->tx_pause,
+				    priv->prof->tx_ppp,
+				    priv->prof->rx_pause,
+				    priv->prof->rx_ppp);
+	if (err)
+		en_err(priv, "Failed setting pause params\n");
+
+	return err;
+}
+
+static void mlx4_en_get_pauseparam(struct net_device *dev,
+				 struct ethtool_pauseparam *pause)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	pause->tx_pause = priv->prof->tx_pause;
+	pause->rx_pause = priv->prof->rx_pause;
+}
+
+static int mlx4_en_set_ringparam(struct net_device *dev,
+				 struct ethtool_ringparam *param)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	u32 rx_size, tx_size;
+	int port_up = 0;
+	int err = 0;
+
+	if (param->rx_jumbo_pending || param->rx_mini_pending)
+		return -EINVAL;
+
+	rx_size = roundup_pow_of_two(param->rx_pending);
+	rx_size = max_t(u32, rx_size, MLX4_EN_MIN_RX_SIZE);
+	rx_size = min_t(u32, rx_size, MLX4_EN_MAX_RX_SIZE);
+	tx_size = roundup_pow_of_two(param->tx_pending);
+	tx_size = max_t(u32, tx_size, MLX4_EN_MIN_TX_SIZE);
+	tx_size = min_t(u32, tx_size, MLX4_EN_MAX_TX_SIZE);
+
+	if (rx_size == (priv->port_up ? priv->rx_ring[0].actual_size :
+					priv->rx_ring[0].size) &&
+	    tx_size == priv->tx_ring[0].size)
+		return 0;
+
+	mutex_lock(&mdev->state_lock);
+	if (priv->port_up) {
+		port_up = 1;
+		mlx4_en_stop_port(dev);
+	}
+
+	mlx4_en_free_resources(priv);
+
+	priv->prof->tx_ring_size = tx_size;
+	priv->prof->rx_ring_size = rx_size;
+
+	err = mlx4_en_alloc_resources(priv);
+	if (err) {
+		en_err(priv, "Failed reallocating port resources\n");
+		goto out;
+	}
+	if (port_up) {
+		err = mlx4_en_start_port(dev);
+		if (err)
+			en_err(priv, "Failed starting port\n");
+	}
+
+out:
+	mutex_unlock(&mdev->state_lock);
+	return err;
+}
+
+static void mlx4_en_get_ringparam(struct net_device *dev,
+				  struct ethtool_ringparam *param)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	memset(param, 0, sizeof(*param));
+	param->rx_max_pending = MLX4_EN_MAX_RX_SIZE;
+	param->tx_max_pending = MLX4_EN_MAX_TX_SIZE;
+	param->rx_pending = priv->port_up ?
+		priv->rx_ring[0].actual_size : priv->rx_ring[0].size;
+	param->tx_pending = priv->tx_ring[0].size;
+}
+
+const struct ethtool_ops mlx4_en_ethtool_ops = {
+	.get_drvinfo = mlx4_en_get_drvinfo,
+	.get_settings = mlx4_en_get_settings,
+	.set_settings = mlx4_en_set_settings,
+#ifdef NETIF_F_TSO
+	.get_tso = mlx4_en_get_tso,
+	.set_tso = mlx4_en_set_tso,
+#endif
+	.get_sg = ethtool_op_get_sg,
+	.set_sg = ethtool_op_set_sg,
+	.get_link = ethtool_op_get_link,
+	.get_rx_csum = mlx4_en_get_rx_csum,
+	.set_rx_csum = mlx4_en_set_rx_csum,
+	.get_tx_csum = ethtool_op_get_tx_csum,
+	.set_tx_csum = ethtool_op_set_tx_ipv6_csum,
+	.get_strings = mlx4_en_get_strings,
+	.get_sset_count = mlx4_en_get_sset_count,
+	.get_ethtool_stats = mlx4_en_get_ethtool_stats,
+	.self_test = mlx4_en_self_test,
+	.get_wol = mlx4_en_get_wol,
+	.get_msglevel = mlx4_en_get_msglevel,
+	.set_msglevel = mlx4_en_set_msglevel,
+	.get_coalesce = mlx4_en_get_coalesce,
+	.set_coalesce = mlx4_en_set_coalesce,
+	.get_pauseparam = mlx4_en_get_pauseparam,
+	.set_pauseparam = mlx4_en_set_pauseparam,
+	.get_ringparam = mlx4_en_get_ringparam,
+	.set_ringparam = mlx4_en_set_ringparam,
+	.get_flags = ethtool_op_get_flags,
+	.set_flags = ethtool_op_set_flags,
+};
+
+
+
+
+
diff --git a/sys/ofed/drivers/net/mlx4/en_frag.c b/sys/ofed/drivers/net/mlx4/en_frag.c
new file mode 100644
index 0000000..6c6bac4
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/en_frag.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include "mlx4_en.h"
+
+#include <net/ethernet.h>
+#include <netinet/ip.h>
+#include <machine/in_cksum.h>
+
+static struct mlx4_en_ipfrag *find_session(struct mlx4_en_rx_ring *ring,
+					   struct ip *iph)
+{
+	struct mlx4_en_ipfrag *session;
+	int i;
+
+	for (i = 0; i < MLX4_EN_NUM_IPFRAG_SESSIONS; i++) {
+		session = &ring->ipfrag[i];
+		if (session->fragments == NULL)
+			continue;
+		if (session->daddr == iph->ip_dst.s_addr &&
+		    session->saddr == iph->ip_src.s_addr &&
+		    session->id == iph->ip_id &&
+		    session->protocol == iph->ip_p) {
+			return session;
+		}
+	}
+	return NULL;
+}
+
+static struct mlx4_en_ipfrag *start_session(struct mlx4_en_rx_ring *ring,
+					    struct ip *iph)
+{
+	struct mlx4_en_ipfrag *session;
+	int index = -1;
+	int i;
+
+	for (i = 0; i < MLX4_EN_NUM_IPFRAG_SESSIONS; i++) {
+		if (ring->ipfrag[i].fragments == NULL) {
+			index = i;
+			break;
+		}
+	}
+	if (index < 0)
+		return NULL;
+
+	session = &ring->ipfrag[index];
+
+	return session;
+}
+
+
+static void flush_session(struct mlx4_en_priv *priv,
+			  struct mlx4_en_ipfrag *session,
+			  u16 more)
+{
+	struct mbuf *mb = session->fragments;
+	struct ip *iph = mb->m_pkthdr.header;
+	struct net_device *dev = mb->m_pkthdr.rcvif;
+
+	/* Update IP length and checksum */
+	iph->ip_len = htons(session->total_len);
+	iph->ip_off = htons(more | (session->offset >> 3));
+	iph->ip_sum = 0;
+	iph->ip_sum = in_cksum_skip(mb, iph->ip_hl * 4,
+	    (char *)iph - mb->m_data);
+
+	dev->if_input(dev, mb);
+	session->fragments = NULL;
+	session->last = NULL;
+}
+
+
+static inline void frag_append(struct mlx4_en_priv *priv,
+			       struct mlx4_en_ipfrag *session,
+			       struct mbuf *mb,
+			       unsigned int data_len)
+{
+	struct mbuf *parent = session->fragments;
+
+	/* Update mb bookkeeping */
+	parent->m_pkthdr.len += data_len;
+	session->total_len += data_len;
+
+	m_adj(mb, mb->m_pkthdr.len - data_len);
+
+	session->last->m_next = mb;
+	for (; mb->m_next != NULL; mb = mb->m_next);
+	session->last = mb;
+}
+
+int mlx4_en_rx_frags(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring,
+		     struct mbuf *mb, struct mlx4_cqe *cqe)
+{
+	struct mlx4_en_ipfrag *session;
+	struct ip *iph;
+	u16 ip_len;
+	u16 ip_hlen;
+	int data_len;
+	u16 offset;
+
+	iph = (struct ip *)(mtod(mb, char *) + ETHER_HDR_LEN);
+	mb->m_pkthdr.header = iph;
+	ip_len = ntohs(iph->ip_len);
+	ip_hlen = iph->ip_hl * 4;
+	data_len = ip_len - ip_hlen;
+	offset = ntohs(iph->ip_off);
+	offset &= IP_OFFMASK;
+	offset <<= 3;
+
+	session = find_session(ring, iph);
+	if (unlikely(in_cksum_skip(mb, ip_hlen, (char *)iph - mb->m_data))) {
+		if (session)
+			flush_session(priv, session, IP_MF);
+		return -EINVAL;
+	}
+	if (session) {
+		if (unlikely(session->offset + session->total_len !=
+		    offset + ip_hlen ||
+		    session->total_len + mb->m_pkthdr.len > 65536)) {
+			flush_session(priv, session, IP_MF);
+			goto new_session;
+		}
+		frag_append(priv, session, mb, data_len);
+	} else {
+new_session:
+		session = start_session(ring, iph);
+		if (unlikely(!session))
+			return -ENOSPC;
+
+		session->fragments = mb;
+		session->daddr = iph->ip_dst.s_addr;
+		session->saddr = iph->ip_src.s_addr;
+		session->id = iph->ip_id;
+		session->protocol = iph->ip_p;
+		session->total_len = ip_len;
+		session->offset = offset;
+		for (; mb->m_next != NULL; mb = mb->m_next);
+		session->last = mb;
+	}
+	if (!(ntohs(iph->ip_off) & IP_MF))
+		flush_session(priv, session, 0);
+
+	return 0;
+}
+
+
+void mlx4_en_flush_frags(struct mlx4_en_priv *priv,
+			 struct mlx4_en_rx_ring *ring)
+{
+	struct mlx4_en_ipfrag *session;
+	int i;
+
+	for (i = 0; i < MLX4_EN_NUM_IPFRAG_SESSIONS; i++) {
+		session = &ring->ipfrag[i];
+		if (session->fragments)
+			flush_session(priv, session, IP_MF);
+	}
+}
diff --git a/sys/ofed/drivers/net/mlx4/en_main.c b/sys/ofed/drivers/net/mlx4/en_main.c
new file mode 100644
index 0000000..4d75a10
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/en_main.c
@@ -0,0 +1,384 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4_en.h"
+
+MODULE_AUTHOR("Liran Liss, Yevgeny Petrilin");
+MODULE_DESCRIPTION("Mellanox ConnectX HCA Ethernet driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION " ("DRV_RELDATE")");
+
+static const char mlx4_en_version[] =
+	DRV_NAME ": Mellanox ConnectX HCA Ethernet driver v"
+	DRV_VERSION " (" DRV_RELDATE ")\n";
+
+#define MLX4_EN_PARM_INT(X, def_val, desc) \
+	static unsigned int X = def_val;\
+	module_param(X , uint, 0444); \
+	MODULE_PARM_DESC(X, desc);
+
+
+/*
+ * Device scope module parameters
+ */
+
+
+/* Enable RSS TCP traffic */
+MLX4_EN_PARM_INT(tcp_rss, 1,
+		 "Enable RSS for incomming TCP traffic or disabled (0)");
+/* Enable RSS UDP traffic */
+MLX4_EN_PARM_INT(udp_rss, 1,
+		 "Enable RSS for incomming UDP traffic or disabled (0)");
+
+/* Number of LRO sessions per Rx ring (rounded up to a power of two) */
+MLX4_EN_PARM_INT(num_lro, MLX4_EN_MAX_LRO_DESCRIPTORS,
+		 "Number of LRO sessions per ring or disabled (0)");
+
+/* Allow reassembly of fragmented IP packets */
+MLX4_EN_PARM_INT(ip_reasm, 1, "Allow reassembly of fragmented IP packets (!0)");
+
+/* Priority pausing */
+MLX4_EN_PARM_INT(pfctx, 0, "Priority based Flow Control policy on TX[7:0]."
+			   " Per priority bit mask");
+MLX4_EN_PARM_INT(pfcrx, 0, "Priority based Flow Control policy on RX[7:0]."
+			   " Per priority bit mask");
+
+static int mlx4_en_get_profile(struct mlx4_en_dev *mdev)
+{
+	struct mlx4_en_profile *params = &mdev->profile;
+	int i;
+
+	params->tcp_rss = tcp_rss;
+	params->udp_rss = udp_rss;
+	if (params->udp_rss && !mdev->dev->caps.udp_rss) {
+		mlx4_warn(mdev, "UDP RSS is not supported on this device.\n");
+		params->udp_rss = 0;
+	}
+	params->num_lro = min_t(int, num_lro , MLX4_EN_MAX_LRO_DESCRIPTORS);
+	params->ip_reasm = ip_reasm;
+	for (i = 1; i <= MLX4_MAX_PORTS; i++) {
+		params->prof[i].rx_pause = 1;
+		params->prof[i].rx_ppp = pfcrx;
+		params->prof[i].tx_pause = 1;
+		params->prof[i].tx_ppp = pfctx;
+		params->prof[i].tx_ring_size = MLX4_EN_DEF_TX_RING_SIZE;
+		params->prof[i].rx_ring_size = MLX4_EN_DEF_RX_RING_SIZE;
+		params->prof[i].tx_ring_num = MLX4_EN_NUM_HASH_RINGS + 1 +
+			(!!pfcrx) * MLX4_EN_NUM_PPP_RINGS;
+	}
+
+	return 0;
+}
+
+static void *get_netdev(struct mlx4_dev *dev, void *ctx, u8 port)
+{
+	struct mlx4_en_dev *endev = ctx;
+
+	return endev->pndev[port];
+}
+
+static void mlx4_en_event(struct mlx4_dev *dev, void *endev_ptr,
+			  enum mlx4_dev_event event, int port)
+{
+	struct mlx4_en_dev *mdev = (struct mlx4_en_dev *) endev_ptr;
+	struct mlx4_en_priv *priv;
+
+	if (!mdev->pndev[port])
+		return;
+
+	priv = netdev_priv(mdev->pndev[port]);
+	switch (event) {
+	case MLX4_DEV_EVENT_PORT_UP:
+	case MLX4_DEV_EVENT_PORT_DOWN:
+		/* To prevent races, we poll the link state in a separate
+		  task rather than changing it here */
+		priv->link_state = event;
+		queue_work(mdev->workqueue, &priv->linkstate_task);
+		break;
+
+	case MLX4_DEV_EVENT_CATASTROPHIC_ERROR:
+		mlx4_err(mdev, "Internal error detected, restarting device\n");
+		break;
+
+	default:
+		mlx4_warn(mdev, "Unhandled event: %d\n", event);
+	}
+}
+
+static void mlx4_en_remove(struct mlx4_dev *dev, void *endev_ptr)
+{
+	struct mlx4_en_dev *mdev = endev_ptr;
+	int i;
+
+	mutex_lock(&mdev->state_lock);
+	mdev->device_up = false;
+	mutex_unlock(&mdev->state_lock);
+
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH)
+		if (mdev->pndev[i])
+			mlx4_en_destroy_netdev(mdev->pndev[i]);
+
+	flush_workqueue(mdev->workqueue);
+	destroy_workqueue(mdev->workqueue);
+	mlx4_mr_free(dev, &mdev->mr);
+	mlx4_uar_free(dev, &mdev->priv_uar);
+	mlx4_pd_free(dev, mdev->priv_pdn);
+	sx_destroy(&mdev->state_lock.sx);
+	mtx_destroy(&mdev->uar_lock.m);
+	kfree(mdev);
+}
+
+static void *mlx4_en_add(struct mlx4_dev *dev)
+{
+	static int mlx4_en_version_printed;
+	struct mlx4_en_dev *mdev;
+	int i;
+	int err;
+
+	if (!mlx4_en_version_printed) {
+		printk(KERN_INFO "%s", mlx4_en_version);
+		mlx4_en_version_printed++;
+	}
+
+	mdev = kzalloc(sizeof *mdev, GFP_KERNEL);
+	if (!mdev) {
+		dev_err(&dev->pdev->dev, "Device struct alloc failed, "
+			"aborting.\n");
+		err = -ENOMEM;
+		goto err_free_res;
+	}
+
+	if (mlx4_pd_alloc(dev, &mdev->priv_pdn))
+		goto err_free_dev;
+
+	if (mlx4_uar_alloc(dev, &mdev->priv_uar))
+		goto err_pd;
+
+	mtx_init(&mdev->uar_lock.m, "mlx4 uar", NULL, MTX_DEF);
+	mdev->uar_map = ioremap(mdev->priv_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
+	if (!mdev->uar_map)
+		goto err_uar;
+
+	mdev->dev = dev;
+	mdev->dma_device = &(dev->pdev->dev);
+	mdev->pdev = dev->pdev;
+	mdev->device_up = false;
+
+	mdev->LSO_support = !!(dev->caps.flags & (1 << 15));
+	if (!mdev->LSO_support)
+		mlx4_warn(mdev, "LSO not supported, please upgrade to later "
+				"FW version to enable LSO\n");
+
+	if (mlx4_mr_alloc(mdev->dev, mdev->priv_pdn, 0, ~0ull,
+			 MLX4_PERM_LOCAL_WRITE |  MLX4_PERM_LOCAL_READ,
+			 0, 0, &mdev->mr)) {
+		mlx4_err(mdev, "Failed allocating memory region\n");
+		goto err_uar;
+	}
+	if (mlx4_mr_enable(mdev->dev, &mdev->mr)) {
+		mlx4_err(mdev, "Failed enabling memory region\n");
+		goto err_mr;
+	}
+
+	/* Build device profile according to supplied module parameters */
+	err = mlx4_en_get_profile(mdev);
+	if (err) {
+		mlx4_err(mdev, "Bad module parameters, aborting.\n");
+		goto err_mr;
+	}
+
+	/* Configure wich ports to start according to module parameters */
+	mdev->port_cnt = 0;
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH)
+		mdev->port_cnt++;
+
+	/* If we did not receive an explicit number of Rx rings, default to
+	 * the number of completion vectors populated by the mlx4_core */
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
+		mlx4_info(mdev, "Using %d tx rings for port:%d\n",
+			  mdev->profile.prof[i].tx_ring_num, i);
+		mdev->profile.prof[i].rx_ring_num = rounddown_pow_of_two(
+			min_t(int, dev->caps.num_comp_vectors, MAX_RX_RINGS/2)) +
+		(mdev->profile.udp_rss ? rounddown_pow_of_two(
+			min_t(int, dev->caps.num_comp_vectors, MAX_RX_RINGS/2)) : 1);
+		mlx4_info(mdev, "Defaulting to %d rx rings for port:%d\n",
+			  mdev->profile.prof[i].rx_ring_num, i);
+	}
+
+	/* Create our own workqueue for reset/multicast tasks
+	 * Note: we cannot use the shared workqueue because of deadlocks caused
+	 *       by the rtnl lock */
+	mdev->workqueue = create_singlethread_workqueue("mlx4_en");
+	if (!mdev->workqueue) {
+		err = -ENOMEM;
+		goto err_mr;
+	}
+
+	/* At this stage all non-port specific tasks are complete:
+	 * mark the card state as up */
+	sx_init(&mdev->state_lock.sx, "mlxen state");
+	mdev->device_up = true;
+
+	/* Setup ports */
+
+	/* Create a netdev for each port */
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
+		mlx4_info(mdev, "Activating port:%d\n", i);
+		if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i])) {
+			mdev->pndev[i] = NULL;
+			goto err_free_netdev;
+		}
+	}
+	return mdev;
+
+
+err_free_netdev:
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
+		if (mdev->pndev[i])
+			mlx4_en_destroy_netdev(mdev->pndev[i]);
+	}
+
+	mutex_lock(&mdev->state_lock);
+	mdev->device_up = false;
+	mutex_unlock(&mdev->state_lock);
+	flush_workqueue(mdev->workqueue);
+
+	/* Stop event queue before we drop down to release shared SW state */
+	destroy_workqueue(mdev->workqueue);
+
+err_mr:
+	mlx4_mr_free(dev, &mdev->mr);
+err_uar:
+	mtx_destroy(&mdev->uar_lock.m);
+	mlx4_uar_free(dev, &mdev->priv_uar);
+err_pd:
+	mlx4_pd_free(dev, mdev->priv_pdn);
+err_free_dev:
+	kfree(mdev);
+err_free_res:
+	return NULL;
+}
+
+enum mlx4_query_reply mlx4_en_query(void *endev_ptr, void *int_dev)
+{
+	struct mlx4_en_dev *mdev = endev_ptr;
+	struct net_device *netdev = int_dev;
+	int p;
+	
+	for (p = 1; p <= MLX4_MAX_PORTS; ++p)
+		if (mdev->pndev[p] == netdev)
+			return p;
+
+	return MLX4_QUERY_NOT_MINE;
+}
+
+#if 0
+static struct pci_device_id mlx4_en_pci_table[] = {
+	{ PCI_VDEVICE(MELLANOX, 0x6340) }, /* MT25408 "Hermon" SDR */
+	{ PCI_VDEVICE(MELLANOX, 0x634a) }, /* MT25408 "Hermon" DDR */
+	{ PCI_VDEVICE(MELLANOX, 0x6354) }, /* MT25408 "Hermon" QDR */
+	{ PCI_VDEVICE(MELLANOX, 0x6732) }, /* MT25408 "Hermon" DDR PCIe gen2 */
+	{ PCI_VDEVICE(MELLANOX, 0x673c) }, /* MT25408 "Hermon" QDR PCIe gen2 */
+	{ PCI_VDEVICE(MELLANOX, 0x6368) }, /* MT25408 "Hermon" EN 10GigE */
+	{ PCI_VDEVICE(MELLANOX, 0x6750) }, /* MT25408 "Hermon" EN 10GigE PCIe gen2 */
+	{ PCI_VDEVICE(MELLANOX, 0x6372) }, /* MT25458 ConnectX EN 10GBASE-T 10GigE */
+	{ PCI_VDEVICE(MELLANOX, 0x675a) }, /* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */
+	{ PCI_VDEVICE(MELLANOX, 0x6764) }, /* MT26468 ConnectX EN 10GigE PCIe gen2 */
+	{ PCI_VDEVICE(MELLANOX, 0x6746) }, /* MT26438 ConnectX VPI PCIe 2.0 5GT/s - IB QDR / 10GigE Virt+ */
+	{ PCI_VDEVICE(MELLANOX, 0x676e) }, /* MT26478 ConnectX EN 40GigE PCIe 2.0 5GT/s */
+	{ PCI_VDEVICE(MELLANOX, 0x6778) }, /* MT26488 ConnectX VPI PCIe 2.0 5GT/s - IB DDR / 10GigE Virt+ */
+	{ PCI_VDEVICE(MELLANOX, 0x1000) },
+	{ PCI_VDEVICE(MELLANOX, 0x1001) },
+	{ PCI_VDEVICE(MELLANOX, 0x1002) },
+	{ PCI_VDEVICE(MELLANOX, 0x1003) },
+	{ PCI_VDEVICE(MELLANOX, 0x1004) },
+	{ PCI_VDEVICE(MELLANOX, 0x1005) },
+	{ PCI_VDEVICE(MELLANOX, 0x1006) },
+	{ PCI_VDEVICE(MELLANOX, 0x1007) },
+	{ PCI_VDEVICE(MELLANOX, 0x1008) },
+	{ PCI_VDEVICE(MELLANOX, 0x1009) },
+	{ PCI_VDEVICE(MELLANOX, 0x100a) },
+	{ PCI_VDEVICE(MELLANOX, 0x100b) },
+	{ PCI_VDEVICE(MELLANOX, 0x100c) },
+	{ PCI_VDEVICE(MELLANOX, 0x100d) },
+	{ PCI_VDEVICE(MELLANOX, 0x100e) },
+	{ PCI_VDEVICE(MELLANOX, 0x100f) },
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, mlx4_en_pci_table);
+#endif
+
+static struct mlx4_interface mlx4_en_interface = {
+	.add	= mlx4_en_add,
+	.remove	= mlx4_en_remove,
+	.event	= mlx4_en_event,
+	.query  = mlx4_en_query,
+	.get_prot_dev	= get_netdev,
+	.protocol	= MLX4_PROT_EN,
+};
+
+static int __init mlx4_en_init(void)
+{
+	return mlx4_register_interface(&mlx4_en_interface);
+}
+
+static void __exit mlx4_en_cleanup(void)
+{
+	mlx4_unregister_interface(&mlx4_en_interface);
+}
+
+module_init(mlx4_en_init);
+module_exit(mlx4_en_cleanup);
+
+#undef MODULE_VERSION
+#include <sys/module.h>
+static int
+mlxen_evhand(module_t mod, int event, void *arg)
+{
+        return (0);
+}
+static moduledata_t mlxen_mod = {
+        .name = "mlxen",
+	.evhand = mlxen_evhand,
+};
+DECLARE_MODULE(mlxen, mlxen_mod, SI_SUB_SMP, SI_ORDER_ANY);
+MODULE_DEPEND(mlxen, mlx4, 1, 1, 1);
diff --git a/sys/ofed/drivers/net/mlx4/en_netdev.c b/sys/ofed/drivers/net/mlx4/en_netdev.c
new file mode 100644
index 0000000..531f46f
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/en_netdev.c
@@ -0,0 +1,1511 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include "mlx4_en.h"
+
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/cq.h>
+
+#include <linux/delay.h>
+#include <net/ethernet.h>
+#include <net/if_vlan_var.h>
+#include <sys/sockio.h>
+
+static void mlx4_en_sysctl_stat(struct mlx4_en_priv *priv);
+
+static void mlx4_en_vlan_rx_add_vid(void *arg, struct net_device *dev, u16 vid)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int idx;
+	u8 field;
+
+	if ((vid == 0) || (vid > 4095))    /* Invalid */
+		return;
+
+	en_dbg(HW, priv, "adding VLAN:%d\n", vid);
+
+	spin_lock(&priv->vlan_lock);
+	priv->vlgrp_modified = true;
+	idx = vid >> 5;
+	field = 1 << (vid & 0x1f);
+	if (priv->vlan_unregister[idx] & field)
+		priv->vlan_unregister[idx] &= ~field;
+	else
+		priv->vlan_register[idx] |= field;
+	priv->vlans[idx] |= field;
+	spin_unlock(&priv->vlan_lock);
+}
+
+static void mlx4_en_vlan_rx_kill_vid(void *arg, struct net_device *dev, u16 vid)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int idx;
+	u8 field;
+
+	if ((vid == 0) || (vid > 4095))    /* Invalid */
+		return;
+	en_dbg(HW, priv, "Killing VID:%d\n", vid);
+	spin_lock(&priv->vlan_lock);
+	priv->vlgrp_modified = true;
+	idx = vid >> 5;
+	field = 1 << (vid & 0x1f);
+	if (priv->vlan_register[idx] & field)
+		priv->vlan_register[idx] &= ~field;
+	else
+		priv->vlan_unregister[idx] |= field;
+	priv->vlans[idx] &= ~field;
+	spin_unlock(&priv->vlan_lock);
+}
+
+u64 mlx4_en_mac_to_u64(u8 *addr)
+{
+	u64 mac = 0;
+	int i;
+
+	for (i = 0; i < ETHER_ADDR_LEN; i++) {
+		mac <<= 8;
+		mac |= addr[i];
+	}
+	return mac;
+}
+
+static int mlx4_en_cache_mclist(struct net_device *dev, u64 **mcaddrp)
+{
+	struct ifmultiaddr *ifma;;
+	u64 *mcaddr;
+	int cnt;
+	int i;
+
+	*mcaddrp = NULL;
+restart:
+	cnt = 0;
+	if_maddr_rlock(dev);
+	TAILQ_FOREACH(ifma, &dev->if_multiaddrs, ifma_link) {
+		if (ifma->ifma_addr->sa_family != AF_LINK)
+			continue;
+		if (((struct sockaddr_dl *)ifma->ifma_addr)->sdl_alen !=
+		    ETHER_ADDR_LEN)
+			continue;
+		cnt++;
+	}
+	if_maddr_runlock(dev);
+	if (cnt == 0)
+		return (0);
+	mcaddr = kmalloc(sizeof(u64) * cnt, GFP_KERNEL);
+	if (mcaddr == NULL)
+		return (0);
+	i = 0;
+	if_maddr_rlock(dev);
+	TAILQ_FOREACH(ifma, &dev->if_multiaddrs, ifma_link) {
+		if (ifma->ifma_addr->sa_family != AF_LINK)
+			continue;
+		if (((struct sockaddr_dl *)ifma->ifma_addr)->sdl_alen !=
+		    ETHER_ADDR_LEN)
+			continue;
+		/* Make sure the list didn't grow. */
+		if (i == cnt) {
+			if_maddr_runlock(dev);
+			kfree(mcaddr);
+			goto restart;
+		}
+		mcaddr[i++] = mlx4_en_mac_to_u64(
+		    LLADDR((struct sockaddr_dl *)ifma->ifma_addr));
+	}
+	if_maddr_runlock(dev);
+	*mcaddrp = mcaddr;
+	return (i);
+}
+
+
+static void mlx4_en_set_multicast(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	if (!priv->port_up)
+		return;
+
+	queue_work(priv->mdev->workqueue, &priv->mcast_task);
+}
+
+static void mlx4_en_do_set_multicast(struct work_struct *work)
+{
+	struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
+						 mcast_task);
+	struct net_device *dev = priv->dev;
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	mutex_lock(&mdev->state_lock);
+	if (!mdev->device_up) {
+		en_dbg(HW, priv, "Card is not up, "
+				 "ignoring multicast change.\n");
+		goto out;
+	}
+	if (!priv->port_up) {
+		en_dbg(HW, priv, "Port is down, "
+				 "ignoring  multicast change.\n");
+		goto out;
+	}
+
+	/*
+	 * Promsicuous mode: disable all filters
+	 */
+
+	if (dev->if_flags & IFF_PROMISC) {
+		if (!(priv->flags & MLX4_EN_FLAG_PROMISC)) {
+			priv->flags |= MLX4_EN_FLAG_PROMISC;
+
+			/* Enable promiscouos mode */
+			err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port,
+						     priv->base_qpn, 1);
+			if (err)
+				en_err(priv, "Failed enabling "
+					     "promiscous mode\n");
+
+			/* Disable port multicast filter (unconditionally) */
+			err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
+						  0, MLX4_MCAST_DISABLE);
+			if (err)
+				en_err(priv, "Failed disabling "
+					     "multicast filter\n");
+
+			/* Disable port VLAN filter */
+			err = mlx4_SET_VLAN_FLTR(mdev->dev, priv->port, NULL);
+			if (err)
+				en_err(priv, "Failed disabling VLAN filter\n");
+		}
+		goto out;
+	}
+
+	/*
+	 * Not in promiscous mode
+	 */
+
+	if (priv->flags & MLX4_EN_FLAG_PROMISC) {
+		priv->flags &= ~MLX4_EN_FLAG_PROMISC;
+
+		/* Disable promiscouos mode */
+		err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port,
+					     priv->base_qpn, 0);
+		if (err)
+			en_err(priv, "Failed disabling promiscous mode\n");
+
+		/* Enable port VLAN filter */
+		err = mlx4_SET_VLAN_FLTR(mdev->dev, priv->port, priv->vlans);
+		if (err)
+			en_err(priv, "Failed enabling VLAN filter\n");
+	}
+
+	/* Enable/disable the multicast filter according to IFF_ALLMULTI */
+	if (dev->if_flags & IFF_ALLMULTI) {
+		err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
+					  0, MLX4_MCAST_DISABLE);
+		if (err)
+			en_err(priv, "Failed disabling multicast filter\n");
+	} else {
+		u64 *mcaddr;
+		int mccount;
+		int i;
+
+		err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
+					  0, MLX4_MCAST_DISABLE);
+		if (err)
+			en_err(priv, "Failed disabling multicast filter\n");
+
+		/* Flush mcast filter and init it with broadcast address */
+		mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, ETH_BCAST,
+				    1, MLX4_MCAST_CONFIG);
+
+		/* Update multicast list - we cache all addresses so they won't
+		 * change while HW is updated holding the command semaphor */
+		mccount = mlx4_en_cache_mclist(dev, &mcaddr);
+		for (i = 0; i < mccount; i++)
+			mlx4_SET_MCAST_FLTR(mdev->dev, priv->port,
+					    mcaddr[i], 0, MLX4_MCAST_CONFIG);
+		err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
+					  0, MLX4_MCAST_ENABLE);
+		if (err)
+			en_err(priv, "Failed enabling multicast filter\n");
+
+		kfree(mcaddr);
+	}
+out:
+	mutex_unlock(&mdev->state_lock);
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void mlx4_en_netpoll(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_cq *cq;
+	unsigned long flags;
+	int i;
+
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		cq = &priv->rx_cq[i];
+		spin_lock_irqsave(&cq->lock, flags);
+		napi_synchronize(&cq->napi);
+		if (priv->rx_ring[i].use_frags)
+			mlx4_en_process_rx_cq(dev, cq, 0);
+		else
+			mlx4_en_process_rx_cq_mb(dev, cq, 0);
+		spin_unlock_irqrestore(&cq->lock, flags);
+	}
+}
+#endif
+
+static void mlx4_en_watchdog_timeout(void *arg)
+{
+	struct mlx4_en_priv *priv = arg;
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	en_dbg(DRV, priv, "Scheduling watchdog\n");
+	queue_work(mdev->workqueue, &priv->watchdog_task);
+	if (priv->port_up)
+		callout_reset(&priv->watchdog_timer, MLX4_EN_WATCHDOG_TIMEOUT,
+		    mlx4_en_watchdog_timeout, priv);
+}
+
+
+/* XXX This clears user settings in too many cases. */
+static void mlx4_en_set_default_moderation(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_cq *cq;
+	int i;
+
+	/* If we haven't received a specific coalescing setting
+	 * (module param), we set the moderation paramters as follows:
+	 * - moder_cnt is set to the number of mtu sized packets to
+	 *   satisfy our coelsing target.
+	 * - moder_time is set to a fixed value.
+	 */
+	priv->rx_frames = MLX4_EN_RX_COAL_TARGET / priv->dev->if_mtu + 1;
+	priv->rx_usecs = MLX4_EN_RX_COAL_TIME;
+	en_dbg(INTR, priv, "Default coalesing params for mtu:%ld - "
+			   "rx_frames:%d rx_usecs:%d\n",
+		 priv->dev->if_mtu, priv->rx_frames, priv->rx_usecs);
+
+	/* Setup cq moderation params */
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		cq = &priv->rx_cq[i];
+		cq->moder_cnt = priv->rx_frames;
+		cq->moder_time = priv->rx_usecs;
+	}
+
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		cq = &priv->tx_cq[i];
+		cq->moder_cnt = MLX4_EN_TX_COAL_PKTS;
+		cq->moder_time = MLX4_EN_TX_COAL_TIME;
+	}
+
+	/* Reset auto-moderation params */
+	priv->pkt_rate_low = MLX4_EN_RX_RATE_LOW;
+	priv->rx_usecs_low = MLX4_EN_RX_COAL_TIME_LOW;
+	priv->pkt_rate_high = MLX4_EN_RX_RATE_HIGH;
+	priv->rx_usecs_high = MLX4_EN_RX_COAL_TIME_HIGH;
+	priv->sample_interval = MLX4_EN_SAMPLE_INTERVAL;
+	priv->adaptive_rx_coal = 1;
+	priv->last_moder_time = MLX4_EN_AUTO_CONF;
+	priv->last_moder_jiffies = 0;
+	priv->last_moder_packets = 0;
+	priv->last_moder_tx_packets = 0;
+	priv->last_moder_bytes = 0;
+}
+
+static void mlx4_en_auto_moderation(struct mlx4_en_priv *priv)
+{
+	unsigned long period = (unsigned long) (jiffies - priv->last_moder_jiffies);
+	struct mlx4_en_cq *cq;
+	unsigned long packets;
+	unsigned long rate;
+	unsigned long avg_pkt_size;
+	unsigned long rx_packets;
+	unsigned long rx_bytes;
+	unsigned long tx_packets;
+	unsigned long tx_pkt_diff;
+	unsigned long rx_pkt_diff;
+	int moder_time;
+	int i, err;
+
+	if (!priv->adaptive_rx_coal || period < priv->sample_interval * HZ)
+		return;
+
+	spin_lock(&priv->stats_lock);
+	rx_packets = priv->dev->if_ipackets;
+	rx_bytes = priv->dev->if_ibytes;
+	tx_packets = priv->dev->if_opackets;
+	spin_unlock(&priv->stats_lock);
+
+	if (!priv->last_moder_jiffies || !period)
+		goto out;
+
+	tx_pkt_diff = ((unsigned long) (tx_packets -
+					priv->last_moder_tx_packets));
+	rx_pkt_diff = ((unsigned long) (rx_packets -
+					priv->last_moder_packets));
+	packets = max(tx_pkt_diff, rx_pkt_diff);
+	rate = packets * HZ / period;
+	avg_pkt_size = packets ? ((unsigned long) (rx_bytes -
+				 priv->last_moder_bytes)) / packets : 0;
+
+	/* Apply auto-moderation only when packet rate exceeds a rate that
+	 * it matters */
+	if (rate > MLX4_EN_RX_RATE_THRESH) {
+		/* If tx and rx packet rates are not balanced, assume that
+		 * traffic is mainly BW bound and apply maximum moderation.
+		 * Otherwise, moderate according to packet rate */
+		if (2 * tx_pkt_diff > 3 * rx_pkt_diff ||
+		    2 * rx_pkt_diff > 3 * tx_pkt_diff) {
+			moder_time = priv->rx_usecs_high;
+		} else {
+			if (rate < priv->pkt_rate_low ||
+			    avg_pkt_size < MLX4_EN_AVG_PKT_SMALL)
+				moder_time = priv->rx_usecs_low;
+			else if (rate > priv->pkt_rate_high)
+				moder_time = priv->rx_usecs_high;
+			else
+				moder_time = (rate - priv->pkt_rate_low) *
+					(priv->rx_usecs_high - priv->rx_usecs_low) /
+					(priv->pkt_rate_high - priv->pkt_rate_low) +
+					priv->rx_usecs_low;
+		}
+	} else {
+		/* When packet rate is low, use default moderation rather than
+		 * 0 to prevent interrupt storms if traffic suddenly increases */
+		moder_time = priv->rx_usecs;
+	}
+
+	en_dbg(INTR, priv, "tx rate:%lu rx_rate:%lu\n",
+	       tx_pkt_diff * HZ / period, rx_pkt_diff * HZ / period);
+
+	en_dbg(INTR, priv, "Rx moder_time changed from:%d to %d period:%lu "
+	       "[jiff] packets:%lu avg_pkt_size:%lu rate:%lu [p/s])\n",
+		 priv->last_moder_time, moder_time, period, packets,
+		 avg_pkt_size, rate);
+
+	if (moder_time != priv->last_moder_time) {
+		priv->last_moder_time = moder_time;
+		for (i = 0; i < priv->rx_ring_num; i++) {
+			cq = &priv->rx_cq[i];
+			cq->moder_time = moder_time;
+			err = mlx4_en_set_cq_moder(priv, cq);
+			if (err) {
+				en_err(priv, "Failed modifying moderation for cq:%d\n", i);
+				break;
+			}
+		}
+	}
+
+out:
+	priv->last_moder_packets = rx_packets;
+	priv->last_moder_tx_packets = tx_packets;
+	priv->last_moder_bytes = rx_bytes;
+	priv->last_moder_jiffies = jiffies;
+}
+
+static void mlx4_en_handle_vlans(struct mlx4_en_priv *priv)
+{
+	u8 vlan_register[VLAN_FLTR_SIZE];
+	u8 vlan_unregister[VLAN_FLTR_SIZE];
+	int i, j, idx;
+	u16 vid;
+
+	/* cache the vlan data for processing 
+	 * done under lock to avoid changes during work */
+	spin_lock(&priv->vlan_lock);
+	for (i = 0; i < VLAN_FLTR_SIZE; i++) {
+		vlan_register[i] = priv->vlan_register[i];
+		priv->vlan_register[i] = 0;
+		vlan_unregister[i] = priv->vlan_unregister[i];
+		priv->vlan_unregister[i] = 0;
+	}
+	priv->vlgrp_modified = false;
+	spin_unlock(&priv->vlan_lock);
+
+	/* Configure the vlan filter 
+	 * The vlgrp is updated with all the vids that need to be allowed */
+	if (mlx4_SET_VLAN_FLTR(priv->mdev->dev, priv->port, priv->vlans))
+		en_err(priv, "Failed configuring VLAN filter\n");
+
+	/* Configure the VLAN table */
+	for (i = 0; i < VLAN_FLTR_SIZE; i++) {
+		for (j = 0; j < 32; j++) {
+			vid = (i << 5) + j;
+			if (vlan_register[i] & (1 << j))
+				if (mlx4_register_vlan(priv->mdev->dev, priv->port, vid, &idx))
+					en_dbg(HW, priv, "failed registering vlan %d\n", vid);
+			if (vlan_unregister[i] & (1 << j)) {
+				if (!mlx4_find_cached_vlan(priv->mdev->dev, priv->port, vid, &idx))
+					mlx4_unregister_vlan(priv->mdev->dev, priv->port, idx);
+				else
+					en_dbg(HW, priv, "could not find vid %d in cache\n", vid);
+			}
+		}
+	}
+}
+
+static void mlx4_en_do_get_stats(struct work_struct *work)
+{
+	struct delayed_work *delay = to_delayed_work(work);
+	struct mlx4_en_priv *priv = container_of(delay, struct mlx4_en_priv,
+						 stats_task);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	err = mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 0);
+	if (err)
+		en_dbg(HW, priv, "Could not update stats \n");
+
+
+	mutex_lock(&mdev->state_lock);
+	if (mdev->device_up) {
+		if (priv->port_up) {
+			if (priv->vlgrp_modified)
+				mlx4_en_handle_vlans(priv);
+
+			mlx4_en_auto_moderation(priv);
+		}
+
+		queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY);
+	}
+	if (mdev->mac_removed[MLX4_MAX_PORTS + 1 - priv->port]) {
+		panic("mlx4_en_do_get_stats: Unexpected mac removed for %d\n",
+		    priv->port);
+		mdev->mac_removed[MLX4_MAX_PORTS + 1 - priv->port] = 0;
+	}
+	mutex_unlock(&mdev->state_lock);
+}
+
+static void mlx4_en_linkstate(struct work_struct *work)
+{
+	struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
+						 linkstate_task);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int linkstate = priv->link_state;
+
+	mutex_lock(&mdev->state_lock);
+	/* If observable port state changed set carrier state and
+	 * report to system log */
+	if (priv->last_link_state != linkstate) {
+		if (linkstate == MLX4_DEV_EVENT_PORT_DOWN) {
+			if_link_state_change(priv->dev, LINK_STATE_DOWN);
+		} else {
+			en_info(priv, "Link Up\n");
+			if_link_state_change(priv->dev, LINK_STATE_UP);
+		}
+	}
+	priv->last_link_state = linkstate;
+	mutex_unlock(&mdev->state_lock);
+}
+
+
+int mlx4_en_start_port(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_cq *cq;
+	struct mlx4_en_tx_ring *tx_ring;
+	int rx_index = 0;
+	int tx_index = 0;
+	int err = 0;
+	int i;
+	int j;
+
+	if (priv->port_up) {
+		en_dbg(DRV, priv, "start port called while port already up\n");
+		return 0;
+	}
+
+	/* Calculate Rx buf size */
+	dev->if_mtu = min(dev->if_mtu, priv->max_mtu);
+	mlx4_en_calc_rx_buf(dev);
+	en_dbg(DRV, priv, "Rx buf size:%d\n", priv->rx_mb_size);
+
+	/* Configure rx cq's and rings */
+	err = mlx4_en_activate_rx_rings(priv);
+	if (err) {
+		en_err(priv, "Failed to activate RX rings\n");
+		return err;
+	}
+
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		cq = &priv->rx_cq[i];
+
+		err = mlx4_en_activate_cq(priv, cq);
+		if (err) {
+			en_err(priv, "Failed activating Rx CQ\n");
+			goto cq_err;
+		}
+		for (j = 0; j < cq->size; j++)
+			cq->buf[j].owner_sr_opcode = MLX4_CQE_OWNER_MASK;
+		err = mlx4_en_set_cq_moder(priv, cq);
+		if (err) {
+			en_err(priv, "Failed setting cq moderation parameters");
+			mlx4_en_deactivate_cq(priv, cq);
+			goto cq_err;
+		}
+		mlx4_en_arm_cq(priv, cq);
+		priv->rx_ring[i].cqn = cq->mcq.cqn;
+		++rx_index;
+	}
+
+	err = mlx4_en_config_rss_steer(priv);
+	if (err) {
+		en_err(priv, "Failed configuring rss steering\n");
+		goto cq_err;
+	}
+
+	/* Configure tx cq's and rings */
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		/* Configure cq */
+		cq = &priv->tx_cq[i];
+		err = mlx4_en_activate_cq(priv, cq);
+		if (err) {
+			en_err(priv, "Failed allocating Tx CQ\n");
+			goto tx_err;
+		}
+		err = mlx4_en_set_cq_moder(priv, cq);
+		if (err) {
+			en_err(priv, "Failed setting cq moderation parameters");
+			mlx4_en_deactivate_cq(priv, cq);
+			goto tx_err;
+		}
+		en_dbg(DRV, priv, "Resetting index of collapsed CQ:%d to -1\n", i);
+		cq->buf->wqe_index = cpu_to_be16(0xffff);
+
+		/* Configure ring */
+		tx_ring = &priv->tx_ring[i];
+		err = mlx4_en_activate_tx_ring(priv, tx_ring, cq->mcq.cqn);
+		if (err) {
+			en_err(priv, "Failed allocating Tx ring\n");
+			mlx4_en_deactivate_cq(priv, cq);
+			goto tx_err;
+		}
+		/* Set initial ownership of all Tx TXBBs to SW (1) */
+		for (j = 0; j < tx_ring->buf_size; j += STAMP_STRIDE)
+			*((u32 *) (tx_ring->buf + j)) = 0xffffffff;
+		++tx_index;
+	}
+
+	/* Configure port */
+	err = mlx4_SET_PORT_general(mdev->dev, priv->port,
+				    priv->rx_mb_size + ETHER_CRC_LEN,
+				    priv->prof->tx_pause,
+				    priv->prof->tx_ppp,
+				    priv->prof->rx_pause,
+				    priv->prof->rx_ppp);
+	if (err) {
+		en_err(priv, "Failed setting port general configurations "
+			     "for port %d, with error %d\n", priv->port, err);
+		goto tx_err;
+	}
+	/* Set default qp number */
+	err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port, priv->base_qpn, 0);
+	if (err) {
+		en_err(priv, "Failed setting default qp numbers\n");
+		goto tx_err;
+	}
+	/* Set port mac number */
+	en_dbg(DRV, priv, "Setting mac for port %d\n", priv->port);
+	err = mlx4_register_mac(mdev->dev, priv->port,
+				mlx4_en_mac_to_u64(IF_LLADDR(dev)),
+				&priv->mac_index);
+	if (err) {
+		en_err(priv, "Failed setting port mac\n");
+		goto tx_err;
+	}
+	mdev->mac_removed[priv->port] = 0;
+
+	/* Init port */
+	en_dbg(HW, priv, "Initializing port\n");
+	err = mlx4_INIT_PORT(mdev->dev, priv->port);
+	if (err) {
+		en_err(priv, "Failed Initializing port\n");
+		goto mac_err;
+	}
+
+	/* Set the various hardware offload abilities */
+	dev->if_hwassist = 0;
+	if (dev->if_capenable & IFCAP_TSO4)
+		dev->if_hwassist |= CSUM_TSO;
+	if (dev->if_capenable & IFCAP_TXCSUM)
+		dev->if_hwassist |= (CSUM_TCP | CSUM_UDP | CSUM_IP);
+	if (dev->if_capenable & IFCAP_RXCSUM)
+		priv->rx_csum = 1;
+	else
+		priv->rx_csum = 0;
+
+	priv->port_up = true;
+
+	/* Populate multicast list */
+	mlx4_en_set_multicast(dev);
+
+	/* Enable the queues. */
+	atomic_clear_int(&dev->if_drv_flags, IFF_DRV_OACTIVE);
+	atomic_set_int(&dev->if_drv_flags, IFF_DRV_RUNNING);
+
+	callout_reset(&priv->watchdog_timer, MLX4_EN_WATCHDOG_TIMEOUT,
+	    mlx4_en_watchdog_timeout, priv);
+
+	return 0;
+
+mac_err:
+	mlx4_unregister_mac(mdev->dev, priv->port, priv->mac_index);
+tx_err:
+	while (tx_index--) {
+		mlx4_en_deactivate_tx_ring(priv, &priv->tx_ring[tx_index]);
+		mlx4_en_deactivate_cq(priv, &priv->tx_cq[tx_index]);
+	}
+
+	mlx4_en_release_rss_steer(priv);
+cq_err:
+	while (rx_index--)
+		mlx4_en_deactivate_cq(priv, &priv->rx_cq[rx_index]);
+	for (i = 0; i < priv->rx_ring_num; i++)
+		mlx4_en_deactivate_rx_ring(priv, &priv->rx_ring[i]);
+
+	return err; /* need to close devices */
+}
+
+
+void mlx4_en_stop_port(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int i;
+
+	if (!priv->port_up) {
+		en_dbg(DRV, priv, "stop port called while port already down\n");
+		return;
+	}
+
+	/* Set port as not active */
+	priv->port_up = false;
+
+	/* Unregister Mac address for the port */
+	mlx4_unregister_mac(mdev->dev, priv->port, priv->mac_index);
+	mdev->mac_removed[priv->port] = 1;
+
+	/* Free TX Rings */
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		mlx4_en_deactivate_tx_ring(priv, &priv->tx_ring[i]);
+		mlx4_en_deactivate_cq(priv, &priv->tx_cq[i]);
+	}
+	msleep(10);
+
+	for (i = 0; i < priv->tx_ring_num; i++)
+		mlx4_en_free_tx_buf(dev, &priv->tx_ring[i]);
+
+	/* Free RSS qps */
+	mlx4_en_release_rss_steer(priv);
+
+	/* Free RX Rings */
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		mlx4_en_deactivate_rx_ring(priv, &priv->rx_ring[i]);
+		mlx4_en_deactivate_cq(priv, &priv->rx_cq[i]);
+	}
+
+	/* close port*/
+	mlx4_CLOSE_PORT(mdev->dev, priv->port);
+
+	callout_stop(&priv->watchdog_timer);
+
+	atomic_clear_int(&dev->if_drv_flags, IFF_DRV_RUNNING);
+}
+
+static void mlx4_en_restart(struct work_struct *work)
+{
+	struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
+						 watchdog_task);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct net_device *dev = priv->dev;
+	struct mlx4_en_tx_ring *ring;
+	int i;
+
+	if (priv->blocked == 0 || priv->port_up == 0)
+		return;
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		ring = &priv->tx_ring[i];
+		if (ring->blocked &&
+		    ring->watchdog_time + MLX4_EN_WATCHDOG_TIMEOUT < ticks)
+			goto reset;
+	}
+	return;
+
+reset:
+	priv->port_stats.tx_timeout++;
+	en_dbg(DRV, priv, "Watchdog task called for port %d\n", priv->port);
+
+	mutex_lock(&mdev->state_lock);
+	if (priv->port_up) {
+		mlx4_en_stop_port(dev);
+		if (mlx4_en_start_port(dev))
+			en_err(priv, "Failed restarting port %d\n", priv->port);
+	}
+	mutex_unlock(&mdev->state_lock);
+}
+
+
+static void
+mlx4_en_init(void *arg)
+{
+	struct mlx4_en_priv *priv;
+	struct mlx4_en_dev *mdev;
+	struct ifnet *dev;
+	int i;
+
+	priv = arg;
+	dev = priv->dev;
+	mdev = priv->mdev;
+	mutex_lock(&mdev->state_lock);
+	if (dev->if_drv_flags & IFF_DRV_RUNNING)
+		mlx4_en_stop_port(dev);
+
+	if (!mdev->device_up) {
+		en_err(priv, "Cannot open - device down/disabled\n");
+		goto out;
+	}
+
+	/* Reset HW statistics and performance counters */
+	if (mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 1))
+		en_dbg(HW, priv, "Failed dumping statistics\n");
+
+	memset(&priv->pstats, 0, sizeof(priv->pstats));
+
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		priv->tx_ring[i].bytes = 0;
+		priv->tx_ring[i].packets = 0;
+	}
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		priv->rx_ring[i].bytes = 0;
+		priv->rx_ring[i].packets = 0;
+	}
+
+	mlx4_en_set_default_moderation(priv);
+	if (mlx4_en_start_port(dev))
+		en_err(priv, "Failed starting port:%d\n", priv->port);
+
+out:
+	mutex_unlock(&mdev->state_lock);
+}
+
+void mlx4_en_free_resources(struct mlx4_en_priv *priv)
+{
+	int i;
+
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		if (priv->tx_ring[i].tx_info)
+			mlx4_en_destroy_tx_ring(priv, &priv->tx_ring[i]);
+		if (priv->tx_cq[i].buf)
+			mlx4_en_destroy_cq(priv, &priv->tx_cq[i]);
+	}
+
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		if (priv->rx_ring[i].rx_info)
+			mlx4_en_destroy_rx_ring(priv, &priv->rx_ring[i]);
+		if (priv->rx_cq[i].buf)
+			mlx4_en_destroy_cq(priv, &priv->rx_cq[i]);
+	}
+	/* Free the stats tree when we resize the rings. */
+	if (priv->sysctl)
+		sysctl_ctx_free(&priv->stat_ctx);
+
+}
+
+int mlx4_en_alloc_resources(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_port_profile *prof = priv->prof;
+	int i;
+
+	/* Create tx Rings */
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		if (mlx4_en_create_cq(priv, &priv->tx_cq[i],
+				      prof->tx_ring_size, i, TX))
+			goto err;
+
+		if (mlx4_en_create_tx_ring(priv, &priv->tx_ring[i],
+					   prof->tx_ring_size, TXBB_SIZE))
+			goto err;
+	}
+
+	/* Create rx Rings */
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		if (mlx4_en_create_cq(priv, &priv->rx_cq[i],
+				      prof->rx_ring_size, i, RX))
+			goto err;
+
+		if (i > priv->rx_ring_num - priv->udp_rings - 1)
+			priv->rx_ring[i].use_frags = 0;
+		else
+			priv->rx_ring[i].use_frags = 1;
+		if (mlx4_en_create_rx_ring(priv, &priv->rx_ring[i],
+					   prof->rx_ring_size))
+			goto err;
+	}
+
+	/* Re-create stat sysctls in case the number of rings changed. */
+	mlx4_en_sysctl_stat(priv);
+
+	/* Populate Tx priority mappings */
+	mlx4_en_set_prio_map(priv, priv->tx_prio_map,
+			     prof->tx_ring_num - MLX4_EN_NUM_HASH_RINGS);
+
+	return 0;
+
+err:
+	en_err(priv, "Failed to allocate NIC resources\n");
+	return -ENOMEM;
+}
+
+
+void mlx4_en_destroy_netdev(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	en_dbg(DRV, priv, "Destroying netdev on port:%d\n", priv->port);
+
+	if (priv->vlan_attach != NULL)
+		EVENTHANDLER_DEREGISTER(vlan_config, priv->vlan_attach);
+	if (priv->vlan_detach != NULL)
+		EVENTHANDLER_DEREGISTER(vlan_unconfig, priv->vlan_detach);
+
+	/* Unregister device - this will close the port if it was up */
+	if (priv->registered)
+		ether_ifdetach(dev);
+
+	if (priv->allocated)
+		mlx4_free_hwq_res(mdev->dev, &priv->res, MLX4_EN_PAGE_SIZE);
+
+	if (priv->sysctl)
+		sysctl_ctx_free(&priv->conf_ctx);
+
+	cancel_delayed_work(&priv->stats_task);
+	/* flush any pending task for this netdev */
+	flush_workqueue(mdev->workqueue);
+
+	/* Detach the netdev so tasks would not attempt to access it */
+	mutex_lock(&mdev->state_lock);
+	mdev->pndev[priv->port] = NULL;
+	mutex_unlock(&mdev->state_lock);
+
+	mlx4_en_free_resources(priv);
+	mtx_destroy(&priv->stats_lock.m);
+	mtx_destroy(&priv->vlan_lock.m);
+	kfree(priv);
+	if_free(dev);
+}
+
+static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err = 0;
+
+	en_dbg(DRV, priv, "Change MTU called - current:%ld new:%d\n",
+		 dev->if_mtu, new_mtu);
+
+	if ((new_mtu < MLX4_EN_MIN_MTU) || (new_mtu > priv->max_mtu)) {
+		en_err(priv, "Bad MTU size:%d.\n", new_mtu);
+		return -EPERM;
+	}
+	mutex_lock(&mdev->state_lock);
+	dev->if_mtu = new_mtu;
+	if (dev->if_drv_flags & IFF_DRV_RUNNING) {
+		if (!mdev->device_up) {
+			/* NIC is probably restarting - let watchdog task reset
+			 * the port */
+			en_dbg(DRV, priv, "Change MTU called with card down!?\n");
+		} else {
+			mlx4_en_stop_port(dev);
+			mlx4_en_set_default_moderation(priv);
+			err = mlx4_en_start_port(dev);
+			if (err) {
+				en_err(priv, "Failed restarting port:%d\n",
+					 priv->port);
+				queue_work(mdev->workqueue, &priv->watchdog_task);
+			}
+		}
+	}
+	mutex_unlock(&mdev->state_lock);
+	return 0;
+}
+
+static int mlx4_en_calc_media(struct mlx4_en_priv *priv)
+{
+	int trans_type;
+	int active;
+
+	active = IFM_ETHER;
+	if (priv->last_link_state == MLX4_DEV_EVENT_PORT_DOWN)
+		return (active);
+	if (mlx4_en_QUERY_PORT(priv->mdev, priv->port))
+		return (active);
+	active |= IFM_FDX;
+	trans_type = priv->port_state.transciver;
+	/* XXX I don't know all of the transceiver values. */
+	if (priv->port_state.link_speed == 1000)
+		active |= IFM_1000_T;
+	else if (trans_type > 0 && trans_type <= 0xC)
+		active |= IFM_10G_SR;
+	else if (trans_type == 0x80 || trans_type == 0)
+		active |= IFM_10G_CX4;
+	if (priv->prof->tx_pause)
+		active |= IFM_ETH_TXPAUSE;
+	if (priv->prof->rx_pause)
+		active |= IFM_ETH_RXPAUSE;
+
+	return (active);
+}
+
+
+static void mlx4_en_media_status(struct ifnet *dev, struct ifmediareq *ifmr)
+{
+	struct mlx4_en_priv *priv;
+
+	priv = dev->if_softc;
+	ifmr->ifm_status = IFM_AVALID;
+	if (priv->last_link_state != MLX4_DEV_EVENT_PORT_DOWN)
+		ifmr->ifm_status |= IFM_ACTIVE;
+	ifmr->ifm_active = mlx4_en_calc_media(priv);
+
+	return;
+}
+
+static int mlx4_en_media_change(struct ifnet *dev)
+{
+	struct mlx4_en_priv *priv;
+        struct ifmedia *ifm;
+	int rxpause;
+	int txpause;
+	int error;
+
+	priv = dev->if_softc;
+	ifm = &priv->media;
+	rxpause = txpause = 0;
+	error = 0;
+
+	if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER)
+		return (EINVAL);
+        switch (IFM_SUBTYPE(ifm->ifm_media)) {
+        case IFM_AUTO:
+		break;
+	case IFM_10G_SR:
+	case IFM_10G_CX4:
+	case IFM_1000_T:
+		if (IFM_SUBTYPE(ifm->ifm_media) ==
+		    IFM_SUBTYPE(mlx4_en_calc_media(priv)) &&
+		    (ifm->ifm_media & IFM_FDX))
+			break;
+		/* Fallthrough */
+	default:
+                printf("%s: Only auto media type\n", if_name(dev));
+                return (EINVAL);
+	}
+	/* Allow user to set/clear pause */
+	if (IFM_OPTIONS(ifm->ifm_media) & IFM_ETH_RXPAUSE)
+		rxpause = 1;
+	if (IFM_OPTIONS(ifm->ifm_media) & IFM_ETH_TXPAUSE)
+		txpause = 1;
+	if (priv->prof->tx_pause != txpause || priv->prof->rx_pause != rxpause) {
+		priv->prof->tx_pause = txpause;
+		priv->prof->rx_pause = rxpause;
+		error = -mlx4_SET_PORT_general(priv->mdev->dev, priv->port,
+		     priv->rx_mb_size + ETHER_CRC_LEN, priv->prof->tx_pause,
+		     priv->prof->tx_ppp, priv->prof->rx_pause,
+		     priv->prof->rx_ppp);
+	}
+	return (error);
+}
+
+static int mlx4_en_ioctl(struct ifnet *dev, u_long command, caddr_t data)
+{
+	struct mlx4_en_priv *priv;
+	struct mlx4_en_dev *mdev;
+	struct ifreq *ifr;
+	int error;
+	int mask;
+
+	error = 0;
+	mask = 0;
+	priv = dev->if_softc;
+	mdev = priv->mdev;
+	ifr = (struct ifreq *) data;
+	switch (command) {
+	case SIOCSIFMTU:
+		error = -mlx4_en_change_mtu(dev, ifr->ifr_mtu);
+		break;
+	case SIOCSIFFLAGS:
+		if (dev->if_flags & IFF_UP) {
+			if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0) {
+				mutex_lock(&mdev->state_lock);
+				mlx4_en_start_port(dev);
+				mutex_unlock(&mdev->state_lock);
+			} else
+				mlx4_en_set_multicast(dev);
+		} else {
+			mutex_lock(&mdev->state_lock);
+			if (dev->if_drv_flags & IFF_DRV_RUNNING) {
+				mlx4_en_stop_port(dev);
+				if_link_state_change(dev, LINK_STATE_DOWN);
+			}
+			mutex_unlock(&mdev->state_lock);
+		}
+		break;
+	case SIOCADDMULTI:
+	case SIOCDELMULTI:
+		mlx4_en_set_multicast(dev);
+		break;
+	case SIOCSIFMEDIA:
+	case SIOCGIFMEDIA:
+		error = ifmedia_ioctl(dev, ifr, &priv->media, command);
+		break;
+	case SIOCSIFCAP:
+		mask = ifr->ifr_reqcap ^ dev->if_capenable;
+		if (mask & IFCAP_HWCSUM)
+			dev->if_capenable ^= IFCAP_HWCSUM;
+		if (mask & IFCAP_TSO4)
+			dev->if_capenable ^= IFCAP_TSO4;
+		if (mask & IFCAP_LRO)
+			dev->if_capenable ^= IFCAP_LRO;
+		if (mask & IFCAP_VLAN_HWTAGGING)
+			dev->if_capenable ^= IFCAP_VLAN_HWTAGGING;
+		if (mask & IFCAP_VLAN_HWFILTER)
+			dev->if_capenable ^= IFCAP_VLAN_HWFILTER;
+		if (dev->if_drv_flags & IFF_DRV_RUNNING)
+			mlx4_en_init(priv);
+		VLAN_CAPABILITIES(dev);
+		break;
+	default:
+		error = ether_ioctl(dev, command, data);
+		break;
+	}
+
+	return (error);
+}
+
+static int mlx4_en_set_ring_size(struct net_device *dev,
+    int rx_size, int tx_size)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int port_up = 0;
+	int err = 0;
+
+	rx_size = roundup_pow_of_two(rx_size);
+	rx_size = max_t(u32, rx_size, MLX4_EN_MIN_RX_SIZE);
+	rx_size = min_t(u32, rx_size, MLX4_EN_MAX_RX_SIZE);
+	tx_size = roundup_pow_of_two(tx_size);
+	tx_size = max_t(u32, tx_size, MLX4_EN_MIN_TX_SIZE);
+	tx_size = min_t(u32, tx_size, MLX4_EN_MAX_TX_SIZE);
+
+	if (rx_size == (priv->port_up ?
+	    priv->rx_ring[0].actual_size : priv->rx_ring[0].size) &&
+	    tx_size == priv->tx_ring[0].size)
+		return 0;
+
+	mutex_lock(&mdev->state_lock);
+	if (priv->port_up) {
+		port_up = 1;
+		mlx4_en_stop_port(dev);
+	}
+	mlx4_en_free_resources(priv);
+	priv->prof->tx_ring_size = tx_size;
+	priv->prof->rx_ring_size = rx_size;
+	err = mlx4_en_alloc_resources(priv);
+	if (err) {
+		en_err(priv, "Failed reallocating port resources\n");
+		goto out;
+	}
+	if (port_up) {
+		err = mlx4_en_start_port(dev);
+		if (err)
+			en_err(priv, "Failed starting port\n");
+	}
+out:
+	mutex_unlock(&mdev->state_lock);
+	return err;
+}
+
+static int mlx4_en_set_rx_ring_size(SYSCTL_HANDLER_ARGS)
+{
+	struct mlx4_en_priv *priv;
+	int size;
+	int error;
+
+	priv = arg1;
+	size = priv->prof->rx_ring_size;
+	error = sysctl_handle_int(oidp, &size, 0, req);
+	if (error || !req->newptr)
+		return (error);
+	error = -mlx4_en_set_ring_size(priv->dev, size,
+	    priv->prof->tx_ring_size);
+
+	return (error);
+}
+
+static int mlx4_en_set_tx_ring_size(SYSCTL_HANDLER_ARGS)
+{
+	struct mlx4_en_priv *priv;
+	int size;
+	int error;
+
+	priv = arg1;
+	size = priv->prof->tx_ring_size;
+	error = sysctl_handle_int(oidp, &size, 0, req);
+	if (error || !req->newptr)
+		return (error);
+	error = -mlx4_en_set_ring_size(priv->dev, priv->prof->rx_ring_size,
+	    size);
+
+	return (error);
+}
+
+static void mlx4_en_sysctl_conf(struct mlx4_en_priv *priv)
+{
+	struct net_device *dev;
+	struct sysctl_ctx_list *ctx;
+	struct sysctl_oid *node;
+	struct sysctl_oid_list *node_list;
+	struct sysctl_oid *coal;
+	struct sysctl_oid_list *coal_list;
+
+	dev = priv->dev;
+	ctx = &priv->conf_ctx;
+
+	sysctl_ctx_init(ctx);
+	priv->sysctl = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
+	    OID_AUTO, dev->if_xname, CTLFLAG_RD, 0, "mlx4 10gig ethernet");
+	node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(priv->sysctl), OID_AUTO,
+	    "conf", CTLFLAG_RD, NULL, "Configuration");
+	node_list = SYSCTL_CHILDREN(node);
+
+	SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "msg_enable",
+	    CTLFLAG_RW, &priv->msg_enable, 0,
+	    "Driver message enable bitfield");
+	SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "rx_rings",
+	    CTLTYPE_INT | CTLFLAG_RD, &priv->rx_ring_num, 0,
+	    "Number of receive rings");
+	SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "tx_rings",
+	    CTLTYPE_INT | CTLFLAG_RD, &priv->tx_ring_num, 0,
+	    "Number of transmit rings");
+	SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, "rx_size",
+	    CTLTYPE_INT | CTLFLAG_RW, priv, 0, mlx4_en_set_rx_ring_size, "I",
+	    "Receive ring size");
+	SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, "tx_size",
+	    CTLTYPE_INT | CTLFLAG_RW, priv, 0, mlx4_en_set_tx_ring_size, "I",
+	    "Transmit ring size");
+	SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "ip_reasm",
+	    CTLFLAG_RD, &priv->mdev->profile.ip_reasm, 0,
+	    "Allow reassembly of IP fragments.");
+
+	/* Add coalescer configuration. */
+	coal = SYSCTL_ADD_NODE(ctx, node_list, OID_AUTO,
+	    "coalesce", CTLFLAG_RD, NULL, "Interrupt coalesce configuration");
+	coal_list = SYSCTL_CHILDREN(node);
+	SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "pkt_rate_low",
+	    CTLFLAG_RW, &priv->pkt_rate_low, 0,
+	    "Packets per-second for minimum delay");
+	SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "rx_usecs_low",
+	    CTLFLAG_RW, &priv->rx_usecs_low, 0,
+	    "Minimum RX delay in micro-seconds");
+	SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "pkt_rate_high",
+	    CTLFLAG_RW, &priv->pkt_rate_high, 0,
+	    "Packets per-second for maximum delay");
+	SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "rx_usecs_high",
+	    CTLFLAG_RW, &priv->rx_usecs_high, 0,
+	    "Maximum RX delay in micro-seconds");
+	SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "sample_interval",
+	    CTLFLAG_RW, &priv->sample_interval, 0,
+	    "adaptive frequency in units of HZ ticks");
+	SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "adaptive_rx_coal",
+	    CTLFLAG_RW, &priv->adaptive_rx_coal, 0,
+	    "Enable adaptive rx coalescing");
+}
+
+static void mlx4_en_sysctl_stat(struct mlx4_en_priv *priv)
+{
+	struct net_device *dev;
+	struct sysctl_ctx_list *ctx;
+	struct sysctl_oid *node;
+	struct sysctl_oid_list *node_list;
+	struct sysctl_oid *ring_node;
+	struct sysctl_oid_list *ring_list;
+	struct mlx4_en_tx_ring *tx_ring;
+	struct mlx4_en_rx_ring *rx_ring;
+	char namebuf[128];
+	int i;
+
+	dev = priv->dev;
+
+	ctx = &priv->stat_ctx;
+	sysctl_ctx_init(ctx);
+	node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(priv->sysctl), OID_AUTO,
+	    "stat", CTLFLAG_RD, NULL, "Statistics");
+	node_list = SYSCTL_CHILDREN(node);
+
+#ifdef MLX4_EN_PERF_STAT
+	SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "tx_poll", CTLFLAG_RD,
+	    &priv->pstats.tx_poll, "TX Poll calls");
+	SYSCTL_ADD_QUAD(ctx, node_list, OID_AUTO, "tx_pktsz_avg", CTLFLAG_RD,
+	    &priv->pstats.tx_pktsz_avg, "TX average packet size");
+	SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "inflight_avg", CTLFLAG_RD,
+	    &priv->pstats.inflight_avg, "TX average packets in-flight");
+	SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "tx_coal_avg", CTLFLAG_RD,
+	    &priv->pstats.tx_coal_avg, "TX average coalesced completions");
+	SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "rx_coal_avg", CTLFLAG_RD,
+	    &priv->pstats.rx_coal_avg, "RX average coalesced completions");
+#endif
+
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tso_packets", CTLFLAG_RD,
+	    &priv->port_stats.tso_packets, "TSO packets sent");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "queue_stopped", CTLFLAG_RD,
+	    &priv->port_stats.queue_stopped, "Queue full");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "wake_queue", CTLFLAG_RD,
+	    &priv->port_stats.wake_queue, "Queue resumed after full");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_timeout", CTLFLAG_RD,
+	    &priv->port_stats.tx_timeout, "Transmit timeouts");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_alloc_failed", CTLFLAG_RD,
+	    &priv->port_stats.rx_alloc_failed, "RX failed to allocate mbuf");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_chksum_good", CTLFLAG_RD,
+	    &priv->port_stats.rx_chksum_good, "RX checksum offload success");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_chksum_none", CTLFLAG_RD,
+	    &priv->port_stats.rx_chksum_none, "RX without checksum offload");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_chksum_offload",
+	    CTLFLAG_RD, &priv->port_stats.tx_chksum_offload,
+	    "TX checksum offloads");
+
+	/* Could strdup the names and add in a loop.  This is simpler. */
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "broadcast", CTLFLAG_RD,
+	    &priv->pkstats.broadcast, "Broadcast packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio0", CTLFLAG_RD,
+	    &priv->pkstats.tx_prio[0], "TX Priority 0 packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio1", CTLFLAG_RD,
+	    &priv->pkstats.tx_prio[1], "TX Priority 1 packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio2", CTLFLAG_RD,
+	    &priv->pkstats.tx_prio[2], "TX Priority 2 packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio3", CTLFLAG_RD,
+	    &priv->pkstats.tx_prio[3], "TX Priority 3 packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio4", CTLFLAG_RD,
+	    &priv->pkstats.tx_prio[4], "TX Priority 4 packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio5", CTLFLAG_RD,
+	    &priv->pkstats.tx_prio[5], "TX Priority 5 packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio6", CTLFLAG_RD,
+	    &priv->pkstats.tx_prio[6], "TX Priority 6 packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio7", CTLFLAG_RD,
+	    &priv->pkstats.tx_prio[7], "TX Priority 7 packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio0", CTLFLAG_RD,
+	    &priv->pkstats.rx_prio[0], "RX Priority 0 packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio1", CTLFLAG_RD,
+	    &priv->pkstats.rx_prio[1], "RX Priority 1 packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio2", CTLFLAG_RD,
+	    &priv->pkstats.rx_prio[2], "RX Priority 2 packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio3", CTLFLAG_RD,
+	    &priv->pkstats.rx_prio[3], "RX Priority 3 packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio4", CTLFLAG_RD,
+	    &priv->pkstats.rx_prio[4], "RX Priority 4 packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio5", CTLFLAG_RD,
+	    &priv->pkstats.rx_prio[5], "RX Priority 5 packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio6", CTLFLAG_RD,
+	    &priv->pkstats.rx_prio[6], "RX Priority 6 packets");
+	SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio7", CTLFLAG_RD,
+	    &priv->pkstats.rx_prio[7], "RX Priority 7 packets");
+
+	for (i = 0; i < priv->tx_ring_num; i++) {
+		tx_ring = &priv->tx_ring[i];
+		snprintf(namebuf, sizeof(namebuf), "tx_ring%d", i);
+		ring_node = SYSCTL_ADD_NODE(ctx, node_list, OID_AUTO, namebuf,
+		    CTLFLAG_RD, NULL, "TX Ring");
+		ring_list = SYSCTL_CHILDREN(ring_node);
+		SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "packets",
+		    CTLFLAG_RD, &tx_ring->packets, "TX packets");
+		SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "bytes",
+		    CTLFLAG_RD, &tx_ring->bytes, "TX bytes");
+		SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "error",
+		    CTLFLAG_RD, &tx_ring->errors, "TX soft errors");
+
+	}
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		rx_ring = &priv->rx_ring[i];
+		snprintf(namebuf, sizeof(namebuf), "rx_ring%d", i);
+		ring_node = SYSCTL_ADD_NODE(ctx, node_list, OID_AUTO, namebuf,
+		    CTLFLAG_RD, NULL, "RX Ring");
+		ring_list = SYSCTL_CHILDREN(ring_node);
+		SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "packets",
+		    CTLFLAG_RD, &rx_ring->packets, "RX packets");
+		SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "bytes",
+		    CTLFLAG_RD, &rx_ring->bytes, "RX bytes");
+		SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "error",
+		    CTLFLAG_RD, &rx_ring->errors, "RX soft errors");
+		SYSCTL_ADD_UINT(ctx, ring_list, OID_AUTO, "lro_queued",
+		    CTLFLAG_RD, &rx_ring->lro.lro_queued, 0, "LRO Queued");
+		SYSCTL_ADD_UINT(ctx, ring_list, OID_AUTO, "lro_flushed",
+		    CTLFLAG_RD, &rx_ring->lro.lro_flushed, 0, "LRO Flushed");
+	}
+}
+
+int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
+			struct mlx4_en_port_profile *prof)
+{
+	static volatile int mlx4_en_unit;
+	struct net_device *dev;
+	struct mlx4_en_priv *priv;
+	uint8_t dev_addr[ETHER_ADDR_LEN];
+	int err;
+	int i;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	dev = priv->dev = if_alloc(IFT_ETHER);
+	if (dev == NULL) {
+		mlx4_err(mdev, "Net device allocation failed\n");
+		kfree(priv);
+		return -ENOMEM;
+	}
+	dev->if_softc = priv;
+	if_initname(dev, "mlxen", atomic_fetchadd_int(&mlx4_en_unit, 1));
+	dev->if_mtu = ETHERMTU;
+	dev->if_baudrate = 1000000000;
+	dev->if_init = mlx4_en_init;
+	dev->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
+	dev->if_ioctl = mlx4_en_ioctl;
+	dev->if_transmit = mlx4_en_transmit;
+	dev->if_qflush = mlx4_en_qflush;
+	dev->if_snd.ifq_maxlen = prof->tx_ring_size;
+
+	/*
+	 * Initialize driver private data
+	 */
+	priv->dev = dev;
+	priv->mdev = mdev;
+	priv->prof = prof;
+	priv->port = port;
+	priv->port_up = false;
+	priv->rx_csum = 1;
+	priv->flags = prof->flags;
+	priv->tx_ring_num = prof->tx_ring_num;
+	priv->rx_ring_num = prof->rx_ring_num;
+	priv->udp_rings = mdev->profile.udp_rss ? prof->rx_ring_num / 2 : 1;
+	priv->mac_index = -1;
+	priv->msg_enable = MLX4_EN_MSG_LEVEL;
+	mtx_init(&priv->stats_lock.m, "mlx4 stats", NULL, MTX_DEF);
+	mtx_init(&priv->vlan_lock.m, "mlx4 vlan", NULL, MTX_DEF);
+	INIT_WORK(&priv->mcast_task, mlx4_en_do_set_multicast);
+	INIT_WORK(&priv->watchdog_task, mlx4_en_restart);
+	INIT_WORK(&priv->linkstate_task, mlx4_en_linkstate);
+	INIT_DELAYED_WORK(&priv->stats_task, mlx4_en_do_get_stats);
+	callout_init(&priv->watchdog_timer, 1);
+
+	/* Query for default mac and max mtu */
+	priv->max_mtu = mdev->dev->caps.eth_mtu_cap[priv->port];
+	priv->mac = mdev->dev->caps.def_mac[priv->port];
+
+	if (ILLEGAL_MAC(priv->mac)) {
+		en_err(priv, "Port: %d, invalid mac burned: 0x%llx, quiting\n",
+			 priv->port, priv->mac);
+		err = -EINVAL;
+		goto out;
+	}
+
+	mlx4_en_sysctl_conf(priv);
+
+	err = mlx4_en_alloc_resources(priv);
+	if (err)
+		goto out;
+
+	/* Allocate page for receive rings */
+	err = mlx4_alloc_hwq_res(mdev->dev, &priv->res,
+				MLX4_EN_PAGE_SIZE, MLX4_EN_PAGE_SIZE);
+	if (err) {
+		en_err(priv, "Failed to allocate page for rx qps\n");
+		goto out;
+	}
+	priv->allocated = 1;
+
+	/*
+	 * Set driver features
+	 */
+	dev->if_capabilities |= IFCAP_RXCSUM | IFCAP_TXCSUM;
+	dev->if_capabilities |= IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING;
+	dev->if_capabilities |= IFCAP_VLAN_HWCSUM | IFCAP_VLAN_HWFILTER;
+	dev->if_capabilities |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU;
+#if 0 /* Not yet */
+	dev->if_capabilities |= IFCAP_WOL;
+#endif
+	if (mdev->LSO_support)
+		dev->if_capabilities |= IFCAP_TSO | IFCAP_VLAN_HWTSO;
+
+	/* Don't enable LOR unless the user requests. */
+	dev->if_capenable = dev->if_capabilities;
+
+	if (mdev->profile.num_lro)
+		dev->if_capabilities |= IFCAP_LRO;
+
+        /* Register for VLAN events */
+	priv->vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
+            mlx4_en_vlan_rx_add_vid, priv, EVENTHANDLER_PRI_FIRST);
+	priv->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
+            mlx4_en_vlan_rx_kill_vid, priv, EVENTHANDLER_PRI_FIRST);
+
+	mdev->pndev[priv->port] = dev;
+
+	priv->last_link_state = MLX4_DEV_EVENT_PORT_DOWN;
+	if_link_state_change(dev, LINK_STATE_DOWN);
+
+	/* Set default MAC */
+	for (i = 0; i < ETHER_ADDR_LEN; i++)
+		dev_addr[ETHER_ADDR_LEN - 1 - i] = (u8) (priv->mac >> (8 * i));
+
+	ether_ifattach(dev, dev_addr);
+	ifmedia_init(&priv->media, IFM_IMASK | IFM_ETH_FMASK,
+	    mlx4_en_media_change, mlx4_en_media_status);
+	ifmedia_add(&priv->media, IFM_ETHER | IFM_FDX | IFM_1000_T, 0, NULL);
+	ifmedia_add(&priv->media, IFM_ETHER | IFM_FDX | IFM_10G_SR, 0, NULL);
+	ifmedia_add(&priv->media, IFM_ETHER | IFM_FDX | IFM_10G_CX4, 0, NULL);
+	ifmedia_add(&priv->media, IFM_ETHER | IFM_AUTO, 0, NULL);
+	ifmedia_set(&priv->media, IFM_ETHER | IFM_AUTO);
+
+	en_warn(priv, "Using %d TX rings\n", prof->tx_ring_num);
+	en_warn(priv, "Using %d RX rings\n", prof->rx_ring_num);
+
+	priv->registered = 1;
+	queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY);
+
+	return 0;
+
+out:
+	mlx4_en_destroy_netdev(dev);
+	return err;
+}
+
diff --git a/sys/ofed/drivers/net/mlx4/en_params.c b/sys/ofed/drivers/net/mlx4/en_params.c
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/en_params.c
diff --git a/sys/ofed/drivers/net/mlx4/en_port.c b/sys/ofed/drivers/net/mlx4/en_port.c
new file mode 100644
index 0000000..36a53d3
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/en_port.c
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+
+#include "mlx4_en.h"
+
+#include <linux/if_vlan.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/cmd.h>
+
+
+int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port,
+			u64 mac, u64 clear, u8 mode)
+{
+	return mlx4_cmd(dev, (mac | (clear << 63)), port, mode,
+			MLX4_CMD_SET_MCAST_FLTR, MLX4_CMD_TIME_CLASS_B);
+}
+
+int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, u8 port, u32 *vlans)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_vlan_fltr_mbox *filter;
+	int i;
+	int err = 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	filter = mailbox->buf;
+	memset(filter, 0, sizeof *filter);
+	if (vlans)
+		for (i = 0; i < VLAN_FLTR_SIZE; i ++)
+			filter->entry[i] = cpu_to_be32(vlans[i]);
+	err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_SET_VLAN_FLTR,
+		       MLX4_CMD_TIME_CLASS_B);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+
+int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
+			  u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_general_context *context;
+	int err;
+	u32 in_mod;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+	memset(context, 0, sizeof *context);
+
+	context->flags = SET_PORT_GEN_ALL_VALID;
+	context->mtu = cpu_to_be16(mtu);
+	context->pptx = (pptx * (!pfctx)) << 7;
+	context->pfctx = pfctx;
+	context->pprx = (pprx * (!pfcrx)) << 7;
+	context->pfcrx = pfcrx;
+
+	in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn,
+			   u8 promisc)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_rqp_calc_context *context;
+	int err;
+	u32 in_mod;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+	memset(context, 0, sizeof *context);
+
+	context->base_qpn = cpu_to_be32(base_qpn);
+	context->promisc = cpu_to_be32(promisc << SET_PORT_PROMISC_EN_SHIFT | base_qpn);
+	context->mcast = cpu_to_be32((dev->caps.mc_promisc_mode <<
+				      SET_PORT_PROMISC_MODE_SHIFT) | base_qpn);
+	context->intra_no_vlan = 0;
+	context->no_vlan = MLX4_NO_VLAN_IDX;
+	context->intra_vlan_miss = 0;
+	context->vlan_miss = MLX4_VLAN_MISS_IDX;
+
+	in_mod = MLX4_SET_PORT_RQP_CALC << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port)
+{
+	struct mlx4_en_query_port_context *qport_context;
+	struct mlx4_en_priv *priv = netdev_priv(mdev->pndev[port]);
+	struct mlx4_en_port_state *state = &priv->port_state;
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	memset(mailbox->buf, 0, sizeof(*qport_context));
+	err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0,
+			   MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B);
+	if (err)
+		goto out;
+	qport_context = mailbox->buf;
+
+	/* This command is always accessed from Ethtool context
+	 * already synchronized, no need in locking */
+	state->link_state = !!(qport_context->link_up & MLX4_EN_LINK_UP_MASK);
+	if ((qport_context->link_speed & MLX4_EN_SPEED_MASK) ==
+	    MLX4_EN_1G_SPEED)
+		state->link_speed = 1000;
+	else
+		state->link_speed = 10000;
+	state->transciver = qport_context->transceiver;
+	if (be32_to_cpu(qport_context->transceiver_code_hi) & 0x400)
+		state->transciver = 0x80;
+
+out:
+	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+	return err;
+}
+
+static int read_iboe_counters(struct mlx4_dev *dev, int index, u64 counters[])
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+	int mode;
+	struct mlx4_counters_ext *ext;
+	struct mlx4_counters *reg;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return -ENOMEM;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, index, 0,
+			   MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C);
+	if (err)
+		goto out;
+
+	mode = be32_to_cpu(((struct mlx4_counters *)mailbox->buf)->counter_mode) & 0xf;
+	switch (mode) {
+	case 0:
+		reg = mailbox->buf;
+		counters[0] = be64_to_cpu(reg->rx_frames);
+		counters[1] = be64_to_cpu(reg->tx_frames);
+		counters[2] = be64_to_cpu(reg->rx_bytes);
+		counters[3] = be64_to_cpu(reg->tx_bytes);
+		break;
+	case 1:
+		ext = mailbox->buf;
+		counters[0] = be64_to_cpu(ext->rx_uni_frames);
+		counters[1] = be64_to_cpu(ext->tx_uni_frames);
+		counters[2] = be64_to_cpu(ext->rx_uni_bytes);
+		counters[3] = be64_to_cpu(ext->tx_uni_bytes);
+		break;
+	default:
+		err = -EINVAL;
+	}
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
+{
+	struct mlx4_en_stat_out_mbox *mlx4_en_stats;
+	struct net_device *dev;
+	struct mlx4_en_priv *priv;
+	struct mlx4_cmd_mailbox *mailbox;
+	u64 in_mod = reset << 8 | port;
+	unsigned long oerror;
+	unsigned long ierror;
+	int err;
+	int i;
+	int counter;
+	u64 counters[4];
+
+	dev = mdev->pndev[port];
+	priv = netdev_priv(dev);
+	memset(counters, 0, sizeof counters);
+	counter = mlx4_get_iboe_counter(priv->mdev->dev, port);
+	if (counter >= 0)
+		err = read_iboe_counters(priv->mdev->dev, counter, counters);
+
+	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	memset(mailbox->buf, 0, sizeof(*mlx4_en_stats));
+	err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, in_mod, 0,
+			   MLX4_CMD_DUMP_ETH_STATS, MLX4_CMD_TIME_CLASS_B);
+	if (err)
+		goto out;
+
+	mlx4_en_stats = mailbox->buf;
+
+	spin_lock(&priv->stats_lock);
+
+	oerror = ierror = 0;
+	dev->if_ipackets = counters[0];
+	dev->if_ibytes = counters[2];
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		dev->if_ipackets += priv->rx_ring[i].packets;
+		dev->if_ibytes += priv->rx_ring[i].bytes;
+		ierror += priv->rx_ring[i].errors;
+	}
+	dev->if_opackets = counters[1];
+	dev->if_obytes = counters[3];
+	for (i = 0; i <= priv->tx_ring_num; i++) {
+		dev->if_opackets += priv->tx_ring[i].packets;
+		dev->if_obytes += priv->tx_ring[i].bytes;
+		oerror += priv->tx_ring[i].errors;
+	}
+
+	dev->if_ierrors = be32_to_cpu(mlx4_en_stats->RDROP) + ierror;
+	dev->if_oerrors = be32_to_cpu(mlx4_en_stats->TDROP) + oerror;
+	dev->if_imcasts = be64_to_cpu(mlx4_en_stats->MCAST_prio_0) +
+			  be64_to_cpu(mlx4_en_stats->MCAST_prio_1) +
+			  be64_to_cpu(mlx4_en_stats->MCAST_prio_2) +
+			  be64_to_cpu(mlx4_en_stats->MCAST_prio_3) +
+			  be64_to_cpu(mlx4_en_stats->MCAST_prio_4) +
+			  be64_to_cpu(mlx4_en_stats->MCAST_prio_5) +
+			  be64_to_cpu(mlx4_en_stats->MCAST_prio_6) +
+			  be64_to_cpu(mlx4_en_stats->MCAST_prio_7) +
+			  be64_to_cpu(mlx4_en_stats->MCAST_novlan);
+	dev->if_omcasts = be64_to_cpu(mlx4_en_stats->TMCAST_prio_0) +
+			  be64_to_cpu(mlx4_en_stats->TMCAST_prio_1) +
+			  be64_to_cpu(mlx4_en_stats->TMCAST_prio_2) +
+			  be64_to_cpu(mlx4_en_stats->TMCAST_prio_3) +
+			  be64_to_cpu(mlx4_en_stats->TMCAST_prio_4) +
+			  be64_to_cpu(mlx4_en_stats->TMCAST_prio_5) +
+			  be64_to_cpu(mlx4_en_stats->TMCAST_prio_6) +
+			  be64_to_cpu(mlx4_en_stats->TMCAST_prio_7) +
+			  be64_to_cpu(mlx4_en_stats->TMCAST_novlan);
+	dev->if_collisions = 0;
+
+	priv->pkstats.broadcast =
+				be64_to_cpu(mlx4_en_stats->RBCAST_prio_0) +
+				be64_to_cpu(mlx4_en_stats->RBCAST_prio_1) +
+				be64_to_cpu(mlx4_en_stats->RBCAST_prio_2) +
+				be64_to_cpu(mlx4_en_stats->RBCAST_prio_3) +
+				be64_to_cpu(mlx4_en_stats->RBCAST_prio_4) +
+				be64_to_cpu(mlx4_en_stats->RBCAST_prio_5) +
+				be64_to_cpu(mlx4_en_stats->RBCAST_prio_6) +
+				be64_to_cpu(mlx4_en_stats->RBCAST_prio_7) +
+				be64_to_cpu(mlx4_en_stats->RBCAST_novlan);
+	priv->pkstats.rx_prio[0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_0);
+	priv->pkstats.rx_prio[1] = be64_to_cpu(mlx4_en_stats->RTOT_prio_1);
+	priv->pkstats.rx_prio[2] = be64_to_cpu(mlx4_en_stats->RTOT_prio_2);
+	priv->pkstats.rx_prio[3] = be64_to_cpu(mlx4_en_stats->RTOT_prio_3);
+	priv->pkstats.rx_prio[4] = be64_to_cpu(mlx4_en_stats->RTOT_prio_4);
+	priv->pkstats.rx_prio[5] = be64_to_cpu(mlx4_en_stats->RTOT_prio_5);
+	priv->pkstats.rx_prio[6] = be64_to_cpu(mlx4_en_stats->RTOT_prio_6);
+	priv->pkstats.rx_prio[7] = be64_to_cpu(mlx4_en_stats->RTOT_prio_7);
+	priv->pkstats.tx_prio[0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_0);
+	priv->pkstats.tx_prio[1] = be64_to_cpu(mlx4_en_stats->TTOT_prio_1);
+	priv->pkstats.tx_prio[2] = be64_to_cpu(mlx4_en_stats->TTOT_prio_2);
+	priv->pkstats.tx_prio[3] = be64_to_cpu(mlx4_en_stats->TTOT_prio_3);
+	priv->pkstats.tx_prio[4] = be64_to_cpu(mlx4_en_stats->TTOT_prio_4);
+	priv->pkstats.tx_prio[5] = be64_to_cpu(mlx4_en_stats->TTOT_prio_5);
+	priv->pkstats.tx_prio[6] = be64_to_cpu(mlx4_en_stats->TTOT_prio_6);
+	priv->pkstats.tx_prio[7] = be64_to_cpu(mlx4_en_stats->TTOT_prio_7);
+	spin_unlock(&priv->stats_lock);
+
+out:
+	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+	return err;
+}
+
diff --git a/sys/ofed/drivers/net/mlx4/en_port.h b/sys/ofed/drivers/net/mlx4/en_port.h
new file mode 100644
index 0000000..96551d3
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/en_port.h
@@ -0,0 +1,591 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef _MLX4_EN_PORT_H_
+#define _MLX4_EN_PORT_H_
+
+
+#define SET_PORT_GEN_ALL_VALID	0x7
+#define SET_PORT_PROMISC_EN_SHIFT	31
+#define SET_PORT_PROMISC_MODE_SHIFT	30
+
+enum {
+	MLX4_CMD_SET_VLAN_FLTR  = 0x47,
+	MLX4_CMD_SET_MCAST_FLTR = 0x48,
+	MLX4_CMD_DUMP_ETH_STATS = 0x49,
+};
+
+struct mlx4_set_port_general_context {
+	u8 reserved[3];
+	u8 flags;
+	u16 reserved2;
+	__be16 mtu;
+	u8 pptx;
+	u8 pfctx;
+	u16 reserved3;
+	u8 pprx;
+	u8 pfcrx;
+	u16 reserved4;
+};
+
+struct mlx4_set_port_rqp_calc_context {
+	__be32 base_qpn;
+	__be32 flags;
+	u8 reserved[3];
+	u8 mac_miss;
+	u8 intra_no_vlan;
+	u8 no_vlan;
+	u8 intra_vlan_miss;
+	u8 vlan_miss;
+	u8 reserved2[3];
+	u8 no_vlan_prio;
+	__be32 promisc;
+	__be32 mcast;
+};
+
+#define VLAN_FLTR_SIZE	128
+struct mlx4_set_vlan_fltr_mbox {
+	__be32 entry[VLAN_FLTR_SIZE];
+};
+
+
+enum {
+	MLX4_MCAST_CONFIG       = 0,
+	MLX4_MCAST_DISABLE      = 1,
+	MLX4_MCAST_ENABLE       = 2,
+};
+
+struct mlx4_en_query_port_context {
+	u8 link_up;
+#define MLX4_EN_LINK_UP_MASK	0x80
+	u8 reserved;
+	__be16 mtu;
+	u8 reserved2;
+	u8 link_speed;
+#define MLX4_EN_SPEED_MASK	0x3
+#define MLX4_EN_1G_SPEED	0x2
+	u16 reserved3[5];
+	__be64 mac;
+	u8 transceiver;
+	u8 reserved4[3];
+	__be32 wavelenth;
+	u32 reserved5;
+	__be32 transceiver_code_hi;
+	__be32 transceiver_code_low;
+};
+
+
+struct mlx4_en_stat_out_mbox {
+	/* Received frames with a length of 64 octets */
+	__be64 R64_prio_0;
+	__be64 R64_prio_1;
+	__be64 R64_prio_2;
+	__be64 R64_prio_3;
+	__be64 R64_prio_4;
+	__be64 R64_prio_5;
+	__be64 R64_prio_6;
+	__be64 R64_prio_7;
+	__be64 R64_novlan;
+	/* Received frames with a length of 127 octets */
+	__be64 R127_prio_0;
+	__be64 R127_prio_1;
+	__be64 R127_prio_2;
+	__be64 R127_prio_3;
+	__be64 R127_prio_4;
+	__be64 R127_prio_5;
+	__be64 R127_prio_6;
+	__be64 R127_prio_7;
+	__be64 R127_novlan;
+	/* Received frames with a length of 255 octets */
+	__be64 R255_prio_0;
+	__be64 R255_prio_1;
+	__be64 R255_prio_2;
+	__be64 R255_prio_3;
+	__be64 R255_prio_4;
+	__be64 R255_prio_5;
+	__be64 R255_prio_6;
+	__be64 R255_prio_7;
+	__be64 R255_novlan;
+	/* Received frames with a length of 511 octets */
+	__be64 R511_prio_0;
+	__be64 R511_prio_1;
+	__be64 R511_prio_2;
+	__be64 R511_prio_3;
+	__be64 R511_prio_4;
+	__be64 R511_prio_5;
+	__be64 R511_prio_6;
+	__be64 R511_prio_7;
+	__be64 R511_novlan;
+	/* Received frames with a length of 1023 octets */
+	__be64 R1023_prio_0;
+	__be64 R1023_prio_1;
+	__be64 R1023_prio_2;
+	__be64 R1023_prio_3;
+	__be64 R1023_prio_4;
+	__be64 R1023_prio_5;
+	__be64 R1023_prio_6;
+	__be64 R1023_prio_7;
+	__be64 R1023_novlan;
+	/* Received frames with a length of 1518 octets */
+	__be64 R1518_prio_0;
+	__be64 R1518_prio_1;
+	__be64 R1518_prio_2;
+	__be64 R1518_prio_3;
+	__be64 R1518_prio_4;
+	__be64 R1518_prio_5;
+	__be64 R1518_prio_6;
+	__be64 R1518_prio_7;
+	__be64 R1518_novlan;
+	/* Received frames with a length of 1522 octets */
+	__be64 R1522_prio_0;
+	__be64 R1522_prio_1;
+	__be64 R1522_prio_2;
+	__be64 R1522_prio_3;
+	__be64 R1522_prio_4;
+	__be64 R1522_prio_5;
+	__be64 R1522_prio_6;
+	__be64 R1522_prio_7;
+	__be64 R1522_novlan;
+	/* Received frames with a length of 1548 octets */
+	__be64 R1548_prio_0;
+	__be64 R1548_prio_1;
+	__be64 R1548_prio_2;
+	__be64 R1548_prio_3;
+	__be64 R1548_prio_4;
+	__be64 R1548_prio_5;
+	__be64 R1548_prio_6;
+	__be64 R1548_prio_7;
+	__be64 R1548_novlan;
+	/* Received frames with a length of 1548 < octets < MTU */
+	__be64 R2MTU_prio_0;
+	__be64 R2MTU_prio_1;
+	__be64 R2MTU_prio_2;
+	__be64 R2MTU_prio_3;
+	__be64 R2MTU_prio_4;
+	__be64 R2MTU_prio_5;
+	__be64 R2MTU_prio_6;
+	__be64 R2MTU_prio_7;
+	__be64 R2MTU_novlan;
+	/* Received frames with a length of MTU< octets and good CRC */
+	__be64 RGIANT_prio_0;
+	__be64 RGIANT_prio_1;
+	__be64 RGIANT_prio_2;
+	__be64 RGIANT_prio_3;
+	__be64 RGIANT_prio_4;
+	__be64 RGIANT_prio_5;
+	__be64 RGIANT_prio_6;
+	__be64 RGIANT_prio_7;
+	__be64 RGIANT_novlan;
+	/* Received broadcast frames with good CRC */
+	__be64 RBCAST_prio_0;
+	__be64 RBCAST_prio_1;
+	__be64 RBCAST_prio_2;
+	__be64 RBCAST_prio_3;
+	__be64 RBCAST_prio_4;
+	__be64 RBCAST_prio_5;
+	__be64 RBCAST_prio_6;
+	__be64 RBCAST_prio_7;
+	__be64 RBCAST_novlan;
+	/* Received multicast frames with good CRC */
+	__be64 MCAST_prio_0;
+	__be64 MCAST_prio_1;
+	__be64 MCAST_prio_2;
+	__be64 MCAST_prio_3;
+	__be64 MCAST_prio_4;
+	__be64 MCAST_prio_5;
+	__be64 MCAST_prio_6;
+	__be64 MCAST_prio_7;
+	__be64 MCAST_novlan;
+	/* Received unicast not short or GIANT frames with good CRC */
+	__be64 RTOTG_prio_0;
+	__be64 RTOTG_prio_1;
+	__be64 RTOTG_prio_2;
+	__be64 RTOTG_prio_3;
+	__be64 RTOTG_prio_4;
+	__be64 RTOTG_prio_5;
+	__be64 RTOTG_prio_6;
+	__be64 RTOTG_prio_7;
+	__be64 RTOTG_novlan;
+
+	/* Count of total octets of received frames, includes framing characters */
+	__be64 RTTLOCT_prio_0;
+	/* Count of total octets of received frames, not including framing
+	   characters */
+	__be64 RTTLOCT_NOFRM_prio_0;
+	/* Count of Total number of octets received
+	   (only for frames without errors) */
+	__be64 ROCT_prio_0;
+
+	__be64 RTTLOCT_prio_1;
+	__be64 RTTLOCT_NOFRM_prio_1;
+	__be64 ROCT_prio_1;
+
+	__be64 RTTLOCT_prio_2;
+	__be64 RTTLOCT_NOFRM_prio_2;
+	__be64 ROCT_prio_2;
+
+	__be64 RTTLOCT_prio_3;
+	__be64 RTTLOCT_NOFRM_prio_3;
+	__be64 ROCT_prio_3;
+
+	__be64 RTTLOCT_prio_4;
+	__be64 RTTLOCT_NOFRM_prio_4;
+	__be64 ROCT_prio_4;
+
+	__be64 RTTLOCT_prio_5;
+	__be64 RTTLOCT_NOFRM_prio_5;
+	__be64 ROCT_prio_5;
+
+	__be64 RTTLOCT_prio_6;
+	__be64 RTTLOCT_NOFRM_prio_6;
+	__be64 ROCT_prio_6;
+
+	__be64 RTTLOCT_prio_7;
+	__be64 RTTLOCT_NOFRM_prio_7;
+	__be64 ROCT_prio_7;
+
+	__be64 RTTLOCT_novlan;
+	__be64 RTTLOCT_NOFRM_novlan;
+	__be64 ROCT_novlan;
+
+	/* Count of Total received frames including bad frames */
+	__be64 RTOT_prio_0;
+	/* Count of  Total number of received frames with 802.1Q encapsulation */
+	__be64 R1Q_prio_0;
+	__be64 reserved1;
+
+	__be64 RTOT_prio_1;
+	__be64 R1Q_prio_1;
+	__be64 reserved2;
+
+	__be64 RTOT_prio_2;
+	__be64 R1Q_prio_2;
+	__be64 reserved3;
+
+	__be64 RTOT_prio_3;
+	__be64 R1Q_prio_3;
+	__be64 reserved4;
+
+	__be64 RTOT_prio_4;
+	__be64 R1Q_prio_4;
+	__be64 reserved5;
+
+	__be64 RTOT_prio_5;
+	__be64 R1Q_prio_5;
+	__be64 reserved6;
+
+	__be64 RTOT_prio_6;
+	__be64 R1Q_prio_6;
+	__be64 reserved7;
+
+	__be64 RTOT_prio_7;
+	__be64 R1Q_prio_7;
+	__be64 reserved8;
+
+	__be64 RTOT_novlan;
+	__be64 R1Q_novlan;
+	__be64 reserved9;
+
+	/* Total number of Successfully Received Control Frames */
+	__be64 RCNTL;
+	__be64 reserved10;
+	__be64 reserved11;
+	__be64 reserved12;
+	/* Count of received frames with a length/type field  value between 46
+	   (42 for VLANtagged frames) and 1500 (also 1500 for VLAN-tagged frames),
+	   inclusive */
+	__be64 RInRangeLengthErr;
+	/* Count of received frames with length/type field between 1501 and 1535
+	   decimal, inclusive */
+	__be64 ROutRangeLengthErr;
+	/* Count of received frames that are longer than max allowed size for
+	   802.3 frames (1518/1522) */
+	__be64 RFrmTooLong;
+	/* Count frames received with PCS error */
+	__be64 PCS;
+
+	/* Transmit frames with a length of 64 octets */
+	__be64 T64_prio_0;
+	__be64 T64_prio_1;
+	__be64 T64_prio_2;
+	__be64 T64_prio_3;
+	__be64 T64_prio_4;
+	__be64 T64_prio_5;
+	__be64 T64_prio_6;
+	__be64 T64_prio_7;
+	__be64 T64_novlan;
+	__be64 T64_loopbk;
+	/* Transmit frames with a length of 65 to 127 octets. */
+	__be64 T127_prio_0;
+	__be64 T127_prio_1;
+	__be64 T127_prio_2;
+	__be64 T127_prio_3;
+	__be64 T127_prio_4;
+	__be64 T127_prio_5;
+	__be64 T127_prio_6;
+	__be64 T127_prio_7;
+	__be64 T127_novlan;
+	__be64 T127_loopbk;
+	/* Transmit frames with a length of 128 to 255 octets */
+	__be64 T255_prio_0;
+	__be64 T255_prio_1;
+	__be64 T255_prio_2;
+	__be64 T255_prio_3;
+	__be64 T255_prio_4;
+	__be64 T255_prio_5;
+	__be64 T255_prio_6;
+	__be64 T255_prio_7;
+	__be64 T255_novlan;
+	__be64 T255_loopbk;
+	/* Transmit frames with a length of 256 to 511 octets */
+	__be64 T511_prio_0;
+	__be64 T511_prio_1;
+	__be64 T511_prio_2;
+	__be64 T511_prio_3;
+	__be64 T511_prio_4;
+	__be64 T511_prio_5;
+	__be64 T511_prio_6;
+	__be64 T511_prio_7;
+	__be64 T511_novlan;
+	__be64 T511_loopbk;
+	/* Transmit frames with a length of 512 to 1023 octets */
+	__be64 T1023_prio_0;
+	__be64 T1023_prio_1;
+	__be64 T1023_prio_2;
+	__be64 T1023_prio_3;
+	__be64 T1023_prio_4;
+	__be64 T1023_prio_5;
+	__be64 T1023_prio_6;
+	__be64 T1023_prio_7;
+	__be64 T1023_novlan;
+	__be64 T1023_loopbk;
+	/* Transmit frames with a length of 1024 to 1518 octets */
+	__be64 T1518_prio_0;
+	__be64 T1518_prio_1;
+	__be64 T1518_prio_2;
+	__be64 T1518_prio_3;
+	__be64 T1518_prio_4;
+	__be64 T1518_prio_5;
+	__be64 T1518_prio_6;
+	__be64 T1518_prio_7;
+	__be64 T1518_novlan;
+	__be64 T1518_loopbk;
+	/* Counts transmit frames with a length of 1519 to 1522 bytes */
+	__be64 T1522_prio_0;
+	__be64 T1522_prio_1;
+	__be64 T1522_prio_2;
+	__be64 T1522_prio_3;
+	__be64 T1522_prio_4;
+	__be64 T1522_prio_5;
+	__be64 T1522_prio_6;
+	__be64 T1522_prio_7;
+	__be64 T1522_novlan;
+	__be64 T1522_loopbk;
+	/* Transmit frames with a length of 1523 to 1548 octets */
+	__be64 T1548_prio_0;
+	__be64 T1548_prio_1;
+	__be64 T1548_prio_2;
+	__be64 T1548_prio_3;
+	__be64 T1548_prio_4;
+	__be64 T1548_prio_5;
+	__be64 T1548_prio_6;
+	__be64 T1548_prio_7;
+	__be64 T1548_novlan;
+	__be64 T1548_loopbk;
+	/* Counts transmit frames with a length of 1549 to MTU bytes */
+	__be64 T2MTU_prio_0;
+	__be64 T2MTU_prio_1;
+	__be64 T2MTU_prio_2;
+	__be64 T2MTU_prio_3;
+	__be64 T2MTU_prio_4;
+	__be64 T2MTU_prio_5;
+	__be64 T2MTU_prio_6;
+	__be64 T2MTU_prio_7;
+	__be64 T2MTU_novlan;
+	__be64 T2MTU_loopbk;
+	/* Transmit frames with a length greater than MTU octets and a good CRC. */
+	__be64 TGIANT_prio_0;
+	__be64 TGIANT_prio_1;
+	__be64 TGIANT_prio_2;
+	__be64 TGIANT_prio_3;
+	__be64 TGIANT_prio_4;
+	__be64 TGIANT_prio_5;
+	__be64 TGIANT_prio_6;
+	__be64 TGIANT_prio_7;
+	__be64 TGIANT_novlan;
+	__be64 TGIANT_loopbk;
+	/* Transmit broadcast frames with a good CRC */
+	__be64 TBCAST_prio_0;
+	__be64 TBCAST_prio_1;
+	__be64 TBCAST_prio_2;
+	__be64 TBCAST_prio_3;
+	__be64 TBCAST_prio_4;
+	__be64 TBCAST_prio_5;
+	__be64 TBCAST_prio_6;
+	__be64 TBCAST_prio_7;
+	__be64 TBCAST_novlan;
+	__be64 TBCAST_loopbk;
+	/* Transmit multicast frames with a good CRC */
+	__be64 TMCAST_prio_0;
+	__be64 TMCAST_prio_1;
+	__be64 TMCAST_prio_2;
+	__be64 TMCAST_prio_3;
+	__be64 TMCAST_prio_4;
+	__be64 TMCAST_prio_5;
+	__be64 TMCAST_prio_6;
+	__be64 TMCAST_prio_7;
+	__be64 TMCAST_novlan;
+	__be64 TMCAST_loopbk;
+	/* Transmit good frames that are neither broadcast nor multicast */
+	__be64 TTOTG_prio_0;
+	__be64 TTOTG_prio_1;
+	__be64 TTOTG_prio_2;
+	__be64 TTOTG_prio_3;
+	__be64 TTOTG_prio_4;
+	__be64 TTOTG_prio_5;
+	__be64 TTOTG_prio_6;
+	__be64 TTOTG_prio_7;
+	__be64 TTOTG_novlan;
+	__be64 TTOTG_loopbk;
+
+	/* total octets of transmitted frames, including framing characters */
+	__be64 TTTLOCT_prio_0;
+	/* total octets of transmitted frames, not including framing characters */
+	__be64 TTTLOCT_NOFRM_prio_0;
+	/* ifOutOctets */
+	__be64 TOCT_prio_0;
+
+	__be64 TTTLOCT_prio_1;
+	__be64 TTTLOCT_NOFRM_prio_1;
+	__be64 TOCT_prio_1;
+
+	__be64 TTTLOCT_prio_2;
+	__be64 TTTLOCT_NOFRM_prio_2;
+	__be64 TOCT_prio_2;
+
+	__be64 TTTLOCT_prio_3;
+	__be64 TTTLOCT_NOFRM_prio_3;
+	__be64 TOCT_prio_3;
+
+	__be64 TTTLOCT_prio_4;
+	__be64 TTTLOCT_NOFRM_prio_4;
+	__be64 TOCT_prio_4;
+
+	__be64 TTTLOCT_prio_5;
+	__be64 TTTLOCT_NOFRM_prio_5;
+	__be64 TOCT_prio_5;
+
+	__be64 TTTLOCT_prio_6;
+	__be64 TTTLOCT_NOFRM_prio_6;
+	__be64 TOCT_prio_6;
+
+	__be64 TTTLOCT_prio_7;
+	__be64 TTTLOCT_NOFRM_prio_7;
+	__be64 TOCT_prio_7;
+
+	__be64 TTTLOCT_novlan;
+	__be64 TTTLOCT_NOFRM_novlan;
+	__be64 TOCT_novlan;
+
+	__be64 TTTLOCT_loopbk;
+	__be64 TTTLOCT_NOFRM_loopbk;
+	__be64 TOCT_loopbk;
+
+	/* Total frames transmitted with a good CRC that are not aborted  */
+	__be64 TTOT_prio_0;
+	/* Total number of frames transmitted with 802.1Q encapsulation */
+	__be64 T1Q_prio_0;
+	__be64 reserved13;
+
+	__be64 TTOT_prio_1;
+	__be64 T1Q_prio_1;
+	__be64 reserved14;
+
+	__be64 TTOT_prio_2;
+	__be64 T1Q_prio_2;
+	__be64 reserved15;
+
+	__be64 TTOT_prio_3;
+	__be64 T1Q_prio_3;
+	__be64 reserved16;
+
+	__be64 TTOT_prio_4;
+	__be64 T1Q_prio_4;
+	__be64 reserved17;
+
+	__be64 TTOT_prio_5;
+	__be64 T1Q_prio_5;
+	__be64 reserved18;
+
+	__be64 TTOT_prio_6;
+	__be64 T1Q_prio_6;
+	__be64 reserved19;
+
+	__be64 TTOT_prio_7;
+	__be64 T1Q_prio_7;
+	__be64 reserved20;
+
+	__be64 TTOT_novlan;
+	__be64 T1Q_novlan;
+	__be64 reserved21;
+
+	__be64 TTOT_loopbk;
+	__be64 T1Q_loopbk;
+	__be64 reserved22;
+
+	/* Received frames with a length greater than MTU octets and a bad CRC */
+	__be32 RJBBR;
+	/* Received frames with a bad CRC that are not runts, jabbers,
+	   or alignment errors */
+	__be32 RCRC;
+	/* Received frames with SFD with a length of less than 64 octets and a
+	   bad CRC */
+	__be32 RRUNT;
+	/* Received frames with a length less than 64 octets and a good CRC */
+	__be32 RSHORT;
+	/* Total Number of Received Packets Dropped */
+	__be32 RDROP;
+	/* Drop due to overflow  */
+	__be32 RdropOvflw;
+	/* Drop due to overflow */
+	__be32 RdropLength;
+	/* Total of good frames. Does not include frames received with
+	   frame-too-long, FCS, or length errors */
+	__be32 RTOTFRMS;
+	/* Total dropped Xmited packets */
+	__be32 TDROP;
+};
+
+enum mlx4_query_reply mlx4_en_query(void *endev_ptr, void *int_dev);
+
+#endif
diff --git a/sys/ofed/drivers/net/mlx4/en_resources.c b/sys/ofed/drivers/net/mlx4/en_resources.c
new file mode 100644
index 0000000..a147153
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/en_resources.c
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/vmalloc.h>
+#include <linux/mlx4/qp.h>
+
+#include "mlx4_en.h"
+
+void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
+			     int is_tx, int rss, int qpn, int cqn,
+			     struct mlx4_qp_context *context)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	memset(context, 0, sizeof *context);
+	context->flags = cpu_to_be32(7 << 16 | rss << 13);
+	context->pd = cpu_to_be32(mdev->priv_pdn);
+	context->mtu_msgmax = 0xff;
+	if (!is_tx && !rss) {
+		context->rq_size_stride = ilog2(size) << 3 | (ilog2(stride) - 4);
+	}
+	if (is_tx)
+		context->sq_size_stride = ilog2(size) << 3 | (ilog2(stride) - 4);
+	else
+		context->sq_size_stride = ilog2(TXBB_SIZE) - 4;
+	context->usr_page = cpu_to_be32(mdev->priv_uar.index);
+	context->local_qpn = cpu_to_be32(qpn);
+	context->pri_path.ackto = 1 & 0x07;
+	context->pri_path.sched_queue = 0x83 | (priv->port - 1) << 6;
+	context->pri_path.counter_index = 0xff;
+	context->cqn_send = cpu_to_be32(cqn);
+	context->cqn_recv = cpu_to_be32(cqn);
+	context->db_rec_addr = cpu_to_be64(priv->res.db.dma << 2);
+}
+
+
+int mlx4_en_map_buffer(struct mlx4_buf *buf)
+{
+	struct page **pages;
+	int i;
+
+	if (buf->direct.buf != NULL || buf->nbufs == 1)
+		return 0;
+
+	pages = kmalloc(sizeof *pages * buf->nbufs, GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	for (i = 0; i < buf->nbufs; ++i)
+		pages[i] = virt_to_page(buf->page_list[i].buf);
+
+	buf->direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL);
+	kfree(pages);
+	if (!buf->direct.buf)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx4_en_unmap_buffer(struct mlx4_buf *buf)
+{
+	if (buf->direct.buf != NULL || buf->nbufs == 1)
+		return;
+
+	vunmap(buf->direct.buf);
+	buf->direct.buf = NULL;
+}
+
+void mlx4_en_sqp_event(struct mlx4_qp *qp, enum mlx4_event event)
+{
+    return;
+}
+
diff --git a/sys/ofed/drivers/net/mlx4/en_rx.c b/sys/ofed/drivers/net/mlx4/en_rx.c
new file mode 100644
index 0000000..fc4345f
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/en_rx.c
@@ -0,0 +1,1006 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include "mlx4_en.h"
+
+#include <linux/mlx4/cq.h>
+#include <linux/mlx4/qp.h>
+
+#include <net/ethernet.h>
+#include <net/if_vlan_var.h>
+#include <sys/mbuf.h>
+
+enum {
+	MIN_RX_ARM = 1024,
+};
+
+static int mlx4_en_alloc_buf(struct mlx4_en_priv *priv,
+			     struct mlx4_en_rx_desc *rx_desc,
+			     struct mbuf **mb_list,
+			     int i)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
+	struct mbuf *mb;
+	dma_addr_t dma;
+
+	if (i == 0)
+		mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, frag_info->frag_size);
+	else
+		mb = m_getjcl(M_NOWAIT, MT_DATA, 0, frag_info->frag_size);
+	if (mb == NULL) {
+		priv->port_stats.rx_alloc_failed++;
+		return -ENOMEM;
+	}
+	dma = pci_map_single(mdev->pdev, mb->m_data, frag_info->frag_size,
+			     PCI_DMA_FROMDEVICE);
+	rx_desc->data[i].addr = cpu_to_be64(dma);
+	mb_list[i] = mb;
+	return 0;
+}
+
+static void
+mlx4_en_init_rx_desc_mb(struct mlx4_en_priv *priv,
+			 struct mlx4_en_rx_ring *ring, int index)
+{
+	struct mlx4_en_rx_desc *rx_desc = ring->buf + ring->stride * index;
+
+	rx_desc->data->byte_count = cpu_to_be32(priv->rx_mb_size);
+	rx_desc->data->lkey = cpu_to_be32(priv->mdev->mr.key);
+}
+
+static void mlx4_en_init_rx_desc(struct mlx4_en_priv *priv,
+				 struct mlx4_en_rx_ring *ring, int index)
+{
+	struct mlx4_en_rx_desc *rx_desc = ring->buf + ring->stride * index;
+	int possible_frags;
+	int i;
+
+	/* Set size and memtype fields */
+	for (i = 0; i < priv->num_frags; i++) {
+		rx_desc->data[i].byte_count =
+			cpu_to_be32(priv->frag_info[i].frag_size);
+		rx_desc->data[i].lkey = cpu_to_be32(priv->mdev->mr.key);
+	}
+
+	/* If the number of used fragments does not fill up the ring stride,
+	 * remaining (unused) fragments must be padded with null address/size
+	 * and a special memory key */
+	possible_frags = (ring->stride - sizeof(struct mlx4_en_rx_desc)) / DS_SIZE;
+	for (i = priv->num_frags; i < possible_frags; i++) {
+		rx_desc->data[i].byte_count = 0;
+		rx_desc->data[i].lkey = cpu_to_be32(MLX4_EN_MEMTYPE_PAD);
+		rx_desc->data[i].addr = 0;
+	}
+}
+
+static int
+mlx4_en_alloc_rx_mb(struct mlx4_en_priv *priv,
+		     struct mlx4_en_rx_desc *rx_desc,
+		     struct mbuf **pmb, int unmap)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	dma_addr_t dma;
+	int size = priv->rx_mb_size;
+	struct mbuf *new_mb;
+
+	new_mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
+	if (unlikely(new_mb == NULL)) {
+		priv->port_stats.rx_alloc_failed++;
+		return -ENOMEM;
+	}
+
+	if (unmap)
+		pci_unmap_single(mdev->pdev, be64_to_cpu(rx_desc->data->addr),
+				 be32_to_cpu(rx_desc->data->byte_count),
+				 PCI_DMA_FROMDEVICE);
+	dma = pci_map_single(priv->mdev->pdev, new_mb->m_data, size, DMA_FROM_DEVICE);
+	*pmb = new_mb;
+	rx_desc->data->addr = cpu_to_be64(dma);
+	return 0;
+}
+
+static int
+mlx4_en_prepare_rx_desc_mb(struct mlx4_en_priv *priv,
+			    struct mlx4_en_rx_ring *ring, int index)
+{
+	struct mlx4_en_rx_desc *rx_desc = ring->buf + (index * ring->stride);
+	struct mbuf **pmb = (struct mbuf **) ring->rx_info + index;
+
+	return mlx4_en_alloc_rx_mb(priv, rx_desc, pmb, 0);
+}
+
+static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv,
+				   struct mlx4_en_rx_ring *ring, int index)
+{
+	struct mlx4_en_rx_desc *rx_desc = ring->buf + (index * ring->stride);
+	struct mbuf **mb_list = ring->rx_info + (index << priv->log_rx_info);
+	int i;
+
+	for (i = 0; i < priv->num_frags; i++)
+		if (mlx4_en_alloc_buf(priv, rx_desc, mb_list, i))
+			goto err;
+
+	return 0;
+
+err:
+	while (i--)
+		m_free(mb_list[i]);
+	return -ENOMEM;
+}
+
+static inline void mlx4_en_update_rx_prod_db(struct mlx4_en_rx_ring *ring)
+{
+	*ring->wqres.db.db = cpu_to_be32(ring->prod & 0xffff);
+}
+
+static void mlx4_en_free_rx_desc(struct mlx4_en_priv *priv,
+				 struct mlx4_en_rx_ring *ring,
+				 int index)
+{
+	struct mlx4_en_frag_info *frag_info;
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mbuf **mb_list;
+	struct mbuf *mb;
+	struct mlx4_en_rx_desc *rx_desc = ring->buf + (index << ring->log_stride);
+	dma_addr_t dma;
+	int nr;
+
+	if (ring->use_frags) {
+		mb_list = ring->rx_info + (index << priv->log_rx_info);
+		for (nr = 0; nr < priv->num_frags; nr++) {
+			en_dbg(DRV, priv, "Freeing fragment:%d\n", nr);
+ 			frag_info = &priv->frag_info[nr];
+			dma = be64_to_cpu(rx_desc->data[nr].addr);
+
+			en_dbg(DRV, priv, "Unmaping buffer at dma:0x%llx\n", (u64) dma);
+			pci_unmap_single(mdev->pdev, dma, frag_info->frag_size,
+					 PCI_DMA_FROMDEVICE);
+			m_free(mb_list[nr]);
+		}
+	} else {
+		mb = *((struct mbuf **) ring->rx_info + index);
+		dma = be64_to_cpu(rx_desc->data->addr);
+		pci_unmap_single(mdev->pdev, dma,
+				 priv->rx_mb_size,
+				 PCI_DMA_FROMDEVICE);
+		m_free(mb);
+	}
+}
+
+static int mlx4_en_fill_rx_buffers(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_rx_ring *ring;
+	int ring_ind;
+	int buf_ind;
+	int new_size;
+	int err;
+
+	for (buf_ind = 0; buf_ind < priv->prof->rx_ring_size; buf_ind++) {
+		for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
+			ring = &priv->rx_ring[ring_ind];
+
+			if (ring->use_frags)
+				err = mlx4_en_prepare_rx_desc(priv, ring,
+							      ring->actual_size);
+			else
+				err = mlx4_en_prepare_rx_desc_mb(priv, ring,
+								  ring->actual_size);
+			if (err) {
+				if (ring->actual_size == 0) {
+					en_err(priv, "Failed to allocate "
+						     "enough rx buffers\n");
+					return -ENOMEM;
+				} else {
+					new_size = rounddown_pow_of_two(ring->actual_size);
+					en_warn(priv, "Only %d buffers allocated "
+						      "reducing ring size to %d\n",
+						ring->actual_size, new_size);
+					goto reduce_rings;
+				}
+			}
+			ring->actual_size++;
+			ring->prod++;
+		}
+	}
+	return 0;
+
+reduce_rings:
+	for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
+		ring = &priv->rx_ring[ring_ind];
+		while (ring->actual_size > new_size) {
+			ring->actual_size--;
+			ring->prod--;
+			mlx4_en_free_rx_desc(priv, ring, ring->actual_size);
+		}
+	}
+
+	return 0;
+}
+
+static void mlx4_en_free_rx_buf(struct mlx4_en_priv *priv,
+				struct mlx4_en_rx_ring *ring)
+{
+	int index;
+
+	en_dbg(DRV, priv, "Freeing Rx buf - cons:%d prod:%d\n",
+	       ring->cons, ring->prod);
+
+	/* Unmap and free Rx buffers */
+	BUG_ON((u32) (ring->prod - ring->cons) > ring->actual_size);
+	while (ring->cons != ring->prod) {
+		index = ring->cons & ring->size_mask;
+		en_dbg(DRV, priv, "Processing descriptor:%d\n", index);
+		mlx4_en_free_rx_desc(priv, ring, index);
+		++ring->cons;
+	}
+}
+
+
+int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
+			   struct mlx4_en_rx_ring *ring, u32 size)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+	int tmp;
+
+
+	ring->prod = 0;
+	ring->cons = 0;
+	ring->size = size;
+	ring->size_mask = size - 1;
+	ring->stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) +
+					  DS_SIZE * (ring->use_frags ?
+						     MLX4_EN_MAX_RX_FRAGS : 1));
+	ring->log_stride = ffs(ring->stride) - 1;
+	ring->buf_size = ring->size * ring->stride + TXBB_SIZE;
+
+	if (ring->use_frags)
+		tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS *
+						sizeof(struct mbuf *));
+	else
+		tmp = size * sizeof(struct mbuf *);
+
+	ring->rx_info = kmalloc(tmp, GFP_KERNEL);
+	if (!ring->rx_info) {
+		en_err(priv, "Failed allocating rx_info ring\n");
+		return -ENOMEM;
+	}
+	en_dbg(DRV, priv, "Allocated rx_info ring at addr:%p size:%d stride:%d (%d)\n",
+		 ring->rx_info, tmp, ring->stride, ring->log_stride);
+
+	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres,
+				 ring->buf_size, 2 * PAGE_SIZE);
+	if (err)
+		goto err_ring;
+
+	err = mlx4_en_map_buffer(&ring->wqres.buf);
+	if (err) {
+		en_err(priv, "Failed to map RX buffer\n");
+		goto err_hwq;
+	}
+	ring->buf = ring->wqres.buf.direct.buf;
+
+	return 0;
+
+	mlx4_en_unmap_buffer(&ring->wqres.buf);
+err_hwq:
+	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
+err_ring:
+	kfree(ring->rx_info);
+	ring->rx_info = NULL;
+	return err;
+}
+
+int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_rx_ring *ring;
+	int i;
+	int ring_ind;
+	int err;
+	int stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) +
+					DS_SIZE * priv->num_frags);
+
+	for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
+		ring = &priv->rx_ring[ring_ind];
+
+		ring->prod = 0;
+		ring->cons = 0;
+		ring->actual_size = 0;
+		ring->cqn = priv->rx_cq[ring_ind].mcq.cqn;
+
+		if (ring->use_frags)
+			ring->stride = stride;
+		if (ring->stride <= TXBB_SIZE)
+			ring->buf += TXBB_SIZE;
+
+		ring->log_stride = ffs(ring->stride) - 1;
+		ring->buf_size = ring->size * ring->stride;
+
+		memset(ring->buf, 0, ring->buf_size);
+		mlx4_en_update_rx_prod_db(ring);
+
+		if (ring->use_frags) {
+			/* Initailize all descriptors */
+			for (i = 0; i < ring->size; i++)
+				mlx4_en_init_rx_desc(priv, ring, i);
+		} else {
+			for (i = 0; i < ring->size; i++)
+				mlx4_en_init_rx_desc_mb(priv, ring, i);
+		}
+		/* Configure lro mngr */
+		if (priv->dev->if_capenable & IFCAP_LRO) {
+			if (tcp_lro_init(&ring->lro))
+				priv->dev->if_capenable &= ~IFCAP_LRO;
+			else
+				ring->lro.ifp = priv->dev;
+		}
+	}
+	err = mlx4_en_fill_rx_buffers(priv);
+	if (err)
+		goto err_buffers;
+
+	for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
+		ring = &priv->rx_ring[ring_ind];
+
+		ring->size_mask = ring->actual_size - 1;
+		mlx4_en_update_rx_prod_db(ring);
+	}
+
+
+	return 0;
+
+err_buffers:
+	for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++)
+		mlx4_en_free_rx_buf(priv, &priv->rx_ring[ring_ind]);
+
+	ring_ind = priv->rx_ring_num - 1;
+	return err;
+}
+
+void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_rx_ring *ring)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	mlx4_en_unmap_buffer(&ring->wqres.buf);
+	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size + TXBB_SIZE);
+	kfree(ring->rx_info);
+	ring->rx_info = NULL;
+}
+
+void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
+				struct mlx4_en_rx_ring *ring)
+{
+	tcp_lro_free(&ring->lro);
+	mlx4_en_free_rx_buf(priv, ring);
+	if (ring->stride <= TXBB_SIZE)
+		ring->buf -= TXBB_SIZE;
+}
+
+
+/* Unmap a completed descriptor and free unused pages */
+static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
+				    struct mlx4_en_rx_desc *rx_desc,
+				    struct mbuf **mb_list,
+				    int length)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_frag_info *frag_info;
+	dma_addr_t dma;
+	struct mbuf *mb;
+	int nr;
+
+	mb = mb_list[0];
+	mb->m_pkthdr.len = length;
+	/* Collect used fragments while replacing them in the HW descirptors */
+	for (nr = 0; nr < priv->num_frags; nr++) {
+		frag_info = &priv->frag_info[nr];
+		if (length <= frag_info->frag_prefix_size)
+			break;
+		if (nr) 
+			mb->m_next = mb_list[nr];
+		mb = mb_list[nr];
+		mb->m_len = frag_info[nr].frag_size;
+		dma = be64_to_cpu(rx_desc->data[nr].addr);
+
+		/* Allocate a replacement page */
+		if (mlx4_en_alloc_buf(priv, rx_desc, mb_list, nr))
+			goto fail;
+
+		/* Unmap buffer */
+		pci_unmap_single(mdev->pdev, dma, frag_info[nr].frag_size,
+				 PCI_DMA_FROMDEVICE);
+	}
+	/* Adjust size of last fragment to match actual length */
+	mb->m_len = length - priv->frag_info[nr - 1].frag_prefix_size;
+	mb->m_next = NULL;
+	return 0;
+
+fail:
+	/* Drop all accumulated fragments (which have already been replaced in
+	 * the descriptor) of this packet; remaining fragments are reused... */
+	while (nr > 0) {
+		nr--;
+		m_free(mb_list[nr]);
+	}
+	return -ENOMEM;
+}
+
+
+static struct mbuf *mlx4_en_rx_mb(struct mlx4_en_priv *priv,
+				      struct mlx4_en_rx_desc *rx_desc,
+				      struct mbuf **mb_list,
+				      unsigned int length)
+{
+	struct mbuf *mb;
+
+	mb = mb_list[0];
+	/* Move relevant fragments to mb */
+	if (unlikely(mlx4_en_complete_rx_desc(priv, rx_desc, mb_list, length)))
+		return NULL;
+
+	return mb;
+}
+
+static inline int invalid_cqe(struct mlx4_en_priv *priv,
+			      struct mlx4_cqe *cqe)
+{
+	/* Drop packet on bad receive or bad checksum */
+	if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
+		     MLX4_CQE_OPCODE_ERROR)) {
+		en_err(priv, "CQE completed in error - vendor "
+			 "syndrom:%d syndrom:%d\n",
+			 ((struct mlx4_err_cqe *) cqe)->vendor_err_syndrome,
+			 ((struct mlx4_err_cqe *) cqe)->syndrome);
+		return 1;
+	}
+	if (unlikely(cqe->badfcs_enc & MLX4_CQE_BAD_FCS)) {
+		en_dbg(RX_ERR, priv, "Accepted frame with bad FCS\n");
+		return 1;;
+	}
+
+	return 0;
+}
+
+static struct mbuf *
+mlx4_en_get_rx_mb(struct mlx4_en_priv *priv,
+		   struct mlx4_en_rx_desc *rx_desc,
+		   struct mbuf **pmb,
+		   unsigned int length)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mbuf *mb;
+	dma_addr_t dma;
+
+	if (length <= SMALL_PACKET_SIZE) {
+		mb = m_gethdr(M_WAITOK, MT_DATA);
+		if (unlikely(mb == NULL))
+			return NULL;
+		/* We are copying all relevant data to the mb - temporarily
+		 * synch buffers for the copy */
+		dma = be64_to_cpu(rx_desc->data->addr);
+		dma_sync_single_range_for_cpu(&mdev->pdev->dev, dma, 0,
+					      length, DMA_FROM_DEVICE);
+		memcpy(mb->m_data, (*pmb)->m_data, length);
+		dma_sync_single_range_for_device(&mdev->pdev->dev, dma, 0,
+						 length, DMA_FROM_DEVICE);
+
+	} else {
+		mb = *pmb;
+		if (unlikely(mlx4_en_alloc_rx_mb(priv, rx_desc, pmb, 1)))
+			return NULL;
+	}
+
+	mb->m_len = length;
+	mb->m_pkthdr.len = length;
+	return mb;
+}
+
+static void validate_loopback(struct mlx4_en_priv *priv, struct mbuf *mb)
+{
+	int i;
+	int offset = ETHER_HDR_LEN;
+
+	for (i = 0; i < MLX4_LOOPBACK_TEST_PAYLOAD; i++, offset++) {
+		if (*(mb->m_data + offset) != (unsigned char) (i & 0xff))
+			goto out_loopback;
+	}
+	/* Loopback found */
+	priv->loopback_ok = 1;
+
+out_loopback:
+	m_freem(mb);
+}
+
+int mlx4_en_process_rx_cq_mb(struct net_device *dev,
+			      struct mlx4_en_cq *cq, int budget)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_cqe *cqe;
+	struct mlx4_en_rx_ring *ring = &priv->rx_ring[cq->ring];
+	struct mlx4_en_rx_desc *rx_desc;
+	struct mbuf **pmb;
+	struct mbuf *mb;
+	int index;
+	unsigned int length;
+	int polled = 0;
+
+	if (!priv->port_up)
+		return 0;
+
+	/* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
+	 * descriptor offset can be deduced from the CQE index instead of
+	 * reading 'cqe->index' */
+	index = cq->mcq.cons_index & ring->size_mask;
+	cqe = &cq->buf[index];
+
+	/* Process all completed CQEs */
+	while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
+		    cq->mcq.cons_index & cq->size)) {
+
+		pmb = (struct mbuf **) ring->rx_info + index;
+		rx_desc = ring->buf + (index << ring->log_stride);
+
+		/*
+		 * make sure we read the CQE after we read the ownership bit
+		 */
+		rmb();
+
+		if (invalid_cqe(priv, cqe))
+			goto next;
+
+		/*
+		 * Packet is OK - process it.
+		 */
+		length = be32_to_cpu(cqe->byte_cnt);
+
+		mb = mlx4_en_get_rx_mb(priv, rx_desc, pmb, length);
+		if (unlikely(!mb)){
+			ring->errors++;
+			goto next;
+		}
+
+		ring->bytes += length;
+		ring->packets++;
+
+		if (unlikely(priv->validate_loopback)) {
+			validate_loopback(priv, mb);
+			goto next;
+		}
+		mb->m_pkthdr.flowid = cq->ring;
+		mb->m_flags |= M_FLOWID;
+		mb->m_pkthdr.rcvif = dev;
+		if (be32_to_cpu(cqe->vlan_my_qpn) &
+		    MLX4_CQE_VLAN_PRESENT_MASK) {
+			mb->m_pkthdr.ether_vtag = be16_to_cpu(cqe->sl_vid);
+			mb->m_flags |= M_VLANTAG;
+		}
+
+		if (likely(priv->rx_csum && cqe->checksum == 0xffff)) {
+			priv->port_stats.rx_chksum_good++;
+			mb->m_pkthdr.csum_flags =
+			    CSUM_IP_CHECKED | CSUM_IP_VALID |
+			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+			mb->m_pkthdr.csum_data = htons(0xffff);
+		} else {
+			priv->port_stats.rx_chksum_none++;
+			mb->m_pkthdr.csum_flags = 0;
+			if (priv->mdev->profile.ip_reasm &&
+			    cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV4) &&
+			    !mlx4_en_rx_frags(priv, ring, mb, cqe))
+				goto next;
+		}
+		/* Push it up the stack */
+		dev->if_input(dev, mb);
+
+next:
+		++cq->mcq.cons_index;
+		index = (cq->mcq.cons_index) & ring->size_mask;
+		cqe = &cq->buf[index];
+		if (++polled == budget)
+			break;
+	}
+
+	/* If CQ is empty, flush all pending IP reassembly sessions */
+	mlx4_en_flush_frags(priv, ring);
+
+	AVG_PERF_COUNTER(priv->pstats.rx_coal_avg, polled);
+	mlx4_cq_set_ci(&cq->mcq);
+	wmb(); /* ensure HW sees CQ consumer before we post new buffers */
+	ring->cons = cq->mcq.cons_index;
+	ring->prod += polled; /* Polled descriptors were realocated in place */
+	mlx4_en_update_rx_prod_db(ring);
+	return polled;
+}
+
+int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int budget)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_cqe *cqe;
+	struct mlx4_en_rx_ring *ring = &priv->rx_ring[cq->ring];
+	struct mbuf **mb_list;
+	struct mlx4_en_rx_desc *rx_desc;
+	struct mbuf *mb;
+	struct lro_entry *queued;
+	int index;
+	unsigned int length;
+	int polled = 0;
+
+	if (!priv->port_up)
+		return 0;
+
+	/* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
+	 * descriptor offset can be deduced from the CQE index instead of
+	 * reading 'cqe->index' */
+	index = cq->mcq.cons_index & ring->size_mask;
+	cqe = &cq->buf[index];
+
+	/* Process all completed CQEs */
+	while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
+		    cq->mcq.cons_index & cq->size)) {
+
+		mb_list = ring->rx_info + (index << priv->log_rx_info);
+		rx_desc = ring->buf + (index << ring->log_stride);
+
+		/*
+		 * make sure we read the CQE after we read the ownership bit
+		 */
+		rmb();
+
+		if (invalid_cqe(priv, cqe))
+			goto next;
+
+		/*
+		 * Packet is OK - process it.
+		 */
+		length = be32_to_cpu(cqe->byte_cnt);
+		mb = mlx4_en_rx_mb(priv, rx_desc, mb_list, length);
+		if (!mb) {
+			ring->errors++;
+			goto next;
+		}
+
+		ring->bytes += length;
+		ring->packets++;
+
+                if (unlikely(priv->validate_loopback)) {
+			validate_loopback(priv, mb);
+			goto next;
+		}
+
+		mb->m_pkthdr.flowid = cq->ring;
+		mb->m_flags |= M_FLOWID;
+		mb->m_pkthdr.rcvif = dev;
+		if (be32_to_cpu(cqe->vlan_my_qpn) &
+		    MLX4_CQE_VLAN_PRESENT_MASK) {
+			mb->m_pkthdr.ether_vtag = be16_to_cpu(cqe->sl_vid);
+			mb->m_flags |= M_VLANTAG;
+		}
+		if (likely(priv->rx_csum) &&
+		    (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPOK)) &&
+		    (cqe->checksum == cpu_to_be16(0xffff))) {
+			priv->port_stats.rx_chksum_good++;
+			mb->m_pkthdr.csum_flags = 
+			    CSUM_IP_CHECKED | CSUM_IP_VALID |
+			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+			mb->m_pkthdr.csum_data = htons(0xffff);
+			/* This packet is eligible for LRO if it is:
+			 * - DIX Ethernet (type interpretation)
+			 * - TCP/IP (v4)
+			 * - without IP options
+			 * - not an IP fragment
+			 */
+			if (mlx4_en_can_lro(cqe->status) &&
+			    (dev->if_capenable & IFCAP_LRO)) {
+				if (ring->lro.lro_cnt != 0 &&
+				    tcp_lro_rx(&ring->lro, mb, 0) == 0)
+					goto next;
+			}
+
+			/* LRO not possible, complete processing here */
+			INC_PERF_COUNTER(priv->pstats.lro_misses);
+		} else {
+			mb->m_pkthdr.csum_flags = 0;
+			priv->port_stats.rx_chksum_none++;
+		}
+
+		/* Push it up the stack */
+		dev->if_input(dev, mb);
+
+next:
+		++cq->mcq.cons_index;
+		index = (cq->mcq.cons_index) & ring->size_mask;
+		cqe = &cq->buf[index];
+		if (++polled == budget)
+			break;
+	}
+	while ((queued = SLIST_FIRST(&ring->lro.lro_active)) != NULL) {
+		SLIST_REMOVE_HEAD(&ring->lro.lro_active, next);
+		tcp_lro_flush(&ring->lro, queued);
+	}
+	AVG_PERF_COUNTER(priv->pstats.rx_coal_avg, polled);
+	mlx4_cq_set_ci(&cq->mcq);
+	wmb(); /* ensure HW sees CQ consumer before we post new buffers */
+	ring->cons = cq->mcq.cons_index;
+	ring->prod += polled; /* Polled descriptors were realocated in place */
+	mlx4_en_update_rx_prod_db(ring);
+	return polled;
+}
+
+
+/* Rx CQ polling - called by NAPI */
+static int mlx4_en_poll_rx_cq(struct mlx4_en_cq *cq, int budget)
+{
+	struct net_device *dev = cq->dev;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int done;
+
+	if (priv->rx_ring[cq->ring].use_frags)
+		done = mlx4_en_process_rx_cq(dev, cq, budget);
+	else
+		done = mlx4_en_process_rx_cq_mb(dev, cq, budget);
+
+	cq->tot_rx += done;
+
+	return done;
+}
+
+void mlx4_en_rx_que(void *context, int pending)
+{
+	struct mlx4_en_cq *cq;
+
+        cq = context;
+	while (mlx4_en_poll_rx_cq(cq, MLX4_EN_MAX_RX_POLL)
+	    == MLX4_EN_MAX_RX_POLL);
+	mlx4_en_arm_cq(cq->dev->if_softc, cq);
+}
+
+void mlx4_en_rx_irq(struct mlx4_cq *mcq)
+{
+	struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq);
+	struct mlx4_en_priv *priv = netdev_priv(cq->dev);
+	int done;
+
+	done = mlx4_en_poll_rx_cq(cq, MLX4_EN_MAX_RX_POLL);
+	if (done == MLX4_EN_MAX_RX_POLL)
+		taskqueue_enqueue(cq->tq, &cq->cq_task);
+	else
+		mlx4_en_arm_cq(priv, cq);
+}
+
+
+#if MLX4_EN_MAX_RX_FRAGS == 3
+static int frag_sizes[] = {
+	FRAG_SZ0,
+	FRAG_SZ1,
+	FRAG_SZ2,
+};
+#elif MLX4_EN_MAX_RX_FRAGS == 2
+static int frag_sizes[] = {
+	FRAG_SZ0,
+	FRAG_SZ1,
+};
+#else
+#error "Unknown MAX_RX_FRAGS"
+#endif
+
+void mlx4_en_calc_rx_buf(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int eff_mtu = dev->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + ETH_LLC_SNAP_SIZE;
+	int buf_size = 0;
+	int i, frag;
+
+	for (i = 0, frag = 0; buf_size < eff_mtu; frag++, i++) {
+		/*
+		 * Allocate small to large but only as much as is needed for
+		 * the tail.
+		 */
+		while (i > 0 && eff_mtu - buf_size <= frag_sizes[i - 1])
+			i--;
+		priv->frag_info[frag].frag_size = frag_sizes[i];
+		priv->frag_info[frag].frag_prefix_size = buf_size;
+		buf_size += priv->frag_info[frag].frag_size;
+	}
+
+	priv->num_frags = frag;
+	/*
+	 * For use_frags == 0 calculate the size extbuf we require.
+	 */
+	if (eff_mtu <= MCLBYTES)
+		priv->rx_mb_size = MCLBYTES;
+	else if (eff_mtu <= MJUMPAGESIZE)
+		priv->rx_mb_size = MJUMPAGESIZE;
+	else if (eff_mtu <= MJUM9BYTES)
+		priv->rx_mb_size = MJUM9BYTES;
+	else
+		priv->rx_mb_size = MJUM16BYTES;
+	priv->log_rx_info =
+	    ROUNDUP_LOG2(priv->num_frags * sizeof(struct mbuf *));
+
+	en_dbg(DRV, priv, "Rx buffer scatter-list (effective-mtu:%d "
+		  "num_frags:%d):\n", eff_mtu, priv->num_frags);
+	for (i = 0; i < priv->num_frags; i++) {
+		en_dbg(DRV, priv, "  frag:%d - size:%d prefix:%d\n", i,
+				priv->frag_info[i].frag_size,
+				priv->frag_info[i].frag_prefix_size)
+	}
+}
+
+/* RSS related functions */
+
+static int mlx4_en_config_rss_qp(struct mlx4_en_priv *priv, int qpn,
+				 struct mlx4_en_rx_ring *ring,
+				 enum mlx4_qp_state *state,
+				 struct mlx4_qp *qp)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_qp_context *context;
+	int err = 0;
+
+	context = kmalloc(sizeof *context , GFP_KERNEL);
+	if (!context) {
+		en_err(priv, "Failed to allocate qp context\n");
+		return -ENOMEM;
+	}
+
+	err = mlx4_qp_alloc(mdev->dev, qpn, qp);
+	if (err) {
+		en_err(priv, "Failed to allocate qp #%x\n", qpn);
+		goto out;
+	}
+	qp->event = mlx4_en_sqp_event;
+
+	memset(context, 0, sizeof *context);
+	mlx4_en_fill_qp_context(priv, ring->actual_size, ring->stride, 0, 0,
+				qpn, ring->cqn, context);
+	context->db_rec_addr = cpu_to_be64(ring->wqres.db.dma);
+
+	err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, context, qp, state);
+	if (err) {
+		mlx4_qp_remove(mdev->dev, qp);
+		mlx4_qp_free(mdev->dev, qp);
+	}
+	mlx4_en_update_rx_prod_db(ring);
+out:
+	kfree(context);
+	return err;
+}
+
+/* Allocate rx qp's and configure them according to rss map */
+int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_rss_map *rss_map = &priv->rss_map;
+	struct mlx4_qp_context context;
+	struct mlx4_en_rss_context *rss_context;
+	void *ptr;
+	u8 rss_mask = (priv->udp_rings > 1) ? 0x3f : 0x14;
+	int i, qpn;
+	int err = 0;
+	int good_qps = 0;
+
+	en_dbg(DRV, priv, "Configuring rss steering\n");
+	err = mlx4_qp_reserve_range(mdev->dev, priv->rx_ring_num,
+				    roundup_pow_of_two(priv->rx_ring_num),
+				    &rss_map->base_qpn);
+	if (err) {
+		en_err(priv, "Failed reserving %d qps\n", priv->rx_ring_num);
+		return err;
+	}
+
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		qpn = rss_map->base_qpn + i;
+		err = mlx4_en_config_rss_qp(priv, qpn,
+					    &priv->rx_ring[i],
+					    &rss_map->state[i],
+					    &rss_map->qps[i]);
+		if (err)
+			goto rss_err;
+
+		++good_qps;
+	}
+
+	/* Configure RSS indirection qp */
+	err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &priv->base_qpn);
+	if (err) {
+		en_err(priv, "Failed to reserve range for RSS "
+			     "indirection qp\n");
+		goto rss_err;
+	}
+	err = mlx4_qp_alloc(mdev->dev, priv->base_qpn, &rss_map->indir_qp);
+	if (err) {
+		en_err(priv, "Failed to allocate RSS indirection QP\n");
+		goto reserve_err;
+	}
+	rss_map->indir_qp.event = mlx4_en_sqp_event;
+	mlx4_en_fill_qp_context(priv, 0, 0, 0, 1, priv->base_qpn,
+				priv->rx_ring[0].cqn, &context);
+
+	ptr = ((void *) &context) + 0x3c;
+	rss_context = (struct mlx4_en_rss_context *) ptr;
+	rss_context->base_qpn = cpu_to_be32(ilog2(priv->rx_ring_num - priv->udp_rings) << 24 |
+					    (rss_map->base_qpn));
+	rss_context->default_qpn = cpu_to_be32(rss_map->base_qpn +
+					       priv->rx_ring_num -
+					       priv->udp_rings);
+	rss_context->flags = rss_mask;
+	if (priv->udp_rings > 1)
+		rss_context->base_qpn_udp = rss_context->default_qpn;
+
+	err = mlx4_qp_to_ready(mdev->dev, &priv->res.mtt, &context,
+			       &rss_map->indir_qp, &rss_map->indir_state);
+	if (err)
+		goto indir_err;
+
+	return 0;
+
+indir_err:
+	mlx4_qp_modify(mdev->dev, NULL, rss_map->indir_state,
+		       MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->indir_qp);
+	mlx4_qp_remove(mdev->dev, &rss_map->indir_qp);
+	mlx4_qp_free(mdev->dev, &rss_map->indir_qp);
+reserve_err:
+	mlx4_qp_release_range(mdev->dev, priv->base_qpn, 1);
+rss_err:
+	for (i = 0; i < good_qps; i++) {
+		mlx4_qp_modify(mdev->dev, NULL, rss_map->state[i],
+			       MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->qps[i]);
+		mlx4_qp_remove(mdev->dev, &rss_map->qps[i]);
+		mlx4_qp_free(mdev->dev, &rss_map->qps[i]);
+	}
+	mlx4_qp_release_range(mdev->dev, rss_map->base_qpn, priv->rx_ring_num);
+	return err;
+}
+
+void mlx4_en_release_rss_steer(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_rss_map *rss_map = &priv->rss_map;
+	int i;
+
+	mlx4_qp_modify(mdev->dev, NULL, rss_map->indir_state,
+		       MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->indir_qp);
+	mlx4_qp_remove(mdev->dev, &rss_map->indir_qp);
+	mlx4_qp_free(mdev->dev, &rss_map->indir_qp);
+	mlx4_qp_release_range(mdev->dev, priv->base_qpn, 1);
+
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		mlx4_qp_modify(mdev->dev, NULL, rss_map->state[i],
+			       MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->qps[i]);
+		mlx4_qp_remove(mdev->dev, &rss_map->qps[i]);
+		mlx4_qp_free(mdev->dev, &rss_map->qps[i]);
+	}
+	mlx4_qp_release_range(mdev->dev, rss_map->base_qpn, priv->rx_ring_num);
+}
diff --git a/sys/ofed/drivers/net/mlx4/en_selftest.c b/sys/ofed/drivers/net/mlx4/en_selftest.c
new file mode 100644
index 0000000..0e62027
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/en_selftest.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include "mlx4_en.h"
+
+#include <linux/kernel.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+#include <linux/delay.h>
+#include <linux/mlx4/driver.h>
+
+
+static int mlx4_en_test_registers(struct mlx4_en_priv *priv)
+{
+	return mlx4_cmd(priv->mdev->dev, 0, 0, 0, MLX4_CMD_HW_HEALTH_CHECK,
+			MLX4_CMD_TIME_CLASS_A);
+}
+
+static int mlx4_en_test_loopback_xmit(struct mlx4_en_priv *priv)
+{
+	struct mbuf *mb;
+	struct ethhdr *ethh;
+	unsigned char *packet;
+	unsigned int packet_size = MLX4_LOOPBACK_TEST_PAYLOAD;
+	unsigned int i;
+	int err;
+
+
+	/* build the pkt before xmit */
+	mb = netdev_alloc_mb(priv->dev, MLX4_LOOPBACK_TEST_PAYLOAD + ETH_HLEN + NET_IP_ALIGN);
+	if (!mb) {
+		en_err(priv, "-LOOPBACK_TEST_XMIT- failed to create mb for xmit\n");
+		return -ENOMEM;
+	}
+	mb_reserve(mb, NET_IP_ALIGN);
+
+	ethh = (struct ethhdr *)mb_put(mb, sizeof(struct ethhdr));
+	packet	= (unsigned char *)mb_put(mb, packet_size);
+	memcpy(ethh->h_dest, priv->dev->dev_addr, ETH_ALEN);
+	memset(ethh->h_source, 0, ETH_ALEN);
+	ethh->h_proto = htons(ETH_P_ARP);
+	mb_set_mac_header(mb, 0);
+	for (i = 0; i < packet_size; ++i)	/* fill our packet */
+		packet[i] = (unsigned char)(i & 0xff);
+
+	/* xmit the pkt */
+	err = mlx4_en_xmit(mb, priv->dev);
+	return err;
+}
+
+static int mlx4_en_test_loopback(struct mlx4_en_priv *priv)
+{
+	u32 loopback_ok = 0;
+	int i;
+
+
+        priv->loopback_ok = 0;
+	priv->validate_loopback = 1;
+
+	/* xmit */
+	if (mlx4_en_test_loopback_xmit(priv)) {
+		en_err(priv, "Transmitting loopback packet failed\n");
+		goto mlx4_en_test_loopback_exit;
+	}
+
+	/* polling for result */
+	for (i = 0; i < MLX4_EN_LOOPBACK_RETRIES; ++i) {
+		msleep(MLX4_EN_LOOPBACK_TIMEOUT);
+		if (priv->loopback_ok) {
+			loopback_ok = 1;
+			break;
+		}
+	}
+	if (!loopback_ok)
+		en_err(priv, "Loopback packet didn't arrive\n");
+
+mlx4_en_test_loopback_exit:
+
+	priv->validate_loopback = 0;
+	return (!loopback_ok);
+}
+
+
+static int mlx4_en_test_link(struct mlx4_en_priv *priv)
+{
+	if (mlx4_en_QUERY_PORT(priv->mdev, priv->port))
+		return -ENOMEM;
+	if (priv->port_state.link_state == 1)
+		return 0;
+	else
+		return 1;
+}
+
+static int mlx4_en_test_speed(struct mlx4_en_priv *priv)
+{
+
+	if (mlx4_en_QUERY_PORT(priv->mdev, priv->port))
+		return -ENOMEM;
+
+	/* The device currently only supports 10G speed */
+	if (priv->port_state.link_speed != SPEED_10000)
+		return priv->port_state.link_speed;
+	return 0;
+}
+
+
+void mlx4_en_ex_selftest(struct net_device *dev, u32 *flags, u64 *buf)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_tx_ring *tx_ring;
+	int i, carrier_ok;
+
+	memset(buf, 0, sizeof(u64) * MLX4_EN_NUM_SELF_TEST);
+
+	if (*flags & ETH_TEST_FL_OFFLINE) {
+		/* disable the interface */
+		carrier_ok = netif_carrier_ok(dev);
+
+		netif_carrier_off(dev);
+retry_tx:
+		/* Wait untill all tx queues are empty.
+		 * there should not be any additional incoming traffic
+		 * since we turned the carrier off */
+		msleep(200);
+		for (i = 0; i < priv->tx_ring_num && carrier_ok; i++) {
+			tx_ring = &priv->tx_ring[i];
+			if (tx_ring->prod != (tx_ring->cons + tx_ring->last_nr_txbb))
+				goto retry_tx;
+		}
+
+		if (priv->mdev->dev->caps.loopback_support){
+			buf[3] = mlx4_en_test_registers(priv);
+			buf[4] = mlx4_en_test_loopback(priv);
+		}
+
+		if (carrier_ok)
+			netif_carrier_on(dev);
+
+	}
+	buf[0] = mlx4_test_interrupts(mdev->dev);
+	buf[1] = mlx4_en_test_link(priv);
+	buf[2] = mlx4_en_test_speed(priv);
+
+	for (i = 0; i < MLX4_EN_NUM_SELF_TEST; i++) {
+		if (buf[i])
+			*flags |= ETH_TEST_FL_FAILED;
+	}
+}
diff --git a/sys/ofed/drivers/net/mlx4/en_tx.c b/sys/ofed/drivers/net/mlx4/en_tx.c
new file mode 100644
index 0000000..cd45f9d
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/en_tx.c
@@ -0,0 +1,1035 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include "mlx4_en.h"
+
+#include <linux/mlx4/cq.h>
+#include <linux/mlx4/qp.h>
+#include <linux/vmalloc.h>
+
+#include <net/ethernet.h>
+#include <net/if_vlan_var.h>
+#include <sys/mbuf.h>
+
+#include <netinet/in_systm.h>
+#include <netinet/in.h>
+#include <netinet/if_ether.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_lro.h>
+#include <netinet/udp.h>
+
+enum {
+	MAX_INLINE = 104, /* 128 - 16 - 4 - 4 */
+	MAX_BF = 256,
+};
+
+static int inline_thold = MAX_INLINE;
+
+module_param_named(inline_thold, inline_thold, int, 0444);
+MODULE_PARM_DESC(inline_thold, "treshold for using inline data");
+
+int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
+			   struct mlx4_en_tx_ring *ring, u32 size,
+			   u16 stride)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int tmp;
+	int err;
+
+	ring->size = size;
+	ring->size_mask = size - 1;
+	ring->stride = stride;
+
+	inline_thold = min(inline_thold, MAX_INLINE);
+
+	mtx_init(&ring->tx_lock.m, "mlx4 tx", NULL, MTX_DEF);
+	mtx_init(&ring->comp_lock.m, "mlx4 comp", NULL, MTX_DEF);
+
+	/* Allocate the buf ring */
+	ring->br = buf_ring_alloc(MLX4_EN_DEF_TX_QUEUE_SIZE, M_DEVBUF,
+	    M_WAITOK, &ring->tx_lock.m);
+	if (ring->br == NULL) {
+		en_err(priv, "Failed allocating tx_info ring\n");
+		return -ENOMEM;
+	}
+
+	tmp = size * sizeof(struct mlx4_en_tx_info);
+	ring->tx_info = kmalloc(tmp, GFP_KERNEL);
+	if (!ring->tx_info) {
+		en_err(priv, "Failed allocating tx_info ring\n");
+		err = -ENOMEM;
+		goto err_tx;
+	}
+	en_dbg(DRV, priv, "Allocated tx_info ring at addr:%p size:%d\n",
+		 ring->tx_info, tmp);
+
+	ring->bounce_buf = kmalloc(MAX_DESC_SIZE, GFP_KERNEL);
+	if (!ring->bounce_buf) {
+		en_err(priv, "Failed allocating bounce buffer\n");
+		err = -ENOMEM;
+		goto err_tx;
+	}
+	ring->buf_size = ALIGN(size * ring->stride, MLX4_EN_PAGE_SIZE);
+
+	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size,
+				 2 * PAGE_SIZE);
+	if (err) {
+		en_err(priv, "Failed allocating hwq resources\n");
+		goto err_bounce;
+	}
+
+	err = mlx4_en_map_buffer(&ring->wqres.buf);
+	if (err) {
+		en_err(priv, "Failed to map TX buffer\n");
+		goto err_hwq_res;
+	}
+
+	ring->buf = ring->wqres.buf.direct.buf;
+
+	en_dbg(DRV, priv, "Allocated TX ring (addr:%p) - buf:%p size:%d "
+	       "buf_size:%d dma:%llx\n", ring, ring->buf, ring->size,
+	       ring->buf_size, (unsigned long long) ring->wqres.buf.direct.map);
+
+	err = mlx4_qp_reserve_range(mdev->dev, 1, 256, &ring->qpn);
+	if (err) {
+		en_err(priv, "Failed reserving qp for tx ring.\n");
+		goto err_map;
+	}
+
+	err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp);
+	if (err) {
+		en_err(priv, "Failed allocating qp %d\n", ring->qpn);
+		goto err_reserve;
+	}
+	ring->qp.event = mlx4_en_sqp_event;
+
+	err = mlx4_bf_alloc(mdev->dev, &ring->bf);
+	if (err) {
+		ring->bf.uar = &mdev->priv_uar;
+		ring->bf.uar->map = mdev->uar_map;
+		ring->bf_enabled = false;
+	} else
+		ring->bf_enabled = true;
+
+	return 0;
+
+err_reserve:
+	mlx4_qp_release_range(mdev->dev, ring->qpn, 1);
+err_map:
+	mlx4_en_unmap_buffer(&ring->wqres.buf);
+err_hwq_res:
+	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
+err_bounce:
+	kfree(ring->bounce_buf);
+	ring->bounce_buf = NULL;
+err_tx:
+	buf_ring_free(ring->br, M_DEVBUF);
+	kfree(ring->tx_info);
+	ring->tx_info = NULL;
+	return err;
+}
+
+void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_tx_ring *ring)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	en_dbg(DRV, priv, "Destroying tx ring, qpn: %d\n", ring->qpn);
+
+	buf_ring_free(ring->br, M_DEVBUF);
+	if (ring->bf_enabled)
+		mlx4_bf_free(mdev->dev, &ring->bf);
+	mlx4_qp_remove(mdev->dev, &ring->qp);
+	mlx4_qp_free(mdev->dev, &ring->qp);
+	mlx4_qp_release_range(mdev->dev, ring->qpn, 1);
+	mlx4_en_unmap_buffer(&ring->wqres.buf);
+	mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
+	kfree(ring->bounce_buf);
+	ring->bounce_buf = NULL;
+	kfree(ring->tx_info);
+	ring->tx_info = NULL;
+	mtx_destroy(&ring->tx_lock.m);
+	mtx_destroy(&ring->comp_lock.m);
+}
+
+int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_tx_ring *ring,
+			     int cq)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	ring->cqn = cq;
+	ring->prod = 0;
+	ring->cons = 0xffffffff;
+	ring->last_nr_txbb = 1;
+	ring->poll_cnt = 0;
+	ring->blocked = 0;
+	memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info));
+	memset(ring->buf, 0, ring->buf_size);
+
+	ring->qp_state = MLX4_QP_STATE_RST;
+	ring->doorbell_qpn = swab32(ring->qp.qpn << 8);
+
+	mlx4_en_fill_qp_context(priv, ring->size, ring->stride, 1, 0, ring->qpn,
+				ring->cqn, &ring->context);
+	if (ring->bf_enabled)
+		ring->context.usr_page = cpu_to_be32(ring->bf.uar->index);
+
+	err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, &ring->context,
+			       &ring->qp, &ring->qp_state);
+
+	return err;
+}
+
+void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
+				struct mlx4_en_tx_ring *ring)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	mlx4_qp_modify(mdev->dev, NULL, ring->qp_state,
+		       MLX4_QP_STATE_RST, NULL, 0, 0, &ring->qp);
+}
+
+
+static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
+				struct mlx4_en_tx_ring *ring,
+				int index, u8 owner)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
+	struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE;
+	struct mlx4_wqe_data_seg *data = (void *) tx_desc + tx_info->data_offset;
+	struct mbuf *mb = tx_info->mb;
+	void *end = ring->buf + ring->buf_size;
+	int frags = tx_info->nr_segs;
+	int i;
+	__be32 *ptr = (__be32 *)tx_desc;
+	__be32 stamp = cpu_to_be32(STAMP_VAL | (!!owner << STAMP_SHIFT));
+
+	/* Optimize the common case when there are no wraparounds */
+	if (likely((void *) tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) {
+		if (!tx_info->inl) {
+			for (i = 0; i < frags; i++) {
+				pci_unmap_single(mdev->pdev,
+					(dma_addr_t) be64_to_cpu(data[i].addr),
+					data[i].byte_count, PCI_DMA_TODEVICE);
+			}
+		}
+		/* Stamp the freed descriptor */
+		for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE; i += STAMP_STRIDE) {
+			*ptr = stamp;
+			ptr += STAMP_DWORDS;
+		}
+
+	} else {
+		if (!tx_info->inl) {
+			for (i = 0; i < frags; i++) {
+				/* Check for wraparound before unmapping */
+				if ((void *) data >= end)
+					data = (struct mlx4_wqe_data_seg *) ring->buf;
+				pci_unmap_single(mdev->pdev,
+					(dma_addr_t) be64_to_cpu(data->addr),
+					data->byte_count, PCI_DMA_TODEVICE);
+				++data;
+			}
+		}
+		/* Stamp the freed descriptor */
+		for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE; i += STAMP_STRIDE) {
+			*ptr = stamp;
+			ptr += STAMP_DWORDS;
+			if ((void *) ptr >= end) {
+				ptr = ring->buf;
+				stamp ^= cpu_to_be32(0x80000000);
+			}
+		}
+
+	}
+	m_freem(mb);
+	return tx_info->nr_txbb;
+}
+
+
+int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int cnt = 0;
+
+	/* Skip last polled descriptor */
+	ring->cons += ring->last_nr_txbb;
+	en_dbg(DRV, priv, "Freeing Tx buf - cons:0x%x prod:0x%x\n",
+		 ring->cons, ring->prod);
+
+	if ((u32) (ring->prod - ring->cons) > ring->size) {
+		en_warn(priv, "Tx consumer passed producer!\n");
+		return 0;
+	}
+
+	while (ring->cons != ring->prod) {
+		ring->last_nr_txbb = mlx4_en_free_tx_desc(priv, ring,
+						ring->cons & ring->size_mask,
+						!!(ring->cons & ring->size));
+		ring->cons += ring->last_nr_txbb;
+		cnt++;
+	}
+
+	if (cnt)
+		en_dbg(DRV, priv, "Freed %d uncompleted tx descriptors\n", cnt);
+
+	return cnt;
+}
+
+void mlx4_en_set_prio_map(struct mlx4_en_priv *priv, u16 *prio_map, u32 ring_num)
+{
+	int block = 8 / ring_num;
+	int extra = 8 - (block * ring_num);
+	int num = 0;
+	u16 ring = 1;
+	int prio;
+
+	if (ring_num == 1) {
+		for (prio = 0; prio < 8; prio++)
+			prio_map[prio] = 0;
+		return;
+	}
+
+	for (prio = 0; prio < 8; prio++) {
+		if (extra && (num == block + 1)) {
+			ring++;
+			num = 0;
+			extra--;
+		} else if (!extra && (num == block)) {
+			ring++;
+			num = 0;
+		}
+		prio_map[prio] = ring;
+		en_dbg(DRV, priv, " prio:%d --> ring:%d\n", prio, ring);
+		num++;
+	}
+}
+
+static void mlx4_en_process_tx_cq(struct net_device *dev, struct mlx4_en_cq *cq)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_cq *mcq = &cq->mcq;
+	struct mlx4_en_tx_ring *ring = &priv->tx_ring[cq->ring];
+	struct mlx4_cqe *cqe = cq->buf;
+	u16 index;
+	u16 new_index;
+	u32 txbbs_skipped = 0;
+	u32 cq_last_sav;
+
+	/* index always points to the first TXBB of the last polled descriptor */
+	index = ring->cons & ring->size_mask;
+	new_index = be16_to_cpu(cqe->wqe_index) & ring->size_mask;
+	if (index == new_index)
+		return;
+
+	if (!priv->port_up)
+		return;
+
+	/*
+	 * We use a two-stage loop:
+	 * - the first samples the HW-updated CQE
+	 * - the second frees TXBBs until the last sample
+	 * This lets us amortize CQE cache misses, while still polling the CQ
+	 * until is quiescent.
+	 */
+	cq_last_sav = mcq->cons_index;
+	do {
+		do {
+			/* Skip over last polled CQE */
+			index = (index + ring->last_nr_txbb) & ring->size_mask;
+			txbbs_skipped += ring->last_nr_txbb;
+
+			/* Poll next CQE */
+			ring->last_nr_txbb = mlx4_en_free_tx_desc(
+						priv, ring, index,
+						!!((ring->cons + txbbs_skipped) &
+						   ring->size));
+			++mcq->cons_index;
+
+		} while (index != new_index);
+
+		new_index = be16_to_cpu(cqe->wqe_index) & ring->size_mask;
+	} while (index != new_index);
+	AVG_PERF_COUNTER(priv->pstats.tx_coal_avg,
+			 (u32) (mcq->cons_index - cq_last_sav));
+
+	/*
+	 * To prevent CQ overflow we first update CQ consumer and only then
+	 * the ring consumer.
+	 */
+	mlx4_cq_set_ci(mcq);
+	wmb();
+	ring->cons += txbbs_skipped;
+
+	/* Wakeup Tx queue if this ring stopped it */
+	if (unlikely(ring->blocked)) {
+		if ((u32) (ring->prod - ring->cons) <=
+		     ring->size - HEADROOM - MAX_DESC_TXBBS) {
+			ring->blocked = 0;
+			if (atomic_fetchadd_int(&priv->blocked, -1) == 1)
+				atomic_clear_int(&dev->if_drv_flags,
+				    IFF_DRV_OACTIVE);
+			priv->port_stats.wake_queue++;
+		}
+	}
+}
+
+void mlx4_en_tx_irq(struct mlx4_cq *mcq)
+{
+	struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq);
+	struct mlx4_en_priv *priv = netdev_priv(cq->dev);
+	struct mlx4_en_tx_ring *ring = &priv->tx_ring[cq->ring];
+
+	if (!spin_trylock(&ring->comp_lock))
+		return;
+	mlx4_en_process_tx_cq(cq->dev, cq);
+	mod_timer(&cq->timer, jiffies + 1);
+	spin_unlock(&ring->comp_lock);
+}
+
+
+void mlx4_en_poll_tx_cq(unsigned long data)
+{
+	struct mlx4_en_cq *cq = (struct mlx4_en_cq *) data;
+	struct mlx4_en_priv *priv = netdev_priv(cq->dev);
+	struct mlx4_en_tx_ring *ring = &priv->tx_ring[cq->ring];
+	u32 inflight;
+
+	INC_PERF_COUNTER(priv->pstats.tx_poll);
+
+	if (!spin_trylock(&ring->comp_lock)) {
+		mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT);
+		return;
+	}
+	mlx4_en_process_tx_cq(cq->dev, cq);
+	inflight = (u32) (ring->prod - ring->cons - ring->last_nr_txbb);
+
+	/* If there are still packets in flight and the timer has not already
+	 * been scheduled by the Tx routine then schedule it here to guarantee
+	 * completion processing of these packets */
+	if (inflight && priv->port_up)
+		mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT);
+
+	spin_unlock(&ring->comp_lock);
+}
+
+static struct mlx4_en_tx_desc *mlx4_en_bounce_to_desc(struct mlx4_en_priv *priv,
+						      struct mlx4_en_tx_ring *ring,
+						      u32 index,
+						      unsigned int desc_size)
+{
+	u32 copy = (ring->size - index) * TXBB_SIZE;
+	int i;
+
+	for (i = desc_size - copy - 4; i >= 0; i -= 4) {
+		if ((i & (TXBB_SIZE - 1)) == 0)
+			wmb();
+
+		*((u32 *) (ring->buf + i)) =
+			*((u32 *) (ring->bounce_buf + copy + i));
+	}
+
+	for (i = copy - 4; i >= 4 ; i -= 4) {
+		if ((i & (TXBB_SIZE - 1)) == 0)
+			wmb();
+
+		*((u32 *) (ring->buf + index * TXBB_SIZE + i)) =
+			*((u32 *) (ring->bounce_buf + i));
+	}
+
+	/* Return real descriptor location */
+	return ring->buf + index * TXBB_SIZE;
+}
+
+static inline void mlx4_en_xmit_poll(struct mlx4_en_priv *priv, int tx_ind)
+{
+	struct mlx4_en_cq *cq = &priv->tx_cq[tx_ind];
+	struct mlx4_en_tx_ring *ring = &priv->tx_ring[tx_ind];
+
+	/* If we don't have a pending timer, set one up to catch our recent
+	   post in case the interface becomes idle */
+	if (!timer_pending(&cq->timer))
+		mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT);
+
+	/* Poll the CQ every mlx4_en_TX_MODER_POLL packets */
+	if ((++ring->poll_cnt & (MLX4_EN_TX_POLL_MODER - 1)) == 0)
+		if (spin_trylock(&ring->comp_lock)) {
+			mlx4_en_process_tx_cq(priv->dev, cq);
+			spin_unlock(&ring->comp_lock);
+		}
+}
+
+static int is_inline(struct mbuf *mb)
+{
+
+	if (inline_thold && mb->m_pkthdr.len <= inline_thold &&
+	    (mb->m_pkthdr.csum_flags & CSUM_TSO) == 0)
+		return 1;
+
+	return 0;
+}
+
+static int inline_size(struct mbuf *mb)
+{
+	int len;
+
+	len = mb->m_pkthdr.len;
+	if (len + CTRL_SIZE + sizeof(struct mlx4_wqe_inline_seg)
+	    <= MLX4_INLINE_ALIGN)
+		return ALIGN(len + CTRL_SIZE +
+			     sizeof(struct mlx4_wqe_inline_seg), 16);
+	else
+		return ALIGN(len + CTRL_SIZE + 2 *
+			     sizeof(struct mlx4_wqe_inline_seg), 16);
+}
+
+static int get_head_size(struct mbuf *mb)
+{
+	struct tcphdr *th;
+	struct ip *ip;
+	int ip_hlen, tcp_hlen;
+	int len;
+
+	len = ETHER_HDR_LEN;
+	if (mb->m_len < len + sizeof(struct ip))
+		return (0);
+	ip = (struct ip *)(mtod(mb, char *) + len);
+	if (ip->ip_p != IPPROTO_TCP)
+		return (0);
+	ip_hlen = ip->ip_hl << 2;
+	len += ip_hlen;
+	if (mb->m_len < len + sizeof(struct tcphdr))
+		return (0);
+	th = (struct tcphdr *)(mtod(mb, char *) + len);
+	tcp_hlen = th->th_off << 2;
+	len += tcp_hlen;
+	if (mb->m_len < len)
+		return (0);
+	return (len);
+}
+
+static int get_real_size(struct mbuf *mb, struct net_device *dev, int *segsp,
+    int *lso_header_size)
+{
+	struct mbuf *m;
+	int nr_segs;
+
+	nr_segs = 0;
+	for (m = mb; m != NULL; m = m->m_next)
+		if (m->m_len)
+			nr_segs++;
+
+	if (mb->m_pkthdr.csum_flags & CSUM_TSO) {
+		*lso_header_size = get_head_size(mb);
+		if (*lso_header_size) {
+			if (mb->m_len == *lso_header_size)
+				nr_segs--;
+			*segsp = nr_segs;
+			return CTRL_SIZE + nr_segs * DS_SIZE +
+			    ALIGN(*lso_header_size + 4, DS_SIZE);
+		}
+	} else
+		*lso_header_size = 0;
+	*segsp = nr_segs;
+	if (is_inline(mb))
+		return inline_size(mb);
+	return (CTRL_SIZE + nr_segs * DS_SIZE);
+}
+
+static struct mbuf *mb_copy(struct mbuf *mb, int *offp, char *data, int len)
+{
+	int bytes;
+	int off;
+
+	off = *offp;
+	while (len) {
+		bytes = min(mb->m_len - off, len);
+		if (bytes)
+			memcpy(data, mb->m_data + off, bytes);
+		len -= bytes;
+		data += bytes;
+		off += bytes;
+		if (off == mb->m_len) {
+			off = 0;
+			mb = mb->m_next;
+		}
+	}
+	*offp = off;
+	return (mb);
+}
+
+static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, struct mbuf *mb,
+			     int real_size, u16 *vlan_tag, int tx_ind)
+{
+	struct mlx4_wqe_inline_seg *inl = &tx_desc->inl;
+	int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - sizeof *inl;
+	int len;
+	int off;
+
+	off = 0;
+	len = mb->m_pkthdr.len;
+	if (len <= spc) {
+		inl->byte_count = cpu_to_be32(1 << 31 | len);
+		mb_copy(mb, &off, (void *)(inl + 1), len);
+	} else {
+		inl->byte_count = cpu_to_be32(1 << 31 | spc);
+		mb = mb_copy(mb, &off, (void *)(inl + 1), spc);
+		inl = (void *) (inl + 1) + spc;
+		mb_copy(mb, &off, (void *)(inl + 1), len - spc);
+		wmb();
+		inl->byte_count = cpu_to_be32(1 << 31 | (len - spc));
+	}
+	tx_desc->ctrl.vlan_tag = cpu_to_be16(*vlan_tag);
+	tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN * !!(*vlan_tag);
+	tx_desc->ctrl.fence_size = (real_size / 16) & 0x3f;
+}
+
+u16 mlx4_en_select_queue(struct net_device *dev, struct mbuf *mb)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_tx_hash_entry *entry;
+	struct ether_header *eth;
+	struct tcphdr *th;
+	struct ip *iph;
+	u32 hash_index;
+	int tx_ind = 0;
+	u16 vlan_tag = 0;
+	int len;
+
+	/* Obtain VLAN information if present */
+	if (mb->m_flags & M_VLANTAG) {
+		vlan_tag = mb->m_pkthdr.ether_vtag;
+		/* Set the Tx ring to use according to vlan priority */
+		tx_ind = priv->tx_prio_map[vlan_tag >> 13];
+		if (tx_ind)
+			return tx_ind;
+	}
+	if (mb->m_len <
+	    ETHER_HDR_LEN + sizeof(struct ip) + sizeof(struct tcphdr))
+		return MLX4_EN_NUM_HASH_RINGS;
+	eth = mtod(mb, struct ether_header *);
+	/* Hashing is only done for TCP/IP or UDP/IP packets */
+	if (be16_to_cpu(eth->ether_type) != ETHERTYPE_IP)
+		return MLX4_EN_NUM_HASH_RINGS;
+	len = ETHER_HDR_LEN;
+	iph = (struct ip *)(mtod(mb, char *) + len);
+	len += iph->ip_hl << 2;
+	th = (struct tcphdr *)(mtod(mb, char *) + len);
+	hash_index = be32_to_cpu(iph->ip_dst.s_addr) & MLX4_EN_TX_HASH_MASK;
+	switch(iph->ip_p) {
+	case IPPROTO_UDP:
+		break;
+	case IPPROTO_TCP:
+		if (mb->m_len < len + sizeof(struct tcphdr))
+			return MLX4_EN_NUM_HASH_RINGS;
+		hash_index =
+		    (hash_index ^ be16_to_cpu(th->th_dport ^ th->th_sport)) &
+		    MLX4_EN_TX_HASH_MASK;
+		break;
+	default:
+		return MLX4_EN_NUM_HASH_RINGS;
+	}
+
+	entry = &priv->tx_hash[hash_index];
+	if(unlikely(!entry->cnt)) {
+		tx_ind = hash_index & (MLX4_EN_NUM_HASH_RINGS / 2 - 1);
+		if (2 * entry->small_pkts > entry->big_pkts)
+			tx_ind += MLX4_EN_NUM_HASH_RINGS / 2;
+		entry->small_pkts = entry->big_pkts = 0;
+		entry->ring = tx_ind;
+	}
+
+	entry->cnt++;
+	if (mb->m_pkthdr.len > MLX4_EN_SMALL_PKT_SIZE)
+		entry->big_pkts++;
+	else
+		entry->small_pkts++;
+	return entry->ring;
+}
+
+static void mlx4_bf_copy(unsigned long *dst, unsigned long *src, unsigned bytecnt)
+{
+	__iowrite64_copy(dst, src, bytecnt / 8);
+}
+
+static int mlx4_en_xmit(struct net_device *dev, int tx_ind, struct mbuf **mbp)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_tx_ring *ring;
+	struct mlx4_en_cq *cq;
+	struct mlx4_en_tx_desc *tx_desc;
+	struct mlx4_wqe_data_seg *data;
+	struct mlx4_en_tx_info *tx_info;
+	struct mbuf *m;
+	int nr_txbb;
+	int nr_segs;
+	int desc_size;
+	int real_size;
+	dma_addr_t dma;
+	u32 index, bf_index;
+	__be32 op_own;
+	u16 vlan_tag = 0;
+	int i;
+	int lso_header_size;
+	bool bounce = false;
+	struct mbuf *mb;
+	int defrag = 1;
+
+	ring = &priv->tx_ring[tx_ind];
+	mb = *mbp;
+	if (!priv->port_up)
+		goto tx_drop;
+
+retry:
+	real_size = get_real_size(mb, dev, &nr_segs, &lso_header_size);
+	if (unlikely(!real_size))
+		goto tx_drop;
+
+	/* Allign descriptor to TXBB size */
+	desc_size = ALIGN(real_size, TXBB_SIZE);
+	nr_txbb = desc_size / TXBB_SIZE;
+	if (unlikely(nr_txbb > MAX_DESC_TXBBS)) {
+		if (defrag) {
+			mb = m_defrag(*mbp, M_DONTWAIT);
+			if (mb == NULL) {
+				mb = *mbp;
+				goto tx_drop;
+			}
+			*mbp = mb;
+			defrag = 0;
+			goto retry;
+		}
+		goto tx_drop;
+	}
+
+	/* Check available TXBBs And 2K spare for prefetch */
+	if (unlikely(((int)(ring->prod - ring->cons)) >
+		     ring->size - HEADROOM - MAX_DESC_TXBBS)) {
+		/* every full Tx ring stops queue */
+		if (ring->blocked == 0)
+			atomic_add_int(&priv->blocked, 1);
+		atomic_set_int(&dev->if_drv_flags, IFF_DRV_OACTIVE);
+		ring->blocked = 1;
+		priv->port_stats.queue_stopped++;
+
+		/* Use interrupts to find out when queue opened */
+		cq = &priv->tx_cq[tx_ind];
+		mlx4_en_arm_cq(priv, cq);
+		return EBUSY;
+	}
+
+	/* Track current inflight packets for performance analysis */
+	AVG_PERF_COUNTER(priv->pstats.inflight_avg,
+			 (u32) (ring->prod - ring->cons - 1));
+
+	/* Packet is good - grab an index and transmit it */
+	index = ring->prod & ring->size_mask;
+	bf_index = ring->prod;
+
+	/* See if we have enough space for whole descriptor TXBB for setting
+	 * SW ownership on next descriptor; if not, use a bounce buffer. */
+	if (likely(index + nr_txbb <= ring->size))
+		tx_desc = ring->buf + index * TXBB_SIZE;
+	else {
+		tx_desc = (struct mlx4_en_tx_desc *) ring->bounce_buf;
+		bounce = true;
+	}
+
+	/* Prepare ctrl segement apart opcode+ownership, which depends on
+	 * whether LSO is used */
+	if (mb->m_flags & M_VLANTAG)
+		vlan_tag = mb->m_pkthdr.ether_vtag;
+	tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag);
+	tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN * !!vlan_tag;
+	tx_desc->ctrl.fence_size = (real_size / 16) & 0x3f;
+	tx_desc->ctrl.srcrb_flags = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE |
+						MLX4_WQE_CTRL_SOLICITED);
+	if (mb->m_pkthdr.csum_flags & (CSUM_IP|CSUM_TCP|CSUM_UDP)) {
+		tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM |
+							 MLX4_WQE_CTRL_TCP_UDP_CSUM);
+		priv->port_stats.tx_chksum_offload++;
+	}
+
+	if (unlikely(priv->validate_loopback)) {
+		/* Copy dst mac address to wqe */
+		struct ether_header *ethh;
+		u64 mac;
+		u32 mac_l, mac_h;
+
+		ethh = mtod(mb, struct ether_header *);
+		mac = mlx4_en_mac_to_u64(ethh->ether_dhost);
+		if (mac) {
+			mac_h = (u32) ((mac & 0xffff00000000ULL) >> 16);
+			mac_l = (u32) (mac & 0xffffffff);
+			tx_desc->ctrl.srcrb_flags |= cpu_to_be32(mac_h);
+			tx_desc->ctrl.imm = cpu_to_be32(mac_l);
+		}
+	}
+
+	/* Handle LSO (TSO) packets */
+	if (lso_header_size) {
+		int segsz;
+
+		/* Mark opcode as LSO */
+		op_own = cpu_to_be32(MLX4_OPCODE_LSO | (1 << 6)) |
+			((ring->prod & ring->size) ?
+				cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
+
+		/* Fill in the LSO prefix */
+		tx_desc->lso.mss_hdr_size = cpu_to_be32(
+			mb->m_pkthdr.tso_segsz << 16 | lso_header_size);
+
+		/* Copy headers;
+		 * note that we already verified that it is linear */
+		memcpy(tx_desc->lso.header, mb->m_data, lso_header_size);
+		data = ((void *) &tx_desc->lso +
+			ALIGN(lso_header_size + 4, DS_SIZE));
+
+		priv->port_stats.tso_packets++;
+		segsz = mb->m_pkthdr.tso_segsz;
+		i = ((mb->m_pkthdr.len - lso_header_size) / segsz) +
+			!!((mb->m_pkthdr.len - lso_header_size) % segsz);
+		ring->bytes += mb->m_pkthdr.len + (i - 1) * lso_header_size;
+		ring->packets += i;
+		mb->m_data += lso_header_size;
+		mb->m_len -= lso_header_size;
+	} else {
+		/* Normal (Non LSO) packet */
+		op_own = cpu_to_be32(MLX4_OPCODE_SEND) |
+			((ring->prod & ring->size) ?
+			 cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
+		data = &tx_desc->data;
+		ring->bytes += max(mb->m_pkthdr.len,
+		    (unsigned int)ETHER_MIN_LEN - ETHER_CRC_LEN);
+		ring->packets++;
+
+	}
+	AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, mb->m_pkthdr.len);
+
+	/* Save mb in tx_info ring */
+	tx_info = &ring->tx_info[index];
+	tx_info->mb = mb;
+	tx_info->nr_txbb = nr_txbb;
+	tx_info->nr_segs = nr_segs;
+	/* valid only for non inline segments */
+	tx_info->data_offset = (void *) data - (void *) tx_desc;
+
+	if (!is_inline(mb)) {
+		for (i = 0, m = mb; i < nr_segs; i++, m = m->m_next) {
+			if (m->m_len == 0) {
+				i--;
+				continue;
+			}
+			dma = pci_map_single(mdev->dev->pdev, m->m_data,
+					     m->m_len, PCI_DMA_TODEVICE);
+			data->addr = cpu_to_be64(dma);
+			data->lkey = cpu_to_be32(mdev->mr.key);
+			wmb();
+			data->byte_count = cpu_to_be32(m->m_len);
+			data++;
+		}
+		if (lso_header_size) {
+			mb->m_data -= lso_header_size;
+			mb->m_len += lso_header_size;
+		}
+		tx_info->inl = 0;
+	} else {
+		build_inline_wqe(tx_desc, mb, real_size, &vlan_tag, tx_ind);
+		tx_info->inl = 1;
+	}
+
+	ring->prod += nr_txbb;
+
+	/* If we used a bounce buffer then copy descriptor back into place */
+	if (bounce)
+		tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size);
+
+	if (ring->bf_enabled && desc_size <= MAX_BF && !bounce && !vlan_tag) {
+		*(u32 *) (&tx_desc->ctrl.vlan_tag) |= ring->doorbell_qpn;
+		op_own |= htonl((bf_index & 0xffff) << 8);
+		/* Ensure new descirptor hits memory
+		* before setting ownership of this descriptor to HW */
+		wmb();
+		tx_desc->ctrl.owner_opcode = op_own;
+
+		wmb();
+
+		mlx4_bf_copy(ring->bf.reg + ring->bf.offset, (unsigned long *) &tx_desc->ctrl,
+		     desc_size);
+
+		wmb();
+
+		ring->bf.offset ^= ring->bf.buf_size;
+	} else {
+		/* Ensure new descirptor hits memory
+		* before setting ownership of this descriptor to HW */
+		wmb();
+		tx_desc->ctrl.owner_opcode = op_own;
+		wmb();
+		writel(ring->doorbell_qpn, ring->bf.uar->map + MLX4_SEND_DOORBELL);
+	}
+
+	return 0;
+
+tx_drop:
+	*mbp = NULL;
+	m_freem(mb);
+	ring->errors++;
+	return EINVAL;
+}
+
+
+static int
+mlx4_en_transmit_locked(struct ifnet *dev, int tx_ind, struct mbuf *m)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_tx_ring *ring;
+	struct mbuf *next;
+	int enqueued, err = 0;
+
+	ring = &priv->tx_ring[tx_ind];
+	if ((dev->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
+	    IFF_DRV_RUNNING || priv->port_up == 0) {
+		if (m != NULL)
+			err = drbr_enqueue(dev, ring->br, m);
+		return (err);  
+	}
+
+	enqueued = 0;
+	if (m == NULL) {
+		next = drbr_dequeue(dev, ring->br);
+	} else if (drbr_needs_enqueue(dev, ring->br)) {
+		if ((err = drbr_enqueue(dev, ring->br, m)) != 0)
+			return (err);
+		next = drbr_dequeue(dev, ring->br);
+	} else
+		next = m;
+
+	/* Process the queue */
+	while (next != NULL) {
+		if ((err = mlx4_en_xmit(dev, tx_ind, &next)) != 0) {
+			if (next != NULL)
+				err = drbr_enqueue(dev, ring->br, next);
+			break;
+		}
+		enqueued++;
+		drbr_stats_update(dev, next->m_pkthdr.len, next->m_flags);
+		/* Send a copy of the frame to the BPF listener */
+		ETHER_BPF_MTAP(dev, next);
+		if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0)
+			break;
+		next = drbr_dequeue(dev, ring->br);
+	}
+
+	if (enqueued > 0)
+		ring->watchdog_time = ticks;
+
+	return (err);
+}
+
+void
+mlx4_en_tx_que(void *context, int pending)
+{
+	struct mlx4_en_tx_ring *ring;
+	struct mlx4_en_priv *priv;
+	struct net_device *dev;
+	struct mlx4_en_cq *cq;
+	int tx_ind;
+
+	cq = context;
+	dev = cq->dev;
+	priv = dev->if_softc;
+	tx_ind = cq->ring;
+	ring = &priv->tx_ring[tx_ind];
+        if (dev->if_drv_flags & IFF_DRV_RUNNING) {
+		mlx4_en_xmit_poll(priv, tx_ind);
+		spin_lock(&ring->tx_lock);
+                if (!drbr_empty(dev, ring->br))
+			mlx4_en_transmit_locked(dev, tx_ind, NULL);
+		spin_unlock(&ring->tx_lock);
+	}
+}
+
+int
+mlx4_en_transmit(struct ifnet *dev, struct mbuf *m)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_tx_ring *ring;
+	struct mlx4_en_cq *cq;
+	int i = 0, err = 0;
+
+	/* Which queue to use */
+	if ((m->m_flags & (M_FLOWID | M_VLANTAG)) == M_FLOWID)
+		i = m->m_pkthdr.flowid % (MLX4_EN_NUM_HASH_RINGS - 1);
+	else
+		i = mlx4_en_select_queue(dev, m);
+
+	ring = &priv->tx_ring[i];
+
+	if (spin_trylock(&ring->tx_lock)) {
+		err = mlx4_en_transmit_locked(dev, i, m);
+		spin_unlock(&ring->tx_lock);
+		/* Poll CQ here */
+		mlx4_en_xmit_poll(priv, i);
+	} else {
+		err = drbr_enqueue(dev, ring->br, m);
+		cq = &priv->tx_cq[i];
+		taskqueue_enqueue(cq->tq, &cq->cq_task);
+	}
+
+	return (err);
+}
+
+/*
+ * Flush ring buffers.
+ */
+void
+mlx4_en_qflush(struct ifnet *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_tx_ring *ring = priv->tx_ring;
+	struct mbuf *m;
+
+	for (int i = 0; i < priv->tx_ring_num; i++, ring++) {
+		spin_lock(&ring->tx_lock);
+		while ((m = buf_ring_dequeue_sc(ring->br)) != NULL)
+			m_freem(m);
+		spin_unlock(&ring->tx_lock);
+	}
+	if_qflush(dev);
+}
diff --git a/sys/ofed/drivers/net/mlx4/eq.c b/sys/ofed/drivers/net/mlx4/eq.c
new file mode 100644
index 0000000..42885c7
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/eq.c
@@ -0,0 +1,727 @@
+/*
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+#include "fw.h"
+
+enum {
+	MLX4_NUM_ASYNC_EQE	= 0x100,
+	MLX4_NUM_SPARE_EQE	= 0x80,
+	MLX4_EQ_ENTRY_SIZE	= 0x20
+};
+
+/*
+ * Must be packed because start is 64 bits but only aligned to 32 bits.
+ */
+struct mlx4_eq_context {
+	__be32			flags;
+	u16			reserved1[3];
+	__be16			page_offset;
+	u8			log_eq_size;
+	u8			reserved2[4];
+	u8			eq_period;
+	u8			reserved3;
+	u8			eq_max_count;
+	u8			reserved4[3];
+	u8			intr;
+	u8			log_page_size;
+	u8			reserved5[2];
+	u8			mtt_base_addr_h;
+	__be32			mtt_base_addr_l;
+	u32			reserved6[2];
+	__be32			consumer_index;
+	__be32			producer_index;
+	u32			reserved7[4];
+};
+
+#define MLX4_EQ_STATUS_OK	   ( 0 << 28)
+#define MLX4_EQ_STATUS_WRITE_FAIL  (10 << 28)
+#define MLX4_EQ_OWNER_SW	   ( 0 << 24)
+#define MLX4_EQ_OWNER_HW	   ( 1 << 24)
+#define MLX4_EQ_FLAG_EC		   ( 1 << 18)
+#define MLX4_EQ_FLAG_OI		   ( 1 << 17)
+#define MLX4_EQ_STATE_ARMED	   ( 9 <<  8)
+#define MLX4_EQ_STATE_FIRED	   (10 <<  8)
+#define MLX4_EQ_STATE_ALWAYS_ARMED (11 <<  8)
+
+#define MLX4_ASYNC_EVENT_MASK ((1ull << MLX4_EVENT_TYPE_PATH_MIG)	    | \
+			       (1ull << MLX4_EVENT_TYPE_COMM_EST)	    | \
+			       (1ull << MLX4_EVENT_TYPE_SQ_DRAINED)	    | \
+			       (1ull << MLX4_EVENT_TYPE_CQ_ERROR)	    | \
+			       (1ull << MLX4_EVENT_TYPE_WQ_CATAS_ERROR)	    | \
+			       (1ull << MLX4_EVENT_TYPE_EEC_CATAS_ERROR)    | \
+			       (1ull << MLX4_EVENT_TYPE_PATH_MIG_FAILED)    | \
+			       (1ull << MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | \
+			       (1ull << MLX4_EVENT_TYPE_WQ_ACCESS_ERROR)    | \
+			       (1ull << MLX4_EVENT_TYPE_PORT_CHANGE)	    | \
+			       (1ull << MLX4_EVENT_TYPE_ECC_DETECT)	    | \
+			       (1ull << MLX4_EVENT_TYPE_SRQ_CATAS_ERROR)    | \
+			       (1ull << MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE)    | \
+			       (1ull << MLX4_EVENT_TYPE_SRQ_LIMIT)	    | \
+			       (1ull << MLX4_EVENT_TYPE_CMD))
+
+struct mlx4_eqe {
+	u8			reserved1;
+	u8			type;
+	u8			reserved2;
+	u8			subtype;
+	union {
+		u32		raw[6];
+		struct {
+			__be32	cqn;
+		} __attribute__((packed)) comp;
+		struct {
+			u16	reserved1;
+			__be16	token;
+			u32	reserved2;
+			u8	reserved3[3];
+			u8	status;
+			__be64	out_param;
+		} __attribute__((packed)) cmd;
+		struct {
+			__be32	qpn;
+		} __attribute__((packed)) qp;
+		struct {
+			__be32	srqn;
+		} __attribute__((packed)) srq;
+		struct {
+			__be32	cqn;
+			u32	reserved1;
+			u8	reserved2[3];
+			u8	syndrome;
+		} __attribute__((packed)) cq_err;
+		struct {
+			u32	reserved1[2];
+			__be32	port;
+		} __attribute__((packed)) port_change;
+	}			event;
+	u8			reserved3[3];
+	u8			owner;
+} __attribute__((packed));
+
+static void eq_set_ci(struct mlx4_eq *eq, int req_not)
+{
+	__raw_writel((__force u32) cpu_to_be32((eq->cons_index & 0xffffff) |
+					       req_not << 31),
+		     eq->doorbell);
+	/* We still want ordering, just not swabbing, so add a barrier */
+	mb();
+}
+
+static struct mlx4_eqe *get_eqe(struct mlx4_eq *eq, u32 entry)
+{
+	unsigned long off = (entry & (eq->nent - 1)) * MLX4_EQ_ENTRY_SIZE;
+	return eq->page_list[off / PAGE_SIZE].buf + off % PAGE_SIZE;
+}
+
+static struct mlx4_eqe *next_eqe_sw(struct mlx4_eq *eq)
+{
+	struct mlx4_eqe *eqe = get_eqe(eq, eq->cons_index);
+	return !!(eqe->owner & 0x80) ^ !!(eq->cons_index & eq->nent) ? NULL : eqe;
+}
+
+static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
+{
+	struct mlx4_eqe *eqe;
+	int cqn;
+	int eqes_found = 0;
+	int set_ci = 0;
+	int port;
+
+	while ((eqe = next_eqe_sw(eq))) {
+		/*
+		 * Make sure we read EQ entry contents after we've
+		 * checked the ownership bit.
+		 */
+		rmb();
+
+		switch (eqe->type) {
+		case MLX4_EVENT_TYPE_COMP:
+			cqn = be32_to_cpu(eqe->event.comp.cqn) & 0xffffff;
+			mlx4_cq_completion(dev, cqn);
+			break;
+
+		case MLX4_EVENT_TYPE_PATH_MIG:
+		case MLX4_EVENT_TYPE_COMM_EST:
+		case MLX4_EVENT_TYPE_SQ_DRAINED:
+		case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE:
+		case MLX4_EVENT_TYPE_WQ_CATAS_ERROR:
+		case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
+		case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+		case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
+			mlx4_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				      eqe->type);
+			break;
+
+		case MLX4_EVENT_TYPE_SRQ_LIMIT:
+		case MLX4_EVENT_TYPE_SRQ_CATAS_ERROR:
+			mlx4_srq_event(dev, be32_to_cpu(eqe->event.srq.srqn) & 0xffffff,
+				      eqe->type);
+			break;
+
+		case MLX4_EVENT_TYPE_CMD:
+			mlx4_cmd_event(dev,
+				       be16_to_cpu(eqe->event.cmd.token),
+				       eqe->event.cmd.status,
+				       be64_to_cpu(eqe->event.cmd.out_param));
+			break;
+
+		case MLX4_EVENT_TYPE_PORT_CHANGE:
+			port = be32_to_cpu(eqe->event.port_change.port) >> 28;
+			if (eqe->subtype == MLX4_PORT_CHANGE_SUBTYPE_DOWN) {
+				mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_DOWN,
+						    port);
+				mlx4_priv(dev)->sense.do_sense_port[port] = 1;
+			} else {
+				mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_UP,
+						    port);
+				mlx4_priv(dev)->sense.do_sense_port[port] = 0;
+			}
+			break;
+
+		case MLX4_EVENT_TYPE_CQ_ERROR:
+			mlx4_warn(dev, "CQ %s on CQN %06x\n",
+				  eqe->event.cq_err.syndrome == 1 ?
+				  "overrun" : "access violation",
+				  be32_to_cpu(eqe->event.cq_err.cqn) & 0xffffff);
+			mlx4_cq_event(dev, be32_to_cpu(eqe->event.cq_err.cqn),
+				      eqe->type);
+			break;
+
+		case MLX4_EVENT_TYPE_EQ_OVERFLOW:
+			mlx4_warn(dev, "EQ overrun on EQN %d\n", eq->eqn);
+			break;
+
+		case MLX4_EVENT_TYPE_EEC_CATAS_ERROR:
+		case MLX4_EVENT_TYPE_ECC_DETECT:
+		default:
+			mlx4_warn(dev, "Unhandled event %02x(%02x) on EQ %d at index %u\n",
+				  eqe->type, eqe->subtype, eq->eqn, eq->cons_index);
+			break;
+		};
+
+		++eq->cons_index;
+		eqes_found = 1;
+		++set_ci;
+
+		/*
+		 * The HCA will think the queue has overflowed if we
+		 * don't tell it we've been processing events.  We
+		 * create our EQs with MLX4_NUM_SPARE_EQE extra
+		 * entries, so we must update our consumer index at
+		 * least that often.
+		 */
+		if (unlikely(set_ci >= MLX4_NUM_SPARE_EQE)) {
+			eq_set_ci(eq, 0);
+			set_ci = 0;
+		}
+	}
+
+	eq_set_ci(eq, 1);
+
+	return eqes_found;
+}
+
+static irqreturn_t mlx4_interrupt(int irq, void *dev_ptr)
+{
+	struct mlx4_dev *dev = dev_ptr;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int work = 0;
+	int i;
+
+	writel(priv->eq_table.clr_mask, priv->eq_table.clr_int);
+
+	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
+		work |= mlx4_eq_int(dev, &priv->eq_table.eq[i]);
+
+	return IRQ_RETVAL(work);
+}
+
+static irqreturn_t mlx4_msi_x_interrupt(int irq, void *eq_ptr)
+{
+	struct mlx4_eq  *eq  = eq_ptr;
+	struct mlx4_dev *dev = eq->dev;
+
+	mlx4_eq_int(dev, eq);
+
+	/* MSI-X vectors always belong to us */
+	return IRQ_HANDLED;
+}
+
+static int mlx4_MAP_EQ(struct mlx4_dev *dev, u64 event_mask, int unmap,
+			int eq_num)
+{
+	return mlx4_cmd(dev, event_mask, (unmap << 31) | eq_num,
+			0, MLX4_CMD_MAP_EQ, MLX4_CMD_TIME_CLASS_B);
+}
+
+static int mlx4_SW2HW_EQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			 int eq_num)
+{
+	return mlx4_cmd(dev, mailbox->dma, eq_num, 0, MLX4_CMD_SW2HW_EQ,
+			MLX4_CMD_TIME_CLASS_A);
+}
+
+static int mlx4_HW2SW_EQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			 int eq_num)
+{
+	return mlx4_cmd_box(dev, 0, mailbox->dma, eq_num, 0, MLX4_CMD_HW2SW_EQ,
+			    MLX4_CMD_TIME_CLASS_A);
+}
+
+static int mlx4_num_eq_uar(struct mlx4_dev *dev)
+{
+	/*
+	 * Each UAR holds 4 EQ doorbells.  To figure out how many UARs
+	 * we need to map, take the difference of highest index and
+	 * the lowest index we'll use and add 1.
+	 */
+	return (dev->caps.num_comp_vectors + 1 + dev->caps.reserved_eqs) / 4 -
+		dev->caps.reserved_eqs / 4 + 1;
+}
+
+static void __iomem *mlx4_get_eq_uar(struct mlx4_dev *dev, struct mlx4_eq *eq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int index;
+
+	index = eq->eqn / 4 - dev->caps.reserved_eqs / 4;
+
+	if (!priv->eq_table.uar_map[index]) {
+		priv->eq_table.uar_map[index] =
+			ioremap(pci_resource_start(dev->pdev, 2) +
+				((eq->eqn / 4) << PAGE_SHIFT),
+				PAGE_SIZE);
+		if (!priv->eq_table.uar_map[index]) {
+			mlx4_err(dev, "Couldn't map EQ doorbell for EQN 0x%06x\n",
+				 eq->eqn);
+			return NULL;
+		}
+	}
+
+	return priv->eq_table.uar_map[index] + 0x800 + 8 * (eq->eqn % 4);
+}
+
+static int mlx4_create_eq(struct mlx4_dev *dev, int nent,
+			  u8 intr, struct mlx4_eq *eq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_eq_context *eq_context;
+	int npages;
+	u64 *dma_list = NULL;
+	dma_addr_t t;
+	u64 mtt_addr;
+	int err = -ENOMEM;
+	int i;
+
+	eq->dev   = dev;
+	eq->nent  = roundup_pow_of_two(max(nent, 2));
+	npages = PAGE_ALIGN(eq->nent * MLX4_EQ_ENTRY_SIZE) / PAGE_SIZE;
+
+	eq->page_list = kmalloc(npages * sizeof *eq->page_list,
+				GFP_KERNEL);
+	if (!eq->page_list)
+		goto err_out;
+
+	for (i = 0; i < npages; ++i)
+		eq->page_list[i].buf = NULL;
+
+	dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+	if (!dma_list)
+		goto err_out_free;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		goto err_out_free;
+	eq_context = mailbox->buf;
+
+	for (i = 0; i < npages; ++i) {
+		eq->page_list[i].buf = dma_alloc_coherent(&dev->pdev->dev,
+							  PAGE_SIZE, &t, GFP_KERNEL);
+		if (!eq->page_list[i].buf)
+			goto err_out_free_pages;
+
+		dma_list[i] = t;
+		eq->page_list[i].map = t;
+
+		memset(eq->page_list[i].buf, 0, PAGE_SIZE);
+	}
+
+	eq->eqn = mlx4_bitmap_alloc(&priv->eq_table.bitmap);
+	if (eq->eqn == -1)
+		goto err_out_free_pages;
+
+	eq->doorbell = mlx4_get_eq_uar(dev, eq);
+	if (!eq->doorbell) {
+		err = -ENOMEM;
+		goto err_out_free_eq;
+	}
+
+	err = mlx4_mtt_init(dev, npages, PAGE_SHIFT, &eq->mtt);
+	if (err)
+		goto err_out_free_eq;
+
+	err = mlx4_write_mtt(dev, &eq->mtt, 0, npages, dma_list);
+	if (err)
+		goto err_out_free_mtt;
+
+	memset(eq_context, 0, sizeof *eq_context);
+	eq_context->flags	  = cpu_to_be32(MLX4_EQ_STATUS_OK   |
+						MLX4_EQ_STATE_ARMED);
+	eq_context->log_eq_size	  = ilog2(eq->nent);
+	eq_context->intr	  = intr;
+	eq_context->log_page_size = PAGE_SHIFT - MLX4_ICM_PAGE_SHIFT;
+
+	mtt_addr = mlx4_mtt_addr(dev, &eq->mtt);
+	eq_context->mtt_base_addr_h = mtt_addr >> 32;
+	eq_context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff);
+
+	err = mlx4_SW2HW_EQ(dev, mailbox, eq->eqn);
+	if (err) {
+		mlx4_warn(dev, "SW2HW_EQ failed (%d)\n", err);
+		goto err_out_free_mtt;
+	}
+
+	kfree(dma_list);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	eq->cons_index = 0;
+
+	return err;
+
+err_out_free_mtt:
+	mlx4_mtt_cleanup(dev, &eq->mtt);
+
+err_out_free_eq:
+	mlx4_bitmap_free(&priv->eq_table.bitmap, eq->eqn);
+
+err_out_free_pages:
+	for (i = 0; i < npages; ++i)
+		if (eq->page_list[i].buf)
+			dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+					  eq->page_list[i].buf,
+					  eq->page_list[i].map);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+err_out_free:
+	kfree(eq->page_list);
+	kfree(dma_list);
+
+err_out:
+	return err;
+}
+
+static void mlx4_free_eq(struct mlx4_dev *dev,
+			 struct mlx4_eq *eq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+	int npages = PAGE_ALIGN(MLX4_EQ_ENTRY_SIZE * eq->nent) / PAGE_SIZE;
+	int i;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return;
+
+	err = mlx4_HW2SW_EQ(dev, mailbox, eq->eqn);
+	if (err)
+		mlx4_warn(dev, "HW2SW_EQ failed (%d)\n", err);
+
+	if (0) {
+		mlx4_dbg(dev, "Dumping EQ context %02x:\n", eq->eqn);
+		for (i = 0; i < sizeof (struct mlx4_eq_context) / 4; ++i) {
+			if (i % 4 == 0)
+				printk("[%02x] ", i * 4);
+			printk(" %08x", be32_to_cpup(mailbox->buf + i * 4));
+			if ((i + 1) % 4 == 0)
+				printk("\n");
+		}
+	}
+
+	mlx4_mtt_cleanup(dev, &eq->mtt);
+	for (i = 0; i < npages; ++i)
+		pci_free_consistent(dev->pdev, PAGE_SIZE,
+				    eq->page_list[i].buf,
+				    eq->page_list[i].map);
+
+	kfree(eq->page_list);
+	mlx4_bitmap_free(&priv->eq_table.bitmap, eq->eqn);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+}
+
+static void mlx4_free_irqs(struct mlx4_dev *dev)
+{
+	struct mlx4_eq_table *eq_table = &mlx4_priv(dev)->eq_table;
+	int i;
+
+	if (eq_table->have_irq)
+		free_irq(dev->pdev->irq, dev);
+	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
+		if (eq_table->eq[i].have_irq) {
+			free_irq(eq_table->eq[i].irq, eq_table->eq + i);
+			eq_table->eq[i].have_irq = 0;
+		}
+
+	kfree(eq_table->irq_names);
+}
+
+static int mlx4_map_clr_int(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	priv->clr_base = ioremap(pci_resource_start(dev->pdev, priv->fw.clr_int_bar) +
+				 priv->fw.clr_int_base, MLX4_CLR_INT_SIZE);
+	if (!priv->clr_base) {
+		mlx4_err(dev, "Couldn't map interrupt clear register, aborting.\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void mlx4_unmap_clr_int(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	iounmap(priv->clr_base);
+}
+
+int mlx4_alloc_eq_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	priv->eq_table.eq = kcalloc(dev->caps.num_eqs - dev->caps.reserved_eqs,
+				    sizeof *priv->eq_table.eq, GFP_KERNEL);
+	if (!priv->eq_table.eq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx4_free_eq_table(struct mlx4_dev *dev)
+{
+	kfree(mlx4_priv(dev)->eq_table.eq);
+}
+
+int mlx4_init_eq_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+	int i;
+
+	priv->eq_table.uar_map = kcalloc(sizeof *priv->eq_table.uar_map,
+					 mlx4_num_eq_uar(dev), GFP_KERNEL);
+	if (!priv->eq_table.uar_map) {
+		err = -ENOMEM;
+		goto err_out_free;
+	}
+
+	err = mlx4_bitmap_init(&priv->eq_table.bitmap, dev->caps.num_eqs,
+			       dev->caps.num_eqs - 1, dev->caps.reserved_eqs, 0);
+	if (err)
+		goto err_out_free;
+
+	for (i = 0; i < mlx4_num_eq_uar(dev); ++i)
+		priv->eq_table.uar_map[i] = NULL;
+
+	err = mlx4_map_clr_int(dev);
+	if (err)
+		goto err_out_bitmap;
+
+	priv->eq_table.clr_mask =
+		swab32(1 << (priv->eq_table.inta_pin & 31));
+	priv->eq_table.clr_int  = priv->clr_base +
+		(priv->eq_table.inta_pin < 32 ? 4 : 0);
+
+	priv->eq_table.irq_names = kmalloc(16 * dev->caps.num_comp_vectors, GFP_KERNEL);
+	if (!priv->eq_table.irq_names) {
+		err = -ENOMEM;
+		goto err_out_bitmap;
+	}
+
+	for (i = 0; i < dev->caps.num_comp_vectors; ++i) {
+		err = mlx4_create_eq(dev, dev->caps.num_cqs + MLX4_NUM_SPARE_EQE,
+				     (dev->flags & MLX4_FLAG_MSI_X) ? i : 0,
+				     &priv->eq_table.eq[i]);
+		if (err) {
+			--i;
+			goto err_out_unmap;
+		}
+	}
+
+	err = mlx4_create_eq(dev, MLX4_NUM_ASYNC_EQE + MLX4_NUM_SPARE_EQE,
+			     (dev->flags & MLX4_FLAG_MSI_X) ? dev->caps.num_comp_vectors : 0,
+			     &priv->eq_table.eq[dev->caps.num_comp_vectors]);
+	if (err)
+		goto err_out_comp;
+
+	if (dev->flags & MLX4_FLAG_MSI_X) {
+		static const char async_eq_name[] = DRV_NAME "(async)";
+		const char *eq_name;
+
+		for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) {
+			if (i < dev->caps.num_comp_vectors) {
+				snprintf(priv->eq_table.irq_names + i * 16, 16,
+					 "eth-mlx4-%d", i);
+				eq_name = priv->eq_table.irq_names + i * 16;
+			} else
+				eq_name = async_eq_name;
+
+			err = request_irq(priv->eq_table.eq[i].irq,
+					  mlx4_msi_x_interrupt, 0, eq_name,
+					  priv->eq_table.eq + i);
+			if (err)
+				goto err_out_async;
+
+			priv->eq_table.eq[i].have_irq = 1;
+		}
+	} else {
+		err = request_irq(dev->pdev->irq, mlx4_interrupt,
+				  IRQF_SHARED, DRV_NAME, dev);
+		if (err)
+			goto err_out_async;
+
+		priv->eq_table.have_irq = 1;
+	}
+
+	err = mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 0,
+			  priv->eq_table.eq[dev->caps.num_comp_vectors].eqn);
+	if (err)
+		mlx4_warn(dev, "MAP_EQ for async EQ %d failed (%d)\n",
+			   priv->eq_table.eq[dev->caps.num_comp_vectors].eqn, err);
+
+	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
+		eq_set_ci(&priv->eq_table.eq[i], 1);
+
+	return 0;
+
+err_out_async:
+	mlx4_free_eq(dev, &priv->eq_table.eq[dev->caps.num_comp_vectors]);
+
+err_out_comp:
+	i = dev->caps.num_comp_vectors - 1;
+
+err_out_unmap:
+	while (i >= 0) {
+		mlx4_free_eq(dev, &priv->eq_table.eq[i]);
+		--i;
+	}
+	mlx4_unmap_clr_int(dev);
+	mlx4_free_irqs(dev);
+
+err_out_bitmap:
+	mlx4_bitmap_cleanup(&priv->eq_table.bitmap);
+
+err_out_free:
+	kfree(priv->eq_table.uar_map);
+
+	return err;
+}
+
+void mlx4_cleanup_eq_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i;
+
+	mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 1,
+		    priv->eq_table.eq[dev->caps.num_comp_vectors].eqn);
+
+	mlx4_free_irqs(dev);
+
+	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
+		mlx4_free_eq(dev, &priv->eq_table.eq[i]);
+
+	mlx4_unmap_clr_int(dev);
+
+	for (i = 0; i < mlx4_num_eq_uar(dev); ++i)
+		if (priv->eq_table.uar_map[i])
+			iounmap(priv->eq_table.uar_map[i]);
+
+	mlx4_bitmap_cleanup(&priv->eq_table.bitmap);
+
+	kfree(priv->eq_table.uar_map);
+}
+
+/* A test that verifies that we can accept interrupts on all
+ * the irq vectors of the device.
+ * Interrupts are checked using the NOP command.
+ */
+int mlx4_test_interrupts(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i;
+	int err;
+
+	err = mlx4_NOP(dev);
+	/* When not in MSI_X, there is only one irq to check */
+	if (!(dev->flags & MLX4_FLAG_MSI_X))
+		return err;
+
+	/* A loop over all completion vectors, for each vector we will check
+	 * whether it works by mapping command completions to that vector
+	 * and performing a NOP command
+	 */
+	for(i = 0; !err && (i < dev->caps.num_comp_vectors); ++i) {
+		/* Temporary use polling for command completions */
+		mlx4_cmd_use_polling(dev);
+
+		/* Map the new eq to handle all asyncronous events */
+		err = mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 0,
+				  priv->eq_table.eq[i].eqn);
+		if (err) {
+			mlx4_warn(dev, "Failed mapping eq for interrupt test\n");
+			mlx4_cmd_use_events(dev);
+			break;
+		}
+
+		/* Go back to using events */
+		mlx4_cmd_use_events(dev);
+		err = mlx4_NOP(dev);
+	}
+
+	/* Return to default */
+	mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 0,
+		    priv->eq_table.eq[dev->caps.num_comp_vectors].eqn);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_test_interrupts);
diff --git a/sys/ofed/drivers/net/mlx4/fw.c b/sys/ofed/drivers/net/mlx4/fw.c
new file mode 100644
index 0000000..774d261
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/fw.c
@@ -0,0 +1,1010 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx4/cmd.h>
+
+#include "fw.h"
+#include "icm.h"
+
+enum {
+	MLX4_COMMAND_INTERFACE_MIN_REV		= 2,
+	MLX4_COMMAND_INTERFACE_MAX_REV		= 3,
+	MLX4_COMMAND_INTERFACE_NEW_PORT_CMDS	= 3,
+};
+
+extern void __buggy_use_of_MLX4_GET(void);
+extern void __buggy_use_of_MLX4_PUT(void);
+
+static int enable_qos;
+module_param(enable_qos, bool, 0444);
+MODULE_PARM_DESC(enable_qos, "Enable Quality of Service support in the HCA (default: off)");
+
+static int mlx4_pre_t11_mode = 0;
+module_param_named(enable_pre_t11_mode, mlx4_pre_t11_mode, int, 0644);
+MODULE_PARM_DESC(enable_pre_t11_mode, "For FCoXX, enable pre-t11 mode if non-zero (default: 0)");
+
+#define MLX4_GET(dest, source, offset)				      \
+	do {							      \
+		void *__p = (char *) (source) + (offset);	      \
+		switch (sizeof (dest)) {			      \
+		case 1: (dest) = *(u8 *) __p;	    break;	      \
+		case 2: (dest) = be16_to_cpup(__p); break;	      \
+		case 4: (dest) = be32_to_cpup(__p); break;	      \
+		case 8: (dest) = be64_to_cpup(__p); break;	      \
+		default: __buggy_use_of_MLX4_GET();		      \
+		}						      \
+	} while (0)
+
+#define MLX4_PUT(dest, source, offset)				      \
+	do {							      \
+		void *__d = ((char *) (dest) + (offset));	      \
+		switch (sizeof(source)) {			      \
+		case 1: *(u8 *) __d = (source);		       break; \
+		case 2:	*(__be16 *) __d = cpu_to_be16(source); break; \
+		case 4:	*(__be32 *) __d = cpu_to_be32(source); break; \
+		case 8:	*(__be64 *) __d = cpu_to_be64(source); break; \
+		default: __buggy_use_of_MLX4_PUT();		      \
+		}						      \
+	} while (0)
+
+static void dump_dev_cap_flags(struct mlx4_dev *dev, u64 flags)
+{
+	static const char *fname[] = {
+		[ 0] = "RC transport",
+		[ 1] = "UC transport",
+		[ 2] = "UD transport",
+		[ 3] = "XRC transport",
+		[ 4] = "reliable multicast",
+		[ 5] = "FCoIB support",
+		[ 6] = "SRQ support",
+		[ 7] = "IPoIB checksum offload",
+		[ 8] = "P_Key violation counter",
+		[ 9] = "Q_Key violation counter",
+		[10] = "VMM",
+		[12] = "DPDP",
+		[16] = "MW support",
+		[17] = "APM support",
+		[18] = "Atomic ops support",
+		[19] = "Raw multicast support",
+		[20] = "Address vector port checking support",
+		[21] = "UD multicast support",
+		[24] = "Demand paging support",
+		[25] = "Router support",
+		[30] = "IBoE support",
+		[48] = "Basic counters support",
+		[49] = "Extended counters support",
+	};
+	int i;
+
+	mlx4_dbg(dev, "DEV_CAP flags:\n");
+	for (i = 0; i < ARRAY_SIZE(fname); ++i)
+		if (fname[i] && (flags & (1LL << i)))
+			mlx4_dbg(dev, "    %s\n", fname[i]);
+}
+
+int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *inbox;
+	int err = 0;
+
+#define MOD_STAT_CFG_IN_SIZE		0x100
+
+#define MOD_STAT_CFG_PG_SZ_M_OFFSET	0x002
+#define MOD_STAT_CFG_PG_SZ_OFFSET	0x003
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	memset(inbox, 0, MOD_STAT_CFG_IN_SIZE);
+
+	MLX4_PUT(inbox, cfg->log_pg_sz, MOD_STAT_CFG_PG_SZ_OFFSET);
+	MLX4_PUT(inbox, cfg->log_pg_sz_m, MOD_STAT_CFG_PG_SZ_M_OFFSET);
+
+	err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_MOD_STAT_CFG,
+			MLX4_CMD_TIME_CLASS_A);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *outbox;
+	u8 field;
+	u32 field32;
+	u16 size;
+	u16 stat_rate;
+	int err;
+	int i;
+	u32 in_modifier;
+	u64 out_param;
+	u32 tmp1, tmp2;
+
+#define QUERY_DEV_CAP_OUT_SIZE		       0x100
+#define QUERY_DEV_CAP_MAX_SRQ_SZ_OFFSET		0x10
+#define QUERY_DEV_CAP_MAX_QP_SZ_OFFSET		0x11
+#define QUERY_DEV_CAP_RSVD_QP_OFFSET		0x12
+#define QUERY_DEV_CAP_MAX_QP_OFFSET		0x13
+#define QUERY_DEV_CAP_RSVD_SRQ_OFFSET		0x14
+#define QUERY_DEV_CAP_MAX_SRQ_OFFSET		0x15
+#define QUERY_DEV_CAP_RSVD_EEC_OFFSET		0x16
+#define QUERY_DEV_CAP_MAX_EEC_OFFSET		0x17
+#define QUERY_DEV_CAP_MAX_CQ_SZ_OFFSET		0x19
+#define QUERY_DEV_CAP_RSVD_CQ_OFFSET		0x1a
+#define QUERY_DEV_CAP_MAX_CQ_OFFSET		0x1b
+#define QUERY_DEV_CAP_MAX_MPT_OFFSET		0x1d
+#define QUERY_DEV_CAP_RSVD_EQ_OFFSET		0x1e
+#define QUERY_DEV_CAP_MAX_EQ_OFFSET		0x1f
+#define QUERY_DEV_CAP_RSVD_MTT_OFFSET		0x20
+#define QUERY_DEV_CAP_MAX_MRW_SZ_OFFSET		0x21
+#define QUERY_DEV_CAP_RSVD_MRW_OFFSET		0x22
+#define QUERY_DEV_CAP_MAX_MTT_SEG_OFFSET	0x23
+#define QUERY_DEV_CAP_MAX_AV_OFFSET		0x27
+#define QUERY_DEV_CAP_MAX_REQ_QP_OFFSET		0x29
+#define QUERY_DEV_CAP_MAX_RES_QP_OFFSET		0x2b
+#define QUERY_DEV_CAP_MAX_GSO_OFFSET		0x2d
+#define QUERY_DEV_CAP_MAX_RDMA_OFFSET		0x2f
+#define QUERY_DEV_CAP_STAT_CFG_INL_OFFSET	0x31
+#define QUERY_DEV_CAP_RSZ_SRQ_OFFSET		0x33
+#define QUERY_DEV_CAP_ACK_DELAY_OFFSET		0x35
+#define QUERY_DEV_CAP_MTU_WIDTH_OFFSET		0x36
+#define QUERY_DEV_CAP_VL_PORT_OFFSET		0x37
+#define QUERY_DEV_CAP_MAX_MSG_SZ_OFFSET		0x38
+#define QUERY_DEV_CAP_MAX_GID_OFFSET		0x3b
+#define QUERY_DEV_CAP_RATE_SUPPORT_OFFSET	0x3c
+#define QUERY_DEV_CAP_MAX_PKEY_OFFSET		0x3f
+#define QUERY_DEV_CAP_EXT_FLAGS_OFFSET		0x40
+#define QUERY_DEV_CAP_UDP_RSS_OFFSET		0x42
+#define QUERY_DEV_CAP_ETH_UC_LOOPBACK_OFFSET	0x43
+#define QUERY_DEV_CAP_FLAGS_OFFSET		0x44
+#define QUERY_DEV_CAP_RSVD_UAR_OFFSET		0x48
+#define QUERY_DEV_CAP_UAR_SZ_OFFSET		0x49
+#define QUERY_DEV_CAP_PAGE_SZ_OFFSET		0x4b
+#define QUERY_DEV_CAP_BF_OFFSET			0x4c
+#define QUERY_DEV_CAP_LOG_BF_REG_SZ_OFFSET	0x4d
+#define QUERY_DEV_CAP_LOG_MAX_BF_REGS_PER_PAGE_OFFSET	0x4e
+#define QUERY_DEV_CAP_LOG_MAX_BF_PAGES_OFFSET	0x4f
+#define QUERY_DEV_CAP_MAX_SG_SQ_OFFSET		0x51
+#define QUERY_DEV_CAP_MAX_DESC_SZ_SQ_OFFSET	0x52
+#define QUERY_DEV_CAP_MAX_SG_RQ_OFFSET		0x55
+#define QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET	0x56
+#define QUERY_DEV_CAP_MAX_QP_MCG_OFFSET		0x61
+#define QUERY_DEV_CAP_RSVD_MCG_OFFSET		0x62
+#define QUERY_DEV_CAP_MAX_MCG_OFFSET		0x63
+#define QUERY_DEV_CAP_RSVD_PD_OFFSET		0x64
+#define QUERY_DEV_CAP_MAX_PD_OFFSET		0x65
+#define QUERY_DEV_CAP_RSVD_XRC_OFFSET		0x66
+#define QUERY_DEV_CAP_MAX_XRC_OFFSET		0x67
+#define QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET	0x80
+#define QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET	0x82
+#define QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET	0x84
+#define QUERY_DEV_CAP_ALTC_ENTRY_SZ_OFFSET	0x86
+#define QUERY_DEV_CAP_EQC_ENTRY_SZ_OFFSET	0x88
+#define QUERY_DEV_CAP_CQC_ENTRY_SZ_OFFSET	0x8a
+#define QUERY_DEV_CAP_SRQ_ENTRY_SZ_OFFSET	0x8c
+#define QUERY_DEV_CAP_C_MPT_ENTRY_SZ_OFFSET	0x8e
+#define QUERY_DEV_CAP_MTT_ENTRY_SZ_OFFSET	0x90
+#define QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET	0x92
+#define QUERY_DEV_CAP_BMME_FLAGS_OFFSET		0x94
+#define QUERY_DEV_CAP_RSVD_LKEY_OFFSET		0x98
+#define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET		0xa0
+#define QUERY_DEV_CAP_MAX_BASIC_CNT_OFFSET	0x68
+#define QUERY_DEV_CAP_MAX_EXT_CNT_OFFSET	0x6c
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_DEV_CAP,
+			   MLX4_CMD_TIME_CLASS_A);
+	if (err)
+		goto out;
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_QP_OFFSET);
+	dev_cap->reserved_qps = 1 << (field & 0xf);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_OFFSET);
+	dev_cap->max_qps = 1 << (field & 0x1f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_SRQ_OFFSET);
+	dev_cap->reserved_srqs = 1 << (field >> 4);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SRQ_OFFSET);
+	dev_cap->max_srqs = 1 << (field & 0x1f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_CQ_SZ_OFFSET);
+	dev_cap->max_cq_sz = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_CQ_OFFSET);
+	dev_cap->reserved_cqs = 1 << (field & 0xf);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_CQ_OFFSET);
+	dev_cap->max_cqs = 1 << (field & 0x1f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MPT_OFFSET);
+	dev_cap->max_mpts = 1 << (field & 0x3f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_EQ_OFFSET);
+	dev_cap->reserved_eqs = 1 << (field & 0xf);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_EQ_OFFSET);
+	dev_cap->max_eqs = 1 << (field & 0xf);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MTT_OFFSET);
+	dev_cap->reserved_mtts = 1 << (field >> 4);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MRW_SZ_OFFSET);
+	dev_cap->max_mrw_sz = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MRW_OFFSET);
+	dev_cap->reserved_mrws = 1 << (field & 0xf);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MTT_SEG_OFFSET);
+	dev_cap->max_mtt_seg = 1 << (field & 0x3f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_REQ_QP_OFFSET);
+	dev_cap->max_requester_per_qp = 1 << (field & 0x3f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_RES_QP_OFFSET);
+	dev_cap->max_responder_per_qp = 1 << (field & 0x3f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_GSO_OFFSET);
+	field &= 0x1f;
+	if (!field)
+		dev_cap->max_gso_sz = 0;
+	else
+		dev_cap->max_gso_sz = 1 << field;
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_RDMA_OFFSET);
+	dev_cap->max_rdma_global = 1 << (field & 0x3f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_ACK_DELAY_OFFSET);
+	dev_cap->local_ca_ack_delay = field & 0x1f;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET);
+	dev_cap->num_ports = field & 0xf;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MSG_SZ_OFFSET);
+	dev_cap->max_msg_sz = 1 << (field & 0x1f);
+	MLX4_GET(stat_rate, outbox, QUERY_DEV_CAP_RATE_SUPPORT_OFFSET);
+	dev_cap->stat_rate_support = stat_rate;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_UDP_RSS_OFFSET);
+	dev_cap->udp_rss = field & 0x1;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_ETH_UC_LOOPBACK_OFFSET);
+	dev_cap->loopback_support = field & 0x1;
+	MLX4_GET(tmp1, outbox, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
+	MLX4_GET(tmp2, outbox, QUERY_DEV_CAP_FLAGS_OFFSET);
+	dev_cap->flags = tmp2 | (u64)tmp1 << 32;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_UAR_OFFSET);
+	dev_cap->reserved_uars = field >> 4;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_UAR_SZ_OFFSET);
+	dev_cap->uar_size = 1 << ((field & 0x3f) + 20);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_PAGE_SZ_OFFSET);
+	dev_cap->min_page_sz = 1 << field;
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_BF_OFFSET);
+	if (field & 0x80) {
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_LOG_BF_REG_SZ_OFFSET);
+		dev_cap->bf_reg_size = 1 << (field & 0x1f);
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_LOG_MAX_BF_REGS_PER_PAGE_OFFSET);
+		if ((1 << (field & 0x3f)) > (PAGE_SIZE / dev_cap->bf_reg_size)) {
+			mlx4_dbg(dev, "log blue flame is invalid (%d), forcing 3\n", field & 0x1f);
+			field = 3;
+		}
+		dev_cap->bf_regs_per_page = 1 << (field & 0x3f);
+		mlx4_dbg(dev, "BlueFlame available (reg size %d, regs/page %d)\n",
+			 dev_cap->bf_reg_size, dev_cap->bf_regs_per_page);
+	} else {
+		dev_cap->bf_reg_size = 0;
+		mlx4_dbg(dev, "BlueFlame not available\n");
+	}
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SG_SQ_OFFSET);
+	dev_cap->max_sq_sg = field;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_SQ_OFFSET);
+	dev_cap->max_sq_desc_sz = size;
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_MCG_OFFSET);
+	dev_cap->max_qp_per_mcg = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MCG_OFFSET);
+	dev_cap->reserved_mgms = field & 0xf;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MCG_OFFSET);
+	dev_cap->max_mcgs = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_PD_OFFSET);
+	dev_cap->reserved_pds = field >> 4;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PD_OFFSET);
+	dev_cap->max_pds = 1 << (field & 0x3f);
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_XRC_OFFSET);
+	dev_cap->reserved_xrcds = field >> 4;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_XRC_OFFSET);
+	dev_cap->max_xrcds = 1 << (field & 0x1f);
+
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET);
+	dev_cap->rdmarc_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET);
+	dev_cap->qpc_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET);
+	dev_cap->aux_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_ALTC_ENTRY_SZ_OFFSET);
+	dev_cap->altc_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_EQC_ENTRY_SZ_OFFSET);
+	dev_cap->eqc_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_CQC_ENTRY_SZ_OFFSET);
+	dev_cap->cqc_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_SRQ_ENTRY_SZ_OFFSET);
+	dev_cap->srq_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_C_MPT_ENTRY_SZ_OFFSET);
+	dev_cap->cmpt_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_MTT_ENTRY_SZ_OFFSET);
+	dev_cap->mtt_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET);
+	dev_cap->dmpt_entry_sz = size;
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SRQ_SZ_OFFSET);
+	dev_cap->max_srq_sz = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_SZ_OFFSET);
+	dev_cap->max_qp_sz = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_STAT_CFG_INL_OFFSET);
+	dev_cap->inline_cfg = field & 1;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSZ_SRQ_OFFSET);
+	dev_cap->resize_srq = field & 1;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SG_RQ_OFFSET);
+	dev_cap->max_rq_sg = field;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET);
+	dev_cap->max_rq_desc_sz = size;
+
+	MLX4_GET(dev_cap->bmme_flags, outbox,
+		 QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+	MLX4_GET(dev_cap->reserved_lkey, outbox,
+		 QUERY_DEV_CAP_RSVD_LKEY_OFFSET);
+	MLX4_GET(dev_cap->max_icm_sz, outbox,
+		 QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET);
+	MLX4_GET(dev_cap->max_basic_counters, outbox,
+		 QUERY_DEV_CAP_MAX_BASIC_CNT_OFFSET);
+	MLX4_GET(dev_cap->max_ext_counters, outbox,
+		 QUERY_DEV_CAP_MAX_EXT_CNT_OFFSET);
+
+	if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
+		for (i = 1; i <= dev_cap->num_ports; ++i) {
+			MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET);
+			dev_cap->max_vl[i]	   = field >> 4;
+			MLX4_GET(field, outbox, QUERY_DEV_CAP_MTU_WIDTH_OFFSET);
+			dev_cap->ib_mtu[i]	   = field >> 4;
+			dev_cap->max_port_width[i] = field & 0xf;
+			MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_GID_OFFSET);
+			dev_cap->max_gids[i]	   = 1 << (field & 0xf);
+			MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PKEY_OFFSET);
+			dev_cap->max_pkeys[i]	   = 1 << (field & 0xf);
+		}
+	} else {
+#define QUERY_PORT_SUPPORTED_TYPE_OFFSET	0x00
+#define QUERY_PORT_MTU_OFFSET			0x01
+#define QUERY_PORT_ETH_MTU_OFFSET		0x02
+#define QUERY_PORT_WIDTH_OFFSET			0x06
+#define QUERY_PORT_MAX_GID_PKEY_OFFSET		0x07
+#define QUERY_PORT_MAX_MACVLAN_OFFSET		0x0a
+#define QUERY_PORT_MAX_VL_OFFSET		0x0b
+#define QUERY_PORT_MAC_OFFSET			0x10
+#define QUERY_PORT_TRANS_VENDOR_OFFSET		0x18
+#define QUERY_PORT_WAVELENGTH_OFFSET		0x1c
+#define QUERY_PORT_TRANS_CODE_OFFSET		0x20
+
+#define STAT_CFG_PORT_MODE	(1 << 28)
+#define STAT_CFG_PORT_OFFSET	0x8
+#define STAT_CFG_PORT_MASK	(1 << 20)
+#define STAT_CFG_MOD_INLINE	0x3
+
+		for (i = 1; i <= dev_cap->num_ports; ++i) {
+			err = mlx4_cmd_box(dev, 0, mailbox->dma, i, 0, MLX4_CMD_QUERY_PORT,
+					   MLX4_CMD_TIME_CLASS_B);
+			if (err)
+				goto out;
+
+			MLX4_GET(field, outbox, QUERY_PORT_SUPPORTED_TYPE_OFFSET);
+			dev_cap->supported_port_types[i] = field & 3;
+			MLX4_GET(field, outbox, QUERY_PORT_MTU_OFFSET);
+			dev_cap->ib_mtu[i]	   = field & 0xf;
+			MLX4_GET(field, outbox, QUERY_PORT_WIDTH_OFFSET);
+			dev_cap->max_port_width[i] = field & 0xf;
+			MLX4_GET(field, outbox, QUERY_PORT_MAX_GID_PKEY_OFFSET);
+			dev_cap->max_gids[i]	   = 1 << (field >> 4);
+			dev_cap->max_pkeys[i]	   = 1 << (field & 0xf);
+			MLX4_GET(field, outbox, QUERY_PORT_MAX_VL_OFFSET);
+			dev_cap->max_vl[i]	   = field & 0xf;
+			MLX4_GET(field, outbox, QUERY_PORT_MAX_MACVLAN_OFFSET);
+			dev_cap->log_max_macs[i]  = field & 0xf;
+			dev_cap->log_max_vlans[i] = field >> 4;
+			MLX4_GET(dev_cap->eth_mtu[i], outbox, QUERY_PORT_ETH_MTU_OFFSET);
+			MLX4_GET(dev_cap->def_mac[i], outbox, QUERY_PORT_MAC_OFFSET);
+			MLX4_GET(field32, outbox, QUERY_PORT_TRANS_VENDOR_OFFSET);
+			dev_cap->trans_type[i] = field32 >> 24;
+			dev_cap->vendor_oui[i] = field32 & 0xffffff;
+			MLX4_GET(dev_cap->wavelength[i], outbox, QUERY_PORT_WAVELENGTH_OFFSET);
+			MLX4_GET(dev_cap->trans_code[i], outbox, QUERY_PORT_TRANS_CODE_OFFSET);
+
+			/* Query stat cfg for port enablement */
+			if (dev_cap->inline_cfg) {
+				in_modifier = STAT_CFG_PORT_MODE | i << 8 |
+							STAT_CFG_PORT_OFFSET;
+				err = mlx4_cmd_imm(dev, 0, &out_param,
+						   in_modifier,
+						   STAT_CFG_MOD_INLINE,
+						   MLX4_CMD_MOD_STAT_CFG,
+						   MLX4_CMD_TIME_CLASS_B);
+				if (!err)
+					if (!(out_param & STAT_CFG_PORT_MASK))
+						dev_cap->supported_port_types[i] = 0;
+			}
+		}
+	}
+
+	mlx4_dbg(dev, "Base MM extensions: flags %08x, rsvd L_Key %08x\n",
+		 dev_cap->bmme_flags, dev_cap->reserved_lkey);
+
+	/*
+	 * Each UAR has 4 EQ doorbells; so if a UAR is reserved, then
+	 * we can't use any EQs whose doorbell falls on that page,
+	 * even if the EQ itself isn't reserved.
+	 */
+	dev_cap->reserved_eqs = max(dev_cap->reserved_uars * 4,
+				    dev_cap->reserved_eqs);
+
+	mlx4_dbg(dev, "Max ICM size %lld MB\n",
+		 (unsigned long long) dev_cap->max_icm_sz >> 20);
+	mlx4_dbg(dev, "Max QPs: %d, reserved QPs: %d, entry size: %d\n",
+		 dev_cap->max_qps, dev_cap->reserved_qps, dev_cap->qpc_entry_sz);
+	mlx4_dbg(dev, "Max SRQs: %d, reserved SRQs: %d, entry size: %d\n",
+		 dev_cap->max_srqs, dev_cap->reserved_srqs, dev_cap->srq_entry_sz);
+	mlx4_dbg(dev, "Max CQs: %d, reserved CQs: %d, entry size: %d\n",
+		 dev_cap->max_cqs, dev_cap->reserved_cqs, dev_cap->cqc_entry_sz);
+	mlx4_dbg(dev, "Max EQs: %d, reserved EQs: %d, entry size: %d\n",
+		 dev_cap->max_eqs, dev_cap->reserved_eqs, dev_cap->eqc_entry_sz);
+	mlx4_dbg(dev, "reserved MPTs: %d, reserved MTTs: %d\n",
+		 dev_cap->reserved_mrws, dev_cap->reserved_mtts);
+	mlx4_dbg(dev, "Max PDs: %d, reserved PDs: %d, reserved UARs: %d\n",
+		 dev_cap->max_pds, dev_cap->reserved_pds, dev_cap->reserved_uars);
+	mlx4_dbg(dev, "Max QP/MCG: %d, reserved MGMs: %d\n",
+		 dev_cap->max_pds, dev_cap->reserved_mgms);
+	mlx4_dbg(dev, "Max CQEs: %d, max WQEs: %d, max SRQ WQEs: %d\n",
+		 dev_cap->max_cq_sz, dev_cap->max_qp_sz, dev_cap->max_srq_sz);
+	mlx4_dbg(dev, "Local CA ACK delay: %d, max MTU: %d, port width cap: %d\n",
+		 dev_cap->local_ca_ack_delay, 128 << dev_cap->ib_mtu[1],
+		 dev_cap->max_port_width[1]);
+	mlx4_dbg(dev, "Max SQ desc size: %d, max SQ S/G: %d\n",
+		 dev_cap->max_sq_desc_sz, dev_cap->max_sq_sg);
+	mlx4_dbg(dev, "Max RQ desc size: %d, max RQ S/G: %d\n",
+		 dev_cap->max_rq_desc_sz, dev_cap->max_rq_sg);
+	mlx4_dbg(dev, "Max GSO size: %d\n", dev_cap->max_gso_sz);
+
+	dump_dev_cap_flags(dev, dev_cap->flags);
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_icm_iter iter;
+	__be64 *pages;
+	int lg;
+	int nent = 0;
+	int i;
+	int err = 0;
+	int ts = 0, tc = 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	memset(mailbox->buf, 0, MLX4_MAILBOX_SIZE);
+	pages = mailbox->buf;
+
+	for (mlx4_icm_first(icm, &iter);
+	     !mlx4_icm_last(&iter);
+	     mlx4_icm_next(&iter)) {
+		/*
+		 * We have to pass pages that are aligned to their
+		 * size, so find the least significant 1 in the
+		 * address or size and use that as our log2 size.
+		 */
+		lg = ffs(mlx4_icm_addr(&iter) | mlx4_icm_size(&iter)) - 1;
+		if (lg < MLX4_ICM_PAGE_SHIFT) {
+			mlx4_warn(dev, "Got FW area not aligned to %d (%llx/%lx).\n",
+				   MLX4_ICM_PAGE_SIZE,
+				   (unsigned long long) mlx4_icm_addr(&iter),
+				   mlx4_icm_size(&iter));
+			err = -EINVAL;
+			goto out;
+		}
+
+		for (i = 0; i < mlx4_icm_size(&iter) >> lg; ++i) {
+			if (virt != -1) {
+				pages[nent * 2] = cpu_to_be64(virt);
+				virt += 1 << lg;
+			}
+
+			pages[nent * 2 + 1] =
+				cpu_to_be64((mlx4_icm_addr(&iter) + (i << lg)) |
+					    (lg - MLX4_ICM_PAGE_SHIFT));
+			ts += 1 << (lg - 10);
+			++tc;
+
+			if (++nent == MLX4_MAILBOX_SIZE / 16) {
+				err = mlx4_cmd(dev, mailbox->dma, nent, 0, op,
+						MLX4_CMD_TIME_CLASS_B);
+				if (err)
+					goto out;
+				nent = 0;
+			}
+		}
+	}
+
+	if (nent)
+		err = mlx4_cmd(dev, mailbox->dma, nent, 0, op, MLX4_CMD_TIME_CLASS_B);
+	if (err)
+		goto out;
+
+	switch (op) {
+	case MLX4_CMD_MAP_FA:
+		mlx4_dbg(dev, "Mapped %d chunks/%d KB for FW.\n", tc, ts);
+		break;
+	case MLX4_CMD_MAP_ICM_AUX:
+		mlx4_dbg(dev, "Mapped %d chunks/%d KB for ICM aux.\n", tc, ts);
+		break;
+	case MLX4_CMD_MAP_ICM:
+		mlx4_dbg(dev, "Mapped %d chunks/%d KB at %llx for ICM.\n",
+			  tc, ts, (unsigned long long) virt - (ts << 10));
+		break;
+	}
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_MAP_FA(struct mlx4_dev *dev, struct mlx4_icm *icm)
+{
+	return mlx4_map_cmd(dev, MLX4_CMD_MAP_FA, icm, -1);
+}
+
+int mlx4_UNMAP_FA(struct mlx4_dev *dev)
+{
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_FA, MLX4_CMD_TIME_CLASS_B);
+}
+
+
+int mlx4_RUN_FW(struct mlx4_dev *dev)
+{
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_RUN_FW, MLX4_CMD_TIME_CLASS_A);
+}
+
+int mlx4_QUERY_FW(struct mlx4_dev *dev)
+{
+	struct mlx4_fw  *fw  = &mlx4_priv(dev)->fw;
+	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *outbox;
+	int err = 0;
+	u64 fw_ver;
+	u16 cmd_if_rev;
+	u8 lg;
+
+#define QUERY_FW_OUT_SIZE             0x100
+#define QUERY_FW_VER_OFFSET            0x00
+#define MC_PROMISC_VER		       0x2000702bcull
+#define QUERY_FW_CMD_IF_REV_OFFSET     0x0a
+#define QUERY_FW_MAX_CMD_OFFSET        0x0f
+#define QUERY_FW_ERR_START_OFFSET      0x30
+#define QUERY_FW_ERR_SIZE_OFFSET       0x38
+#define QUERY_FW_ERR_BAR_OFFSET        0x3c
+
+#define QUERY_FW_SIZE_OFFSET           0x00
+#define QUERY_FW_CLR_INT_BASE_OFFSET   0x20
+#define QUERY_FW_CLR_INT_BAR_OFFSET    0x28
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_FW,
+			    MLX4_CMD_TIME_CLASS_A);
+	if (err)
+		goto out;
+
+	MLX4_GET(fw_ver, outbox, QUERY_FW_VER_OFFSET);
+	/*
+	 * FW subminor version is at more significant bits than minor
+	 * version, so swap here.
+	 */
+	dev->caps.fw_ver = (fw_ver & 0xffff00000000ull) |
+		((fw_ver & 0xffff0000ull) >> 16) |
+		((fw_ver & 0x0000ffffull) << 16);
+	if (dev->caps.fw_ver < MC_PROMISC_VER)
+		dev->caps.mc_promisc_mode = 2;
+	else
+		dev->caps.mc_promisc_mode = 1;
+
+	MLX4_GET(cmd_if_rev, outbox, QUERY_FW_CMD_IF_REV_OFFSET);
+	if (cmd_if_rev < MLX4_COMMAND_INTERFACE_MIN_REV ||
+	    cmd_if_rev > MLX4_COMMAND_INTERFACE_MAX_REV) {
+		mlx4_err(dev, "Installed FW has unsupported "
+			 "command interface revision %d.\n",
+			 cmd_if_rev);
+		mlx4_err(dev, "(Installed FW version is %d.%d.%03d)\n",
+			 (int) (dev->caps.fw_ver >> 32),
+			 (int) (dev->caps.fw_ver >> 16) & 0xffff,
+			 (int) dev->caps.fw_ver & 0xffff);
+		mlx4_err(dev, "This driver version supports only revisions %d to %d.\n",
+			 MLX4_COMMAND_INTERFACE_MIN_REV, MLX4_COMMAND_INTERFACE_MAX_REV);
+		err = -ENODEV;
+		goto out;
+	}
+
+	if (cmd_if_rev < MLX4_COMMAND_INTERFACE_NEW_PORT_CMDS)
+		dev->flags |= MLX4_FLAG_OLD_PORT_CMDS;
+
+	MLX4_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET);
+	cmd->max_cmds = 1 << lg;
+
+	mlx4_dbg(dev, "FW version %d.%d.%03d (cmd intf rev %d), max commands %d\n",
+		 (int) (dev->caps.fw_ver >> 32),
+		 (int) (dev->caps.fw_ver >> 16) & 0xffff,
+		 (int) dev->caps.fw_ver & 0xffff,
+		 cmd_if_rev, cmd->max_cmds);
+
+	MLX4_GET(fw->catas_offset, outbox, QUERY_FW_ERR_START_OFFSET);
+	MLX4_GET(fw->catas_size,   outbox, QUERY_FW_ERR_SIZE_OFFSET);
+	MLX4_GET(fw->catas_bar,    outbox, QUERY_FW_ERR_BAR_OFFSET);
+	fw->catas_bar = (fw->catas_bar >> 6) * 2;
+
+	mlx4_dbg(dev, "Catastrophic error buffer at 0x%llx, size 0x%x, BAR %d\n",
+		 (unsigned long long) fw->catas_offset, fw->catas_size, fw->catas_bar);
+
+	MLX4_GET(fw->fw_pages,     outbox, QUERY_FW_SIZE_OFFSET);
+	MLX4_GET(fw->clr_int_base, outbox, QUERY_FW_CLR_INT_BASE_OFFSET);
+	MLX4_GET(fw->clr_int_bar,  outbox, QUERY_FW_CLR_INT_BAR_OFFSET);
+	fw->clr_int_bar = (fw->clr_int_bar >> 6) * 2;
+
+	mlx4_dbg(dev, "FW size %d KB\n", fw->fw_pages >> 2);
+
+	/*
+	 * Round up number of system pages needed in case
+	 * MLX4_ICM_PAGE_SIZE < PAGE_SIZE.
+	 */
+	fw->fw_pages =
+		ALIGN(fw->fw_pages, PAGE_SIZE / MLX4_ICM_PAGE_SIZE) >>
+		(PAGE_SHIFT - MLX4_ICM_PAGE_SHIFT);
+
+	mlx4_dbg(dev, "Clear int @ %llx, BAR %d\n",
+		 (unsigned long long) fw->clr_int_base, fw->clr_int_bar);
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+static void get_board_id(void *vsd, char *board_id)
+{
+	int i;
+
+#define VSD_OFFSET_SIG1		0x00
+#define VSD_OFFSET_SIG2		0xde
+#define VSD_OFFSET_MLX_BOARD_ID	0xd0
+#define VSD_OFFSET_TS_BOARD_ID	0x20
+
+#define VSD_SIGNATURE_TOPSPIN	0x5ad
+
+	memset(board_id, 0, MLX4_BOARD_ID_LEN);
+
+	if (be16_to_cpup(vsd + VSD_OFFSET_SIG1) == VSD_SIGNATURE_TOPSPIN &&
+	    be16_to_cpup(vsd + VSD_OFFSET_SIG2) == VSD_SIGNATURE_TOPSPIN) {
+		strlcpy(board_id, vsd + VSD_OFFSET_TS_BOARD_ID, MLX4_BOARD_ID_LEN);
+	} else {
+		/*
+		 * The board ID is a string but the firmware byte
+		 * swaps each 4-byte word before passing it back to
+		 * us.  Therefore we need to swab it before printing.
+		 */
+		for (i = 0; i < 4; ++i)
+			((u32 *) board_id)[i] =
+				swab32(*(u32 *) (vsd + VSD_OFFSET_MLX_BOARD_ID + i * 4));
+	}
+}
+
+int mlx4_QUERY_ADAPTER(struct mlx4_dev *dev, struct mlx4_adapter *adapter)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *outbox;
+	int err;
+
+#define QUERY_ADAPTER_OUT_SIZE             0x100
+#define QUERY_ADAPTER_INTA_PIN_OFFSET      0x10
+#define QUERY_ADAPTER_VSD_OFFSET           0x20
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_ADAPTER,
+			   MLX4_CMD_TIME_CLASS_A);
+	if (err)
+		goto out;
+
+	MLX4_GET(adapter->inta_pin, outbox,    QUERY_ADAPTER_INTA_PIN_OFFSET);
+
+	get_board_id(outbox + QUERY_ADAPTER_VSD_OFFSET / 4,
+		     adapter->board_id);
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	__be32 *inbox;
+	int err;
+
+#define INIT_HCA_IN_SIZE		 0x200
+#define INIT_HCA_VERSION_OFFSET		 0x000
+#define	 INIT_HCA_VERSION		 2
+#define INIT_HCA_CACHELINE_SZ_OFFSET	 0x0e
+#define INIT_HCA_X86_64_BYTE_CACHELINE_SZ	 0x40
+#define INIT_HCA_FLAGS_OFFSET		 0x014
+#define INIT_HCA_QPC_OFFSET		 0x020
+#define	 INIT_HCA_QPC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x10)
+#define	 INIT_HCA_LOG_QP_OFFSET		 (INIT_HCA_QPC_OFFSET + 0x17)
+#define	 INIT_HCA_SRQC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x28)
+#define	 INIT_HCA_LOG_SRQ_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x2f)
+#define	 INIT_HCA_CQC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x30)
+#define	 INIT_HCA_LOG_CQ_OFFSET		 (INIT_HCA_QPC_OFFSET + 0x37)
+#define	 INIT_HCA_ALTC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x40)
+#define	 INIT_HCA_AUXC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x50)
+#define	 INIT_HCA_EQC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x60)
+#define	 INIT_HCA_LOG_EQ_OFFSET		 (INIT_HCA_QPC_OFFSET + 0x67)
+#define	 INIT_HCA_RDMARC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x70)
+#define	 INIT_HCA_LOG_RD_OFFSET		 (INIT_HCA_QPC_OFFSET + 0x77)
+#define INIT_HCA_MCAST_OFFSET		 0x0c0
+#define	 INIT_HCA_MC_BASE_OFFSET	 (INIT_HCA_MCAST_OFFSET + 0x00)
+#define	 INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x12)
+#define	 INIT_HCA_LOG_MC_HASH_SZ_OFFSET	 (INIT_HCA_MCAST_OFFSET + 0x16)
+#define	 INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b)
+#define INIT_HCA_TPT_OFFSET		 0x0f0
+#define	 INIT_HCA_DMPT_BASE_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x00)
+#define	 INIT_HCA_LOG_MPT_SZ_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x0b)
+#define	 INIT_HCA_MTT_BASE_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x10)
+#define	 INIT_HCA_CMPT_BASE_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x18)
+#define INIT_HCA_UAR_OFFSET		 0x120
+#define	 INIT_HCA_LOG_UAR_SZ_OFFSET	 (INIT_HCA_UAR_OFFSET + 0x0a)
+#define  INIT_HCA_UAR_PAGE_SZ_OFFSET     (INIT_HCA_UAR_OFFSET + 0x0b)
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	memset(inbox, 0, INIT_HCA_IN_SIZE);
+
+	*((u8 *) mailbox->buf + INIT_HCA_VERSION_OFFSET) = INIT_HCA_VERSION;
+#if defined(__x86_64__) || defined(__PPC64__)
+	*((u8 *) mailbox->buf + INIT_HCA_CACHELINE_SZ_OFFSET) = INIT_HCA_X86_64_BYTE_CACHELINE_SZ;
+#endif
+
+#if defined(__LITTLE_ENDIAN)
+	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) &= ~cpu_to_be32(1 << 1);
+#elif defined(__BIG_ENDIAN)
+	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 1);
+#else
+#error Host endianness not defined
+#endif
+	/* Check port for UD address vector: */
+	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1);
+
+	/* Enable IPoIB checksumming if we can: */
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM)
+		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 3);
+
+	/* Enable QoS support if module parameter set */
+	if (enable_qos)
+		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 2);
+
+	/* counters mode */
+	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |=
+		cpu_to_be32(dev->caps.counters_mode << 4);
+
+	/* QPC/EEC/CQC/EQC/RDMARC attributes */
+
+	MLX4_PUT(inbox, param->qpc_base,      INIT_HCA_QPC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_num_qps,   INIT_HCA_LOG_QP_OFFSET);
+	MLX4_PUT(inbox, param->srqc_base,     INIT_HCA_SRQC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_num_srqs,  INIT_HCA_LOG_SRQ_OFFSET);
+	MLX4_PUT(inbox, param->cqc_base,      INIT_HCA_CQC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_num_cqs,   INIT_HCA_LOG_CQ_OFFSET);
+	MLX4_PUT(inbox, param->altc_base,     INIT_HCA_ALTC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->auxc_base,     INIT_HCA_AUXC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->eqc_base,      INIT_HCA_EQC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_num_eqs,   INIT_HCA_LOG_EQ_OFFSET);
+	MLX4_PUT(inbox, param->rdmarc_base,   INIT_HCA_RDMARC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_rd_per_qp, INIT_HCA_LOG_RD_OFFSET);
+
+	/* multicast attributes */
+
+	MLX4_PUT(inbox, param->mc_base,		INIT_HCA_MC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_mc_entry_sz, INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
+	MLX4_PUT(inbox, param->log_mc_hash_sz,  INIT_HCA_LOG_MC_HASH_SZ_OFFSET);
+	MLX4_PUT(inbox, param->log_mc_table_sz, INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
+
+	/* TPT attributes */
+
+	MLX4_PUT(inbox, param->dmpt_base,  INIT_HCA_DMPT_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_mpt_sz, INIT_HCA_LOG_MPT_SZ_OFFSET);
+	MLX4_PUT(inbox, param->mtt_base,   INIT_HCA_MTT_BASE_OFFSET);
+	MLX4_PUT(inbox, param->cmpt_base,  INIT_HCA_CMPT_BASE_OFFSET);
+
+	/* UAR attributes */
+
+	MLX4_PUT(inbox, (u8) (PAGE_SHIFT - 12), INIT_HCA_UAR_PAGE_SZ_OFFSET);
+	MLX4_PUT(inbox, param->log_uar_sz,      INIT_HCA_LOG_UAR_SZ_OFFSET);
+	if (!mlx4_pre_t11_mode && dev->caps.flags & (u32) MLX4_DEV_CAP_FLAG_FC_T11)
+		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 10);
+
+
+	err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_INIT_HCA, 10000);
+
+	if (err)
+		mlx4_err(dev, "INIT_HCA returns %d\n", err);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_INIT_PORT(struct mlx4_dev *dev, int port)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *inbox;
+	int err;
+	u32 flags;
+	u16 field;
+
+	if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
+#define INIT_PORT_IN_SIZE          256
+#define INIT_PORT_FLAGS_OFFSET     0x00
+#define INIT_PORT_FLAG_SIG         (1 << 18)
+#define INIT_PORT_FLAG_NG          (1 << 17)
+#define INIT_PORT_FLAG_G0          (1 << 16)
+#define INIT_PORT_VL_SHIFT         4
+#define INIT_PORT_PORT_WIDTH_SHIFT 8
+#define INIT_PORT_MTU_OFFSET       0x04
+#define INIT_PORT_MAX_GID_OFFSET   0x06
+#define INIT_PORT_MAX_PKEY_OFFSET  0x0a
+#define INIT_PORT_GUID0_OFFSET     0x10
+#define INIT_PORT_NODE_GUID_OFFSET 0x18
+#define INIT_PORT_SI_GUID_OFFSET   0x20
+
+		mailbox = mlx4_alloc_cmd_mailbox(dev);
+		if (IS_ERR(mailbox))
+			return PTR_ERR(mailbox);
+		inbox = mailbox->buf;
+
+		memset(inbox, 0, INIT_PORT_IN_SIZE);
+
+		flags = 0;
+		flags |= (dev->caps.vl_cap[port] & 0xf) << INIT_PORT_VL_SHIFT;
+		flags |= (dev->caps.port_width_cap[port] & 0xf) << INIT_PORT_PORT_WIDTH_SHIFT;
+		MLX4_PUT(inbox, flags,		  INIT_PORT_FLAGS_OFFSET);
+
+		field = 128 << dev->caps.ib_mtu_cap[port];
+		MLX4_PUT(inbox, field, INIT_PORT_MTU_OFFSET);
+		field = dev->caps.gid_table_len[port];
+		MLX4_PUT(inbox, field, INIT_PORT_MAX_GID_OFFSET);
+		field = dev->caps.pkey_table_len[port];
+		MLX4_PUT(inbox, field, INIT_PORT_MAX_PKEY_OFFSET);
+
+		err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_INIT_PORT,
+			       MLX4_CMD_TIME_CLASS_A);
+
+		mlx4_free_cmd_mailbox(dev, mailbox);
+	} else
+		err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT,
+			       MLX4_CMD_TIME_CLASS_A);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_INIT_PORT);
+
+int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port)
+{
+	return mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT, 1000);
+}
+EXPORT_SYMBOL_GPL(mlx4_CLOSE_PORT);
+
+int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic)
+{
+	return mlx4_cmd(dev, 0, 0, panic, MLX4_CMD_CLOSE_HCA, 1000);
+}
+
+int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages)
+{
+	int ret = mlx4_cmd_imm(dev, icm_size, aux_pages, 0, 0,
+			       MLX4_CMD_SET_ICM_SIZE,
+			       MLX4_CMD_TIME_CLASS_A);
+	if (ret)
+		return ret;
+
+	/*
+	 * Round up number of system pages needed in case
+	 * MLX4_ICM_PAGE_SIZE < PAGE_SIZE.
+	 */
+	*aux_pages = ALIGN(*aux_pages, PAGE_SIZE / MLX4_ICM_PAGE_SIZE) >>
+		(PAGE_SHIFT - MLX4_ICM_PAGE_SHIFT);
+
+	return 0;
+}
+
+int mlx4_NOP(struct mlx4_dev *dev)
+{
+	/* Input modifier of 0x1f means "finish as soon as possible." */
+	return mlx4_cmd(dev, 0, 0x1f, 0, MLX4_CMD_NOP, 100);
+}
+
+int mlx4_query_diag_counters(struct mlx4_dev *dev, int array_length,
+			     u8 op_modifier, u32 in_offset[], u32 counter_out[])
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *outbox;
+	int ret;
+	int i;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	ret = mlx4_cmd_box(dev, 0, mailbox->dma, 0, op_modifier,
+			   MLX4_CMD_DIAG_RPRT, MLX4_CMD_TIME_CLASS_A);
+	if (ret)
+		goto out;
+
+	for (i=0; i < array_length; i++) {
+		if (in_offset[i] > MLX4_MAILBOX_SIZE) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		MLX4_GET(counter_out[i], outbox, in_offset[i]);
+	}
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_query_diag_counters);
+
+void mlx4_get_fc_t11_settings(struct mlx4_dev *dev, int *enable_pre_t11, int *t11_supported)
+{
+	*enable_pre_t11 = !!mlx4_pre_t11_mode;
+	*t11_supported = !!(dev->caps.flags & MLX4_DEV_CAP_FLAG_FC_T11);
+}
+EXPORT_SYMBOL_GPL(mlx4_get_fc_t11_settings);
diff --git a/sys/ofed/drivers/net/mlx4/fw.h b/sys/ofed/drivers/net/mlx4/fw.h
new file mode 100644
index 0000000..fbdd95e
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/fw.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_FW_H
+#define MLX4_FW_H
+
+#include "mlx4.h"
+#include "icm.h"
+
+struct mlx4_mod_stat_cfg {
+	u8 log_pg_sz;
+	u8 log_pg_sz_m;
+};
+
+struct mlx4_dev_cap {
+	int max_srq_sz;
+	int max_qp_sz;
+	int reserved_qps;
+	int max_qps;
+	int reserved_srqs;
+	int max_srqs;
+	int max_cq_sz;
+	int reserved_cqs;
+	int max_cqs;
+	int max_mpts;
+	int reserved_eqs;
+	int max_eqs;
+	int reserved_mtts;
+	int max_mrw_sz;
+	int reserved_mrws;
+	int max_mtt_seg;
+	int max_requester_per_qp;
+	int max_responder_per_qp;
+	int max_rdma_global;
+	int local_ca_ack_delay;
+	int num_ports;
+	u32 max_msg_sz;
+	int ib_mtu[MLX4_MAX_PORTS + 1];
+	int max_port_width[MLX4_MAX_PORTS + 1];
+	int max_vl[MLX4_MAX_PORTS + 1];
+	int max_gids[MLX4_MAX_PORTS + 1];
+	int max_pkeys[MLX4_MAX_PORTS + 1];
+	u64 def_mac[MLX4_MAX_PORTS + 1];
+	u16 eth_mtu[MLX4_MAX_PORTS + 1];
+	int trans_type[MLX4_MAX_PORTS + 1];
+	int vendor_oui[MLX4_MAX_PORTS + 1];
+	u16 wavelength[MLX4_MAX_PORTS + 1];
+	u64 trans_code[MLX4_MAX_PORTS + 1];
+	u16 stat_rate_support;
+	int udp_rss;
+	int loopback_support;
+	u64 flags;
+	int reserved_uars;
+	int uar_size;
+	int min_page_sz;
+	int bf_reg_size;
+	int bf_regs_per_page;
+	int max_sq_sg;
+	int max_sq_desc_sz;
+	int max_rq_sg;
+	int max_rq_desc_sz;
+	int max_qp_per_mcg;
+	int reserved_mgms;
+	int max_mcgs;
+	int reserved_pds;
+	int max_pds;
+	int reserved_xrcds;
+	int max_xrcds;
+	int qpc_entry_sz;
+	int rdmarc_entry_sz;
+	int altc_entry_sz;
+	int aux_entry_sz;
+	int srq_entry_sz;
+	int cqc_entry_sz;
+	int eqc_entry_sz;
+	int dmpt_entry_sz;
+	int cmpt_entry_sz;
+	int mtt_entry_sz;
+	int inline_cfg;
+	int resize_srq;
+	u32 bmme_flags;
+	u32 reserved_lkey;
+	u64 max_icm_sz;
+	int max_gso_sz;
+	u8  supported_port_types[MLX4_MAX_PORTS + 1];
+	u8  log_max_macs[MLX4_MAX_PORTS + 1];
+	u8  log_max_vlans[MLX4_MAX_PORTS + 1];
+	u32 max_basic_counters;
+	u32 max_ext_counters;
+};
+
+struct mlx4_adapter {
+	char board_id[MLX4_BOARD_ID_LEN];
+	u8   inta_pin;
+};
+
+struct mlx4_init_hca_param {
+	u64 qpc_base;
+	u64 rdmarc_base;
+	u64 auxc_base;
+	u64 altc_base;
+	u64 srqc_base;
+	u64 cqc_base;
+	u64 eqc_base;
+	u64 mc_base;
+	u64 dmpt_base;
+	u64 cmpt_base;
+	u64 mtt_base;
+	u16 log_mc_entry_sz;
+	u16 log_mc_hash_sz;
+	u8  log_num_qps;
+	u8  log_num_srqs;
+	u8  log_num_cqs;
+	u8  log_num_eqs;
+	u8  log_rd_per_qp;
+	u8  log_mc_table_sz;
+	u8  log_mpt_sz;
+	u8  log_uar_sz;
+};
+
+struct mlx4_init_ib_param {
+	int port_width;
+	int vl_cap;
+	int mtu_cap;
+	u16 gid_cap;
+	u16 pkey_cap;
+	int set_guid0;
+	u64 guid0;
+	int set_node_guid;
+	u64 node_guid;
+	int set_si_guid;
+	u64 si_guid;
+};
+
+struct mlx4_set_ib_param {
+	int set_si_guid;
+	int reset_qkey_viol;
+	u64 si_guid;
+	u32 cap_mask;
+};
+
+int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap);
+int mlx4_MAP_FA(struct mlx4_dev *dev, struct mlx4_icm *icm);
+int mlx4_UNMAP_FA(struct mlx4_dev *dev);
+int mlx4_RUN_FW(struct mlx4_dev *dev);
+int mlx4_QUERY_FW(struct mlx4_dev *dev);
+int mlx4_QUERY_ADAPTER(struct mlx4_dev *dev, struct mlx4_adapter *adapter);
+int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param);
+int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic);
+int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt);
+int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages);
+int mlx4_NOP(struct mlx4_dev *dev);
+int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg);
+
+#endif /* MLX4_FW_H */
diff --git a/sys/ofed/drivers/net/mlx4/icm.c b/sys/ofed/drivers/net/mlx4/icm.c
new file mode 100644
index 0000000..3a14d6b
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/icm.c
@@ -0,0 +1,455 @@
+/*
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/scatterlist.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+#include "icm.h"
+#include "fw.h"
+
+/*
+ * We allocate in as big chunks as we can, up to a maximum of 256 KB
+ * per chunk.
+ */
+enum {
+	MLX4_ICM_ALLOC_SIZE	= 1 << 18,
+	MLX4_TABLE_CHUNK_SIZE	= 1 << 18
+};
+
+static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)
+{
+	int i;
+
+	if (chunk->nsg > 0)
+		pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages,
+			     PCI_DMA_BIDIRECTIONAL);
+
+	for (i = 0; i < chunk->npages; ++i)
+		__free_pages(sg_page(&chunk->mem[i]),
+			     get_order(chunk->mem[i].length));
+}
+
+static void mlx4_free_icm_coherent(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)
+{
+	int i;
+
+	for (i = 0; i < chunk->npages; ++i)
+		dma_free_coherent(&dev->pdev->dev, chunk->mem[i].length,
+				  lowmem_page_address(sg_page(&chunk->mem[i])),
+				  sg_dma_address(&chunk->mem[i]));
+}
+
+void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent)
+{
+	struct mlx4_icm_chunk *chunk, *tmp;
+
+	if (!icm)
+		return;
+
+	list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) {
+		if (coherent)
+			mlx4_free_icm_coherent(dev, chunk);
+		else
+			mlx4_free_icm_pages(dev, chunk);
+
+		kfree(chunk);
+	}
+
+	kfree(icm);
+}
+
+static int mlx4_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_mask)
+{
+	struct page *page;
+
+	page = alloc_pages(gfp_mask, order);
+	if (!page)
+		return -ENOMEM;
+
+	sg_set_page(mem, page, PAGE_SIZE << order, 0);
+	return 0;
+}
+
+static int mlx4_alloc_icm_coherent(struct device *dev, struct scatterlist *mem,
+				    int order, gfp_t gfp_mask)
+{
+	void *buf = dma_alloc_coherent(dev, PAGE_SIZE << order,
+				       &sg_dma_address(mem), gfp_mask);
+	if (!buf)
+		return -ENOMEM;
+
+	sg_set_buf(mem, buf, PAGE_SIZE << order);
+	BUG_ON(mem->offset);
+	sg_dma_len(mem) = PAGE_SIZE << order;
+	return 0;
+}
+
+struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
+				gfp_t gfp_mask, int coherent)
+{
+	struct mlx4_icm *icm;
+	struct mlx4_icm_chunk *chunk = NULL;
+	int cur_order;
+	int ret;
+
+	/* We use sg_set_buf for coherent allocs, which assumes low memory */
+	BUG_ON(coherent && (gfp_mask & __GFP_HIGHMEM));
+
+	icm = kmalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+	if (!icm)
+		return NULL;
+
+	icm->refcount = 0;
+	INIT_LIST_HEAD(&icm->chunk_list);
+
+	cur_order = get_order(MLX4_ICM_ALLOC_SIZE);
+
+	while (npages > 0) {
+		if (!chunk) {
+			chunk = kmalloc(sizeof *chunk,
+					gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+			if (!chunk)
+				goto fail;
+
+			sg_init_table(chunk->mem, MLX4_ICM_CHUNK_LEN);
+			chunk->npages = 0;
+			chunk->nsg    = 0;
+			list_add_tail(&chunk->list, &icm->chunk_list);
+		}
+
+		while (1 << cur_order > npages)
+			--cur_order;
+
+		if (coherent)
+			ret = mlx4_alloc_icm_coherent(&dev->pdev->dev,
+						      &chunk->mem[chunk->npages],
+						      cur_order, gfp_mask);
+		else
+			ret = mlx4_alloc_icm_pages(&chunk->mem[chunk->npages],
+						   cur_order, gfp_mask);
+
+		if (!ret) {
+			++chunk->npages;
+
+			if (coherent)
+				++chunk->nsg;
+			else if (chunk->npages == MLX4_ICM_CHUNK_LEN) {
+				chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+							chunk->npages,
+							PCI_DMA_BIDIRECTIONAL);
+
+				if (chunk->nsg <= 0)
+					goto fail;
+			}
+
+			if (chunk->npages == MLX4_ICM_CHUNK_LEN)
+				chunk = NULL;
+
+			npages -= 1 << cur_order;
+		} else {
+			--cur_order;
+			if (cur_order < 0)
+				goto fail;
+		}
+	}
+
+	if (!coherent && chunk) {
+		chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+					chunk->npages,
+					PCI_DMA_BIDIRECTIONAL);
+
+		if (chunk->nsg <= 0)
+			goto fail;
+	}
+
+	return icm;
+
+fail:
+	mlx4_free_icm(dev, icm, coherent);
+	return NULL;
+}
+
+static int mlx4_MAP_ICM(struct mlx4_dev *dev, struct mlx4_icm *icm, u64 virt)
+{
+	return mlx4_map_cmd(dev, MLX4_CMD_MAP_ICM, icm, virt);
+}
+
+int mlx4_UNMAP_ICM(struct mlx4_dev *dev, u64 virt, u32 page_count)
+{
+	return mlx4_cmd(dev, virt, page_count, 0, MLX4_CMD_UNMAP_ICM,
+			MLX4_CMD_TIME_CLASS_B);
+}
+
+int mlx4_MAP_ICM_page(struct mlx4_dev *dev, u64 dma_addr, u64 virt)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	__be64 *inbox;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	inbox[0] = cpu_to_be64(virt);
+	inbox[1] = cpu_to_be64(dma_addr);
+
+	err = mlx4_cmd(dev, mailbox->dma, 1, 0, MLX4_CMD_MAP_ICM,
+		       MLX4_CMD_TIME_CLASS_B);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	if (!err)
+		mlx4_dbg(dev, "Mapped page at %llx to %llx for ICM.\n",
+			  (unsigned long long) dma_addr, (unsigned long long) virt);
+
+	return err;
+}
+
+int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm)
+{
+	return mlx4_map_cmd(dev, MLX4_CMD_MAP_ICM_AUX, icm, -1);
+}
+
+int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev)
+{
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_ICM_AUX, MLX4_CMD_TIME_CLASS_B);
+}
+
+int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj)
+{
+	int i = (obj & (table->num_obj - 1)) / (MLX4_TABLE_CHUNK_SIZE / table->obj_size);
+	int ret = 0;
+
+	mutex_lock(&table->mutex);
+
+	if (table->icm[i]) {
+		++table->icm[i]->refcount;
+		goto out;
+	}
+
+	table->icm[i] = mlx4_alloc_icm(dev, MLX4_TABLE_CHUNK_SIZE >> PAGE_SHIFT,
+				       (table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+				       __GFP_NOWARN, table->coherent);
+	if (!table->icm[i]) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (mlx4_MAP_ICM(dev, table->icm[i], table->virt +
+			 (u64) i * MLX4_TABLE_CHUNK_SIZE)) {
+		mlx4_free_icm(dev, table->icm[i], table->coherent);
+		table->icm[i] = NULL;
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	++table->icm[i]->refcount;
+
+out:
+	mutex_unlock(&table->mutex);
+	return ret;
+}
+
+void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj)
+{
+	int i;
+
+	i = (obj & (table->num_obj - 1)) / (MLX4_TABLE_CHUNK_SIZE / table->obj_size);
+
+	mutex_lock(&table->mutex);
+
+	if (--table->icm[i]->refcount == 0) {
+		mlx4_UNMAP_ICM(dev, table->virt + i * MLX4_TABLE_CHUNK_SIZE,
+			       MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
+		mlx4_free_icm(dev, table->icm[i], table->coherent);
+		table->icm[i] = NULL;
+	}
+
+	mutex_unlock(&table->mutex);
+}
+
+void *mlx4_table_find(struct mlx4_icm_table *table, int obj, dma_addr_t *dma_handle)
+{
+	int idx, offset, dma_offset, i;
+	struct mlx4_icm_chunk *chunk;
+	struct mlx4_icm *icm;
+	struct page *page = NULL;
+
+	if (!table->lowmem)
+		return NULL;
+
+	mutex_lock(&table->mutex);
+
+	idx = (obj & (table->num_obj - 1)) * table->obj_size;
+	icm = table->icm[idx / MLX4_TABLE_CHUNK_SIZE];
+	dma_offset = offset = idx % MLX4_TABLE_CHUNK_SIZE;
+
+	if (!icm)
+		goto out;
+
+	list_for_each_entry(chunk, &icm->chunk_list, list) {
+		for (i = 0; i < chunk->npages; ++i) {
+			if (dma_handle && dma_offset >= 0) {
+				if (sg_dma_len(&chunk->mem[i]) > dma_offset)
+					*dma_handle = sg_dma_address(&chunk->mem[i]) +
+						dma_offset;
+				dma_offset -= sg_dma_len(&chunk->mem[i]);
+			}
+			/*
+			 * DMA mapping can merge pages but not split them,
+			 * so if we found the page, dma_handle has already
+			 * been assigned to.
+			 */
+			if (chunk->mem[i].length > offset) {
+				page = sg_page(&chunk->mem[i]);
+				goto out;
+			}
+			offset -= chunk->mem[i].length;
+		}
+	}
+
+out:
+	mutex_unlock(&table->mutex);
+	return page ? lowmem_page_address(page) + offset : NULL;
+}
+
+int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			 int start, int end)
+{
+	int inc = MLX4_TABLE_CHUNK_SIZE / table->obj_size;
+	int i, err;
+
+	for (i = start; i <= end; i += inc) {
+		err = mlx4_table_get(dev, table, i);
+		if (err)
+			goto fail;
+	}
+
+	return 0;
+
+fail:
+	while (i > start) {
+		i -= inc;
+		mlx4_table_put(dev, table, i);
+	}
+
+	return err;
+}
+
+void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			  int start, int end)
+{
+	int i;
+
+	for (i = start; i <= end; i += MLX4_TABLE_CHUNK_SIZE / table->obj_size)
+		mlx4_table_put(dev, table, i);
+}
+
+int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			u64 virt, int obj_size,	int nobj, int reserved,
+			int use_lowmem, int use_coherent)
+{
+	int obj_per_chunk;
+	int num_icm;
+	unsigned chunk_size;
+	int i;
+
+	obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
+	num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
+
+	table->icm      = kcalloc(num_icm, sizeof *table->icm, GFP_KERNEL);
+	if (!table->icm)
+		return -ENOMEM;
+	table->virt     = virt;
+	table->num_icm  = num_icm;
+	table->num_obj  = nobj;
+	table->obj_size = obj_size;
+	table->lowmem   = use_lowmem;
+	table->coherent = use_coherent;
+	mutex_init(&table->mutex);
+
+	for (i = 0; i * MLX4_TABLE_CHUNK_SIZE < reserved * obj_size; ++i) {
+		chunk_size = MLX4_TABLE_CHUNK_SIZE;
+		if ((i + 1) * MLX4_TABLE_CHUNK_SIZE > nobj * obj_size)
+			chunk_size = PAGE_ALIGN(nobj * obj_size - i * MLX4_TABLE_CHUNK_SIZE);
+
+		table->icm[i] = mlx4_alloc_icm(dev, chunk_size >> PAGE_SHIFT,
+					       (use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+					       __GFP_NOWARN, use_coherent);
+		if (!table->icm[i])
+			goto err;
+		if (mlx4_MAP_ICM(dev, table->icm[i], virt + i * MLX4_TABLE_CHUNK_SIZE)) {
+			mlx4_free_icm(dev, table->icm[i], use_coherent);
+			table->icm[i] = NULL;
+			goto err;
+		}
+
+		/*
+		 * Add a reference to this ICM chunk so that it never
+		 * gets freed (since it contains reserved firmware objects).
+		 */
+		++table->icm[i]->refcount;
+	}
+
+	return 0;
+
+err:
+	for (i = 0; i < num_icm; ++i)
+		if (table->icm[i]) {
+			mlx4_UNMAP_ICM(dev, virt + i * MLX4_TABLE_CHUNK_SIZE,
+				       MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
+			mlx4_free_icm(dev, table->icm[i], use_coherent);
+		}
+
+	return -ENOMEM;
+}
+
+void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table)
+{
+	int i;
+
+	for (i = 0; i < table->num_icm; ++i)
+		if (table->icm[i]) {
+			mlx4_UNMAP_ICM(dev, table->virt + i * MLX4_TABLE_CHUNK_SIZE,
+				       MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
+			mlx4_free_icm(dev, table->icm[i], table->coherent);
+		}
+
+	kfree(table->icm);
+}
diff --git a/sys/ofed/drivers/net/mlx4/icm.h b/sys/ofed/drivers/net/mlx4/icm.h
new file mode 100644
index 0000000..b87f726
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/icm.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_ICM_H
+#define MLX4_ICM_H
+
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/mutex.h>
+
+#define MLX4_ICM_CHUNK_LEN						\
+	((256 - sizeof (struct list_head) - 2 * sizeof (int)) /		\
+	 (sizeof (struct scatterlist)))
+
+enum {
+	MLX4_ICM_PAGE_SHIFT	= 12,
+	MLX4_ICM_PAGE_SIZE	= 1 << MLX4_ICM_PAGE_SHIFT,
+};
+
+struct mlx4_icm_chunk {
+	struct list_head	list;
+	int			npages;
+	int			nsg;
+	struct scatterlist	mem[MLX4_ICM_CHUNK_LEN];
+};
+
+struct mlx4_icm {
+	struct list_head	chunk_list;
+	int			refcount;
+};
+
+struct mlx4_icm_iter {
+	struct mlx4_icm	       *icm;
+	struct mlx4_icm_chunk  *chunk;
+	int			page_idx;
+};
+
+struct mlx4_dev;
+
+struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
+				gfp_t gfp_mask, int coherent);
+void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent);
+
+int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			u64 virt, int obj_size,	int nobj, int reserved,
+			int use_lowmem, int use_coherent);
+void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table);
+int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj);
+void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj);
+void *mlx4_table_find(struct mlx4_icm_table *table, int obj, dma_addr_t *dma_handle);
+int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			 int start, int end);
+void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			  int start, int end);
+
+static inline void mlx4_icm_first(struct mlx4_icm *icm,
+				  struct mlx4_icm_iter *iter)
+{
+	iter->icm      = icm;
+	iter->chunk    = list_empty(&icm->chunk_list) ?
+		NULL : list_entry(icm->chunk_list.next,
+				  struct mlx4_icm_chunk, list);
+	iter->page_idx = 0;
+}
+
+static inline int mlx4_icm_last(struct mlx4_icm_iter *iter)
+{
+	return !iter->chunk;
+}
+
+static inline void mlx4_icm_next(struct mlx4_icm_iter *iter)
+{
+	if (++iter->page_idx >= iter->chunk->nsg) {
+		if (iter->chunk->list.next == &iter->icm->chunk_list) {
+			iter->chunk = NULL;
+			return;
+		}
+
+		iter->chunk = list_entry(iter->chunk->list.next,
+					 struct mlx4_icm_chunk, list);
+		iter->page_idx = 0;
+	}
+}
+
+static inline dma_addr_t mlx4_icm_addr(struct mlx4_icm_iter *iter)
+{
+	return sg_dma_address(&iter->chunk->mem[iter->page_idx]);
+}
+
+static inline unsigned long mlx4_icm_size(struct mlx4_icm_iter *iter)
+{
+	return sg_dma_len(&iter->chunk->mem[iter->page_idx]);
+}
+
+int mlx4_UNMAP_ICM(struct mlx4_dev *dev, u64 virt, u32 page_count);
+int mlx4_MAP_ICM_page(struct mlx4_dev *dev, u64 dma_addr, u64 virt);
+int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm);
+int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev);
+
+#endif /* MLX4_ICM_H */
diff --git a/sys/ofed/drivers/net/mlx4/intf.c b/sys/ofed/drivers/net/mlx4/intf.c
new file mode 100644
index 0000000..bdf7e7d
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/intf.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "mlx4.h"
+
+struct mlx4_device_context {
+	struct list_head	list;
+	struct mlx4_interface  *intf;
+	void		       *context;
+};
+
+static LIST_HEAD(intf_list);
+static LIST_HEAD(dev_list);
+static DEFINE_MUTEX(intf_mutex);
+
+static void mlx4_add_device(struct mlx4_interface *intf, struct mlx4_priv *priv)
+{
+	struct mlx4_device_context *dev_ctx;
+
+	dev_ctx = kmalloc(sizeof *dev_ctx, GFP_KERNEL);
+	if (!dev_ctx)
+		return;
+
+	dev_ctx->intf    = intf;
+	dev_ctx->context = intf->add(&priv->dev);
+
+	if (dev_ctx->context) {
+		spin_lock_irq(&priv->ctx_lock);
+		list_add_tail(&dev_ctx->list, &priv->ctx_list);
+		spin_unlock_irq(&priv->ctx_lock);
+	} else
+		kfree(dev_ctx);
+}
+
+static void mlx4_remove_device(struct mlx4_interface *intf, struct mlx4_priv *priv)
+{
+	struct mlx4_device_context *dev_ctx;
+
+	list_for_each_entry(dev_ctx, &priv->ctx_list, list)
+		if (dev_ctx->intf == intf) {
+			spin_lock_irq(&priv->ctx_lock);
+			list_del(&dev_ctx->list);
+			spin_unlock_irq(&priv->ctx_lock);
+
+			intf->remove(&priv->dev, dev_ctx->context);
+			kfree(dev_ctx);
+			return;
+		}
+}
+
+int mlx4_register_interface(struct mlx4_interface *intf)
+{
+	struct mlx4_priv *priv;
+
+	if (!intf->add || !intf->remove)
+		return -EINVAL;
+
+	mutex_lock(&intf_mutex);
+
+	list_add_tail(&intf->list, &intf_list);
+	list_for_each_entry(priv, &dev_list, dev_list)
+		mlx4_add_device(intf, priv);
+
+	mutex_unlock(&intf_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_register_interface);
+
+void mlx4_unregister_interface(struct mlx4_interface *intf)
+{
+	struct mlx4_priv *priv;
+
+	mutex_lock(&intf_mutex);
+
+	list_for_each_entry(priv, &dev_list, dev_list)
+		mlx4_remove_device(intf, priv);
+
+	list_del(&intf->list);
+
+	mutex_unlock(&intf_mutex);
+}
+EXPORT_SYMBOL_GPL(mlx4_unregister_interface);
+
+struct mlx4_dev *mlx4_query_interface(void *int_dev, int *port)
+{
+	struct mlx4_priv *priv;
+	struct mlx4_device_context *dev_ctx;
+	enum mlx4_query_reply r;
+	unsigned long flags;
+
+	mutex_lock(&intf_mutex);
+
+	list_for_each_entry(priv, &dev_list, dev_list) {
+		spin_lock_irqsave(&priv->ctx_lock, flags);
+		list_for_each_entry(dev_ctx, &priv->ctx_list, list) {
+			if (!dev_ctx->intf->query)
+				continue;
+			r = dev_ctx->intf->query(dev_ctx->context, int_dev);
+			if (r != MLX4_QUERY_NOT_MINE) {
+				*port = r;
+				spin_unlock_irqrestore(&priv->ctx_lock, flags);
+				mutex_unlock(&intf_mutex);
+				return &priv->dev;
+			}
+		}
+		spin_unlock_irqrestore(&priv->ctx_lock, flags);
+	}
+
+	mutex_unlock(&intf_mutex);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(mlx4_query_interface);
+
+void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_device_context *dev_ctx;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->ctx_lock, flags);
+
+	list_for_each_entry(dev_ctx, &priv->ctx_list, list)
+		if (dev_ctx->intf->event)
+			dev_ctx->intf->event(dev, dev_ctx->context, type, port);
+
+	spin_unlock_irqrestore(&priv->ctx_lock, flags);
+}
+
+int mlx4_register_device(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_interface *intf;
+
+	mutex_lock(&intf_mutex);
+
+	list_add_tail(&priv->dev_list, &dev_list);
+	list_for_each_entry(intf, &intf_list, list)
+		mlx4_add_device(intf, priv);
+
+	mutex_unlock(&intf_mutex);
+	mlx4_start_catas_poll(dev);
+
+	return 0;
+}
+
+void mlx4_unregister_device(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_interface *intf;
+
+	mlx4_stop_catas_poll(dev);
+	mutex_lock(&intf_mutex);
+
+	list_for_each_entry(intf, &intf_list, list)
+		mlx4_remove_device(intf, priv);
+
+	list_del(&priv->dev_list);
+
+	mutex_unlock(&intf_mutex);
+}
+
+void *mlx4_find_get_prot_dev(struct mlx4_dev *dev, enum mlx4_prot proto, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_device_context *dev_ctx;
+	unsigned long flags;
+	void *result = NULL;
+
+	spin_lock_irqsave(&priv->ctx_lock, flags);
+
+	list_for_each_entry(dev_ctx, &priv->ctx_list, list)
+		if (dev_ctx->intf->protocol == proto && dev_ctx->intf->get_prot_dev) {
+			result = dev_ctx->intf->get_prot_dev(dev, dev_ctx->context, port);
+			break;
+	}
+
+	spin_unlock_irqrestore(&priv->ctx_lock, flags);
+
+	return result;
+}
+
diff --git a/sys/ofed/drivers/net/mlx4/main.c b/sys/ofed/drivers/net/mlx4/main.c
new file mode 100644
index 0000000..44aec46
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/main.c
@@ -0,0 +1,1704 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/io-mapping.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/doorbell.h>
+
+#include "mlx4.h"
+#include "fw.h"
+#include "icm.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("Mellanox ConnectX HCA low-level driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+struct workqueue_struct *mlx4_wq;
+
+#ifdef CONFIG_MLX4_DEBUG
+
+int mlx4_debug_level = 0;
+module_param_named(debug_level, mlx4_debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
+
+#endif /* CONFIG_MLX4_DEBUG */
+
+int mlx4_blck_lb=1;
+module_param_named(block_loopback, mlx4_blck_lb, int, 0644);
+MODULE_PARM_DESC(block_loopback, "Block multicast loopback packets if > 0");
+
+#ifdef CONFIG_PCI_MSI
+
+static int msi_x = 1;
+module_param(msi_x, int, 0444);
+MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero");
+
+#else /* CONFIG_PCI_MSI */
+
+#define msi_x (0)
+
+#endif /* CONFIG_PCI_MSI */
+
+static char mlx4_version[] __devinitdata =
+	DRV_NAME ": Mellanox ConnectX core driver v"
+	DRV_VERSION " (" DRV_RELDATE ")\n";
+
+struct mutex drv_mutex;
+
+static struct mlx4_profile default_profile = {
+	.num_qp		= 1 << 18,
+	.num_srq	= 1 << 16,
+	.rdmarc_per_qp	= 1 << 4,
+	.num_cq		= 1 << 16,
+	.num_mcg	= 1 << 13,
+	.num_mpt	= 1 << 19,
+	.num_mtt	= 1 << 20,
+};
+
+static int log_num_mac = 2;
+module_param_named(log_num_mac, log_num_mac, int, 0444);
+MODULE_PARM_DESC(log_num_mac, "Log2 max number of MACs per ETH port (1-7)");
+
+static int use_prio;
+module_param_named(use_prio, use_prio, bool, 0444);
+MODULE_PARM_DESC(use_prio, "Enable steering by VLAN priority on ETH ports "
+		  "(0/1, default 0)");
+
+static struct mlx4_profile mod_param_profile = { 0 };
+
+module_param_named(log_num_qp, mod_param_profile.num_qp, int, 0444);
+MODULE_PARM_DESC(log_num_qp, "log maximum number of QPs per HCA");
+
+module_param_named(log_num_srq, mod_param_profile.num_srq, int, 0444);
+MODULE_PARM_DESC(log_num_srq, "log maximum number of SRQs per HCA");
+
+module_param_named(log_rdmarc_per_qp, mod_param_profile.rdmarc_per_qp, int, 0444);
+MODULE_PARM_DESC(log_rdmarc_per_qp, "log number of RDMARC buffers per QP");
+
+module_param_named(log_num_cq, mod_param_profile.num_cq, int, 0444);
+MODULE_PARM_DESC(log_num_cq, "log maximum number of CQs per HCA");
+
+module_param_named(log_num_mcg, mod_param_profile.num_mcg, int, 0444);
+MODULE_PARM_DESC(log_num_mcg, "log maximum number of multicast groups per HCA");
+
+module_param_named(log_num_mpt, mod_param_profile.num_mpt, int, 0444);
+MODULE_PARM_DESC(log_num_mpt,
+		"log maximum number of memory protection table entries per HCA");
+
+module_param_named(log_num_mtt, mod_param_profile.num_mtt, int, 0444);
+MODULE_PARM_DESC(log_num_mtt,
+		 "log maximum number of memory translation table segments per HCA");
+
+static int log_mtts_per_seg = 0;
+module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444);
+MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment (1-7)");
+
+static void process_mod_param_profile(void)
+{
+	default_profile.num_qp = (mod_param_profile.num_qp ?
+				  1 << mod_param_profile.num_qp :
+				  default_profile.num_qp);
+	default_profile.num_srq = (mod_param_profile.num_srq ?
+				  1 << mod_param_profile.num_srq :
+				  default_profile.num_srq);
+	default_profile.rdmarc_per_qp = (mod_param_profile.rdmarc_per_qp ?
+				  1 << mod_param_profile.rdmarc_per_qp :
+				  default_profile.rdmarc_per_qp);
+	default_profile.num_cq = (mod_param_profile.num_cq ?
+				  1 << mod_param_profile.num_cq :
+				  default_profile.num_cq);
+	default_profile.num_mcg = (mod_param_profile.num_mcg ?
+				  1 << mod_param_profile.num_mcg :
+				  default_profile.num_mcg);
+	default_profile.num_mpt = (mod_param_profile.num_mpt ?
+				  1 << mod_param_profile.num_mpt :
+				  default_profile.num_mpt);
+	default_profile.num_mtt = (mod_param_profile.num_mtt ?
+				  1 << mod_param_profile.num_mtt :
+				  default_profile.num_mtt);
+}
+
+struct mlx4_port_config
+{
+	struct list_head list;
+	enum mlx4_port_type port_type[MLX4_MAX_PORTS + 1];
+	struct pci_dev *pdev;
+};
+static LIST_HEAD(config_list);
+
+static void mlx4_config_cleanup(void)
+{
+	struct mlx4_port_config *config, *tmp;
+
+	list_for_each_entry_safe(config, tmp, &config_list, list) {
+		list_del(&config->list);
+		kfree(config);
+	}
+}
+
+void *mlx4_get_prot_dev(struct mlx4_dev *dev, enum mlx4_prot proto, int port)
+{
+	return mlx4_find_get_prot_dev(dev, proto, port);
+}
+EXPORT_SYMBOL(mlx4_get_prot_dev);
+
+void mlx4_set_iboe_counter(struct mlx4_dev *dev, int index, u8 port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	priv->iboe_counter_index[port - 1] = index;
+}
+EXPORT_SYMBOL(mlx4_set_iboe_counter);
+
+int mlx4_get_iboe_counter(struct mlx4_dev *dev, u8 port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return priv->iboe_counter_index[port - 1];
+}
+EXPORT_SYMBOL(mlx4_get_iboe_counter);
+
+int mlx4_check_port_params(struct mlx4_dev *dev,
+			   enum mlx4_port_type *port_type)
+{
+	int i;
+
+	for (i = 0; i < dev->caps.num_ports - 1; i++) {
+		if (port_type[i] != port_type[i + 1]) {
+			if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) {
+				mlx4_err(dev, "Only same port types supported "
+					 "on this HCA, aborting.\n");
+				return -EINVAL;
+			}
+			if (port_type[i] == MLX4_PORT_TYPE_ETH &&
+			    port_type[i + 1] == MLX4_PORT_TYPE_IB)
+				return -EINVAL;
+		}
+	}
+
+	for (i = 0; i < dev->caps.num_ports; i++) {
+		if (!(port_type[i] & dev->caps.supported_type[i+1])) {
+			mlx4_err(dev, "Requested port type for port %d is not "
+				      "supported on this HCA\n", i + 1);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static void mlx4_set_port_mask(struct mlx4_dev *dev)
+{
+	int i;
+
+	for (i = 1; i <= dev->caps.num_ports; ++i)
+		dev->caps.port_mask[i] = dev->caps.port_type[i];
+}
+
+static u8 get_counters_mode(u64 flags)
+{
+	switch (flags >> 48 & 3) {
+	case 2:
+	case 3:
+		return MLX4_CUNTERS_EXT;
+	case 1:
+		return MLX4_CUNTERS_BASIC;
+	default:
+		return MLX4_CUNTERS_DISABLED;
+	}
+}
+
+static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
+{
+	int err;
+	int i;
+
+	err = mlx4_QUERY_DEV_CAP(dev, dev_cap);
+	if (err) {
+		mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
+		return err;
+	}
+
+	if (dev_cap->min_page_sz > PAGE_SIZE) {
+		mlx4_err(dev, "HCA minimum page size of %d bigger than "
+			 "kernel PAGE_SIZE of %d, aborting.\n",
+			 dev_cap->min_page_sz, PAGE_SIZE);
+		return -ENODEV;
+	}
+	if (dev_cap->num_ports > MLX4_MAX_PORTS) {
+		mlx4_err(dev, "HCA has %d ports, but we only support %d, "
+			 "aborting.\n",
+			 dev_cap->num_ports, MLX4_MAX_PORTS);
+		return -ENODEV;
+	}
+
+	if (dev_cap->uar_size > pci_resource_len(dev->pdev, 2)) {
+		mlx4_err(dev, "HCA reported UAR size of 0x%x bigger than "
+			 "PCI resource 2 size of 0x%llx, aborting.\n",
+			 dev_cap->uar_size,
+			 (unsigned long long) pci_resource_len(dev->pdev, 2));
+		return -ENODEV;
+	}
+
+	dev->caps.num_ports	     = dev_cap->num_ports;
+	for (i = 1; i <= dev->caps.num_ports; ++i) {
+		dev->caps.vl_cap[i]	    = dev_cap->max_vl[i];
+		dev->caps.ib_mtu_cap[i]	    = dev_cap->ib_mtu[i];
+		dev->caps.gid_table_len[i]  = dev_cap->max_gids[i];
+		dev->caps.pkey_table_len[i] = dev_cap->max_pkeys[i];
+		dev->caps.port_width_cap[i] = dev_cap->max_port_width[i];
+		dev->caps.eth_mtu_cap[i]    = dev_cap->eth_mtu[i];
+		dev->caps.def_mac[i]        = dev_cap->def_mac[i];
+		dev->caps.supported_type[i] = dev_cap->supported_port_types[i];
+		dev->caps.trans_type[i]	    = dev_cap->trans_type[i];
+		dev->caps.vendor_oui[i]     = dev_cap->vendor_oui[i];
+		dev->caps.wavelength[i]     = dev_cap->wavelength[i];
+		dev->caps.trans_code[i]     = dev_cap->trans_code[i];
+	}
+
+	dev->caps.num_uars	     = dev_cap->uar_size / PAGE_SIZE;
+	dev->caps.local_ca_ack_delay = dev_cap->local_ca_ack_delay;
+	dev->caps.bf_reg_size	     = dev_cap->bf_reg_size;
+	dev->caps.bf_regs_per_page   = dev_cap->bf_regs_per_page;
+	dev->caps.max_sq_sg	     = dev_cap->max_sq_sg;
+	dev->caps.max_rq_sg	     = dev_cap->max_rq_sg;
+	dev->caps.max_wqes	     = dev_cap->max_qp_sz;
+	dev->caps.max_qp_init_rdma   = dev_cap->max_requester_per_qp;
+	dev->caps.max_srq_wqes	     = dev_cap->max_srq_sz;
+	dev->caps.max_srq_sge	     = dev_cap->max_rq_sg - 1;
+	dev->caps.reserved_srqs	     = dev_cap->reserved_srqs;
+	dev->caps.max_sq_desc_sz     = dev_cap->max_sq_desc_sz;
+	dev->caps.max_rq_desc_sz     = dev_cap->max_rq_desc_sz;
+	dev->caps.num_qp_per_mgm     = MLX4_QP_PER_MGM;
+	/*
+	 * Subtract 1 from the limit because we need to allocate a
+	 * spare CQE so the HCA HW can tell the difference between an
+	 * empty CQ and a full CQ.
+	 */
+	dev->caps.max_cqes	     = dev_cap->max_cq_sz - 1;
+	dev->caps.reserved_cqs	     = dev_cap->reserved_cqs;
+	dev->caps.reserved_eqs	     = dev_cap->reserved_eqs;
+	dev->caps.mtts_per_seg	     = 1 << log_mtts_per_seg;
+	dev->caps.reserved_mtts	     = DIV_ROUND_UP(dev_cap->reserved_mtts,
+						    dev->caps.mtts_per_seg);
+	dev->caps.reserved_mrws	     = dev_cap->reserved_mrws;
+	dev->caps.reserved_uars	     = dev_cap->reserved_uars;
+	dev->caps.reserved_pds	     = dev_cap->reserved_pds;
+	dev->caps.mtt_entry_sz	     = dev->caps.mtts_per_seg * dev_cap->mtt_entry_sz;
+	dev->caps.max_msg_sz         = dev_cap->max_msg_sz;
+	dev->caps.page_size_cap	     = ~(u32) (dev_cap->min_page_sz - 1);
+	dev->caps.flags		     = dev_cap->flags;
+	dev->caps.bmme_flags	     = dev_cap->bmme_flags;
+	dev->caps.reserved_lkey	     = dev_cap->reserved_lkey;
+	dev->caps.stat_rate_support  = dev_cap->stat_rate_support;
+	dev->caps.udp_rss	     = dev_cap->udp_rss;
+	dev->caps.loopback_support   = dev_cap->loopback_support;
+	dev->caps.max_gso_sz	     = dev_cap->max_gso_sz;
+	dev->caps.reserved_xrcds     = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ?
+		dev_cap->reserved_xrcds : 0;
+	dev->caps.max_xrcds	     = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ?
+		dev_cap->max_xrcds : 0;
+
+	dev->caps.log_num_macs  = log_num_mac;
+	dev->caps.log_num_prios = use_prio ? 3 : 0;
+
+	for (i = 1; i <= dev->caps.num_ports; ++i) {
+		dev->caps.port_type[i] = MLX4_PORT_TYPE_NONE;
+		if (dev->caps.supported_type[i]) {
+			if (dev->caps.supported_type[i] != MLX4_PORT_TYPE_ETH)
+				dev->caps.port_type[i] = MLX4_PORT_TYPE_IB;
+			else
+				dev->caps.port_type[i] = MLX4_PORT_TYPE_ETH;
+		}
+		dev->caps.possible_type[i] = dev->caps.port_type[i];
+		mlx4_priv(dev)->sense.sense_allowed[i] =
+			dev->caps.supported_type[i] == MLX4_PORT_TYPE_AUTO;
+
+		if (dev->caps.log_num_macs > dev_cap->log_max_macs[i]) {
+			dev->caps.log_num_macs = dev_cap->log_max_macs[i];
+			mlx4_warn(dev, "Requested number of MACs is too much "
+				  "for port %d, reducing to %d.\n",
+				  i, 1 << dev->caps.log_num_macs);
+		}
+		dev->caps.log_num_vlans = dev_cap->log_max_vlans[i];
+	}
+
+	dev->caps.counters_mode = get_counters_mode(dev_cap->flags);
+	dev->caps.max_basic_counters = 1 << ilog2(dev_cap->max_basic_counters);
+	dev->caps.max_ext_counters = 1 << ilog2(dev_cap->max_ext_counters);
+
+	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] = dev_cap->reserved_qps;
+	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] =
+		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] =
+		(1 << dev->caps.log_num_macs) *
+		(1 << dev->caps.log_num_vlans) *
+		(1 << dev->caps.log_num_prios) *
+		dev->caps.num_ports;
+
+	dev->caps.reserved_qps = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] +
+		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] +
+		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR];
+
+	return 0;
+}
+
+static int mlx4_save_config(struct mlx4_dev *dev)
+{
+	struct mlx4_port_config *config;
+	int i;
+
+	list_for_each_entry(config, &config_list, list) {
+		if (config->pdev == dev->pdev) {
+			for (i = 1; i <= dev->caps.num_ports; i++)
+				config->port_type[i] = dev->caps.possible_type[i];
+			return 0;
+		}
+	}
+
+	config = kmalloc(sizeof(struct mlx4_port_config), GFP_KERNEL);
+	if (!config)
+		return -ENOMEM;
+
+	config->pdev = dev->pdev;
+	for (i = 1; i <= dev->caps.num_ports; i++)
+		config->port_type[i] = dev->caps.possible_type[i];
+
+	list_add_tail(&config->list, &config_list);
+
+	return 0;
+}
+
+/*
+ * Change the port configuration of the device.
+ * Every user of this function must hold the port mutex.
+ */
+int mlx4_change_port_types(struct mlx4_dev *dev,
+			   enum mlx4_port_type *port_types)
+{
+	int err = 0;
+	int change = 0;
+	int port;
+
+	for (port = 0; port <  dev->caps.num_ports; port++) {
+		/* Change the port type only if the new type is different
+		 * from the current, and not set to Auto */
+		if (port_types[port] != dev->caps.port_type[port + 1]) {
+			change = 1;
+			dev->caps.port_type[port + 1] = port_types[port];
+		}
+	}
+	if (change) {
+		mlx4_unregister_device(dev);
+		for (port = 1; port <= dev->caps.num_ports; port++) {
+			mlx4_CLOSE_PORT(dev, port);
+			err = mlx4_SET_PORT(dev, port);
+			if (err) {
+				mlx4_err(dev, "Failed to set port %d, "
+					      "aborting\n", port);
+				goto out;
+			}
+		}
+		mlx4_set_port_mask(dev);
+		mlx4_save_config(dev);
+		err = mlx4_register_device(dev);
+	}
+
+out:
+	return err;
+}
+
+static ssize_t show_port_type(struct device *dev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info,
+						   port_attr);
+	struct mlx4_dev *mdev = info->dev;
+	char type[8];
+
+	sprintf(type, "%s",
+		(mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_IB) ?
+		"ib" : "eth");
+	if (mdev->caps.possible_type[info->port] == MLX4_PORT_TYPE_AUTO)
+		sprintf(buf, "auto (%s)\n", type);
+	else
+		sprintf(buf, "%s\n", type);
+
+	return strlen(buf);
+}
+
+static ssize_t set_port_type(struct device *dev,
+			     struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info,
+						   port_attr);
+	struct mlx4_dev *mdev = info->dev;
+	struct mlx4_priv *priv = mlx4_priv(mdev);
+	enum mlx4_port_type types[MLX4_MAX_PORTS];
+	enum mlx4_port_type new_types[MLX4_MAX_PORTS];
+	int i;
+	int err = 0;
+
+	if (!strcmp(buf, "ib\n"))
+		info->tmp_type = MLX4_PORT_TYPE_IB;
+	else if (!strcmp(buf, "eth\n"))
+		info->tmp_type = MLX4_PORT_TYPE_ETH;
+	else if (!strcmp(buf, "auto\n"))
+		info->tmp_type = MLX4_PORT_TYPE_AUTO;
+	else {
+		mlx4_err(mdev, "%s is not supported port type\n", buf);
+		return -EINVAL;
+	}
+
+	mlx4_stop_sense(mdev);
+	mutex_lock(&priv->port_mutex);
+	/* Possible type is always the one that was delivered */
+	mdev->caps.possible_type[info->port] = info->tmp_type;
+
+	for (i = 0; i < mdev->caps.num_ports; i++) {
+		types[i] = priv->port[i+1].tmp_type ? priv->port[i+1].tmp_type :
+					mdev->caps.possible_type[i+1];
+		if (types[i] == MLX4_PORT_TYPE_AUTO)
+			types[i] = mdev->caps.port_type[i+1];
+	}
+
+	if (priv->trig) {
+		if (++priv->changed_ports < mdev->caps.num_ports)
+			goto out;
+		else
+			priv->trig = priv->changed_ports = 0;
+	}
+
+	if (!(mdev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) {
+		for (i = 1; i <= mdev->caps.num_ports; i++) {
+			if (mdev->caps.possible_type[i] == MLX4_PORT_TYPE_AUTO) {
+				mdev->caps.possible_type[i] = mdev->caps.port_type[i];
+				err = -EINVAL;
+			}
+		}
+	}
+	if (err) {
+		mlx4_err(mdev, "Auto sensing is not supported on this HCA. "
+			       "Set only 'eth' or 'ib' for both ports "
+			       "(should be the same)\n");
+		goto out;
+	}
+
+	mlx4_do_sense_ports(mdev, new_types, types);
+
+	err = mlx4_check_port_params(mdev, new_types);
+	if (err)
+		goto out;
+
+	/* We are about to apply the changes after the configuration
+	 * was verified, no need to remember the temporary types
+	 * any more */
+	for (i = 0; i < mdev->caps.num_ports; i++)
+		priv->port[i + 1].tmp_type = 0;
+
+	err = mlx4_change_port_types(mdev, new_types);
+
+out:
+	mlx4_start_sense(mdev);
+	mutex_unlock(&priv->port_mutex);
+	return err ? err : count;
+}
+
+static ssize_t trigger_port(struct device *dev, struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct mlx4_dev *mdev = pci_get_drvdata(pdev);
+	struct mlx4_priv *priv = container_of(mdev, struct mlx4_priv, dev);
+
+	if (!priv)
+		return -ENODEV;
+
+	mutex_lock(&priv->port_mutex);
+	priv->trig = 1;
+	mutex_unlock(&priv->port_mutex);
+	return count;
+}
+DEVICE_ATTR(port_trigger, S_IWUGO, NULL, trigger_port);
+
+static int mlx4_load_fw(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+
+	priv->fw.fw_icm = mlx4_alloc_icm(dev, priv->fw.fw_pages,
+					 GFP_HIGHUSER | __GFP_NOWARN, 0);
+	if (!priv->fw.fw_icm) {
+		mlx4_err(dev, "Couldn't allocate FW area, aborting.\n");
+		return -ENOMEM;
+	}
+
+	err = mlx4_MAP_FA(dev, priv->fw.fw_icm);
+	if (err) {
+		mlx4_err(dev, "MAP_FA command failed, aborting.\n");
+		goto err_free;
+	}
+
+	err = mlx4_RUN_FW(dev);
+	if (err) {
+		mlx4_err(dev, "RUN_FW command failed, aborting.\n");
+		goto err_unmap_fa;
+	}
+
+	return 0;
+
+err_unmap_fa:
+	mlx4_UNMAP_FA(dev);
+
+err_free:
+	mlx4_free_icm(dev, priv->fw.fw_icm, 0);
+	return err;
+}
+
+static int mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base,
+				int cmpt_entry_sz)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+
+	err = mlx4_init_icm_table(dev, &priv->qp_table.cmpt_table,
+				  cmpt_base +
+				  ((u64) (MLX4_CMPT_TYPE_QP *
+					  cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+				  cmpt_entry_sz, dev->caps.num_qps,
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
+	if (err)
+		goto err;
+
+	err = mlx4_init_icm_table(dev, &priv->srq_table.cmpt_table,
+				  cmpt_base +
+				  ((u64) (MLX4_CMPT_TYPE_SRQ *
+					  cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+				  cmpt_entry_sz, dev->caps.num_srqs,
+				  dev->caps.reserved_srqs, 0, 0);
+	if (err)
+		goto err_qp;
+
+	err = mlx4_init_icm_table(dev, &priv->cq_table.cmpt_table,
+				  cmpt_base +
+				  ((u64) (MLX4_CMPT_TYPE_CQ *
+					  cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+				  cmpt_entry_sz, dev->caps.num_cqs,
+				  dev->caps.reserved_cqs, 0, 0);
+	if (err)
+		goto err_srq;
+
+	err = mlx4_init_icm_table(dev, &priv->eq_table.cmpt_table,
+				  cmpt_base +
+				  ((u64) (MLX4_CMPT_TYPE_EQ *
+					  cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+				  cmpt_entry_sz,
+				  dev->caps.num_eqs, dev->caps.num_eqs, 0, 0);
+	if (err)
+		goto err_cq;
+
+	return 0;
+
+err_cq:
+	mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table);
+
+err_srq:
+	mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table);
+
+err_qp:
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table);
+
+err:
+	return err;
+}
+
+static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
+			 struct mlx4_init_hca_param *init_hca, u64 icm_size)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u64 aux_pages;
+	int err;
+
+	err = mlx4_SET_ICM_SIZE(dev, icm_size, &aux_pages);
+	if (err) {
+		mlx4_err(dev, "SET_ICM_SIZE command failed, aborting.\n");
+		return err;
+	}
+
+	mlx4_dbg(dev, "%lld KB of HCA context requires %lld KB aux memory.\n",
+		 (unsigned long long) icm_size >> 10,
+		 (unsigned long long) aux_pages << 2);
+
+	priv->fw.aux_icm = mlx4_alloc_icm(dev, aux_pages,
+					  GFP_HIGHUSER | __GFP_NOWARN, 0);
+	if (!priv->fw.aux_icm) {
+		mlx4_err(dev, "Couldn't allocate aux memory, aborting.\n");
+		return -ENOMEM;
+	}
+
+	err = mlx4_MAP_ICM_AUX(dev, priv->fw.aux_icm);
+	if (err) {
+		mlx4_err(dev, "MAP_ICM_AUX command failed, aborting.\n");
+		goto err_free_aux;
+	}
+
+	err = mlx4_init_cmpt_table(dev, init_hca->cmpt_base, dev_cap->cmpt_entry_sz);
+	if (err) {
+		mlx4_err(dev, "Failed to map cMPT context memory, aborting.\n");
+		goto err_unmap_aux;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->eq_table.table,
+				  init_hca->eqc_base, dev_cap->eqc_entry_sz,
+				  dev->caps.num_eqs, dev->caps.num_eqs,
+				  0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map EQ context memory, aborting.\n");
+		goto err_unmap_cmpt;
+	}
+
+	/*
+	 * Reserved MTT entries must be aligned up to a cacheline
+	 * boundary, since the FW will write to them, while the driver
+	 * writes to all other MTT entries. (The variable
+	 * dev->caps.mtt_entry_sz below is really the MTT segment
+	 * size, not the raw entry size)
+	 */
+	dev->caps.reserved_mtts =
+		ALIGN(dev->caps.reserved_mtts * dev->caps.mtt_entry_sz,
+		      dma_get_cache_alignment()) / dev->caps.mtt_entry_sz;
+
+	err = mlx4_init_icm_table(dev, &priv->mr_table.mtt_table,
+				  init_hca->mtt_base,
+				  dev->caps.mtt_entry_sz,
+				  dev->caps.num_mtt_segs,
+				  dev->caps.reserved_mtts, 1, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map MTT context memory, aborting.\n");
+		goto err_unmap_eq;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->mr_table.dmpt_table,
+				  init_hca->dmpt_base,
+				  dev_cap->dmpt_entry_sz,
+				  dev->caps.num_mpts,
+				  dev->caps.reserved_mrws, 1, 1);
+	if (err) {
+		mlx4_err(dev, "Failed to map dMPT context memory, aborting.\n");
+		goto err_unmap_mtt;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->qp_table.qp_table,
+				  init_hca->qpc_base,
+				  dev_cap->qpc_entry_sz,
+				  dev->caps.num_qps,
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map QP context memory, aborting.\n");
+		goto err_unmap_dmpt;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->qp_table.auxc_table,
+				  init_hca->auxc_base,
+				  dev_cap->aux_entry_sz,
+				  dev->caps.num_qps,
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map AUXC context memory, aborting.\n");
+		goto err_unmap_qp;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->qp_table.altc_table,
+				  init_hca->altc_base,
+				  dev_cap->altc_entry_sz,
+				  dev->caps.num_qps,
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map ALTC context memory, aborting.\n");
+		goto err_unmap_auxc;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->qp_table.rdmarc_table,
+				  init_hca->rdmarc_base,
+				  dev_cap->rdmarc_entry_sz << priv->qp_table.rdmarc_shift,
+				  dev->caps.num_qps,
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map RDMARC context memory, aborting\n");
+		goto err_unmap_altc;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->cq_table.table,
+				  init_hca->cqc_base,
+				  dev_cap->cqc_entry_sz,
+				  dev->caps.num_cqs,
+				  dev->caps.reserved_cqs, 0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map CQ context memory, aborting.\n");
+		goto err_unmap_rdmarc;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->srq_table.table,
+				  init_hca->srqc_base,
+				  dev_cap->srq_entry_sz,
+				  dev->caps.num_srqs,
+				  dev->caps.reserved_srqs, 0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map SRQ context memory, aborting.\n");
+		goto err_unmap_cq;
+	}
+
+	/*
+	 * It's not strictly required, but for simplicity just map the
+	 * whole multicast group table now.  The table isn't very big
+	 * and it's a lot easier than trying to track ref counts.
+	 */
+	err = mlx4_init_icm_table(dev, &priv->mcg_table.table,
+				  init_hca->mc_base, MLX4_MGM_ENTRY_SIZE,
+				  dev->caps.num_mgms + dev->caps.num_amgms,
+				  dev->caps.num_mgms + dev->caps.num_amgms,
+				  0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map MCG context memory, aborting.\n");
+		goto err_unmap_srq;
+	}
+
+	return 0;
+
+err_unmap_srq:
+	mlx4_cleanup_icm_table(dev, &priv->srq_table.table);
+
+err_unmap_cq:
+	mlx4_cleanup_icm_table(dev, &priv->cq_table.table);
+
+err_unmap_rdmarc:
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.rdmarc_table);
+
+err_unmap_altc:
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.altc_table);
+
+err_unmap_auxc:
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.auxc_table);
+
+err_unmap_qp:
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.qp_table);
+
+err_unmap_dmpt:
+	mlx4_cleanup_icm_table(dev, &priv->mr_table.dmpt_table);
+
+err_unmap_mtt:
+	mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table);
+
+err_unmap_eq:
+	mlx4_cleanup_icm_table(dev, &priv->eq_table.table);
+
+err_unmap_cmpt:
+	mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table);
+
+err_unmap_aux:
+	mlx4_UNMAP_ICM_AUX(dev);
+
+err_free_aux:
+	mlx4_free_icm(dev, priv->fw.aux_icm, 0);
+
+	return err;
+}
+
+static void mlx4_free_icms(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mlx4_cleanup_icm_table(dev, &priv->mcg_table.table);
+	mlx4_cleanup_icm_table(dev, &priv->srq_table.table);
+	mlx4_cleanup_icm_table(dev, &priv->cq_table.table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.rdmarc_table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.altc_table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.auxc_table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.qp_table);
+	mlx4_cleanup_icm_table(dev, &priv->mr_table.dmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table);
+	mlx4_cleanup_icm_table(dev, &priv->eq_table.table);
+	mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table);
+
+	mlx4_UNMAP_ICM_AUX(dev);
+	mlx4_free_icm(dev, priv->fw.aux_icm, 0);
+}
+
+static int map_bf_area(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	resource_size_t bf_start;
+	resource_size_t bf_len;
+	int err = 0;
+
+	bf_start = pci_resource_start(dev->pdev, 2) + (dev->caps.num_uars << PAGE_SHIFT);
+	bf_len = pci_resource_len(dev->pdev, 2) - (dev->caps.num_uars << PAGE_SHIFT);
+	priv->bf_mapping = io_mapping_create_wc(bf_start, bf_len);
+	if (!priv->bf_mapping)
+		err = -ENOMEM;
+
+	return err;
+}
+
+static void unmap_bf_area(struct mlx4_dev *dev)
+{
+	if (mlx4_priv(dev)->bf_mapping)
+		io_mapping_free(mlx4_priv(dev)->bf_mapping);
+}
+
+static void mlx4_close_hca(struct mlx4_dev *dev)
+{
+	unmap_bf_area(dev);
+	mlx4_CLOSE_HCA(dev, 0);
+	mlx4_free_icms(dev);
+	mlx4_UNMAP_FA(dev);
+	mlx4_free_icm(dev, mlx4_priv(dev)->fw.fw_icm, 0);
+}
+
+static int mlx4_init_hca(struct mlx4_dev *dev)
+{
+	struct mlx4_priv	  *priv = mlx4_priv(dev);
+	struct mlx4_adapter	   adapter;
+	struct mlx4_dev_cap	   dev_cap;
+	struct mlx4_mod_stat_cfg   mlx4_cfg;
+	struct mlx4_profile	   profile;
+	struct mlx4_init_hca_param init_hca;
+	struct mlx4_port_config	  *config;
+	u64 icm_size;
+	int err;
+	int i;
+
+	err = mlx4_QUERY_FW(dev);
+	if (err) {
+		if (err == -EACCES)
+			mlx4_info(dev, "non-primary physical function, skipping.\n");
+		else
+			mlx4_err(dev, "QUERY_FW command failed, aborting.\n");
+		return err;
+	}
+
+	err = mlx4_load_fw(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to start FW, aborting.\n");
+		return err;
+	}
+
+	mlx4_cfg.log_pg_sz_m = 1;
+	mlx4_cfg.log_pg_sz = 0;
+	err = mlx4_MOD_STAT_CFG(dev, &mlx4_cfg);
+	if (err)
+		mlx4_warn(dev, "Failed to override log_pg_sz parameter\n");
+
+	err = mlx4_dev_cap(dev, &dev_cap);
+	if (err) {
+		mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
+		goto err_stop_fw;
+	}
+
+	process_mod_param_profile();
+	profile = default_profile;
+
+	list_for_each_entry(config, &config_list, list) {
+		if (config->pdev == dev->pdev) {
+			for (i = 1; i <= dev->caps.num_ports; i++) {
+				dev->caps.possible_type[i] = config->port_type[i];
+				if (config->port_type[i] != MLX4_PORT_TYPE_AUTO)
+					dev->caps.port_type[i] = config->port_type[i];
+			}
+		}
+	}
+
+	mlx4_set_port_mask(dev);
+	icm_size = mlx4_make_profile(dev, &profile, &dev_cap, &init_hca);
+	if ((long long) icm_size < 0) {
+		err = icm_size;
+		goto err_stop_fw;
+	}
+
+	if (map_bf_area(dev))
+		mlx4_dbg(dev, "Kernel support for blue flame is not available for kernels < 2.6.28\n");
+
+	init_hca.log_uar_sz = ilog2(dev->caps.num_uars);
+
+	err = mlx4_init_icm(dev, &dev_cap, &init_hca, icm_size);
+	if (err)
+		goto err_stop_fw;
+
+	err = mlx4_INIT_HCA(dev, &init_hca);
+	if (err) {
+		mlx4_err(dev, "INIT_HCA command failed, aborting.\n");
+		goto err_free_icm;
+	}
+
+	err = mlx4_QUERY_ADAPTER(dev, &adapter);
+	if (err) {
+		mlx4_err(dev, "QUERY_ADAPTER command failed, aborting.\n");
+		goto err_close;
+	}
+
+	priv->eq_table.inta_pin = adapter.inta_pin;
+	memcpy(dev->board_id, adapter.board_id, sizeof dev->board_id);
+
+	return 0;
+
+err_close:
+	mlx4_CLOSE_HCA(dev, 0);
+
+err_free_icm:
+	mlx4_free_icms(dev);
+
+err_stop_fw:
+	unmap_bf_area(dev);
+	mlx4_UNMAP_FA(dev);
+	mlx4_free_icm(dev, priv->fw.fw_icm, 0);
+
+	return err;
+}
+
+static int mlx4_init_counters_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+	int nent;
+
+	switch (dev->caps.counters_mode) {
+	case MLX4_CUNTERS_BASIC:
+		nent = dev->caps.max_basic_counters;
+		break;
+	case MLX4_CUNTERS_EXT:
+		nent = dev->caps.max_ext_counters;
+		break;
+	default:
+		return -ENOENT;
+	}
+	err = mlx4_bitmap_init(&priv->counters_bitmap, nent, nent - 1, 0, 0);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static void mlx4_cleanup_counters_table(struct mlx4_dev *dev)
+{
+	switch (dev->caps.counters_mode) {
+	case MLX4_CUNTERS_BASIC:
+	case MLX4_CUNTERS_EXT:
+		mlx4_bitmap_cleanup(&mlx4_priv(dev)->counters_bitmap);
+		break;
+	default:
+		break;
+	}
+}
+
+int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	switch (dev->caps.counters_mode) {
+	case MLX4_CUNTERS_BASIC:
+	case MLX4_CUNTERS_EXT:
+		*idx = mlx4_bitmap_alloc(&priv->counters_bitmap);
+		if (*idx == -1)
+			return -ENOMEM;
+		return 0;
+	default:
+		return -ENOMEM;
+	}
+}
+EXPORT_SYMBOL_GPL(mlx4_counter_alloc);
+
+void mlx4_counter_free(struct mlx4_dev *dev, u32 idx)
+{
+	switch (dev->caps.counters_mode) {
+	case MLX4_CUNTERS_BASIC:
+	case MLX4_CUNTERS_EXT:
+		mlx4_bitmap_free(&mlx4_priv(dev)->counters_bitmap, idx);
+		return;
+	default:
+		return;
+	}
+}
+EXPORT_SYMBOL_GPL(mlx4_counter_free);
+
+static int mlx4_setup_hca(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+	int port;
+	__be32 ib_port_default_caps;
+
+	err = mlx4_init_uar_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize "
+			 "user access region table, aborting.\n");
+		return err;
+	}
+
+	err = mlx4_uar_alloc(dev, &priv->driver_uar);
+	if (err) {
+		mlx4_err(dev, "Failed to allocate driver access region, "
+			 "aborting.\n");
+		goto err_uar_table_free;
+	}
+
+	priv->kar = ioremap(priv->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
+	if (!priv->kar) {
+		mlx4_err(dev, "Couldn't map kernel access region, "
+			 "aborting.\n");
+		err = -ENOMEM;
+		goto err_uar_free;
+	}
+
+	err = mlx4_init_pd_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize "
+			 "protection domain table, aborting.\n");
+		goto err_kar_unmap;
+	}
+
+	err = mlx4_init_xrcd_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize extended "
+			 "reliably connected domain table, aborting.\n");
+		goto err_pd_table_free;
+	}
+
+	err = mlx4_init_mr_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize "
+			 "memory region table, aborting.\n");
+		goto err_xrcd_table_free;
+	}
+
+	err = mlx4_init_eq_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize "
+			 "event queue table, aborting.\n");
+		goto err_mr_table_free;
+	}
+
+	err = mlx4_cmd_use_events(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to switch to event-driven "
+			 "firmware commands, aborting.\n");
+		goto err_eq_table_free;
+	}
+
+	err = mlx4_NOP(dev);
+	if (err) {
+		if (dev->flags & MLX4_FLAG_MSI_X) {
+			mlx4_warn(dev, "NOP command failed to generate MSI-X "
+				  "interrupt IRQ %d).\n",
+				  priv->eq_table.eq[dev->caps.num_comp_vectors].irq);
+			mlx4_warn(dev, "Trying again without MSI-X.\n");
+		} else {
+			mlx4_err(dev, "NOP command failed to generate interrupt "
+				 "(IRQ %d), aborting.\n",
+				 priv->eq_table.eq[dev->caps.num_comp_vectors].irq);
+			mlx4_err(dev, "BIOS or ACPI interrupt routing problem?\n");
+		}
+
+		goto err_cmd_poll;
+	}
+
+	mlx4_dbg(dev, "NOP command IRQ test passed\n");
+
+	err = mlx4_init_cq_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize "
+			 "completion queue table, aborting.\n");
+		goto err_cmd_poll;
+	}
+
+	err = mlx4_init_srq_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize "
+			 "shared receive queue table, aborting.\n");
+		goto err_cq_table_free;
+	}
+
+	err = mlx4_init_qp_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize "
+			 "queue pair table, aborting.\n");
+		goto err_srq_table_free;
+	}
+
+	err = mlx4_init_mcg_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize "
+			 "multicast group table, aborting.\n");
+		goto err_qp_table_free;
+	}
+
+	err = mlx4_init_counters_table(dev);
+	if (err && err != -ENOENT) {
+		mlx4_err(dev, "Failed to initialize counters table, aborting.\n");
+		goto err_mcg_table_free;
+	}
+
+	for (port = 1; port <= dev->caps.num_ports; port++) {
+		ib_port_default_caps = 0;
+		err = mlx4_get_port_ib_caps(dev, port, &ib_port_default_caps);
+		if (err)
+			mlx4_warn(dev, "failed to get port %d default "
+				  "ib capabilities (%d). Continuing with "
+				  "caps = 0\n", port, err);
+		dev->caps.ib_port_def_cap[port] = ib_port_default_caps;
+		err = mlx4_SET_PORT(dev, port);
+		if (err) {
+			mlx4_err(dev, "Failed to set port %d, aborting\n",
+				port);
+			goto err_counters_table_free;
+		}
+	}
+
+	return 0;
+
+err_counters_table_free:
+	mlx4_cleanup_counters_table(dev);
+
+err_mcg_table_free:
+	mlx4_cleanup_mcg_table(dev);
+
+err_qp_table_free:
+	mlx4_cleanup_qp_table(dev);
+
+err_srq_table_free:
+	mlx4_cleanup_srq_table(dev);
+
+err_cq_table_free:
+	mlx4_cleanup_cq_table(dev);
+
+err_cmd_poll:
+	mlx4_cmd_use_polling(dev);
+
+err_eq_table_free:
+	mlx4_cleanup_eq_table(dev);
+
+err_mr_table_free:
+	mlx4_cleanup_mr_table(dev);
+
+err_xrcd_table_free:
+	mlx4_cleanup_xrcd_table(dev);
+
+err_pd_table_free:
+	mlx4_cleanup_pd_table(dev);
+
+err_kar_unmap:
+	iounmap(priv->kar);
+
+err_uar_free:
+	mlx4_uar_free(dev, &priv->driver_uar);
+
+err_uar_table_free:
+	mlx4_cleanup_uar_table(dev);
+	return err;
+}
+
+static void mlx4_enable_msi_x(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct msix_entry *entries;
+	int nreq;
+	int err;
+	int i;
+
+	if (msi_x) {
+		nreq = min_t(int, dev->caps.num_eqs - dev->caps.reserved_eqs,
+			     num_possible_cpus() + 1);
+		entries = kcalloc(nreq, sizeof *entries, GFP_KERNEL);
+		if (!entries)
+			goto no_msi;
+
+		for (i = 0; i < nreq; ++i)
+			entries[i].entry = i;
+
+	retry:
+		err = pci_enable_msix(dev->pdev, entries, nreq);
+		if (err) {
+			/* Try again if at least 2 vectors are available */
+			if (err > 1) {
+				mlx4_info(dev, "Requested %d vectors, "
+					  "but only %d MSI-X vectors available, "
+					  "trying again\n", nreq, err);
+				nreq = err;
+				goto retry;
+			}
+			kfree(entries);
+			goto no_msi;
+		}
+
+		dev->caps.num_comp_vectors = nreq - 1;
+		for (i = 0; i < nreq; ++i)
+			priv->eq_table.eq[i].irq = entries[i].vector;
+
+		dev->flags |= MLX4_FLAG_MSI_X;
+
+		kfree(entries);
+		return;
+	}
+
+no_msi:
+	dev->caps.num_comp_vectors = 1;
+
+	for (i = 0; i < 2; ++i)
+		priv->eq_table.eq[i].irq = dev->pdev->irq;
+}
+
+static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
+{
+	struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
+	int err = 0;
+
+	info->dev = dev;
+	info->port = port;
+	mlx4_init_mac_table(dev, &info->mac_table);
+	mlx4_init_vlan_table(dev, &info->vlan_table);
+
+	sprintf(info->dev_name, "mlx4_port%d", port);
+	info->port_attr.attr.name = info->dev_name;
+	info->port_attr.attr.mode = S_IRUGO | S_IWUSR;
+	info->port_attr.show      = show_port_type;
+	info->port_attr.store     = set_port_type;
+
+	err = device_create_file(&dev->pdev->dev, &info->port_attr);
+	if (err) {
+		mlx4_err(dev, "Failed to create file for port %d\n", port);
+		info->port = -1;
+	}
+
+	return err;
+}
+
+static void mlx4_cleanup_port_info(struct mlx4_port_info *info)
+{
+	if (info->port < 0)
+		return;
+
+	device_remove_file(&info->dev->pdev->dev, &info->port_attr);
+}
+
+static int mlx4_init_trigger(struct mlx4_priv *priv)
+{
+	memcpy(&priv->trigger_attr, &dev_attr_port_trigger,
+	       sizeof(struct device_attribute));
+        return device_create_file(&priv->dev.pdev->dev, &priv->trigger_attr);
+}
+
+static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct mlx4_priv *priv;
+	struct mlx4_dev *dev;
+	int err;
+	int port;
+	int i;
+
+	printk(KERN_INFO PFX "Initializing %s\n",
+	       pci_name(pdev));
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot enable PCI device, "
+			"aborting.\n");
+		return err;
+	}
+
+	/*
+	 * Check for BARs.  We expect 0: 1MB
+	 */
+	if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM) ||
+	    pci_resource_len(pdev, 0) != 1 << 20) {
+		dev_err(&pdev->dev, "Missing DCS, aborting.\n");
+		err = -ENODEV;
+		goto err_disable_pdev;
+	}
+	if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) {
+		dev_err(&pdev->dev, "Missing UAR, aborting.\n");
+		err = -ENODEV;
+		goto err_disable_pdev;
+	}
+
+	err = pci_request_region(pdev, 0, DRV_NAME);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot request control region, aborting.\n");
+		goto err_disable_pdev;
+	}
+
+	err = pci_request_region(pdev, 2, DRV_NAME);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot request UAR region, aborting.\n");
+		goto err_release_bar0;
+	}
+
+	pci_set_master(pdev);
+
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask.\n");
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n");
+			goto err_release_bar2;
+		}
+	}
+	err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit "
+			 "consistent PCI DMA mask.\n");
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, "
+				"aborting.\n");
+			goto err_release_bar2;
+		}
+	}
+
+	priv = kzalloc(sizeof *priv, GFP_KERNEL);
+	if (!priv) {
+		dev_err(&pdev->dev, "Device struct alloc failed, "
+			"aborting.\n");
+		err = -ENOMEM;
+		goto err_release_bar2;
+	}
+
+	dev       = &priv->dev;
+	dev->pdev = pdev;
+	INIT_LIST_HEAD(&priv->ctx_list);
+	spin_lock_init(&priv->ctx_lock);
+
+	mutex_init(&priv->port_mutex);
+
+	INIT_LIST_HEAD(&priv->pgdir_list);
+	mutex_init(&priv->pgdir_mutex);
+	for (i = 0; i < MLX4_MAX_PORTS; ++i)
+		priv->iboe_counter_index[i] = -1;
+
+	INIT_LIST_HEAD(&priv->bf_list);
+	mutex_init(&priv->bf_mutex);
+
+	/*
+	 * Now reset the HCA before we touch the PCI capabilities or
+	 * attempt a firmware command, since a boot ROM may have left
+	 * the HCA in an undefined state.
+	 */
+	err = mlx4_reset(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to reset HCA, aborting.\n");
+		goto err_free_dev;
+	}
+
+	if (mlx4_cmd_init(dev)) {
+		mlx4_err(dev, "Failed to init command interface, aborting.\n");
+		goto err_free_dev;
+	}
+
+	err = mlx4_init_hca(dev);
+	if (err)
+		goto err_cmd;
+
+	err = mlx4_alloc_eq_table(dev);
+	if (err)
+		goto err_close;
+
+	mlx4_enable_msi_x(dev);
+
+	err = mlx4_setup_hca(dev);
+	if (err == -EBUSY && (dev->flags & MLX4_FLAG_MSI_X)) {
+		dev->flags &= ~MLX4_FLAG_MSI_X;
+		pci_disable_msix(pdev);
+		err = mlx4_setup_hca(dev);
+	}
+
+	if (err)
+		goto err_free_eq;
+
+	for (port = 1; port <= dev->caps.num_ports; port++) {
+		err = mlx4_init_port_info(dev, port);
+		if (err)
+			goto err_port;
+	}
+
+	err = mlx4_register_device(dev);
+	if (err)
+		goto err_port;
+
+	err = mlx4_init_trigger(priv);
+	if (err)
+		goto err_register;
+
+	err = mlx4_sense_init(dev);
+	if (err)
+		goto err_trigger;
+
+	mlx4_start_sense(dev);
+
+	pci_set_drvdata(pdev, dev);
+
+	return 0;
+
+err_trigger:
+	device_remove_file(&dev->pdev->dev, &priv->trigger_attr);
+err_register:
+	mlx4_unregister_device(dev);
+err_port:
+	for (--port; port >= 1; --port)
+		mlx4_cleanup_port_info(&priv->port[port]);
+
+	mlx4_cleanup_counters_table(dev);
+	mlx4_cleanup_mcg_table(dev);
+	mlx4_cleanup_qp_table(dev);
+	mlx4_cleanup_srq_table(dev);
+	mlx4_cleanup_cq_table(dev);
+	mlx4_cmd_use_polling(dev);
+	mlx4_cleanup_eq_table(dev);
+	mlx4_cleanup_mr_table(dev);
+	mlx4_cleanup_xrcd_table(dev);
+	mlx4_cleanup_pd_table(dev);
+	mlx4_cleanup_uar_table(dev);
+
+err_free_eq:
+	mlx4_free_eq_table(dev);
+
+err_close:
+	if (dev->flags & MLX4_FLAG_MSI_X)
+		pci_disable_msix(pdev);
+
+	mlx4_close_hca(dev);
+
+err_cmd:
+	mlx4_cmd_cleanup(dev);
+
+err_free_dev:
+	kfree(priv);
+
+err_release_bar2:
+	pci_release_region(pdev, 2);
+
+err_release_bar0:
+	pci_release_region(pdev, 0);
+
+err_disable_pdev:
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+	return err;
+}
+
+static int __devinit mlx4_init_one(struct pci_dev *pdev,
+				   const struct pci_device_id *id)
+{
+	static int mlx4_version_printed;
+
+	if (!mlx4_version_printed) {
+		printk(KERN_INFO "%s", mlx4_version);
+		++mlx4_version_printed;
+	}
+
+	return __mlx4_init_one(pdev, id);
+}
+
+static void mlx4_remove_one(struct pci_dev *pdev)
+{
+	struct mlx4_dev  *dev  = pci_get_drvdata(pdev);
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int p;
+
+	if (dev) {
+		mlx4_sense_cleanup(dev);
+		mlx4_unregister_device(dev);
+		device_remove_file(&dev->pdev->dev, &priv->trigger_attr);
+
+		for (p = 1; p <= dev->caps.num_ports; p++) {
+			mlx4_cleanup_port_info(&priv->port[p]);
+			mlx4_CLOSE_PORT(dev, p);
+		}
+
+                mlx4_cleanup_counters_table(dev);
+		mlx4_cleanup_mcg_table(dev);
+		mlx4_cleanup_qp_table(dev);
+		mlx4_cleanup_srq_table(dev);
+		mlx4_cleanup_cq_table(dev);
+		mlx4_cmd_use_polling(dev);
+		mlx4_cleanup_eq_table(dev);
+		mlx4_cleanup_mr_table(dev);
+		mlx4_cleanup_xrcd_table(dev);
+		mlx4_cleanup_pd_table(dev);
+
+		iounmap(priv->kar);
+		mlx4_uar_free(dev, &priv->driver_uar);
+		mlx4_cleanup_uar_table(dev);
+		mlx4_free_eq_table(dev);
+		mlx4_close_hca(dev);
+		mlx4_cmd_cleanup(dev);
+
+		if (dev->flags & MLX4_FLAG_MSI_X)
+			pci_disable_msix(pdev);
+
+		kfree(priv);
+		pci_release_region(pdev, 2);
+		pci_release_region(pdev, 0);
+		pci_disable_device(pdev);
+		pci_set_drvdata(pdev, NULL);
+	}
+}
+
+int mlx4_restart_one(struct pci_dev *pdev)
+{
+	mlx4_remove_one(pdev);
+	return __mlx4_init_one(pdev, NULL);
+}
+
+static struct pci_device_id mlx4_pci_table[] = {
+	{ PCI_VDEVICE(MELLANOX, 0x6340) }, /* MT25408 "Hermon" SDR */
+	{ PCI_VDEVICE(MELLANOX, 0x634a) }, /* MT25408 "Hermon" DDR */
+	{ PCI_VDEVICE(MELLANOX, 0x6354) }, /* MT25408 "Hermon" QDR */
+	{ PCI_VDEVICE(MELLANOX, 0x6732) }, /* MT25408 "Hermon" DDR PCIe gen2 */
+	{ PCI_VDEVICE(MELLANOX, 0x673c) }, /* MT25408 "Hermon" QDR PCIe gen2 */
+	{ PCI_VDEVICE(MELLANOX, 0x6368) }, /* MT25408 "Hermon" EN 10GigE */
+	{ PCI_VDEVICE(MELLANOX, 0x6750) }, /* MT25408 "Hermon" EN 10GigE PCIe gen2 */
+	{ PCI_VDEVICE(MELLANOX, 0x6372) }, /* MT25458 ConnectX EN 10GBASE-T 10GigE */
+	{ PCI_VDEVICE(MELLANOX, 0x675a) }, /* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */
+	{ PCI_VDEVICE(MELLANOX, 0x6764) }, /* MT26468 ConnectX EN 10GigE PCIe gen2 */
+	{ PCI_VDEVICE(MELLANOX, 0x6746) }, /* MT26438 ConnectX VPI PCIe 2.0 5GT/s - IB QDR / 10GigE Virt+ */
+	{ PCI_VDEVICE(MELLANOX, 0x676e) }, /* MT26478 ConnectX EN 40GigE PCIe 2.0 5GT/s */
+	{ PCI_VDEVICE(MELLANOX, 0x6778) }, /* MT26488 ConnectX VPI PCIe 2.0 5GT/s - IB DDR / 10GigE Virt+ */
+	{ PCI_VDEVICE(MELLANOX, 0x1000) },
+	{ PCI_VDEVICE(MELLANOX, 0x1001) },
+	{ PCI_VDEVICE(MELLANOX, 0x1002) },
+	{ PCI_VDEVICE(MELLANOX, 0x1003) },
+	{ PCI_VDEVICE(MELLANOX, 0x1004) },
+	{ PCI_VDEVICE(MELLANOX, 0x1005) },
+	{ PCI_VDEVICE(MELLANOX, 0x1006) },
+	{ PCI_VDEVICE(MELLANOX, 0x1007) },
+	{ PCI_VDEVICE(MELLANOX, 0x1008) },
+	{ PCI_VDEVICE(MELLANOX, 0x1009) },
+	{ PCI_VDEVICE(MELLANOX, 0x100a) },
+	{ PCI_VDEVICE(MELLANOX, 0x100b) },
+	{ PCI_VDEVICE(MELLANOX, 0x100c) },
+	{ PCI_VDEVICE(MELLANOX, 0x100d) },
+	{ PCI_VDEVICE(MELLANOX, 0x100e) },
+	{ PCI_VDEVICE(MELLANOX, 0x100f) },
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, mlx4_pci_table);
+
+static struct pci_driver mlx4_driver = {
+	.name		= DRV_NAME,
+	.id_table	= mlx4_pci_table,
+	.probe		= mlx4_init_one,
+	.remove		= __devexit_p(mlx4_remove_one)
+};
+
+static int __init mlx4_verify_params(void)
+{
+	if ((log_num_mac < 0) || (log_num_mac > 7)) {
+		printk(KERN_WARNING "mlx4_core: bad num_mac: %d\n", log_num_mac);
+		return -1;
+	}
+
+	if (log_mtts_per_seg == 0)
+		log_mtts_per_seg = ilog2(MLX4_MTT_ENTRY_PER_SEG);
+	if ((log_mtts_per_seg < 1) || (log_mtts_per_seg > 7)) {
+		printk(KERN_WARNING "mlx4_core: bad log_mtts_per_seg: %d\n", log_mtts_per_seg);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int __init mlx4_init(void)
+{
+	int ret;
+
+	mutex_init(&drv_mutex);
+
+	if (mlx4_verify_params())
+		return -EINVAL;
+
+	mlx4_catas_init();
+
+	mlx4_wq = create_singlethread_workqueue("mlx4");
+	if (!mlx4_wq)
+		return -ENOMEM;
+
+	ret = pci_register_driver(&mlx4_driver);
+	return ret < 0 ? ret : 0;
+}
+
+static void __exit mlx4_cleanup(void)
+{
+	mutex_lock(&drv_mutex);
+	mlx4_config_cleanup();
+	pci_unregister_driver(&mlx4_driver);
+	mutex_unlock(&drv_mutex);
+	destroy_workqueue(mlx4_wq);
+}
+
+module_init_order(mlx4_init, SI_ORDER_MIDDLE);
+module_exit(mlx4_cleanup);
+
+#undef MODULE_VERSION
+#include <sys/module.h>
+static int
+mlx4_evhand(module_t mod, int event, void *arg)
+{
+	return (0);
+}
+
+static moduledata_t mlx4_mod = {
+	.name = "mlx4",
+	.evhand = mlx4_evhand,
+};
+MODULE_VERSION(mlx4, 1);
+DECLARE_MODULE(mlx4, mlx4_mod, SI_SUB_SMP, SI_ORDER_ANY);
diff --git a/sys/ofed/drivers/net/mlx4/mcg.c b/sys/ofed/drivers/net/mlx4/mcg.c
new file mode 100644
index 0000000..70493e3
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/mcg.c
@@ -0,0 +1,366 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/driver.h>
+
+#include "mlx4.h"
+
+#define MGM_QPN_MASK       0x00FFFFFF
+#define MGM_BLCK_LB_BIT    30
+
+struct mlx4_mgm {
+	__be32			next_gid_index;
+	__be32			members_count;
+	u32			reserved[2];
+	u8			gid[16];
+	__be32			qp[MLX4_QP_PER_MGM];
+};
+
+static const u8 zero_gid[16];	/* automatically initialized to 0 */
+
+static int mlx4_READ_MCG(struct mlx4_dev *dev, int index,
+			 struct mlx4_cmd_mailbox *mailbox)
+{
+	return mlx4_cmd_box(dev, 0, mailbox->dma, index, 0, MLX4_CMD_READ_MCG,
+			    MLX4_CMD_TIME_CLASS_A);
+}
+
+static int mlx4_WRITE_MCG(struct mlx4_dev *dev, int index,
+			  struct mlx4_cmd_mailbox *mailbox)
+{
+	return mlx4_cmd(dev, mailbox->dma, index, 0, MLX4_CMD_WRITE_MCG,
+			MLX4_CMD_TIME_CLASS_A);
+}
+
+static int mlx4_MGID_HASH(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  u16 *hash)
+{
+	u64 imm;
+	int err;
+
+	err = mlx4_cmd_imm(dev, mailbox->dma, &imm, 0, 0, MLX4_CMD_MGID_HASH,
+			   MLX4_CMD_TIME_CLASS_A);
+
+	if (!err)
+		*hash = imm;
+
+	return err;
+}
+
+/*
+ * Caller must hold MCG table semaphore.  gid and mgm parameters must
+ * be properly aligned for command interface.
+ *
+ *  Returns 0 unless a firmware command error occurs.
+ *
+ * If GID is found in MGM or MGM is empty, *index = *hash, *prev = -1
+ * and *mgm holds MGM entry.
+ *
+ * if GID is found in AMGM, *index = index in AMGM, *prev = index of
+ * previous entry in hash chain and *mgm holds AMGM entry.
+ *
+ * If no AMGM exists for given gid, *index = -1, *prev = index of last
+ * entry in hash chain and *mgm holds end of hash chain.
+ */
+static int find_mgm(struct mlx4_dev *dev,
+		    u8 *gid, enum mlx4_mcast_prot prot,
+		    struct mlx4_cmd_mailbox *mgm_mailbox,
+		    u16 *hash, int *prev, int *index)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm = mgm_mailbox->buf;
+	u8 *mgid;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return -ENOMEM;
+	mgid = mailbox->buf;
+
+	memcpy(mgid, gid, 16);
+
+	err = mlx4_MGID_HASH(dev, mailbox, hash);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (err)
+		return err;
+
+	if (0)
+		mlx4_dbg(dev, "Hash for %pI6 is %04x\n", gid, *hash);
+
+	*index = *hash;
+	*prev  = -1;
+
+	do {
+		err = mlx4_READ_MCG(dev, *index, mgm_mailbox);
+		if (err)
+			return err;
+
+		if (!memcmp(mgm->gid, zero_gid, 16)) {
+			if (*index != *hash) {
+				mlx4_err(dev, "Found zero MGID in AMGM.\n");
+				err = -EINVAL;
+			}
+			return err;
+		}
+
+		if (!memcmp(mgm->gid, gid, 16) &&
+				(prot == be32_to_cpu(mgm->members_count) >> 30))
+			return err;
+
+		*prev = *index;
+		*index = be32_to_cpu(mgm->next_gid_index) >> 6;
+	} while (*index);
+
+	*index = -1;
+	return err;
+}
+
+int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  int block_mcast_loopback, enum mlx4_mcast_prot prot)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	u32 members_count;
+	u16 hash;
+	int index, prev;
+	int link = 0;
+	int i;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	mgm = mailbox->buf;
+
+	mutex_lock(&priv->mcg_table.mutex);
+
+	err = find_mgm(dev, gid, prot, mailbox, &hash, &prev, &index);
+	if (err)
+		goto out;
+
+	if (index != -1) {
+		if (!memcmp(mgm->gid, zero_gid, 16))
+			memcpy(mgm->gid, gid, 16);
+	} else {
+		link = 1;
+
+		index = mlx4_bitmap_alloc(&priv->mcg_table.bitmap);
+		if (index == -1) {
+			mlx4_err(dev, "No AMGM entries left\n");
+			err = -ENOMEM;
+			goto out;
+		}
+		index += dev->caps.num_mgms;
+
+		memset(mgm, 0, sizeof *mgm);
+		memcpy(mgm->gid, gid, 16);
+	}
+
+	members_count = be32_to_cpu(mgm->members_count) & 0xffffff;
+	if (members_count == MLX4_QP_PER_MGM) {
+		mlx4_err(dev, "MGM at index %x is full.\n", index);
+		err = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < members_count; ++i)
+		if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qp->qpn) {
+			mlx4_dbg(dev, "QP %06x already a member of MGM\n", qp->qpn);
+			err = 0;
+			goto out;
+		}
+
+	mgm->qp[members_count++] = cpu_to_be32((qp->qpn & MGM_QPN_MASK) |
+					       (!!mlx4_blck_lb << MGM_BLCK_LB_BIT));
+
+	mgm->members_count = cpu_to_be32(members_count | ((u32) prot << 30));
+
+	err = mlx4_WRITE_MCG(dev, index, mailbox);
+	if (err)
+		goto out;
+
+	if (!link)
+		goto out;
+
+	err = mlx4_READ_MCG(dev, prev, mailbox);
+	if (err)
+		goto out;
+
+	mgm->next_gid_index = cpu_to_be32(index << 6);
+
+	err = mlx4_WRITE_MCG(dev, prev, mailbox);
+	if (err)
+		goto out;
+
+out:
+	if (err && link && index != -1) {
+		if (index < dev->caps.num_mgms)
+			mlx4_warn(dev, "Got AMGM index %d < %d",
+				  index, dev->caps.num_mgms);
+		else
+			mlx4_bitmap_free(&priv->mcg_table.bitmap,
+					 index - dev->caps.num_mgms);
+	}
+	mutex_unlock(&priv->mcg_table.mutex);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_multicast_attach);
+
+int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+						enum mlx4_mcast_prot prot)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	u32 members_count;
+	u16 hash;
+	int prev, index;
+	int i, loc;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	mgm = mailbox->buf;
+
+	mutex_lock(&priv->mcg_table.mutex);
+
+	err = find_mgm(dev, gid, prot, mailbox, &hash, &prev, &index);
+	if (err)
+		goto out;
+
+	if (index == -1) {
+		mlx4_err(dev, "MGID %pI6 not found\n", gid);
+		err = -EINVAL;
+		goto out;
+	}
+
+	members_count = be32_to_cpu(mgm->members_count) & 0xffffff;
+	for (loc = -1, i = 0; i < members_count; ++i)
+		if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qp->qpn)
+			loc = i;
+
+	if (loc == -1) {
+		mlx4_err(dev, "QP %06x not found in MGM\n", qp->qpn);
+		err = -EINVAL;
+		goto out;
+	}
+
+
+	mgm->members_count = cpu_to_be32(--members_count | ((u32) prot << 30));
+	mgm->qp[loc]       = mgm->qp[i - 1];
+	mgm->qp[i - 1]     = 0;
+
+	if (i != 1) {
+		err = mlx4_WRITE_MCG(dev, index, mailbox);
+		goto out;
+	}
+
+	if (prev == -1) {
+		/* Remove entry from MGM */
+		int amgm_index = be32_to_cpu(mgm->next_gid_index) >> 6;
+		if (amgm_index) {
+			err = mlx4_READ_MCG(dev, amgm_index, mailbox);
+			if (err)
+				goto out;
+		} else
+			memset(mgm->gid, 0, 16);
+
+		err = mlx4_WRITE_MCG(dev, index, mailbox);
+		if (err)
+			goto out;
+
+		if (amgm_index) {
+			if (amgm_index < dev->caps.num_mgms)
+				mlx4_warn(dev, "MGM entry %d had AMGM index %d < %d",
+					  index, amgm_index, dev->caps.num_mgms);
+			else
+				mlx4_bitmap_free(&priv->mcg_table.bitmap,
+						 amgm_index - dev->caps.num_mgms);
+		}
+	} else {
+		/* Remove entry from AMGM */
+		int cur_next_index = be32_to_cpu(mgm->next_gid_index) >> 6;
+		err = mlx4_READ_MCG(dev, prev, mailbox);
+		if (err)
+			goto out;
+
+		mgm->next_gid_index = cpu_to_be32(cur_next_index << 6);
+
+		err = mlx4_WRITE_MCG(dev, prev, mailbox);
+		if (err)
+			goto out;
+
+		if (index < dev->caps.num_mgms)
+			mlx4_warn(dev, "entry %d had next AMGM index %d < %d",
+				  prev, index, dev->caps.num_mgms);
+		else
+			mlx4_bitmap_free(&priv->mcg_table.bitmap,
+					 index - dev->caps.num_mgms);
+	}
+
+out:
+	mutex_unlock(&priv->mcg_table.mutex);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_multicast_detach);
+
+int mlx4_init_mcg_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+
+	err = mlx4_bitmap_init(&priv->mcg_table.bitmap, dev->caps.num_amgms,
+			       dev->caps.num_amgms - 1, 0, 0);
+	if (err)
+		return err;
+
+	mutex_init(&priv->mcg_table.mutex);
+
+	return 0;
+}
+
+void mlx4_cleanup_mcg_table(struct mlx4_dev *dev)
+{
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->mcg_table.bitmap);
+}
diff --git a/sys/ofed/drivers/net/mlx4/mlx4.h b/sys/ofed/drivers/net/mlx4/mlx4.h
new file mode 100644
index 0000000..d5d3da9
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/mlx4.h
@@ -0,0 +1,427 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_H
+#define MLX4_H
+
+#include <linux/mutex.h>
+#include <linux/radix-tree.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/doorbell.h>
+
+#define DRV_NAME	"mlx4_core"
+#define PFX		DRV_NAME ": "
+#define DRV_VERSION	"1.0-ofed1.5.2"
+#define DRV_RELDATE	"August 4, 2010"
+
+enum {
+	MLX4_HCR_BASE		= 0x80680,
+	MLX4_HCR_SIZE		= 0x0001c,
+	MLX4_CLR_INT_SIZE	= 0x00008
+};
+
+enum {
+	MLX4_MGM_ENTRY_SIZE	=  0x100,
+	MLX4_QP_PER_MGM		= 4 * (MLX4_MGM_ENTRY_SIZE / 16 - 2),
+	MLX4_MTT_ENTRY_PER_SEG	= 8
+};
+
+enum {
+	MLX4_NUM_PDS		= 1 << 15
+};
+
+enum {
+	MLX4_CMPT_TYPE_QP	= 0,
+	MLX4_CMPT_TYPE_SRQ	= 1,
+	MLX4_CMPT_TYPE_CQ	= 2,
+	MLX4_CMPT_TYPE_EQ	= 3,
+	MLX4_CMPT_NUM_TYPE
+};
+
+enum {
+	MLX4_CMPT_SHIFT		= 24,
+	MLX4_NUM_CMPTS		= MLX4_CMPT_NUM_TYPE << MLX4_CMPT_SHIFT
+};
+
+#ifdef CONFIG_MLX4_DEBUG
+extern int mlx4_debug_level;
+#else /* CONFIG_MLX4_DEBUG */
+#define mlx4_debug_level	(0)
+#endif /* CONFIG_MLX4_DEBUG */
+
+#define mlx4_dbg(mdev, format, arg...)					\
+	do {								\
+		if (mlx4_debug_level)					\
+			dev_printk(KERN_DEBUG, &mdev->pdev->dev, format, ## arg); \
+	} while (0)
+
+#define mlx4_err(mdev, format, arg...) \
+	dev_err(&mdev->pdev->dev, format, ## arg)
+#define mlx4_info(mdev, format, arg...) \
+	dev_info(&mdev->pdev->dev, format, ## arg)
+#define mlx4_warn(mdev, format, arg...) \
+	dev_warn(&mdev->pdev->dev, format, ## arg)
+
+extern int mlx4_blck_lb;
+
+struct mlx4_bitmap {
+	u32			last;
+	u32			top;
+	u32			max;
+	u32                     reserved_top;
+	u32			mask;
+	u32			avail;
+	spinlock_t		lock;
+	unsigned long	       *table;
+};
+
+struct mlx4_buddy {
+	unsigned long	      **bits;
+	unsigned int	       *num_free;
+	int			max_order;
+	spinlock_t		lock;
+};
+
+struct mlx4_icm;
+
+struct mlx4_icm_table {
+	u64			virt;
+	int			num_icm;
+	int			num_obj;
+	int			obj_size;
+	int			lowmem;
+	int			coherent;
+	struct mutex		mutex;
+	struct mlx4_icm	      **icm;
+};
+
+struct mlx4_eq {
+	struct mlx4_dev	       *dev;
+	void __iomem	       *doorbell;
+	int			eqn;
+	u32			cons_index;
+	u16			irq;
+	u16			have_irq;
+	int			nent;
+	int			load;
+	struct mlx4_buf_list   *page_list;
+	struct mlx4_mtt		mtt;
+};
+
+struct mlx4_profile {
+	int			num_qp;
+	int			rdmarc_per_qp;
+	int			num_srq;
+	int			num_cq;
+	int			num_mcg;
+	int			num_mpt;
+	int			num_mtt;
+};
+
+struct mlx4_fw {
+	u64			clr_int_base;
+	u64			catas_offset;
+	struct mlx4_icm	       *fw_icm;
+	struct mlx4_icm	       *aux_icm;
+	u32			catas_size;
+	u16			fw_pages;
+	u8			clr_int_bar;
+	u8			catas_bar;
+};
+
+struct mlx4_cmd {
+	struct pci_pool	       *pool;
+	void __iomem	       *hcr;
+	struct mutex		hcr_mutex;
+	struct semaphore	poll_sem;
+	struct semaphore	event_sem;
+	int			max_cmds;
+	spinlock_t		context_lock;
+	int			free_head;
+	struct mlx4_cmd_context *context;
+	u16			token_mask;
+	u8			use_events;
+	u8			toggle;
+};
+
+struct mlx4_uar_table {
+	struct mlx4_bitmap	bitmap;
+};
+
+struct mlx4_mr_table {
+	struct mlx4_bitmap	mpt_bitmap;
+	struct mlx4_buddy	mtt_buddy;
+	u64			mtt_base;
+	u64			mpt_base;
+	struct mlx4_icm_table	mtt_table;
+	struct mlx4_icm_table	dmpt_table;
+};
+
+struct mlx4_cq_table {
+	struct mlx4_bitmap	bitmap;
+	spinlock_t		lock;
+	struct radix_tree_root	tree;
+	struct mlx4_icm_table	table;
+	struct mlx4_icm_table	cmpt_table;
+};
+
+struct mlx4_eq_table {
+	struct mlx4_bitmap	bitmap;
+	char		       *irq_names;
+	void __iomem	       *clr_int;
+	void __iomem	      **uar_map;
+	u32			clr_mask;
+	struct mlx4_eq	       *eq;
+	struct mlx4_icm_table	table;
+	struct mlx4_icm_table	cmpt_table;
+	int			have_irq;
+	u8			inta_pin;
+};
+
+struct mlx4_srq_table {
+	struct mlx4_bitmap	bitmap;
+	spinlock_t		lock;
+	struct mlx4_icm_table	table;
+	struct mlx4_icm_table	cmpt_table;
+};
+
+struct mlx4_qp_table {
+	struct mlx4_bitmap	bitmap;
+	u32			rdmarc_base;
+	int			rdmarc_shift;
+	spinlock_t		lock;
+	struct mlx4_icm_table	qp_table;
+	struct mlx4_icm_table	auxc_table;
+	struct mlx4_icm_table	altc_table;
+	struct mlx4_icm_table	rdmarc_table;
+	struct mlx4_icm_table	cmpt_table;
+};
+
+struct mlx4_mcg_table {
+	struct mutex		mutex;
+	struct mlx4_bitmap	bitmap;
+	struct mlx4_icm_table	table;
+};
+
+struct mlx4_catas_err {
+	u32 __iomem	       *map;
+	struct timer_list	timer;
+	struct list_head	list;
+};
+
+#define MLX4_MAX_MAC_NUM	128
+#define MLX4_MAC_TABLE_SIZE	(MLX4_MAX_MAC_NUM << 3)
+
+struct mlx4_mac_table {
+	__be64			entries[MLX4_MAX_MAC_NUM];
+	int			refs[MLX4_MAX_MAC_NUM];
+	struct mutex		mutex;
+	int			total;
+	int			max;
+};
+
+#define MLX4_MAX_VLAN_NUM	128
+#define MLX4_VLAN_TABLE_SIZE	(MLX4_MAX_VLAN_NUM << 2)
+
+struct mlx4_vlan_table {
+	__be32			entries[MLX4_MAX_VLAN_NUM];
+	int			refs[MLX4_MAX_VLAN_NUM];
+	struct mutex		mutex;
+	int			total;
+	int			max;
+};
+
+struct mlx4_port_info {
+	struct mlx4_dev	       *dev;
+	int			port;
+	char			dev_name[16];
+	struct device_attribute port_attr;
+	enum mlx4_port_type	tmp_type;
+	struct mlx4_mac_table	mac_table;
+	struct mlx4_vlan_table	vlan_table;
+};
+
+struct mlx4_sense {
+	struct mlx4_dev		*dev;
+	u8			do_sense_port[MLX4_MAX_PORTS + 1];
+	u8			sense_allowed[MLX4_MAX_PORTS + 1];
+	struct delayed_work	sense_poll;
+	struct workqueue_struct	*sense_wq;
+	u32			resched;
+};
+
+extern struct mutex drv_mutex;
+
+struct mlx4_priv {
+	struct mlx4_dev		dev;
+
+	struct list_head	dev_list;
+	struct list_head	ctx_list;
+	spinlock_t		ctx_lock;
+
+	struct list_head        pgdir_list;
+	struct mutex            pgdir_mutex;
+
+	struct mlx4_fw		fw;
+	struct mlx4_cmd		cmd;
+
+	struct mlx4_bitmap	pd_bitmap;
+	struct mlx4_bitmap	xrcd_bitmap;
+	struct mlx4_uar_table	uar_table;
+	struct mlx4_mr_table	mr_table;
+	struct mlx4_cq_table	cq_table;
+	struct mlx4_eq_table	eq_table;
+	struct mlx4_srq_table	srq_table;
+	struct mlx4_qp_table	qp_table;
+	struct mlx4_mcg_table	mcg_table;
+	struct mlx4_bitmap	counters_bitmap;
+	struct list_head	bf_list;
+	struct mutex		bf_mutex;
+
+	struct mlx4_catas_err	catas_err;
+
+	void __iomem	       *clr_base;
+
+	struct mlx4_uar		driver_uar;
+	void __iomem	       *kar;
+	struct mlx4_port_info	port[MLX4_MAX_PORTS + 1];
+	struct device_attribute trigger_attr;
+	int                     trig;
+	int                     changed_ports;
+	struct mlx4_sense       sense;
+	struct mutex		port_mutex;
+	int			iboe_counter_index[MLX4_MAX_PORTS];
+	struct io_mapping      *bf_mapping;
+};
+
+static inline struct mlx4_priv *mlx4_priv(struct mlx4_dev *dev)
+{
+	return container_of(dev, struct mlx4_priv, dev);
+}
+
+#define MLX4_SENSE_RANGE	(HZ * 3)
+
+extern struct workqueue_struct *mlx4_wq;
+
+u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap);
+void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj);
+u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align);
+void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt);
+u32 mlx4_bitmap_avail(struct mlx4_bitmap *bitmap);
+int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask,
+		     u32 reserved_bot, u32 resetrved_top);
+void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap);
+
+int mlx4_reset(struct mlx4_dev *dev);
+
+int mlx4_alloc_eq_table(struct mlx4_dev *dev);
+void mlx4_free_eq_table(struct mlx4_dev *dev);
+
+int mlx4_init_pd_table(struct mlx4_dev *dev);
+int mlx4_init_xrcd_table(struct mlx4_dev *dev);
+int mlx4_init_uar_table(struct mlx4_dev *dev);
+int mlx4_init_mr_table(struct mlx4_dev *dev);
+int mlx4_init_eq_table(struct mlx4_dev *dev);
+int mlx4_init_cq_table(struct mlx4_dev *dev);
+int mlx4_init_qp_table(struct mlx4_dev *dev);
+int mlx4_init_srq_table(struct mlx4_dev *dev);
+int mlx4_init_mcg_table(struct mlx4_dev *dev);
+
+void mlx4_cleanup_pd_table(struct mlx4_dev *dev);
+void mlx4_cleanup_uar_table(struct mlx4_dev *dev);
+void mlx4_cleanup_mr_table(struct mlx4_dev *dev);
+void mlx4_cleanup_eq_table(struct mlx4_dev *dev);
+void mlx4_cleanup_cq_table(struct mlx4_dev *dev);
+void mlx4_cleanup_qp_table(struct mlx4_dev *dev);
+void mlx4_cleanup_srq_table(struct mlx4_dev *dev);
+void mlx4_cleanup_mcg_table(struct mlx4_dev *dev);
+void mlx4_cleanup_xrcd_table(struct mlx4_dev *dev);
+
+void mlx4_start_catas_poll(struct mlx4_dev *dev);
+void mlx4_stop_catas_poll(struct mlx4_dev *dev);
+void mlx4_catas_init(void);
+int mlx4_restart_one(struct pci_dev *pdev);
+int mlx4_register_device(struct mlx4_dev *dev);
+void mlx4_unregister_device(struct mlx4_dev *dev);
+void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type, int port);
+void *mlx4_find_get_prot_dev(struct mlx4_dev *dev, enum mlx4_prot proto, int port);
+
+struct mlx4_dev_cap;
+struct mlx4_init_hca_param;
+
+u64 mlx4_make_profile(struct mlx4_dev *dev,
+		      struct mlx4_profile *request,
+		      struct mlx4_dev_cap *dev_cap,
+		      struct mlx4_init_hca_param *init_hca);
+
+int mlx4_cmd_init(struct mlx4_dev *dev);
+void mlx4_cmd_cleanup(struct mlx4_dev *dev);
+void mlx4_cmd_event(struct mlx4_dev *dev, u16 token, u8 status, u64 out_param);
+int mlx4_cmd_use_events(struct mlx4_dev *dev);
+void mlx4_cmd_use_polling(struct mlx4_dev *dev);
+
+void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn);
+void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type);
+
+void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type);
+
+void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type);
+
+void mlx4_handle_catas_err(struct mlx4_dev *dev);
+
+void mlx4_do_sense_ports(struct mlx4_dev *dev,
+			 enum mlx4_port_type *stype,
+			 enum mlx4_port_type *defaults);
+void mlx4_start_sense(struct mlx4_dev *dev);
+void mlx4_stop_sense(struct mlx4_dev *dev);
+int mlx4_sense_init(struct mlx4_dev *dev);
+void mlx4_sense_cleanup(struct mlx4_dev *dev);
+int mlx4_check_port_params(struct mlx4_dev *dev,
+			   enum mlx4_port_type *port_type);
+int mlx4_change_port_types(struct mlx4_dev *dev,
+			   enum mlx4_port_type *port_types);
+
+void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table);
+void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table);
+
+int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port);
+int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps);
+
+#endif /* MLX4_H */
diff --git a/sys/ofed/drivers/net/mlx4/mlx4_en.h b/sys/ofed/drivers/net/mlx4/mlx4_en.h
new file mode 100644
index 0000000..6ab258e
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/mlx4_en.h
@@ -0,0 +1,651 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef _MLX4_EN_H_
+#define _MLX4_EN_H_
+
+#include <sys/cdefs.h>
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/cq.h>
+#include <linux/mlx4/srq.h>
+#include <linux/mlx4/doorbell.h>
+#include <linux/mlx4/cmd.h>
+
+#include <net/if_media.h>
+#include <netinet/tcp_lro.h>
+
+#include "en_port.h"
+
+#define DRV_NAME	"mlx4_en"
+#define DRV_VERSION	"1.5.2"
+#define DRV_RELDATE	"July 2010"
+
+/* XXX */
+#define	NETIF_MSG_LINK		0x1
+#define	NETIF_MSG_IFDOWN	0x2
+#define	NETIF_MSG_HW		0x4
+#define	NETIF_MSG_DRV		0x8
+#define	NETIF_MSG_INTR		0x10
+#define	NETIF_MSG_RX_ERR	0x20
+
+#define MLX4_EN_MSG_LEVEL	(NETIF_MSG_LINK | NETIF_MSG_IFDOWN)
+
+#define en_print(level, priv, format, arg...)			\
+	{							\
+	if ((priv)->registered)					\
+		printk(level "%s: %s: " format, DRV_NAME,	\
+			(priv->dev)->if_xname, ## arg);	\
+	else							\
+		printk(level "%s: %s: Port %d: " format,	\
+			DRV_NAME, dev_name(&priv->mdev->pdev->dev), \
+			(priv)->port, ## arg);			\
+	}
+
+#define en_dbg(mlevel, priv, format, arg...)	\
+	if (NETIF_MSG_##mlevel & priv->msg_enable) \
+		en_print(KERN_DEBUG, priv, format, ## arg)
+#define en_warn(priv, format, arg...) \
+	en_print(KERN_WARNING, priv, format, ## arg)
+#define en_err(priv, format, arg...) \
+	en_print(KERN_ERR, priv, format, ## arg)
+#define en_info(priv, format, arg...) \
+	en_print(KERN_INFO, priv, format, ## arg)
+
+#define mlx4_err(mdev, format, arg...) \
+	printk(KERN_ERR "%s %s: " format , DRV_NAME ,\
+		dev_name(&mdev->pdev->dev) , ## arg)
+#define mlx4_info(mdev, format, arg...) \
+	printk(KERN_INFO "%s %s: " format , DRV_NAME ,\
+		dev_name(&mdev->pdev->dev) , ## arg)
+#define mlx4_warn(mdev, format, arg...) \
+	printk(KERN_WARNING "%s %s: " format , DRV_NAME ,\
+		dev_name(&mdev->pdev->dev) , ## arg)
+
+/*
+ * Device constants
+ */
+
+
+#define MLX4_EN_PAGE_SHIFT	12
+#define MLX4_EN_PAGE_SIZE	(1 << MLX4_EN_PAGE_SHIFT)
+#define MAX_TX_RINGS		(MLX4_EN_NUM_HASH_RINGS + 1 + MLX4_EN_NUM_PPP_RINGS)
+#define MAX_RX_RINGS		16
+#define TXBB_SIZE		64
+#define HEADROOM		(2048 / TXBB_SIZE + 1)
+#define STAMP_STRIDE		64
+#define STAMP_DWORDS		(STAMP_STRIDE / 4)
+#define STAMP_SHIFT		31
+#define STAMP_VAL		0x7fffffff
+#define STATS_DELAY		(HZ / 4)
+
+/* Typical TSO descriptor with 16 gather entries is 352 bytes... */
+#define MAX_DESC_SIZE		512
+#define MAX_DESC_TXBBS		(MAX_DESC_SIZE / TXBB_SIZE)
+
+/*
+ * OS related constants and tunables
+ */
+
+#define MLX4_EN_WATCHDOG_TIMEOUT	(15 * HZ)
+
+#define MLX4_EN_MAX_LRO_DESCRIPTORS	32
+#define MLX4_EN_NUM_IPFRAG_SESSIONS	16
+
+/* Receive fragment sizes; we use at most 3 fragments (for 9600 byte MTU
+ * and 4K allocations) */
+#if MJUMPAGESIZE == 4096
+enum {
+	FRAG_SZ0 = MCLBYTES,
+	FRAG_SZ1 = MJUMPAGESIZE,
+	FRAG_SZ2 = MJUMPAGESIZE,
+};
+#define MLX4_EN_MAX_RX_FRAGS	3
+#elif MJUMPAGESIZE == 8192
+enum {
+	FRAG_SZ0 = MCLBYTES,
+	FRAG_SZ1 = MJUMPAGESIZE,
+};
+#define MLX4_EN_MAX_RX_FRAGS	2
+#elif MJUMPAGESIZE == 8192
+#else
+#error	"Unknown PAGE_SIZE"
+#endif
+
+/* Maximum ring sizes */
+#define MLX4_EN_MAX_TX_SIZE	8192
+#define MLX4_EN_MAX_RX_SIZE	8192
+
+#define MLX4_EN_MIN_RX_SIZE	(128)
+#define MLX4_EN_MIN_TX_SIZE	(4096 / TXBB_SIZE)
+
+#define MLX4_EN_SMALL_PKT_SIZE		64
+#define MLX4_EN_TX_HASH_SIZE		256
+#define MLX4_EN_TX_HASH_MASK		(MLX4_EN_TX_HASH_SIZE - 1)
+#define MLX4_EN_NUM_HASH_RINGS		4
+#define MLX4_EN_NUM_PPP_RINGS		8
+#define MLX4_EN_DEF_TX_RING_SIZE	512
+#define MLX4_EN_DEF_TX_QUEUE_SIZE	4096
+#define MLX4_EN_DEF_RX_RING_SIZE  	1024
+#define	MLX4_EN_MAX_RX_POLL		16
+
+/* Target number of bytes to coalesce with interrupt moderation */
+#define MLX4_EN_RX_COAL_TARGET	0x20000
+#define MLX4_EN_RX_COAL_TIME	0x10
+
+#define MLX4_EN_TX_COAL_PKTS	5
+#define MLX4_EN_TX_COAL_TIME	0x80
+
+#define MLX4_EN_RX_RATE_LOW		400000
+#define MLX4_EN_RX_COAL_TIME_LOW	0
+#define MLX4_EN_RX_RATE_HIGH		450000
+#define MLX4_EN_RX_COAL_TIME_HIGH	128
+#define MLX4_EN_RX_SIZE_THRESH		1024
+#define MLX4_EN_RX_RATE_THRESH		(1000000 / MLX4_EN_RX_COAL_TIME_HIGH)
+#define MLX4_EN_SAMPLE_INTERVAL		0
+#define MLX4_EN_AVG_PKT_SMALL		256
+
+#define MLX4_EN_AUTO_CONF	0xffff
+
+#define MLX4_EN_DEF_RX_PAUSE	1
+#define MLX4_EN_DEF_TX_PAUSE	1
+
+/* Interval between sucessive polls in the Tx routine when polling is used
+   instead of interrupts (in per-core Tx rings) - should be power of 2 */
+#define MLX4_EN_TX_POLL_MODER	16
+#define MLX4_EN_TX_POLL_TIMEOUT	(HZ / 4)
+
+#define ETH_LLC_SNAP_SIZE	8
+
+#define SMALL_PACKET_SIZE      (MHLEN)
+#define HEADER_COPY_SIZE       (128)
+#define MLX4_LOOPBACK_TEST_PAYLOAD (HEADER_COPY_SIZE - ETHER_HDR_LEN)
+
+#define MLX4_EN_MIN_MTU		46
+#define ETH_BCAST		0xffffffffffffULL
+
+#define MLX4_EN_LOOPBACK_RETRIES	5
+#define MLX4_EN_LOOPBACK_TIMEOUT	100
+
+#ifdef MLX4_EN_PERF_STAT
+/* Number of samples to 'average' */
+#define AVG_SIZE			128
+#define AVG_FACTOR			1024
+#define NUM_PERF_STATS			NUM_PERF_COUNTERS
+
+#define INC_PERF_COUNTER(cnt)		(++(cnt))
+#define ADD_PERF_COUNTER(cnt, add)	((cnt) += (add))
+#define AVG_PERF_COUNTER(cnt, sample) \
+	((cnt) = ((cnt) * (AVG_SIZE - 1) + (sample) * AVG_FACTOR) / AVG_SIZE)
+#define GET_PERF_COUNTER(cnt)		(cnt)
+#define GET_AVG_PERF_COUNTER(cnt)	((cnt) / AVG_FACTOR)
+
+#else
+
+#define NUM_PERF_STATS			0
+#define INC_PERF_COUNTER(cnt)		do {} while (0)
+#define ADD_PERF_COUNTER(cnt, add)	do {} while (0)
+#define AVG_PERF_COUNTER(cnt, sample)	do {} while (0)
+#define GET_PERF_COUNTER(cnt)		(0)
+#define GET_AVG_PERF_COUNTER(cnt)	(0)
+#endif /* MLX4_EN_PERF_STAT */
+
+/*
+ * Configurables
+ */
+
+enum cq_type {
+	RX = 0,
+	TX = 1,
+};
+
+
+/*
+ * Useful macros
+ */
+#define ROUNDUP_LOG2(x)		ilog2(roundup_pow_of_two(x))
+#define XNOR(x, y)		(!(x) == !(y))
+#define ILLEGAL_MAC(addr)	(addr == 0xffffffffffffULL || addr == 0x0)
+
+
+struct mlx4_en_tx_info {
+	struct mbuf *mb;
+	u32 nr_txbb;
+	u8 nr_segs;
+	u8 data_offset;
+	u8 inl;
+};
+
+
+#define MLX4_EN_BIT_DESC_OWN	0x80000000
+#define CTRL_SIZE	sizeof(struct mlx4_wqe_ctrl_seg)
+#define MLX4_EN_MEMTYPE_PAD	0x100
+#define DS_SIZE		sizeof(struct mlx4_wqe_data_seg)
+
+
+struct mlx4_en_tx_desc {
+	struct mlx4_wqe_ctrl_seg ctrl;
+	union {
+		struct mlx4_wqe_data_seg data; /* at least one data segment */
+		struct mlx4_wqe_lso_seg lso;
+		struct mlx4_wqe_inline_seg inl;
+	};
+};
+
+#define MLX4_EN_USE_SRQ		0x01000000
+
+struct mlx4_en_tx_ring {
+	spinlock_t tx_lock;
+	struct mlx4_hwq_resources wqres;
+	u32 size ; /* number of TXBBs */
+	u32 size_mask;
+	u16 stride;
+	u16 cqn;	/* index of port CQ associated with this ring */
+	u32 prod;
+	u32 cons;
+	u32 buf_size;
+	u32 doorbell_qpn;
+	void *buf;
+	u16 poll_cnt;
+	int blocked;
+	struct buf_ring *br;
+	struct mlx4_en_tx_info *tx_info;
+	u8 *bounce_buf;
+	u32 last_nr_txbb;
+	struct mlx4_qp qp;
+	struct mlx4_qp_context context;
+	int qpn;
+	enum mlx4_qp_state qp_state;
+	struct mlx4_srq dummy;
+	unsigned long bytes;
+	unsigned long packets;
+	unsigned long errors;
+	spinlock_t comp_lock;
+	struct mlx4_bf bf;
+	bool bf_enabled;
+	u64 watchdog_time;
+};
+
+struct mlx4_en_ipfrag {
+	struct mbuf *fragments;
+	struct mbuf *last;
+	__be32		saddr;
+	__be32		daddr;
+	__be16		id;
+	u8		protocol;
+	int		total_len;
+	u16		offset;
+};
+
+struct mlx4_en_rx_desc {
+	/* actual number of entries depends on rx ring stride */
+	struct mlx4_wqe_data_seg data[0];
+};
+
+struct mlx4_en_rx_ring {
+	struct mlx4_hwq_resources wqres;
+	u32 size ;	/* number of Rx descs*/
+	u32 actual_size;
+	u32 size_mask;
+	u16 stride;
+	u16 log_stride;
+	u16 cqn;	/* index of port CQ associated with this ring */
+	u32 prod;
+	u32 cons;
+	u32 buf_size;
+	void *buf;
+	void *rx_info;
+	unsigned long bytes;
+	unsigned long packets;
+	unsigned long errors;
+	unsigned int use_frags;
+	struct lro_ctrl lro;
+	struct mlx4_en_ipfrag ipfrag[MLX4_EN_NUM_IPFRAG_SESSIONS];
+};
+
+
+static inline int mlx4_en_can_lro(__be16 status)
+{
+	return (status & cpu_to_be16(MLX4_CQE_STATUS_IPV4	|
+				     MLX4_CQE_STATUS_IPV4F	|
+				     MLX4_CQE_STATUS_IPV6	|
+				     MLX4_CQE_STATUS_IPV4OPT	|
+				     MLX4_CQE_STATUS_TCP	|
+				     MLX4_CQE_STATUS_UDP	|
+				     MLX4_CQE_STATUS_IPOK)) ==
+		cpu_to_be16(MLX4_CQE_STATUS_IPV4 |
+			    MLX4_CQE_STATUS_IPOK |
+			    MLX4_CQE_STATUS_TCP);
+}
+
+struct mlx4_en_cq {
+	struct mlx4_cq          mcq;
+	struct mlx4_hwq_resources wqres;
+	int                     ring;
+	spinlock_t              lock;
+	struct net_device      *dev;
+	/* Per-core Tx cq processing support */
+	struct timer_list timer;
+	int size;
+	int buf_size;
+	unsigned vector;
+	enum cq_type is_tx;
+	u16 moder_time;
+	u16 moder_cnt;
+	struct mlx4_cqe *buf;
+	struct task cq_task;
+	struct taskqueue *tq;
+#define MLX4_EN_OPCODE_ERROR	0x1e
+	u32 tot_rx;
+};
+
+struct mlx4_en_port_profile {
+	u32 flags;
+	u32 tx_ring_num;
+	u32 rx_ring_num;
+	u32 tx_ring_size;
+	u32 rx_ring_size;
+	u8 rx_pause;
+	u8 rx_ppp;
+	u8 tx_pause;
+	u8 tx_ppp;
+};
+
+struct mlx4_en_profile {
+	int rss_xor;
+	int num_lro;
+	int ip_reasm;
+	int tcp_rss;
+	int udp_rss;
+	u8 rss_mask;
+	u32 active_ports;
+	u32 small_pkt_int;
+	u8 no_reset;
+	struct mlx4_en_port_profile prof[MLX4_MAX_PORTS + 1];
+};
+
+struct mlx4_en_dev {
+	struct mlx4_dev         *dev;
+	struct pci_dev		*pdev;
+	struct mutex		state_lock;
+	struct net_device       *pndev[MLX4_MAX_PORTS + 1];
+	u32                     port_cnt;
+	bool			device_up;
+	struct mlx4_en_profile  profile;
+	u32			LSO_support;
+	struct workqueue_struct *workqueue;
+	struct device           *dma_device;
+	void __iomem            *uar_map;
+	struct mlx4_uar         priv_uar;
+	struct mlx4_mr		mr;
+	u32                     priv_pdn;
+	spinlock_t              uar_lock;
+	u8			mac_removed[MLX4_MAX_PORTS + 1];
+};
+
+
+struct mlx4_en_rss_map {
+	int base_qpn;
+	struct mlx4_qp qps[MAX_RX_RINGS];
+	enum mlx4_qp_state state[MAX_RX_RINGS];
+	struct mlx4_qp indir_qp;
+	enum mlx4_qp_state indir_state;
+};
+
+struct mlx4_en_rss_context {
+	__be32 base_qpn;
+	__be32 default_qpn;
+	u16 reserved;
+	u8 hash_fn;
+	u8 flags;
+	__be32 rss_key[10];
+	__be32 base_qpn_udp;
+};
+
+struct mlx4_en_port_state {
+	int link_state;
+	int link_speed;
+	int transciver;
+};
+
+struct mlx4_en_pkt_stats {
+	unsigned long broadcast;
+	unsigned long rx_prio[8];
+	unsigned long tx_prio[8];
+#define NUM_PKT_STATS		17
+};
+
+struct mlx4_en_port_stats {
+	unsigned long tso_packets;
+	unsigned long queue_stopped;
+	unsigned long wake_queue;
+	unsigned long tx_timeout;
+	unsigned long rx_alloc_failed;
+	unsigned long rx_chksum_good;
+	unsigned long rx_chksum_none;
+	unsigned long tx_chksum_offload;
+};
+
+struct mlx4_en_perf_stats {
+	u32 tx_poll;
+	u64 tx_pktsz_avg;
+	u32 inflight_avg;
+	u32 tx_coal_avg;
+	u32 rx_coal_avg;
+};
+
+struct mlx4_en_frag_info {
+	u16 frag_size;
+	u16 frag_prefix_size;
+};
+
+struct mlx4_en_tx_hash_entry {
+	u8 cnt;
+	unsigned int small_pkts;
+	unsigned int big_pkts;
+	unsigned int ring;
+};
+
+struct mlx4_en_priv {
+	struct mlx4_en_dev *mdev;
+	struct mlx4_en_port_profile *prof;
+	struct net_device *dev;
+	bool vlgrp_modified;
+	u32 vlan_register[VLAN_FLTR_SIZE];
+	u32 vlan_unregister[VLAN_FLTR_SIZE];
+	u32 vlans[VLAN_FLTR_SIZE];
+	spinlock_t vlan_lock;
+	struct mlx4_en_port_state port_state;
+	spinlock_t stats_lock;
+
+	unsigned long last_moder_packets;
+	unsigned long last_moder_tx_packets;
+	unsigned long last_moder_bytes;
+	unsigned long last_moder_jiffies;
+	int last_moder_time;
+	u16 rx_usecs;
+	u16 rx_frames;
+	u16 tx_usecs;
+	u16 tx_frames;
+	u32 pkt_rate_low;
+	u16 rx_usecs_low;
+	u32 pkt_rate_high;
+	u16 rx_usecs_high;
+	u16 sample_interval;
+	u16 adaptive_rx_coal;
+	u32 msg_enable;
+	u32 loopback_ok;
+	u32 validate_loopback;
+
+	struct mlx4_hwq_resources res;
+	int link_state;
+	int last_link_state;
+	bool port_up;
+	int port;
+	int registered;
+	int allocated;
+	int rx_csum;
+	u64 mac;
+	int mac_index;
+	unsigned max_mtu;
+	int base_qpn;
+
+	struct mlx4_en_rss_map rss_map;
+	u16 tx_prio_map[8];
+	u32 flags;
+#define MLX4_EN_FLAG_PROMISC	0x1
+	u32 tx_ring_num;
+	u32 rx_ring_num;
+	u32 udp_rings;
+	u32 rx_mb_size;
+	struct mlx4_en_frag_info frag_info[MLX4_EN_MAX_RX_FRAGS];
+	u16 num_frags;
+	u16 log_rx_info;
+
+	struct mlx4_en_tx_ring tx_ring[MAX_TX_RINGS];
+	struct mlx4_en_rx_ring rx_ring[MAX_RX_RINGS];
+	struct mlx4_en_cq tx_cq[MAX_TX_RINGS];
+	struct mlx4_en_cq rx_cq[MAX_RX_RINGS];
+	struct mlx4_en_tx_hash_entry tx_hash[MLX4_EN_TX_HASH_SIZE];
+	struct work_struct mcast_task;
+	struct work_struct watchdog_task;
+	struct work_struct linkstate_task;
+	struct delayed_work stats_task;
+	struct mlx4_en_perf_stats pstats;
+	struct mlx4_en_pkt_stats pkstats;
+	struct mlx4_en_port_stats port_stats;
+	struct mlx4_en_stat_out_mbox hw_stats;
+	struct ifmedia media;
+	eventhandler_tag vlan_attach;
+	eventhandler_tag vlan_detach;
+	struct callout watchdog_timer;
+	volatile int blocked;
+	struct sysctl_oid *sysctl;
+	struct sysctl_ctx_list conf_ctx;
+	struct sysctl_ctx_list stat_ctx;
+};
+
+
+int mlx4_en_transmit(struct net_device *dev, struct mbuf *mb);
+void mlx4_en_qflush(struct net_device *dev);
+
+int mlx4_en_rx_frags(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring,
+		     struct mbuf *mb, struct mlx4_cqe *cqe);
+void mlx4_en_flush_frags(struct mlx4_en_priv *priv,
+			 struct mlx4_en_rx_ring *ring);
+void mlx4_en_destroy_netdev(struct net_device *dev);
+int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
+			struct mlx4_en_port_profile *prof);
+
+int mlx4_en_start_port(struct net_device *dev);
+void mlx4_en_stop_port(struct net_device *dev);
+
+void mlx4_en_free_resources(struct mlx4_en_priv *priv);
+int mlx4_en_alloc_resources(struct mlx4_en_priv *priv);
+
+int mlx4_en_create_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
+		      int entries, int ring, enum cq_type mode);
+void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
+int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
+void mlx4_en_deactivate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
+int mlx4_en_set_cq_moder(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
+int mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
+
+void mlx4_en_poll_tx_cq(unsigned long data);
+void mlx4_en_tx_irq(struct mlx4_cq *mcq);
+u16 mlx4_en_select_queue(struct net_device *dev, struct mbuf *mb);
+
+int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring,
+			   u32 size, u16 stride);
+void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring);
+int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_tx_ring *ring,
+			     int cq);
+void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
+				struct mlx4_en_tx_ring *ring);
+
+int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
+			   struct mlx4_en_rx_ring *ring, u32 size);
+void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_rx_ring *ring);
+int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv);
+void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
+				struct mlx4_en_rx_ring *ring);
+int mlx4_en_process_rx_cq(struct net_device *dev,
+			  struct mlx4_en_cq *cq,
+			  int budget);
+int mlx4_en_process_rx_cq_mb(struct net_device *dev,
+			      struct mlx4_en_cq *cq,
+			      int budget);
+void mlx4_en_tx_que(void *context, int pending);
+void mlx4_en_rx_que(void *context, int pending);
+void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
+			     int is_tx, int rss, int qpn, int cqn,
+			     struct mlx4_qp_context *context);
+void mlx4_en_sqp_event(struct mlx4_qp *qp, enum mlx4_event event);
+int mlx4_en_map_buffer(struct mlx4_buf *buf);
+void mlx4_en_unmap_buffer(struct mlx4_buf *buf);
+
+void mlx4_en_calc_rx_buf(struct net_device *dev);
+void mlx4_en_set_prio_map(struct mlx4_en_priv *priv, u16 *prio_map, u32 ring_num);
+int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv);
+void mlx4_en_release_rss_steer(struct mlx4_en_priv *priv);
+int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring);
+void mlx4_en_rx_irq(struct mlx4_cq *mcq);
+
+int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode);
+int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, u8 port, u32 *vlans);
+int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
+			  u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx);
+int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn,
+			   u8 promisc);
+
+int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset);
+int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port);
+
+#define MLX4_EN_NUM_SELF_TEST	5
+void mlx4_en_ex_selftest(struct net_device *dev, u32 *flags, u64 *buf);
+u64 mlx4_en_mac_to_u64(u8 *addr);
+
+/*
+ * Globals
+ */
+extern const struct ethtool_ops mlx4_en_ethtool_ops;
+#endif
diff --git a/sys/ofed/drivers/net/mlx4/mr.c b/sys/ofed/drivers/net/mlx4/mr.c
new file mode 100644
index 0000000..9ed610a
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/mr.c
@@ -0,0 +1,773 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+/*
+ * Must be packed because mtt_seg is 64 bits but only aligned to 32 bits.
+ */
+struct mlx4_mpt_entry {
+	__be32 flags;
+	__be32 qpn;
+	__be32 key;
+	__be32 pd_flags;
+	__be64 start;
+	__be64 length;
+	__be32 lkey;
+	__be32 win_cnt;
+	u8	reserved1;
+	u8	flags2;
+	u8	reserved2;
+	u8	mtt_rep;
+	__be64 mtt_seg;
+	__be32 mtt_sz;
+	__be32 entity_size;
+	__be32 first_byte_offset;
+} __attribute__((packed));
+
+#define MLX4_MPT_FLAG_SW_OWNS	    (0xfUL << 28)
+#define MLX4_MPT_FLAG_FREE	    (0x3UL << 28)
+#define MLX4_MPT_FLAG_MIO	    (1 << 17)
+#define MLX4_MPT_FLAG_BIND_ENABLE   (1 << 15)
+#define MLX4_MPT_FLAG_PHYSICAL	    (1 <<  9)
+#define MLX4_MPT_FLAG_REGION	    (1 <<  8)
+
+#define MLX4_MPT_PD_FLAG_FAST_REG   (1 << 27)
+#define MLX4_MPT_PD_FLAG_RAE	    (1 << 28)
+#define MLX4_MPT_PD_FLAG_EN_INV	    (3 << 24)
+
+#define MLX4_MPT_FLAG2_FBO_EN	     (1 <<  7)
+
+#define MLX4_MPT_STATUS_SW		0xF0
+#define MLX4_MPT_STATUS_HW		0x00
+
+static u32 mlx4_buddy_alloc(struct mlx4_buddy *buddy, int order)
+{
+	int o;
+	int m;
+	u32 seg;
+
+	spin_lock(&buddy->lock);
+
+	for (o = order; o <= buddy->max_order; ++o)
+		if (buddy->num_free[o]) {
+			m = 1 << (buddy->max_order - o);
+			seg = find_first_bit(buddy->bits[o], m);
+			if (seg < m)
+				goto found;
+		}
+
+	spin_unlock(&buddy->lock);
+	return -1;
+
+ found:
+	clear_bit(seg, buddy->bits[o]);
+	--buddy->num_free[o];
+
+	while (o > order) {
+		--o;
+		seg <<= 1;
+		set_bit(seg ^ 1, buddy->bits[o]);
+		++buddy->num_free[o];
+	}
+
+	spin_unlock(&buddy->lock);
+
+	seg <<= order;
+
+	return seg;
+}
+
+static void mlx4_buddy_free(struct mlx4_buddy *buddy, u32 seg, int order)
+{
+	seg >>= order;
+
+	spin_lock(&buddy->lock);
+
+	while (test_bit(seg ^ 1, buddy->bits[order])) {
+		clear_bit(seg ^ 1, buddy->bits[order]);
+		--buddy->num_free[order];
+		seg >>= 1;
+		++order;
+	}
+
+	set_bit(seg, buddy->bits[order]);
+	++buddy->num_free[order];
+
+	spin_unlock(&buddy->lock);
+}
+
+static int mlx4_buddy_init(struct mlx4_buddy *buddy, int max_order)
+{
+	int i, s;
+
+	buddy->max_order = max_order;
+	spin_lock_init(&buddy->lock);
+
+	buddy->bits = kzalloc((buddy->max_order + 1) * sizeof (long *),
+			      GFP_KERNEL);
+	buddy->num_free = kzalloc((buddy->max_order + 1) * sizeof (int *),
+				  GFP_KERNEL);
+	if (!buddy->bits || !buddy->num_free)
+		goto err_out;
+
+	for (i = 0; i <= buddy->max_order; ++i) {
+		s = BITS_TO_LONGS(1 << (buddy->max_order - i));
+		buddy->bits[i] = kmalloc(s * sizeof (long), GFP_KERNEL);
+		if (!buddy->bits[i])
+			goto err_out_free;
+		bitmap_zero(buddy->bits[i], 1 << (buddy->max_order - i));
+	}
+
+	set_bit(0, buddy->bits[buddy->max_order]);
+	buddy->num_free[buddy->max_order] = 1;
+
+	return 0;
+
+err_out_free:
+	for (i = 0; i <= buddy->max_order; ++i)
+		kfree(buddy->bits[i]);
+
+err_out:
+	kfree(buddy->bits);
+	kfree(buddy->num_free);
+
+	return -ENOMEM;
+}
+
+static void mlx4_buddy_cleanup(struct mlx4_buddy *buddy)
+{
+	int i;
+
+	for (i = 0; i <= buddy->max_order; ++i)
+		kfree(buddy->bits[i]);
+
+	kfree(buddy->bits);
+	kfree(buddy->num_free);
+}
+
+static u32 mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order)
+{
+	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+	u32 seg;
+
+	seg = mlx4_buddy_alloc(&mr_table->mtt_buddy, order);
+	if (seg == -1)
+		return -1;
+
+	if (mlx4_table_get_range(dev, &mr_table->mtt_table, seg,
+				 seg + (1 << order) - 1)) {
+		mlx4_buddy_free(&mr_table->mtt_buddy, seg, order);
+		return -1;
+	}
+
+	return seg;
+}
+
+int mlx4_mtt_init(struct mlx4_dev *dev, int npages, int page_shift,
+		  struct mlx4_mtt *mtt)
+{
+	int i;
+
+	if (!npages) {
+		mtt->order      = -1;
+		mtt->page_shift = MLX4_ICM_PAGE_SHIFT;
+		return 0;
+	} else
+		mtt->page_shift = page_shift;
+
+	for (mtt->order = 0, i = dev->caps.mtts_per_seg; i < npages; i <<= 1)
+		++mtt->order;
+
+	mtt->first_seg = mlx4_alloc_mtt_range(dev, mtt->order);
+	if (mtt->first_seg == -1)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mtt_init);
+
+void mlx4_mtt_cleanup(struct mlx4_dev *dev, struct mlx4_mtt *mtt)
+{
+	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+
+	if (mtt->order < 0)
+		return;
+
+	mlx4_buddy_free(&mr_table->mtt_buddy, mtt->first_seg, mtt->order);
+	mlx4_table_put_range(dev, &mr_table->mtt_table, mtt->first_seg,
+			     mtt->first_seg + (1 << mtt->order) - 1);
+}
+EXPORT_SYMBOL_GPL(mlx4_mtt_cleanup);
+
+u64 mlx4_mtt_addr(struct mlx4_dev *dev, struct mlx4_mtt *mtt)
+{
+	return (u64) mtt->first_seg * dev->caps.mtt_entry_sz;
+}
+EXPORT_SYMBOL_GPL(mlx4_mtt_addr);
+
+static u32 hw_index_to_key(u32 ind)
+{
+	return (ind >> 24) | (ind << 8);
+}
+
+static u32 key_to_hw_index(u32 key)
+{
+	return (key << 24) | (key >> 8);
+}
+
+static int mlx4_SW2HW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  int mpt_index)
+{
+	return mlx4_cmd(dev, mailbox->dma, mpt_index, 0, MLX4_CMD_SW2HW_MPT,
+			MLX4_CMD_TIME_CLASS_B);
+}
+
+static int mlx4_HW2SW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  int mpt_index)
+{
+	return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, mpt_index,
+			    !mailbox, MLX4_CMD_HW2SW_MPT, MLX4_CMD_TIME_CLASS_B);
+}
+
+int mlx4_mr_reserve_range(struct mlx4_dev *dev, int cnt, int align, u32 *base_mridx)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u32 mridx;
+
+	mridx = mlx4_bitmap_alloc_range(&priv->mr_table.mpt_bitmap, cnt, align);
+	if (mridx == -1)
+		return -ENOMEM;
+
+	*base_mridx = mridx;
+	return 0;
+
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_reserve_range);
+
+void mlx4_mr_release_range(struct mlx4_dev *dev, u32 base_mridx, int cnt)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	mlx4_bitmap_free_range(&priv->mr_table.mpt_bitmap, base_mridx, cnt);
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_release_range);
+
+int mlx4_mr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd,
+			   u64 iova, u64 size, u32 access, int npages,
+			   int page_shift, struct mlx4_mr *mr)
+{
+	mr->iova       = iova;
+	mr->size       = size;
+	mr->pd	       = pd;
+	mr->access     = access;
+	mr->enabled    = 0;
+	mr->key	       = hw_index_to_key(mridx);
+
+	return mlx4_mtt_init(dev, npages, page_shift, &mr->mtt);
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_alloc_reserved);
+
+int mlx4_mr_alloc(struct mlx4_dev *dev, u32 pd, u64 iova, u64 size, u32 access,
+		  int npages, int page_shift, struct mlx4_mr *mr)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u32 index;
+	int err;
+
+	index = mlx4_bitmap_alloc(&priv->mr_table.mpt_bitmap);
+	if (index == -1)
+		return -ENOMEM;
+
+	err = mlx4_mr_alloc_reserved(dev, index, pd, iova, size,
+				     access, npages, page_shift, mr);
+	if (err)
+		mlx4_bitmap_free(&priv->mr_table.mpt_bitmap, index);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_alloc);
+
+void mlx4_mr_free_reserved(struct mlx4_dev *dev, struct mlx4_mr *mr)
+{
+	int err;
+
+	if (mr->enabled) {
+		err = mlx4_HW2SW_MPT(dev, NULL,
+				     key_to_hw_index(mr->key) &
+				     (dev->caps.num_mpts - 1));
+		if (err)
+			mlx4_warn(dev, "HW2SW_MPT failed (%d)\n", err);
+	}
+
+	mlx4_mtt_cleanup(dev, &mr->mtt);
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_free_reserved);
+
+void mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	mlx4_mr_free_reserved(dev, mr);
+	mlx4_bitmap_free(&priv->mr_table.mpt_bitmap, key_to_hw_index(mr->key));
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_free);
+
+int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr)
+{
+	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mpt_entry *mpt_entry;
+	int err;
+
+	err = mlx4_table_get(dev, &mr_table->dmpt_table, key_to_hw_index(mr->key));
+	if (err)
+		return err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_table;
+	}
+	mpt_entry = mailbox->buf;
+
+	memset(mpt_entry, 0, sizeof *mpt_entry);
+
+	mpt_entry->flags = cpu_to_be32(MLX4_MPT_FLAG_MIO	 |
+				       MLX4_MPT_FLAG_REGION	 |
+				       mr->access);
+
+	mpt_entry->key	       = cpu_to_be32(key_to_hw_index(mr->key));
+	mpt_entry->pd_flags    = cpu_to_be32(mr->pd | MLX4_MPT_PD_FLAG_EN_INV);
+	mpt_entry->start       = cpu_to_be64(mr->iova);
+	mpt_entry->length      = cpu_to_be64(mr->size);
+	mpt_entry->entity_size = cpu_to_be32(mr->mtt.page_shift);
+
+	if (mr->mtt.order < 0) {
+		mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_PHYSICAL);
+		mpt_entry->mtt_seg = 0;
+	} else {
+		mpt_entry->mtt_seg = cpu_to_be64(mlx4_mtt_addr(dev, &mr->mtt));
+	}
+
+	if (mr->mtt.order >= 0 && mr->mtt.page_shift == 0) {
+		/* fast register MR in free state */
+		mpt_entry->flags    |= cpu_to_be32(MLX4_MPT_FLAG_FREE);
+		mpt_entry->pd_flags |= cpu_to_be32(MLX4_MPT_PD_FLAG_FAST_REG |
+						   MLX4_MPT_PD_FLAG_RAE);
+		mpt_entry->mtt_sz    = cpu_to_be32((1 << mr->mtt.order) *
+						   dev->caps.mtts_per_seg);
+	} else {
+		mpt_entry->flags    |= cpu_to_be32(MLX4_MPT_FLAG_SW_OWNS);
+	}
+
+	err = mlx4_SW2HW_MPT(dev, mailbox,
+			     key_to_hw_index(mr->key) & (dev->caps.num_mpts - 1));
+	if (err) {
+		mlx4_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+		goto err_cmd;
+	}
+
+	mr->enabled = 1;
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return 0;
+
+err_cmd:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+err_table:
+	mlx4_table_put(dev, &mr_table->dmpt_table, key_to_hw_index(mr->key));
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_enable);
+
+static int mlx4_write_mtt_chunk(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+				int start_index, int npages, u64 *page_list)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	__be64 *mtts;
+	dma_addr_t dma_handle;
+	int i;
+	int s = start_index * sizeof (u64);
+
+	/* All MTTs must fit in the same page */
+	if (start_index / (PAGE_SIZE / sizeof (u64)) !=
+	    (start_index + npages - 1) / (PAGE_SIZE / sizeof (u64)))
+		return -EINVAL;
+
+	if (start_index & (dev->caps.mtts_per_seg - 1))
+		return -EINVAL;
+
+	mtts = mlx4_table_find(&priv->mr_table.mtt_table, mtt->first_seg +
+				s / dev->caps.mtt_entry_sz, &dma_handle);
+	if (!mtts)
+		return -ENOMEM;
+
+	for (i = 0; i < npages; ++i)
+		mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT);
+
+	dma_sync_single(&dev->pdev->dev, dma_handle, npages * sizeof (u64), DMA_TO_DEVICE);
+
+	return 0;
+}
+
+int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		   int start_index, int npages, u64 *page_list)
+{
+	int chunk;
+	int err;
+
+	if (mtt->order < 0)
+		return -EINVAL;
+
+	while (npages > 0) {
+		chunk = min_t(int, PAGE_SIZE / sizeof(u64), npages);
+		err = mlx4_write_mtt_chunk(dev, mtt, start_index, chunk, page_list);
+		if (err)
+			return err;
+
+		npages      -= chunk;
+		start_index += chunk;
+		page_list   += chunk;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_write_mtt);
+
+int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		       struct mlx4_buf *buf)
+{
+	u64 *page_list;
+	int err;
+	int i;
+
+	page_list = kmalloc(buf->npages * sizeof *page_list, GFP_KERNEL);
+	if (!page_list)
+		return -ENOMEM;
+
+	for (i = 0; i < buf->npages; ++i)
+		if (buf->direct.map)
+			page_list[i] = buf->direct.map + (i << buf->page_shift);
+		else
+			page_list[i] = buf->page_list[i].map;
+
+	err = mlx4_write_mtt(dev, mtt, 0, buf->npages, page_list);
+
+	kfree(page_list);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_buf_write_mtt);
+
+int mlx4_init_mr_table(struct mlx4_dev *dev)
+{
+	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+	int err;
+
+	if (!is_power_of_2(dev->caps.num_mpts))
+		return -EINVAL;
+
+	err = mlx4_bitmap_init(&mr_table->mpt_bitmap, dev->caps.num_mpts,
+			       ~0, dev->caps.reserved_mrws, 0);
+	if (err)
+		return err;
+
+	err = mlx4_buddy_init(&mr_table->mtt_buddy,
+			      ilog2(dev->caps.num_mtt_segs));
+	if (err)
+		goto err_buddy;
+
+	if (dev->caps.reserved_mtts) {
+		if (mlx4_alloc_mtt_range(dev, fls(dev->caps.reserved_mtts - 1)) == -1) {
+			mlx4_warn(dev, "MTT table of order %d is too small.\n",
+				  mr_table->mtt_buddy.max_order);
+			err = -ENOMEM;
+			goto err_reserve_mtts;
+		}
+	}
+
+	return 0;
+
+err_reserve_mtts:
+	mlx4_buddy_cleanup(&mr_table->mtt_buddy);
+
+err_buddy:
+	mlx4_bitmap_cleanup(&mr_table->mpt_bitmap);
+
+	return err;
+}
+
+void mlx4_cleanup_mr_table(struct mlx4_dev *dev)
+{
+	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+
+	mlx4_buddy_cleanup(&mr_table->mtt_buddy);
+	mlx4_bitmap_cleanup(&mr_table->mpt_bitmap);
+}
+
+static inline int mlx4_check_fmr(struct mlx4_fmr *fmr, u64 *page_list,
+				  int npages, u64 iova)
+{
+	int i, page_mask;
+
+	if (npages > fmr->max_pages)
+		return -EINVAL;
+
+	page_mask = (1 << fmr->page_shift) - 1;
+
+	/* We are getting page lists, so va must be page aligned. */
+	if (iova & page_mask)
+		return -EINVAL;
+
+	/* Trust the user not to pass misaligned data in page_list */
+	if (0)
+		for (i = 0; i < npages; ++i) {
+			if (page_list[i] & ~page_mask)
+				return -EINVAL;
+		}
+
+	if (fmr->maps >= fmr->max_maps)
+		return -EINVAL;
+
+	return 0;
+}
+
+int mlx4_map_phys_fmr_fbo(struct mlx4_dev *dev, struct mlx4_fmr *fmr,
+			  u64 *page_list, int npages, u64 iova, u32 fbo,
+			  u32 len, u32 *lkey, u32 *rkey, int same_key)
+{
+	u32 key;
+	int i, err;
+
+	err = mlx4_check_fmr(fmr, page_list, npages, iova);
+	if (err)
+		return err;
+
+	++fmr->maps;
+
+	key = key_to_hw_index(fmr->mr.key);
+	if (!same_key)
+		key += dev->caps.num_mpts;
+	*lkey = *rkey = fmr->mr.key = hw_index_to_key(key);
+
+	*(u8 *) fmr->mpt = MLX4_MPT_STATUS_SW;
+
+	/* Make sure MPT status is visible before writing MTT entries */
+	wmb();
+
+	for (i = 0; i < npages; ++i)
+		fmr->mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT);
+
+	dma_sync_single(&dev->pdev->dev, fmr->dma_handle,
+			npages * sizeof(u64), DMA_TO_DEVICE);
+
+	fmr->mpt->key    = cpu_to_be32(key);
+	fmr->mpt->lkey   = cpu_to_be32(key);
+	fmr->mpt->length = cpu_to_be64(len);
+	fmr->mpt->start  = cpu_to_be64(iova);
+	fmr->mpt->first_byte_offset = cpu_to_be32(fbo & 0x001fffff);
+	fmr->mpt->flags2 = (fbo ? MLX4_MPT_FLAG2_FBO_EN : 0);
+
+	/* Make MTT entries are visible before setting MPT status */
+	wmb();
+
+	*(u8 *) fmr->mpt = MLX4_MPT_STATUS_HW;
+
+	/* Make sure MPT status is visible before consumer can use FMR */
+	wmb();
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_map_phys_fmr_fbo);
+
+int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list,
+		      int npages, u64 iova, u32 *lkey, u32 *rkey)
+{
+	u32 len = npages * (1ull << fmr->page_shift);
+
+	return mlx4_map_phys_fmr_fbo(dev, fmr, page_list, npages, iova, 0,
+				     len, lkey, rkey, 0);
+}
+EXPORT_SYMBOL_GPL(mlx4_map_phys_fmr);
+
+int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages,
+		   int max_maps, u8 page_shift, struct mlx4_fmr *fmr)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u64 mtt_seg;
+	int err = -ENOMEM;
+
+	if (page_shift < (ffs(dev->caps.page_size_cap) - 1) || page_shift >= 32)
+		return -EINVAL;
+
+	/* All MTTs must fit in the same page */
+	if (max_pages * sizeof *fmr->mtts > PAGE_SIZE)
+		return -EINVAL;
+
+	fmr->page_shift = page_shift;
+	fmr->max_pages  = max_pages;
+	fmr->max_maps   = max_maps;
+	fmr->maps = 0;
+
+	err = mlx4_mr_alloc(dev, pd, 0, 0, access, max_pages,
+			    page_shift, &fmr->mr);
+	if (err)
+		return err;
+
+	mtt_seg = fmr->mr.mtt.first_seg * dev->caps.mtt_entry_sz;
+
+	fmr->mtts = mlx4_table_find(&priv->mr_table.mtt_table,
+				    fmr->mr.mtt.first_seg,
+				    &fmr->dma_handle);
+	if (!fmr->mtts) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	return 0;
+
+err_free:
+	mlx4_mr_free(dev, &fmr->mr);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_fmr_alloc);
+
+int mlx4_fmr_alloc_reserved(struct mlx4_dev *dev, u32 mridx,
+			    u32 pd, u32 access, int max_pages,
+			    int max_maps, u8 page_shift, struct mlx4_fmr *fmr)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u64 mtt_seg;
+	int err = -ENOMEM;
+
+	if (page_shift < (ffs(dev->caps.page_size_cap) - 1) || page_shift >= 32)
+		return -EINVAL;
+
+	/* All MTTs must fit in the same page */
+	if (max_pages * sizeof *fmr->mtts > PAGE_SIZE)
+		return -EINVAL;
+
+	fmr->page_shift = page_shift;
+	fmr->max_pages  = max_pages;
+	fmr->max_maps   = max_maps;
+	fmr->maps = 0;
+
+	err = mlx4_mr_alloc_reserved(dev, mridx, pd, 0, 0, access, max_pages,
+				     page_shift, &fmr->mr);
+	if (err)
+		return err;
+
+	mtt_seg = fmr->mr.mtt.first_seg * dev->caps.mtt_entry_sz;
+
+	fmr->mtts = mlx4_table_find(&priv->mr_table.mtt_table,
+				    fmr->mr.mtt.first_seg,
+				    &fmr->dma_handle);
+	if (!fmr->mtts) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	return 0;
+
+err_free:
+	mlx4_mr_free_reserved(dev, &fmr->mr);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_fmr_alloc_reserved);
+
+int mlx4_fmr_enable(struct mlx4_dev *dev, struct mlx4_fmr *fmr)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+
+	err = mlx4_mr_enable(dev, &fmr->mr);
+	if (err)
+		return err;
+
+	fmr->mpt = mlx4_table_find(&priv->mr_table.dmpt_table,
+				    key_to_hw_index(fmr->mr.key), NULL);
+	if (!fmr->mpt)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_fmr_enable);
+
+void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr,
+		    u32 *lkey, u32 *rkey)
+{
+	if (!fmr->maps)
+		return;
+
+	fmr->maps = 0;
+
+	*(u8 *) fmr->mpt = MLX4_MPT_STATUS_SW;
+}
+EXPORT_SYMBOL_GPL(mlx4_fmr_unmap);
+
+int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr)
+{
+	if (fmr->maps)
+		return -EBUSY;
+
+	fmr->mr.enabled = 0;
+	mlx4_mr_free(dev, &fmr->mr);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_fmr_free);
+
+int mlx4_fmr_free_reserved(struct mlx4_dev *dev, struct mlx4_fmr *fmr)
+{
+	if (fmr->maps)
+		return -EBUSY;
+
+	fmr->mr.enabled = 0;
+	mlx4_mr_free_reserved(dev, &fmr->mr);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_fmr_free_reserved);
+
+int mlx4_SYNC_TPT(struct mlx4_dev *dev)
+{
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_SYNC_TPT, 1000);
+}
+EXPORT_SYMBOL_GPL(mlx4_SYNC_TPT);
diff --git a/sys/ofed/drivers/net/mlx4/pd.c b/sys/ofed/drivers/net/mlx4/pd.c
new file mode 100644
index 0000000..cce9226
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/pd.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/io-mapping.h>
+
+#include <asm/page.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+enum {
+	MLX4_NUM_RESERVED_UARS = 8
+};
+
+int mlx4_pd_alloc(struct mlx4_dev *dev, u32 *pdn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	*pdn = mlx4_bitmap_alloc(&priv->pd_bitmap);
+	if (*pdn == -1)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_pd_alloc);
+
+void mlx4_pd_free(struct mlx4_dev *dev, u32 pdn)
+{
+	mlx4_bitmap_free(&mlx4_priv(dev)->pd_bitmap, pdn);
+}
+EXPORT_SYMBOL_GPL(mlx4_pd_free);
+
+int mlx4_init_pd_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return mlx4_bitmap_init(&priv->pd_bitmap, dev->caps.num_pds,
+				(1 << 24) - 1, dev->caps.reserved_pds, 0);
+}
+
+void mlx4_cleanup_pd_table(struct mlx4_dev *dev)
+{
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->pd_bitmap);
+}
+
+
+int mlx4_uar_alloc(struct mlx4_dev *dev, struct mlx4_uar *uar)
+{
+	uar->index = mlx4_bitmap_alloc(&mlx4_priv(dev)->uar_table.bitmap);
+	if (uar->index == -1)
+		return -ENOMEM;
+
+	uar->pfn = (pci_resource_start(dev->pdev, 2) >> PAGE_SHIFT) + uar->index;
+	uar->map = NULL;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_uar_alloc);
+
+void mlx4_uar_free(struct mlx4_dev *dev, struct mlx4_uar *uar)
+{
+	mlx4_bitmap_free(&mlx4_priv(dev)->uar_table.bitmap, uar->index);
+}
+EXPORT_SYMBOL_GPL(mlx4_uar_free);
+
+int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_uar *uar;
+	int err = 0;
+	int idx;
+
+	if (!priv->bf_mapping)
+		return -ENOMEM;
+
+	mutex_lock(&priv->bf_mutex);
+	if (!list_empty(&priv->bf_list))
+		uar = list_entry(priv->bf_list.next, struct mlx4_uar, bf_list);
+	else {
+		if (mlx4_bitmap_avail(&priv->uar_table.bitmap) < MLX4_NUM_RESERVED_UARS) {
+			err = -ENOMEM;
+			goto out;
+		}
+		uar = kmalloc(sizeof *uar, GFP_KERNEL);
+		if (!uar) {
+			err = -ENOMEM;
+			goto out;
+		}
+		err = mlx4_uar_alloc(dev, uar);
+		if (err)
+			goto free_kmalloc;
+
+		uar->map = ioremap(uar->pfn << PAGE_SHIFT, PAGE_SIZE);
+		if (!uar->map) {
+			err = -ENOMEM;
+			goto free_uar;
+		}
+
+		uar->bf_map = io_mapping_map_wc(priv->bf_mapping, uar->index << PAGE_SHIFT);
+		if (!uar->bf_map) {
+			err = -ENOMEM;
+			goto unamp_uar;
+		}
+		uar->free_bf_bmap = 0;
+		list_add(&uar->bf_list, &priv->bf_list);
+	}
+
+	bf->uar = uar;
+	idx = ffz(uar->free_bf_bmap);
+	uar->free_bf_bmap |= 1 << idx;
+	bf->uar = uar;
+	bf->offset = 0;
+	bf->buf_size = dev->caps.bf_reg_size / 2;
+	bf->reg = uar->bf_map + idx * dev->caps.bf_reg_size;
+	if (uar->free_bf_bmap == (1 << dev->caps.bf_regs_per_page) - 1)
+		list_del_init(&uar->bf_list);
+
+	goto out;
+
+unamp_uar:
+	bf->uar = NULL;
+	iounmap(uar->map);
+
+free_uar:
+	mlx4_uar_free(dev, uar);
+
+free_kmalloc:
+	kfree(uar);
+
+out:
+	mutex_unlock(&priv->bf_mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_bf_alloc);
+
+void mlx4_bf_free(struct mlx4_dev *dev, struct mlx4_bf *bf)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int idx;
+
+	if (!bf->uar || !bf->uar->bf_map)
+		return;
+
+	mutex_lock(&priv->bf_mutex);
+	idx = (bf->reg - bf->uar->bf_map) / dev->caps.bf_reg_size;
+	bf->uar->free_bf_bmap &= ~(1 << idx);
+	if (!bf->uar->free_bf_bmap) {
+		if (!list_empty(&bf->uar->bf_list))
+			list_del(&bf->uar->bf_list);
+
+		io_mapping_unmap(bf->uar->bf_map);
+		iounmap(bf->uar->map);
+		mlx4_uar_free(dev, bf->uar);
+		kfree(bf->uar);
+	} else if (list_empty(&bf->uar->bf_list))
+		list_add(&bf->uar->bf_list, &priv->bf_list);
+
+	mutex_unlock(&priv->bf_mutex);
+}
+EXPORT_SYMBOL_GPL(mlx4_bf_free);
+
+int mlx4_init_uar_table(struct mlx4_dev *dev)
+{
+	if (dev->caps.num_uars <= 128) {
+		mlx4_err(dev, "Only %d UAR pages (need more than 128)\n",
+			 dev->caps.num_uars);
+		mlx4_err(dev, "Increase firmware log2_uar_bar_megabytes?\n");
+		return -ENODEV;
+	}
+
+	return mlx4_bitmap_init(&mlx4_priv(dev)->uar_table.bitmap,
+				dev->caps.num_uars, dev->caps.num_uars - 1,
+				max(128, dev->caps.reserved_uars), 0);
+}
+
+void mlx4_cleanup_uar_table(struct mlx4_dev *dev)
+{
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->uar_table.bitmap);
+}
diff --git a/sys/ofed/drivers/net/mlx4/port.c b/sys/ofed/drivers/net/mlx4/port.c
new file mode 100644
index 0000000..c8df375
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/port.c
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/if_ether.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+
+int mlx4_ib_set_4k_mtu = 0;
+module_param_named(set_4k_mtu, mlx4_ib_set_4k_mtu, int, 0444);
+MODULE_PARM_DESC(set_4k_mtu, "attempt to set 4K MTU to all ConnectX ports");
+
+#define MLX4_MAC_VALID		(1ull << 63)
+#define MLX4_MAC_MASK		0xffffffffffffULL
+
+#define MLX4_VLAN_VALID		(1u << 31)
+#define MLX4_VLAN_MASK		0xfff
+
+void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table)
+{
+	int i;
+
+	mutex_init(&table->mutex);
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		table->entries[i] = 0;
+		table->refs[i]	 = 0;
+	}
+	table->max   = 1 << dev->caps.log_num_macs;
+	table->total = 0;
+}
+
+void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table)
+{
+	int i;
+
+	mutex_init(&table->mutex);
+	for (i = 0; i < MLX4_MAX_VLAN_NUM; i++) {
+		table->entries[i] = 0;
+		table->refs[i]	 = 0;
+	}
+	table->max   = 1 << dev->caps.log_num_vlans;
+	table->total = 0;
+}
+
+static int mlx4_set_port_mac_table(struct mlx4_dev *dev, u8 port,
+				   __be64 *entries)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 in_mod;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memcpy(mailbox->buf, entries, MLX4_MAC_TABLE_SIZE);
+
+	in_mod = MLX4_SET_PORT_MAC_TABLE << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *index)
+{
+	struct mlx4_mac_table *table = &mlx4_priv(dev)->port[port].mac_table;
+	int i, err = 0;
+	int free = -1;
+
+	mlx4_dbg(dev, "Registering MAC: 0x%llx\n", (unsigned long long) mac);
+	mutex_lock(&table->mutex);
+	for (i = 0; i < MLX4_MAX_MAC_NUM - 1; i++) {
+		if (free < 0 && !table->refs[i]) {
+			free = i;
+			continue;
+		}
+
+		if (mac == (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) {
+			/* MAC already registered, increase refernce count */
+			*index = i;
+			++table->refs[i];
+			goto out;
+		}
+	}
+
+	if (free < 0) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	mlx4_dbg(dev, "Free MAC index is %d\n", free);
+
+	if (table->total == table->max) {
+		/* No free mac entries */
+		err = -ENOSPC;
+		goto out;
+	}
+
+	/* Register new MAC */
+	table->refs[free] = 1;
+	table->entries[free] = cpu_to_be64(mac | MLX4_MAC_VALID);
+
+	err = mlx4_set_port_mac_table(dev, port, table->entries);
+	if (unlikely(err)) {
+		mlx4_err(dev, "Failed adding MAC: 0x%llx\n", (unsigned long long) mac);
+		table->refs[free] = 0;
+		table->entries[free] = 0;
+		goto out;
+	}
+
+	*index = free;
+	++table->total;
+out:
+	mutex_unlock(&table->mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_register_mac);
+
+void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, int index)
+{
+	struct mlx4_mac_table *table = &mlx4_priv(dev)->port[port].mac_table;
+
+	mutex_lock(&table->mutex);
+	if (!table->refs[index]) {
+		mlx4_warn(dev, "No MAC entry for index %d\n", index);
+		goto out;
+	}
+	if (--table->refs[index]) {
+		mlx4_warn(dev, "Have more references for index %d,"
+			  "no need to modify MAC table\n", index);
+		goto out;
+	}
+	table->entries[index] = 0;
+	mlx4_set_port_mac_table(dev, port, table->entries);
+	--table->total;
+out:
+	mutex_unlock(&table->mutex);
+}
+EXPORT_SYMBOL_GPL(mlx4_unregister_mac);
+
+static int mlx4_set_port_vlan_table(struct mlx4_dev *dev, u8 port,
+				    __be32 *entries)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 in_mod;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memcpy(mailbox->buf, entries, MLX4_VLAN_TABLE_SIZE);
+	in_mod = MLX4_SET_PORT_VLAN_TABLE << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+
+int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx)
+{
+	struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table;
+	int i;
+
+	for (i = 0; i < MLX4_MAX_VLAN_NUM; ++i) {
+		if (table->refs[i] &&
+		    (vid == (MLX4_VLAN_MASK &
+			      be32_to_cpu(table->entries[i])))) {
+			/* Vlan already registered, increase refernce count */
+			*idx = i;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+EXPORT_SYMBOL_GPL(mlx4_find_cached_vlan);
+
+int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index)
+{
+	struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table;
+	int i, err = 0;
+	int free = -1;
+
+	mutex_lock(&table->mutex);
+	for (i = MLX4_VLAN_REGULAR; i < MLX4_MAX_VLAN_NUM; i++) {
+		if (free < 0 && (table->refs[i] == 0)) {
+			free = i;
+			continue;
+		}
+
+		if (table->refs[i] &&
+		    (vlan == (MLX4_VLAN_MASK &
+			      be32_to_cpu(table->entries[i])))) {
+			/* Vlan already registered, increase refernce count */
+			*index = i;
+			++table->refs[i];
+			goto out;
+		}
+	}
+
+	if (free < 0) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	if (table->total == table->max) {
+		/* No free vlan entries */
+		err = -ENOSPC;
+		goto out;
+	}
+
+	/* Register new MAC */
+	table->refs[free] = 1;
+	table->entries[free] = cpu_to_be32(vlan | MLX4_VLAN_VALID);
+
+	err = mlx4_set_port_vlan_table(dev, port, table->entries);
+	if (unlikely(err)) {
+		mlx4_warn(dev, "Failed adding vlan: %u\n", vlan);
+		table->refs[free] = 0;
+		table->entries[free] = 0;
+		goto out;
+	}
+
+	*index = free;
+	++table->total;
+out:
+	mutex_unlock(&table->mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_register_vlan);
+
+void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, int index)
+{
+	struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table;
+
+	if (index < MLX4_VLAN_REGULAR) {
+		mlx4_warn(dev, "Trying to free special vlan index %d\n", index);
+		return;
+	}
+
+	mutex_lock(&table->mutex);
+	if (!table->refs[index]) {
+		mlx4_warn(dev, "No vlan entry for index %d\n", index);
+		goto out;
+	}
+	if (--table->refs[index]) {
+		mlx4_dbg(dev, "Have more references for index %d,"
+			 "no need to modify vlan table\n", index);
+		goto out;
+	}
+	table->entries[index] = 0;
+	mlx4_set_port_vlan_table(dev, port, table->entries);
+	--table->total;
+out:
+	mutex_unlock(&table->mutex);
+}
+EXPORT_SYMBOL_GPL(mlx4_unregister_vlan);
+
+int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps)
+{
+	struct mlx4_cmd_mailbox *inmailbox, *outmailbox;
+	u8 *inbuf, *outbuf;
+	int err;
+
+	inmailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(inmailbox))
+		return PTR_ERR(inmailbox);
+
+	outmailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(outmailbox)) {
+		mlx4_free_cmd_mailbox(dev, inmailbox);
+		return PTR_ERR(outmailbox);
+	}
+
+	inbuf = inmailbox->buf;
+	outbuf = outmailbox->buf;
+	memset(inbuf, 0, 256);
+	memset(outbuf, 0, 256);
+	inbuf[0] = 1;
+	inbuf[1] = 1;
+	inbuf[2] = 1;
+	inbuf[3] = 1;
+	*(__be16 *) (&inbuf[16]) = cpu_to_be16(0x0015);
+	*(__be32 *) (&inbuf[20]) = cpu_to_be32(port);
+
+	err = mlx4_cmd_box(dev, inmailbox->dma, outmailbox->dma, port, 3,
+			   MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C);
+	if (!err)
+		*caps = *(__be32 *) (outbuf + 84);
+	mlx4_free_cmd_mailbox(dev, inmailbox);
+	mlx4_free_cmd_mailbox(dev, outmailbox);
+	return err;
+}
+
+int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+
+	if (dev->caps.port_type[port] != MLX4_PORT_TYPE_IB)
+		return 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memset(mailbox->buf, 0, 256);
+
+	if (mlx4_ib_set_4k_mtu)
+		((__be32 *) mailbox->buf)[0] |= cpu_to_be32((1 << 22) | (1 << 21) | (5 << 12) | (2 << 4));
+
+	((__be32 *) mailbox->buf)[1] = dev->caps.ib_port_def_cap[port];
+	err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
diff --git a/sys/ofed/drivers/net/mlx4/profile.c b/sys/ofed/drivers/net/mlx4/profile.c
new file mode 100644
index 0000000..bd22df9
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/profile.c
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+
+#include "mlx4.h"
+#include "fw.h"
+
+enum {
+	MLX4_RES_QP,
+	MLX4_RES_RDMARC,
+	MLX4_RES_ALTC,
+	MLX4_RES_AUXC,
+	MLX4_RES_SRQ,
+	MLX4_RES_CQ,
+	MLX4_RES_EQ,
+	MLX4_RES_DMPT,
+	MLX4_RES_CMPT,
+	MLX4_RES_MTT,
+	MLX4_RES_MCG,
+	MLX4_RES_NUM
+};
+
+static const char *res_name[] = {
+	[MLX4_RES_QP]		= "QP",
+	[MLX4_RES_RDMARC]	= "RDMARC",
+	[MLX4_RES_ALTC]		= "ALTC",
+	[MLX4_RES_AUXC]		= "AUXC",
+	[MLX4_RES_SRQ]		= "SRQ",
+	[MLX4_RES_CQ]		= "CQ",
+	[MLX4_RES_EQ]		= "EQ",
+	[MLX4_RES_DMPT]		= "DMPT",
+	[MLX4_RES_CMPT]		= "CMPT",
+	[MLX4_RES_MTT]		= "MTT",
+	[MLX4_RES_MCG]		= "MCG",
+};
+
+u64 mlx4_make_profile(struct mlx4_dev *dev,
+		      struct mlx4_profile *request,
+		      struct mlx4_dev_cap *dev_cap,
+		      struct mlx4_init_hca_param *init_hca)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource {
+		u64 size;
+		u64 start;
+		int type;
+		int num;
+		int log_num;
+	};
+
+	u64 total_size = 0;
+	struct mlx4_resource *profile;
+	struct mlx4_resource tmp;
+	int i, j;
+
+	profile = kzalloc(MLX4_RES_NUM * sizeof *profile, GFP_KERNEL);
+	if (!profile)
+		return -ENOMEM;
+
+	profile[MLX4_RES_QP].size     = dev_cap->qpc_entry_sz;
+	profile[MLX4_RES_RDMARC].size = dev_cap->rdmarc_entry_sz;
+	profile[MLX4_RES_ALTC].size   = dev_cap->altc_entry_sz;
+	profile[MLX4_RES_AUXC].size   = dev_cap->aux_entry_sz;
+	profile[MLX4_RES_SRQ].size    = dev_cap->srq_entry_sz;
+	profile[MLX4_RES_CQ].size     = dev_cap->cqc_entry_sz;
+	profile[MLX4_RES_EQ].size     = dev_cap->eqc_entry_sz;
+	profile[MLX4_RES_DMPT].size   = dev_cap->dmpt_entry_sz;
+	profile[MLX4_RES_CMPT].size   = dev_cap->cmpt_entry_sz;
+	profile[MLX4_RES_MTT].size    = dev->caps.mtts_per_seg * dev_cap->mtt_entry_sz;
+	profile[MLX4_RES_MCG].size    = MLX4_MGM_ENTRY_SIZE;
+
+	profile[MLX4_RES_QP].num      = request->num_qp;
+	profile[MLX4_RES_RDMARC].num  = request->num_qp * request->rdmarc_per_qp;
+	profile[MLX4_RES_ALTC].num    = request->num_qp;
+	profile[MLX4_RES_AUXC].num    = request->num_qp;
+	profile[MLX4_RES_SRQ].num     = request->num_srq;
+	profile[MLX4_RES_CQ].num      = request->num_cq;
+	profile[MLX4_RES_EQ].num      = min_t(unsigned, dev_cap->max_eqs,
+					      dev_cap->reserved_eqs +
+					      num_possible_cpus() + 1);
+	profile[MLX4_RES_DMPT].num    = request->num_mpt;
+	profile[MLX4_RES_CMPT].num    = MLX4_NUM_CMPTS;
+	profile[MLX4_RES_MTT].num     = request->num_mtt;
+	profile[MLX4_RES_MCG].num     = request->num_mcg;
+
+	for (i = 0; i < MLX4_RES_NUM; ++i) {
+		profile[i].type     = i;
+		profile[i].num      = roundup_pow_of_two(profile[i].num);
+		profile[i].log_num  = ilog2(profile[i].num);
+		profile[i].size    *= profile[i].num;
+		profile[i].size     = max(profile[i].size, (u64) PAGE_SIZE);
+	}
+
+	/*
+	 * Sort the resources in decreasing order of size.  Since they
+	 * all have sizes that are powers of 2, we'll be able to keep
+	 * resources aligned to their size and pack them without gaps
+	 * using the sorted order.
+	 */
+	for (i = MLX4_RES_NUM; i > 0; --i)
+		for (j = 1; j < i; ++j) {
+			if (profile[j].size > profile[j - 1].size) {
+				tmp	       = profile[j];
+				profile[j]     = profile[j - 1];
+				profile[j - 1] = tmp;
+			}
+		}
+
+	for (i = 0; i < MLX4_RES_NUM; ++i) {
+		if (profile[i].size) {
+			profile[i].start = total_size;
+			total_size	+= profile[i].size;
+		}
+
+		if (total_size > dev_cap->max_icm_sz) {
+			mlx4_err(dev, "Profile requires 0x%llx bytes; "
+				  "won't fit in 0x%llx bytes of context memory.\n",
+				  (unsigned long long) total_size,
+				  (unsigned long long) dev_cap->max_icm_sz);
+			kfree(profile);
+			return -ENOMEM;
+		}
+
+		if (profile[i].size)
+			mlx4_dbg(dev, "  profile[%2d] (%6s): 2^%02d entries @ 0x%10llx, "
+				  "size 0x%10llx\n",
+				 i, res_name[profile[i].type], profile[i].log_num,
+				 (unsigned long long) profile[i].start,
+				 (unsigned long long) profile[i].size);
+	}
+
+	mlx4_dbg(dev, "HCA context memory: reserving %d KB\n",
+		 (int) (total_size >> 10));
+
+	for (i = 0; i < MLX4_RES_NUM; ++i) {
+		switch (profile[i].type) {
+		case MLX4_RES_QP:
+			dev->caps.num_qps     = profile[i].num;
+			init_hca->qpc_base    = profile[i].start;
+			init_hca->log_num_qps = profile[i].log_num;
+			break;
+		case MLX4_RES_RDMARC:
+			for (priv->qp_table.rdmarc_shift = 0;
+			     request->num_qp << priv->qp_table.rdmarc_shift < profile[i].num;
+			     ++priv->qp_table.rdmarc_shift)
+				; /* nothing */
+			dev->caps.max_qp_dest_rdma = 1 << priv->qp_table.rdmarc_shift;
+			priv->qp_table.rdmarc_base   = (u32) profile[i].start;
+			init_hca->rdmarc_base	     = profile[i].start;
+			init_hca->log_rd_per_qp	     = priv->qp_table.rdmarc_shift;
+			break;
+		case MLX4_RES_ALTC:
+			init_hca->altc_base = profile[i].start;
+			break;
+		case MLX4_RES_AUXC:
+			init_hca->auxc_base = profile[i].start;
+			break;
+		case MLX4_RES_SRQ:
+			dev->caps.num_srqs     = profile[i].num;
+			init_hca->srqc_base    = profile[i].start;
+			init_hca->log_num_srqs = profile[i].log_num;
+			break;
+		case MLX4_RES_CQ:
+			dev->caps.num_cqs     = profile[i].num;
+			init_hca->cqc_base    = profile[i].start;
+			init_hca->log_num_cqs = profile[i].log_num;
+			break;
+		case MLX4_RES_EQ:
+			dev->caps.num_eqs     = profile[i].num;
+			init_hca->eqc_base    = profile[i].start;
+			init_hca->log_num_eqs = profile[i].log_num;
+			break;
+		case MLX4_RES_DMPT:
+			dev->caps.num_mpts	= profile[i].num;
+			priv->mr_table.mpt_base = profile[i].start;
+			init_hca->dmpt_base	= profile[i].start;
+			init_hca->log_mpt_sz	= profile[i].log_num;
+			break;
+		case MLX4_RES_CMPT:
+			init_hca->cmpt_base	 = profile[i].start;
+			break;
+		case MLX4_RES_MTT:
+			dev->caps.num_mtt_segs	 = profile[i].num;
+			priv->mr_table.mtt_base	 = profile[i].start;
+			init_hca->mtt_base	 = profile[i].start;
+			break;
+		case MLX4_RES_MCG:
+			dev->caps.num_mgms	  = profile[i].num >> 1;
+			dev->caps.num_amgms	  = profile[i].num >> 1;
+			init_hca->mc_base	  = profile[i].start;
+			init_hca->log_mc_entry_sz = ilog2(MLX4_MGM_ENTRY_SIZE);
+			init_hca->log_mc_table_sz = profile[i].log_num;
+			init_hca->log_mc_hash_sz  = profile[i].log_num - 1;
+			break;
+		default:
+			break;
+		}
+	}
+
+	/*
+	 * PDs don't take any HCA memory, but we assign them as part
+	 * of the HCA profile anyway.
+	 */
+	dev->caps.num_pds = MLX4_NUM_PDS;
+
+	kfree(profile);
+	return total_size;
+}
diff --git a/sys/ofed/drivers/net/mlx4/qp.c b/sys/ofed/drivers/net/mlx4/qp.c
new file mode 100644
index 0000000..bf1c117
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/qp.c
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/qp.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+	struct mlx4_qp *qp;
+
+	spin_lock(&qp_table->lock);
+
+	qp = __mlx4_qp_lookup(dev, qpn);
+	if (qp)
+		atomic_inc(&qp->refcount);
+
+	spin_unlock(&qp_table->lock);
+
+	if (!qp) {
+		mlx4_warn(dev, "Async event for bogus QP %08x\n", qpn);
+		return;
+	}
+
+	qp->event(qp, event_type);
+
+	if (atomic_dec_and_test(&qp->refcount))
+		complete(&qp->free);
+}
+
+int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		   enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state,
+		   struct mlx4_qp_context *context, enum mlx4_qp_optpar optpar,
+		   int sqd_event, struct mlx4_qp *qp)
+{
+	static const u16 op[MLX4_QP_NUM_STATE][MLX4_QP_NUM_STATE] = {
+		[MLX4_QP_STATE_RST] = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_INIT]	= MLX4_CMD_RST2INIT_QP,
+		},
+		[MLX4_QP_STATE_INIT]  = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_INIT]	= MLX4_CMD_INIT2INIT_QP,
+			[MLX4_QP_STATE_RTR]	= MLX4_CMD_INIT2RTR_QP,
+		},
+		[MLX4_QP_STATE_RTR]   = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_RTS]	= MLX4_CMD_RTR2RTS_QP,
+		},
+		[MLX4_QP_STATE_RTS]   = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_RTS]	= MLX4_CMD_RTS2RTS_QP,
+			[MLX4_QP_STATE_SQD]	= MLX4_CMD_RTS2SQD_QP,
+		},
+		[MLX4_QP_STATE_SQD] = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_RTS]	= MLX4_CMD_SQD2RTS_QP,
+			[MLX4_QP_STATE_SQD]	= MLX4_CMD_SQD2SQD_QP,
+		},
+		[MLX4_QP_STATE_SQER] = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_RTS]	= MLX4_CMD_SQERR2RTS_QP,
+		},
+		[MLX4_QP_STATE_ERR] = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+		}
+	};
+
+	struct mlx4_cmd_mailbox *mailbox;
+	int ret = 0;
+
+	if (cur_state >= MLX4_QP_NUM_STATE || new_state >= MLX4_QP_NUM_STATE ||
+	    !op[cur_state][new_state])
+		return -EINVAL;
+
+	if (op[cur_state][new_state] == MLX4_CMD_2RST_QP)
+		return mlx4_cmd(dev, 0, qp->qpn, 2,
+				MLX4_CMD_2RST_QP, MLX4_CMD_TIME_CLASS_A);
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	if (cur_state == MLX4_QP_STATE_RST && new_state == MLX4_QP_STATE_INIT) {
+		u64 mtt_addr = mlx4_mtt_addr(dev, mtt);
+		context->mtt_base_addr_h = mtt_addr >> 32;
+		context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff);
+		context->log_page_size   = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
+	}
+
+	*(__be32 *) mailbox->buf = cpu_to_be32(optpar);
+	memcpy(mailbox->buf + 8, context, sizeof *context);
+
+	((struct mlx4_qp_context *) (mailbox->buf + 8))->local_qpn =
+		cpu_to_be32(qp->qpn);
+
+	ret = mlx4_cmd(dev, mailbox->dma, qp->qpn | (!!sqd_event << 31),
+		       new_state == MLX4_QP_STATE_RST ? 2 : 0,
+		       op[cur_state][new_state], MLX4_CMD_TIME_CLASS_C);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_modify);
+
+int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_qp_table *qp_table = &priv->qp_table;
+	int qpn;
+
+	qpn = mlx4_bitmap_alloc_range(&qp_table->bitmap, cnt, align);
+	if (qpn == -1)
+		return -ENOMEM;
+
+	*base = qpn;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_reserve_range);
+
+void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_qp_table *qp_table = &priv->qp_table;
+	if (base_qpn < dev->caps.sqp_start + 8)
+		return;
+
+	mlx4_bitmap_free_range(&qp_table->bitmap, base_qpn, cnt);
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_release_range);
+
+int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_qp_table *qp_table = &priv->qp_table;
+	int err;
+
+	if (!qpn)
+		return -EINVAL;
+
+	qp->qpn = qpn;
+
+	err = mlx4_table_get(dev, &qp_table->qp_table, qp->qpn);
+	if (err)
+		goto err_out;
+
+	err = mlx4_table_get(dev, &qp_table->auxc_table, qp->qpn);
+	if (err)
+		goto err_put_qp;
+
+	err = mlx4_table_get(dev, &qp_table->altc_table, qp->qpn);
+	if (err)
+		goto err_put_auxc;
+
+	err = mlx4_table_get(dev, &qp_table->rdmarc_table, qp->qpn);
+	if (err)
+		goto err_put_altc;
+
+	err = mlx4_table_get(dev, &qp_table->cmpt_table, qp->qpn);
+	if (err)
+		goto err_put_rdmarc;
+
+	spin_lock_irq(&qp_table->lock);
+	err = radix_tree_insert(&dev->qp_table_tree, qp->qpn & (dev->caps.num_qps - 1), qp);
+	spin_unlock_irq(&qp_table->lock);
+	if (err)
+		goto err_put_cmpt;
+
+	atomic_set(&qp->refcount, 1);
+	init_completion(&qp->free);
+
+	return 0;
+
+err_put_cmpt:
+	mlx4_table_put(dev, &qp_table->cmpt_table, qp->qpn);
+
+err_put_rdmarc:
+	mlx4_table_put(dev, &qp_table->rdmarc_table, qp->qpn);
+
+err_put_altc:
+	mlx4_table_put(dev, &qp_table->altc_table, qp->qpn);
+
+err_put_auxc:
+	mlx4_table_put(dev, &qp_table->auxc_table, qp->qpn);
+
+err_put_qp:
+	mlx4_table_put(dev, &qp_table->qp_table, qp->qpn);
+
+err_out:
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_alloc);
+
+struct mlx4_qp *mlx4_qp_lookup_lock(struct mlx4_dev *dev, u32 qpn)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+	unsigned long flags;
+	struct mlx4_qp *qp;
+
+	spin_lock_irqsave(&qp_table->lock, flags);
+	qp = radix_tree_lookup(&dev->qp_table_tree, qpn & (dev->caps.num_qps - 1));
+	spin_unlock_irqrestore(&qp_table->lock, flags);
+	return qp;
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_lookup_lock);
+
+void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp_table->lock, flags);
+	radix_tree_delete(&dev->qp_table_tree, qp->qpn & (dev->caps.num_qps - 1));
+	spin_unlock_irqrestore(&qp_table->lock, flags);
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_remove);
+
+void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+
+	if (atomic_dec_and_test(&qp->refcount))
+		complete(&qp->free);
+	wait_for_completion(&qp->free);
+
+	mlx4_table_put(dev, &qp_table->cmpt_table, qp->qpn);
+	mlx4_table_put(dev, &qp_table->rdmarc_table, qp->qpn);
+	mlx4_table_put(dev, &qp_table->altc_table, qp->qpn);
+	mlx4_table_put(dev, &qp_table->auxc_table, qp->qpn);
+	mlx4_table_put(dev, &qp_table->qp_table, qp->qpn);
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_free);
+
+static int mlx4_CONF_SPECIAL_QP(struct mlx4_dev *dev, u32 base_qpn)
+{
+	return mlx4_cmd(dev, 0, base_qpn,
+			(dev->caps.flags & MLX4_DEV_CAP_FLAG_RAW_ETY) ? 4 : 0,
+			MLX4_CMD_CONF_SPECIAL_QP, MLX4_CMD_TIME_CLASS_B);
+}
+
+int mlx4_init_qp_table(struct mlx4_dev *dev)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+	int err;
+	int reserved_from_top = 0;
+
+	spin_lock_init(&qp_table->lock);
+	INIT_RADIX_TREE(&dev->qp_table_tree, GFP_ATOMIC);
+
+	/*
+	 * We reserve 2 extra QPs per port for the special QPs.  The
+	 * block of special QPs must be aligned to a multiple of 8, so
+	 * round up.
+	 * We also reserve the MSB of the 24-bit QP number to indicate
+	 * an XRC qp.
+	 */
+	dev->caps.sqp_start =
+		ALIGN(dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], 8);
+
+	{
+		int sort[MLX4_NUM_QP_REGION];
+		int i, j, tmp;
+		int last_base = dev->caps.num_qps;
+
+		for (i = 1; i < MLX4_NUM_QP_REGION; ++i)
+			sort[i] = i;
+
+		for (i = MLX4_NUM_QP_REGION; i > 0; --i) {
+			for (j = 2; j < i; ++j) {
+				if (dev->caps.reserved_qps_cnt[sort[j]] >
+				    dev->caps.reserved_qps_cnt[sort[j - 1]]) {
+					tmp             = sort[j];
+					sort[j]         = sort[j - 1];
+					sort[j - 1]     = tmp;
+				}
+			}
+		}
+
+		for (i = 1; i < MLX4_NUM_QP_REGION; ++i) {
+			last_base -= dev->caps.reserved_qps_cnt[sort[i]];
+			dev->caps.reserved_qps_base[sort[i]] = last_base;
+			reserved_from_top +=
+				dev->caps.reserved_qps_cnt[sort[i]];
+		}
+
+	}
+
+	err = mlx4_bitmap_init(&qp_table->bitmap, dev->caps.num_qps,
+			       (1 << 23) - 1, dev->caps.sqp_start + 8,
+			       reserved_from_top);
+	if (err)
+		return err;
+
+	return mlx4_CONF_SPECIAL_QP(dev, dev->caps.sqp_start);
+}
+
+void mlx4_cleanup_qp_table(struct mlx4_dev *dev)
+{
+	mlx4_CONF_SPECIAL_QP(dev, 0);
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->qp_table.bitmap);
+}
+
+int mlx4_qp_get_region(struct mlx4_dev *dev, enum mlx4_qp_region region,
+			int *base_qpn, int *cnt)
+{
+	if ((region < 0) || (region >= MLX4_NUM_QP_REGION))
+		return -EINVAL;
+
+	*base_qpn = dev->caps.reserved_qps_base[region];
+	*cnt = dev->caps.reserved_qps_cnt[region];
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_get_region);
+
+int mlx4_qp_query(struct mlx4_dev *dev, struct mlx4_qp *qp,
+		  struct mlx4_qp_context *context)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, qp->qpn, 0,
+			   MLX4_CMD_QUERY_QP, MLX4_CMD_TIME_CLASS_A);
+	if (!err)
+		memcpy(context, mailbox->buf + 8, sizeof *context);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_query);
+
+int mlx4_qp_to_ready(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		     struct mlx4_qp_context *context,
+		     struct mlx4_qp *qp, enum mlx4_qp_state *qp_state)
+{
+	int err;
+	int i;
+	enum mlx4_qp_state states[] = {
+		MLX4_QP_STATE_RST,
+		MLX4_QP_STATE_INIT,
+		MLX4_QP_STATE_RTR,
+		MLX4_QP_STATE_RTS
+	};
+
+	for (i = 0; i < ARRAY_SIZE(states) - 1; i++) {
+		context->flags &= cpu_to_be32(~(0xf << 28));
+		context->flags |= cpu_to_be32(states[i + 1] << 28);
+		err = mlx4_qp_modify(dev, mtt, states[i], states[i + 1],
+				     context, 0, 0, qp);
+		if (err) {
+			mlx4_err(dev, "Failed to bring QP to state: "
+				 "%d with error: %d\n",
+				 states[i + 1], err);
+			return err;
+		}
+
+		*qp_state = states[i + 1];
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_to_ready);
diff --git a/sys/ofed/drivers/net/mlx4/reset.c b/sys/ofed/drivers/net/mlx4/reset.c
new file mode 100644
index 0000000..3951b88
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/reset.c
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/jiffies.h>
+
+#include "mlx4.h"
+
+int mlx4_reset(struct mlx4_dev *dev)
+{
+	void __iomem *reset;
+	u32 *hca_header = NULL;
+	int pcie_cap;
+	u16 devctl;
+	u16 linkctl;
+	u16 vendor;
+	unsigned long end;
+	u32 sem;
+	int i;
+	int err = 0;
+
+#define MLX4_RESET_BASE		0xf0000
+#define MLX4_RESET_SIZE		  0x400
+#define MLX4_SEM_OFFSET		  0x3fc
+#define MLX4_RESET_OFFSET	   0x10
+#define MLX4_RESET_VALUE	swab32(1)
+
+#define MLX4_SEM_TIMEOUT_JIFFIES	(10 * HZ)
+#define MLX4_RESET_TIMEOUT_JIFFIES	(2 * HZ)
+
+	/*
+	 * Reset the chip.  This is somewhat ugly because we have to
+	 * save off the PCI header before reset and then restore it
+	 * after the chip reboots.  We skip config space offsets 22
+	 * and 23 since those have a special meaning.
+	 */
+
+	/* Do we need to save off the full 4K PCI Express header?? */
+	hca_header = kmalloc(256, GFP_KERNEL);
+	if (!hca_header) {
+		err = -ENOMEM;
+		mlx4_err(dev, "Couldn't allocate memory to save HCA "
+			  "PCI header, aborting.\n");
+		goto out;
+	}
+
+	pcie_cap = pci_find_capability(dev->pdev, PCI_CAP_ID_EXP);
+
+	for (i = 0; i < 64; ++i) {
+		if (i == 22 || i == 23)
+			continue;
+		if (pci_read_config_dword(dev->pdev, i * 4, hca_header + i)) {
+			err = -ENODEV;
+			mlx4_err(dev, "Couldn't save HCA "
+				  "PCI header, aborting.\n");
+			goto out;
+		}
+	}
+
+	reset = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_RESET_BASE,
+			MLX4_RESET_SIZE);
+	if (!reset) {
+		err = -ENOMEM;
+		mlx4_err(dev, "Couldn't map HCA reset register, aborting.\n");
+		goto out;
+	}
+
+	/* grab HW semaphore to lock out flash updates */
+	end = jiffies + MLX4_SEM_TIMEOUT_JIFFIES;
+	do {
+		sem = readl(reset + MLX4_SEM_OFFSET);
+		if (!sem)
+			break;
+
+		msleep(1);
+	} while (time_before(jiffies, end));
+
+	if (sem) {
+		mlx4_err(dev, "Failed to obtain HW semaphore, aborting\n");
+		err = -EAGAIN;
+		iounmap(reset);
+		goto out;
+	}
+
+	/* actually hit reset */
+	writel(MLX4_RESET_VALUE, reset + MLX4_RESET_OFFSET);
+	iounmap(reset);
+
+	/* Docs say to wait one second before accessing device */
+	msleep(1000);
+
+	end = jiffies + MLX4_RESET_TIMEOUT_JIFFIES;
+	do {
+		if (!pci_read_config_word(dev->pdev, PCI_VENDOR_ID, &vendor) &&
+		    vendor != 0xffff)
+			break;
+
+		msleep(1);
+	} while (time_before(jiffies, end));
+
+	if (vendor == 0xffff) {
+		err = -ENODEV;
+		mlx4_err(dev, "PCI device did not come back after reset, "
+			  "aborting.\n");
+		goto out;
+	}
+
+	/* Now restore the PCI headers */
+	if (pcie_cap) {
+		devctl = hca_header[(pcie_cap + PCI_EXP_DEVCTL) / 4];
+		if (pci_write_config_word(dev->pdev, pcie_cap + PCI_EXP_DEVCTL,
+					   devctl)) {
+			err = -ENODEV;
+			mlx4_err(dev, "Couldn't restore HCA PCI Express "
+				 "Device Control register, aborting.\n");
+			goto out;
+		}
+		linkctl = hca_header[(pcie_cap + PCI_EXP_LNKCTL) / 4];
+		if (pci_write_config_word(dev->pdev, pcie_cap + PCI_EXP_LNKCTL,
+					   linkctl)) {
+			err = -ENODEV;
+			mlx4_err(dev, "Couldn't restore HCA PCI Express "
+				 "Link control register, aborting.\n");
+			goto out;
+		}
+	}
+
+	for (i = 0; i < 16; ++i) {
+		if (i * 4 == PCI_COMMAND)
+			continue;
+
+		if (pci_write_config_dword(dev->pdev, i * 4, hca_header[i])) {
+			err = -ENODEV;
+			mlx4_err(dev, "Couldn't restore HCA reg %x, "
+				  "aborting.\n", i);
+			goto out;
+		}
+	}
+
+	if (pci_write_config_dword(dev->pdev, PCI_COMMAND,
+				   hca_header[PCI_COMMAND / 4])) {
+		err = -ENODEV;
+		mlx4_err(dev, "Couldn't restore HCA COMMAND, "
+			  "aborting.\n");
+		goto out;
+	}
+
+out:
+	kfree(hca_header);
+
+	return err;
+}
diff --git a/sys/ofed/drivers/net/mlx4/sense.c b/sys/ofed/drivers/net/mlx4/sense.c
new file mode 100644
index 0000000..0fcf025
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/sense.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/errno.h>
+#include <linux/if_ether.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+
+static int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port,
+			   enum mlx4_port_type *type)
+{
+	u64 out_param;
+	int err = 0;
+
+	err = mlx4_cmd_imm(dev, 0, &out_param, port, 0,
+			   MLX4_CMD_SENSE_PORT, MLX4_CMD_TIME_CLASS_B);
+	if (err) {
+		mlx4_err(dev, "Sense command failed for port: %d\n", port);
+		return err;
+	}
+
+	if (out_param > 2) {
+		mlx4_err(dev, "Sense returned illegal value: 0x%llx\n", out_param);
+		return EINVAL;
+	}
+
+	*type = out_param;
+	return 0;
+}
+
+void mlx4_do_sense_ports(struct mlx4_dev *dev,
+			 enum mlx4_port_type *stype,
+			 enum mlx4_port_type *defaults)
+{
+	struct mlx4_sense *sense = &mlx4_priv(dev)->sense;
+	int err;
+	int i;
+
+	for (i = 1; i <= dev->caps.num_ports; i++) {
+		stype[i - 1] = 0;
+		if (sense->do_sense_port[i] && sense->sense_allowed[i] &&
+		    dev->caps.possible_type[i] == MLX4_PORT_TYPE_AUTO) {
+			err = mlx4_SENSE_PORT(dev, i, &stype[i - 1]);
+			if (err)
+				stype[i - 1] = defaults[i - 1];
+		} else
+			stype[i - 1] = defaults[i - 1];
+	}
+
+	/*
+	 * Adjust port configuration:
+	 * If port 1 sensed nothing and port 2 is IB, set both as IB
+	 * If port 2 sensed nothing and port 1 is Eth, set both as Eth
+	 */
+	if (stype[0] == MLX4_PORT_TYPE_ETH) {
+		for (i = 1; i < dev->caps.num_ports; i++)
+			stype[i] = stype[i] ? stype[i] : MLX4_PORT_TYPE_ETH;
+	}
+	if (stype[dev->caps.num_ports - 1] == MLX4_PORT_TYPE_IB) {
+		for (i = 0; i < dev->caps.num_ports - 1; i++)
+			stype[i] = stype[i] ? stype[i] : MLX4_PORT_TYPE_IB;
+	}
+
+	/*
+	 * If sensed nothing, remain in current configuration.
+	 */
+	for (i = 0; i < dev->caps.num_ports; i++)
+		stype[i] = stype[i] ? stype[i] : defaults[i];
+
+}
+
+static void mlx4_sense_port(struct work_struct *work)
+{
+	struct delayed_work *delay = to_delayed_work(work);
+	struct mlx4_sense *sense = container_of(delay, struct mlx4_sense,
+						sense_poll);
+	struct mlx4_dev *dev = sense->dev;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	enum mlx4_port_type stype[MLX4_MAX_PORTS];
+
+	mutex_lock(&priv->port_mutex);
+	mlx4_do_sense_ports(dev, stype, &dev->caps.port_type[1]);
+
+	if (mlx4_check_port_params(dev, stype))
+		goto sense_again;
+
+	if (mlx4_change_port_types(dev, stype))
+		mlx4_err(dev, "Failed to change port_types\n");
+
+sense_again:
+	mutex_unlock(&priv->port_mutex);
+	if (sense->resched)
+		queue_delayed_work(sense->sense_wq , &sense->sense_poll,
+				   round_jiffies(MLX4_SENSE_RANGE));
+}
+
+void mlx4_start_sense(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_sense *sense = &priv->sense;
+
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP))
+		return;
+
+	sense->resched = 1;
+	queue_delayed_work(sense->sense_wq , &sense->sense_poll,
+			   round_jiffies(MLX4_SENSE_RANGE));
+}
+
+
+void mlx4_stop_sense(struct mlx4_dev *dev)
+{
+	mlx4_priv(dev)->sense.resched = 0;
+}
+
+int mlx4_sense_init(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_sense *sense = &priv->sense;
+	int port;
+
+	sense->dev = dev;
+	sense->sense_wq = create_singlethread_workqueue("mlx4_sense");
+	if (!sense->sense_wq)
+		return -ENOMEM;
+
+	for (port = 1; port <= dev->caps.num_ports; port++)
+		sense->do_sense_port[port] = 1;
+
+	INIT_DELAYED_WORK_DEFERRABLE(&sense->sense_poll, mlx4_sense_port);
+	return 0;
+}
+
+void mlx4_sense_cleanup(struct mlx4_dev *dev)
+{
+	mlx4_stop_sense(dev);
+	cancel_delayed_work(&mlx4_priv(dev)->sense.sense_poll);
+	destroy_workqueue(mlx4_priv(dev)->sense.sense_wq);
+}
+
diff --git a/sys/ofed/drivers/net/mlx4/srq.c b/sys/ofed/drivers/net/mlx4/srq.c
new file mode 100644
index 0000000..f856b8d
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/srq.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/srq.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+struct mlx4_srq_context {
+	__be32			state_logsize_srqn;
+	u8			logstride;
+	u8			reserved1;
+	__be16			xrc_domain;
+	__be32			pg_offset_cqn;
+	u32			reserved2;
+	u8			log_page_size;
+	u8			reserved3[2];
+	u8			mtt_base_addr_h;
+	__be32			mtt_base_addr_l;
+	__be32			pd;
+	__be16			limit_watermark;
+	__be16			wqe_cnt;
+	u16			reserved4;
+	__be16			wqe_counter;
+	u32			reserved5;
+	__be64			db_rec_addr;
+};
+
+void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	struct mlx4_srq *srq;
+
+	spin_lock(&srq_table->lock);
+
+	srq = radix_tree_lookup(&dev->srq_table_tree,
+				srqn & (dev->caps.num_srqs - 1));
+	if (srq)
+		atomic_inc(&srq->refcount);
+
+	spin_unlock(&srq_table->lock);
+
+	if (!srq) {
+		mlx4_warn(dev, "Async event for bogus SRQ %08x\n", srqn);
+		return;
+	}
+
+	srq->event(srq, event_type);
+
+	if (atomic_dec_and_test(&srq->refcount))
+		complete(&srq->free);
+}
+
+static int mlx4_SW2HW_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  int srq_num)
+{
+	return mlx4_cmd(dev, mailbox->dma, srq_num, 0, MLX4_CMD_SW2HW_SRQ,
+			MLX4_CMD_TIME_CLASS_A);
+}
+
+static int mlx4_HW2SW_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  int srq_num)
+{
+	return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, srq_num,
+			    mailbox ? 0 : 1, MLX4_CMD_HW2SW_SRQ,
+			    MLX4_CMD_TIME_CLASS_A);
+}
+
+static int mlx4_ARM_SRQ(struct mlx4_dev *dev, int srq_num, int limit_watermark)
+{
+	return mlx4_cmd(dev, limit_watermark, srq_num, 0, MLX4_CMD_ARM_SRQ,
+			MLX4_CMD_TIME_CLASS_B);
+}
+
+static int mlx4_QUERY_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  int srq_num)
+{
+	return mlx4_cmd_box(dev, 0, mailbox->dma, srq_num, 0, MLX4_CMD_QUERY_SRQ,
+			    MLX4_CMD_TIME_CLASS_A);
+}
+
+int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcd,
+		   struct mlx4_mtt *mtt, u64 db_rec, struct mlx4_srq *srq)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_srq_context *srq_context;
+	u64 mtt_addr;
+	int err;
+
+	srq->srqn = mlx4_bitmap_alloc(&srq_table->bitmap);
+	if (srq->srqn == -1)
+		return -ENOMEM;
+
+	err = mlx4_table_get(dev, &srq_table->table, srq->srqn);
+	if (err)
+		goto err_out;
+
+	err = mlx4_table_get(dev, &srq_table->cmpt_table, srq->srqn);
+	if (err)
+		goto err_put;
+
+	spin_lock_irq(&srq_table->lock);
+	err = radix_tree_insert(&dev->srq_table_tree, srq->srqn, srq);
+	spin_unlock_irq(&srq_table->lock);
+	if (err)
+		goto err_cmpt_put;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_radix;
+	}
+
+	srq_context = mailbox->buf;
+	memset(srq_context, 0, sizeof *srq_context);
+
+	srq_context->state_logsize_srqn = cpu_to_be32((ilog2(srq->max) << 24) |
+						      srq->srqn);
+	srq_context->logstride          = srq->wqe_shift - 4;
+	srq_context->xrc_domain		= cpu_to_be16(xrcd);
+	srq_context->pg_offset_cqn	= cpu_to_be32(cqn & 0xffffff);
+	srq_context->log_page_size      = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
+
+	mtt_addr = mlx4_mtt_addr(dev, mtt);
+	srq_context->mtt_base_addr_h    = mtt_addr >> 32;
+	srq_context->mtt_base_addr_l    = cpu_to_be32(mtt_addr & 0xffffffff);
+	srq_context->pd			= cpu_to_be32(pdn);
+	srq_context->db_rec_addr        = cpu_to_be64(db_rec);
+
+	err = mlx4_SW2HW_SRQ(dev, mailbox, srq->srqn);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (err)
+		goto err_radix;
+
+	atomic_set(&srq->refcount, 1);
+	init_completion(&srq->free);
+
+	return 0;
+
+err_radix:
+	spin_lock_irq(&srq_table->lock);
+	radix_tree_delete(&dev->srq_table_tree, srq->srqn);
+	spin_unlock_irq(&srq_table->lock);
+
+err_cmpt_put:
+	mlx4_table_put(dev, &srq_table->cmpt_table, srq->srqn);
+
+err_put:
+	mlx4_table_put(dev, &srq_table->table, srq->srqn);
+
+err_out:
+	mlx4_bitmap_free(&srq_table->bitmap, srq->srqn);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_alloc);
+
+void mlx4_srq_invalidate(struct mlx4_dev *dev, struct mlx4_srq *srq)
+{
+	int err;
+
+	err = mlx4_HW2SW_SRQ(dev, NULL, srq->srqn);
+	if (err)
+		mlx4_warn(dev, "HW2SW_SRQ failed (%d) for SRQN %06x\n", err, srq->srqn);
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_invalidate);
+
+void mlx4_srq_remove(struct mlx4_dev *dev, struct mlx4_srq *srq)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+
+	spin_lock_irq(&srq_table->lock);
+	radix_tree_delete(&dev->srq_table_tree, srq->srqn);
+	spin_unlock_irq(&srq_table->lock);
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_remove);
+
+void mlx4_srq_free(struct mlx4_dev *dev, struct mlx4_srq *srq)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+
+	if (atomic_dec_and_test(&srq->refcount))
+		complete(&srq->free);
+	wait_for_completion(&srq->free);
+
+	mlx4_table_put(dev, &srq_table->table, srq->srqn);
+	mlx4_bitmap_free(&srq_table->bitmap, srq->srqn);
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_free);
+
+int mlx4_srq_arm(struct mlx4_dev *dev, struct mlx4_srq *srq, int limit_watermark)
+{
+	return mlx4_ARM_SRQ(dev, srq->srqn, limit_watermark);
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_arm);
+
+int mlx4_srq_query(struct mlx4_dev *dev, struct mlx4_srq *srq, int *limit_watermark)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_srq_context *srq_context;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	srq_context = mailbox->buf;
+
+	err = mlx4_QUERY_SRQ(dev, mailbox, srq->srqn);
+	if (err)
+		goto err_out;
+	*limit_watermark = be16_to_cpu(srq_context->limit_watermark);
+
+err_out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_query);
+
+int mlx4_init_srq_table(struct mlx4_dev *dev)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	int err;
+
+	spin_lock_init(&srq_table->lock);
+	INIT_RADIX_TREE(&dev->srq_table_tree, GFP_ATOMIC);
+
+	err = mlx4_bitmap_init(&srq_table->bitmap, dev->caps.num_srqs,
+			       dev->caps.num_srqs - 1, dev->caps.reserved_srqs, 0);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+void mlx4_cleanup_srq_table(struct mlx4_dev *dev)
+{
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->srq_table.bitmap);
+}
diff --git a/sys/ofed/drivers/net/mlx4/xrcd.c b/sys/ofed/drivers/net/mlx4/xrcd.c
new file mode 100644
index 0000000..d1bfc11
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/xrcd.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+
+#include "mlx4.h"
+
+int mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	*xrcdn = mlx4_bitmap_alloc(&priv->xrcd_bitmap);
+	if (*xrcdn == -1)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_xrcd_alloc);
+
+void mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn)
+{
+	mlx4_bitmap_free(&mlx4_priv(dev)->xrcd_bitmap, xrcdn);
+}
+EXPORT_SYMBOL_GPL(mlx4_xrcd_free);
+
+int __devinit mlx4_init_xrcd_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return mlx4_bitmap_init(&priv->xrcd_bitmap, (1 << 16),
+				(1 << 16) - 1, dev->caps.reserved_xrcds + 1, 0);
+}
+
+void mlx4_cleanup_xrcd_table(struct mlx4_dev *dev)
+{
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->xrcd_bitmap);
+}
+
+
diff --git a/sys/ofed/include/asm/atomic-long.h b/sys/ofed/include/asm/atomic-long.h
new file mode 100644
index 0000000..5075ad8
--- /dev/null
+++ b/sys/ofed/include/asm/atomic-long.h
@@ -0,0 +1,79 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_ATOMIC_LONG_H_
+#define	_ATOMIC_LONG_H_
+
+#include <sys/cdefs.h>
+#include <sys/types.h>
+#include <machine/atomic.h>
+
+typedef struct {
+	volatile u_long counter;
+} atomic_long_t;
+
+#define	atomic_long_add(i, v)		atomic_long_add_return((i), (v))
+#define	atomic_long_inc_return(v)	atomic_long_add_return(1, (v))
+
+static inline long
+atomic_long_add_return(long i, atomic_long_t *v)
+{
+	return i + atomic_fetchadd_long(&v->counter, i);
+}
+
+static inline void
+atomic_long_set(atomic_long_t *v, long i)
+{
+	atomic_store_rel_long(&v->counter, i);
+}
+
+static inline long
+atomic_long_read(atomic_long_t *v)
+{
+	return atomic_load_acq_long(&v->counter);
+}
+
+static inline long
+atomic_long_inc(atomic_long_t *v)
+{
+	return atomic_fetchadd_long(&v->counter, 1) + 1;
+}
+
+static inline long
+atomic_long_dec(atomic_long_t *v)
+{
+	return atomic_fetchadd_long(&v->counter, -1) - 1;
+}
+
+static inline long
+atomic_long_dec_and_test(atomic_long_t *v)
+{
+	long i = atomic_long_add(-1, v);
+	return i == 0 ;
+}
+
+#endif	/* _ATOMIC_LONG_H_ */
diff --git a/sys/ofed/include/asm/atomic.h b/sys/ofed/include/asm/atomic.h
new file mode 100644
index 0000000..5c5caa0
--- /dev/null
+++ b/sys/ofed/include/asm/atomic.h
@@ -0,0 +1,85 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_ASM_ATOMIC_H_
+#define	_ASM_ATOMIC_H_
+
+#include <sys/cdefs.h>
+#include <sys/types.h>
+#include <machine/atomic.h>
+#include <asm/atomic-long.h>
+
+typedef struct {
+	volatile u_int counter;
+} atomic_t;
+
+#define	atomic_add(i, v)		atomic_add_return((i), (v))
+#define	atomic_sub(i, v)		atomic_sub_return((i), (v))
+#define	atomic_inc_return(v)		atomic_add_return(1, (v))
+#define	atomic_add_negative(i, v)	(atomic_add_return((i), (v)) < 0)
+#define	atomic_sub_and_test(i, v)	(atomic_sub_return((i), (v)) == 0)
+#define	atomic_dec_and_test(v)		(atomic_sub_return(1, (v)) == 0)
+#define	atomic_inc_and_test(v)		(atomic_add_return(1, (v)) == 0)
+
+static inline int
+atomic_add_return(int i, atomic_t *v)
+{
+	return i + atomic_fetchadd_int(&v->counter, i);
+}
+
+static inline int
+atomic_sub_return(int i, atomic_t *v)
+{
+	return atomic_fetchadd_int(&v->counter, -i) - i;
+}
+
+static inline void
+atomic_set(atomic_t *v, int i)
+{
+	atomic_store_rel_int(&v->counter, i);
+}
+
+static inline int
+atomic_read(atomic_t *v)
+{
+	return atomic_load_acq_int(&v->counter);
+}
+
+static inline int
+atomic_inc(atomic_t *v)
+{
+	return atomic_fetchadd_int(&v->counter, 1) + 1;
+}
+
+static inline int
+atomic_dec(atomic_t *v)
+{
+	return atomic_fetchadd_int(&v->counter, -1) - 1;
+}
+
+#endif	/* _ASM_ATOMIC_H_ */
diff --git a/sys/ofed/include/asm/byteorder.h b/sys/ofed/include/asm/byteorder.h
new file mode 100644
index 0000000..341c548
--- /dev/null
+++ b/sys/ofed/include/asm/byteorder.h
@@ -0,0 +1,90 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_ASM_BYTEORDER_H_
+#define	_ASM_BYTEORDER_H_
+
+#include <sys/types.h>
+#include <sys/endian.h>
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define	__LITTLE_ENDIAN
+#else
+#define	__BIG_ENDIAN
+#endif
+
+#define	cpu_to_le64	htole64
+#define	le64_to_cpu	le64toh
+#define	cpu_to_le32	htole32
+#define	le32_to_cpu	le32toh
+#define	cpu_to_le16	htole16
+#define	le16_to_cpu	le16toh
+#define	cpu_to_be64	htobe64
+#define	be64_to_cpu	be64toh
+#define	cpu_to_be32	htobe32
+#define	be32_to_cpu	be32toh
+#define	cpu_to_be16	htobe16
+#define	be16_to_cpu	be16toh
+#define	__be16_to_cpu	be16toh
+
+#define	cpu_to_le64p(x)	htole64(*((uint64_t *)x))
+#define	le64_to_cpup(x)	le64toh(*((uint64_t *)x))
+#define	cpu_to_le32p(x)	htole32(*((uint32_t *)x))
+#define	le32_to_cpup(x)	le32toh(*((uint32_t *)x))
+#define	cpu_to_le16p(x)	htole16(*((uint16_t *)x))
+#define	le16_to_cpup(x)	le16toh(*((uint16_t *)x))
+#define	cpu_to_be64p(x)	htobe64(*((uint64_t *)x))
+#define	be64_to_cpup(x)	be64toh(*((uint64_t *)x))
+#define	cpu_to_be32p(x)	htobe32(*((uint32_t *)x))
+#define	be32_to_cpup(x)	be32toh(*((uint32_t *)x))
+#define	cpu_to_be16p(x)	htobe16(*((uint16_t *)x))
+#define	be16_to_cpup(x)	be16toh(*((uint16_t *)x))
+
+#define	cpu_to_le64s(x)	do { *((uint64_t *)x) = cpu_to_le64p((x)) } while (0)
+#define	le64_to_cpus(x)	do { *((uint64_t *)x) = le64_to_cpup((x)) } while (0)
+#define	cpu_to_le32s(x)	do { *((uint32_t *)x) = cpu_to_le32p((x)) } while (0)
+#define	le32_to_cpus(x)	do { *((uint32_t *)x) = le32_to_cpup((x)) } while (0)
+#define	cpu_to_le16s(x)	do { *((uint16_t *)x) = cpu_to_le16p((x)) } while (0)
+#define	le16_to_cpus(x)	do { *((uint16_t *)x) = le16_to_cpup((x)) } while (0)
+#define	cpu_to_be64s(x)	do { *((uint64_t *)x) = cpu_to_be64p((x)) } while (0)
+#define	be64_to_cpus(x)	do { *((uint64_t *)x) = be64_to_cpup((x)) } while (0)
+#define	cpu_to_be32s(x)	do { *((uint32_t *)x) = cpu_to_be32p((x)) } while (0)
+#define	be32_to_cpus(x)	do { *((uint32_t *)x) = be32_to_cpup((x)) } while (0)
+#define	cpu_to_be16s(x)	do { *((uint16_t *)x) = cpu_to_be16p((x)) } while (0)
+#define	be16_to_cpus(x)	do { *((uint16_t *)x) = be16_to_cpup((x)) } while (0)
+
+#define	swab16	bswap16
+#define	swab32	bswap32
+#define	swab64	bswap64
+
+static inline void
+be16_add_cpu(u16 *var, u16 val)
+{ 
+	*var = cpu_to_be16(be16_to_cpu(*var) + val);
+}
+
+#endif	/* _ASM_BYTEORDER_H_ */
diff --git a/sys/ofed/include/asm/current.h b/sys/ofed/include/asm/current.h
new file mode 100644
index 0000000..33bd120
--- /dev/null
+++ b/sys/ofed/include/asm/current.h
@@ -0,0 +1,32 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_ASM_CURRENT_H_
+#define	_ASM_CURRENT_H_
+
+#endif	/* _ASM_CURRENT_H_ */
diff --git a/sys/ofed/include/asm/fcntl.h b/sys/ofed/include/asm/fcntl.h
new file mode 100644
index 0000000..a650f5b
--- /dev/null
+++ b/sys/ofed/include/asm/fcntl.h
@@ -0,0 +1,33 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _ASM_FCNTL_H_
+#define	_ASM_FCNTL_H_
+
+#include <sys/fcntl.h>
+
+#endif	/* _ASM_FCNTL_H_ */
diff --git a/sys/ofed/include/asm/io.h b/sys/ofed/include/asm/io.h
new file mode 100644
index 0000000..7a742d9
--- /dev/null
+++ b/sys/ofed/include/asm/io.h
@@ -0,0 +1,29 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/io.h>
diff --git a/sys/ofed/include/asm/page.h b/sys/ofed/include/asm/page.h
new file mode 100644
index 0000000..da42df7
--- /dev/null
+++ b/sys/ofed/include/asm/page.h
@@ -0,0 +1,29 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/page.h>
diff --git a/sys/ofed/include/asm/pgtable.h b/sys/ofed/include/asm/pgtable.h
new file mode 100644
index 0000000..087f525
--- /dev/null
+++ b/sys/ofed/include/asm/pgtable.h
@@ -0,0 +1,33 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _ASM_PGTABLE_H_
+#define	_ASM_PGTABLE_H_
+
+typedef int	pgprot_t;
+
+#endif	/* _ASM_PGTABLE_H_ */
diff --git a/sys/ofed/include/asm/semaphore.h b/sys/ofed/include/asm/semaphore.h
new file mode 100644
index 0000000..a60ba8c
--- /dev/null
+++ b/sys/ofed/include/asm/semaphore.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_ASM_SEMAPHORE_H_
+#define	_ASM_SEMAPHORE_H_
+
+#include <linux/semaphore.h>
+
+#endif	/* _ASM_SEMAPHORE_H_ */
diff --git a/sys/ofed/include/asm/system.h b/sys/ofed/include/asm/system.h
new file mode 100644
index 0000000..e5d814e
--- /dev/null
+++ b/sys/ofed/include/asm/system.h
@@ -0,0 +1,27 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
diff --git a/sys/ofed/include/asm/types.h b/sys/ofed/include/asm/types.h
new file mode 100644
index 0000000..70dd2be
--- /dev/null
+++ b/sys/ofed/include/asm/types.h
@@ -0,0 +1,67 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_ASM_TYPES_H_
+#define	_ASM_TYPES_H_
+
+typedef unsigned short umode_t;
+
+typedef __signed__ char __s8;
+typedef unsigned char __u8;
+
+typedef __signed__ short __s16;
+typedef unsigned short __u16;
+
+typedef __signed__ int __s32;
+typedef unsigned int __u32;
+
+#if defined(__GNUC__) // && !defined(__STRICT_ANSI__)
+typedef __signed__ long long __s64;
+typedef unsigned long long __u64;
+#endif
+
+#ifdef _KERNEL
+
+typedef signed char s8;
+typedef unsigned char u8;
+
+typedef signed short s16;
+typedef unsigned short u16;
+
+typedef signed int s32;
+typedef unsigned int u32;
+
+typedef signed long long s64;
+typedef unsigned long long u64;
+
+/* DMA addresses come in generic and 64-bit flavours.  */
+typedef vm_paddr_t dma_addr_t;
+typedef vm_paddr_t dma64_addr_t;
+
+#endif	/* _KERNEL */
+
+#endif	/* _ASM_TYPES_H_ */
diff --git a/sys/ofed/include/asm/uaccess.h b/sys/ofed/include/asm/uaccess.h
new file mode 100644
index 0000000..b7c32fa
--- /dev/null
+++ b/sys/ofed/include/asm/uaccess.h
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _ASM_UACCESS_H_
+#define _ASM_UACCESS_H_
+
+#include <linux/uaccess.h>
+
+static inline long
+copy_to_user(void *to, const void *from, unsigned long n)
+{
+	if (copyout(from, to, n) != 0)
+		return n;
+	return 0;
+}
+
+static inline long
+copy_from_user(void *to, const void *from, unsigned long n)
+{
+	if (copyin(from, to, n) != 0)
+		return n;
+	return 0;
+}
+
+#endif	/* _ASM_UACCESS_H_ */
diff --git a/sys/ofed/include/linux/bitmap.h b/sys/ofed/include/linux/bitmap.h
new file mode 100644
index 0000000..66059ac
--- /dev/null
+++ b/sys/ofed/include/linux/bitmap.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_BITMAP_H_
+#define	_LINUX_BITMAP_H_
+
+#include <linux/bitops.h>
+#include <linux/string.h>
+
+#endif	/* _LINUX_BITMAP_H_ */
diff --git a/sys/ofed/include/linux/bitops.h b/sys/ofed/include/linux/bitops.h
new file mode 100644
index 0000000..4305a3a
--- /dev/null
+++ b/sys/ofed/include/linux/bitops.h
@@ -0,0 +1,312 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_BITOPS_H_
+#define	_LINUX_BITOPS_H_
+
+#ifdef __LP64__
+#define	BITS_PER_LONG		64
+#else
+#define	BITS_PER_LONG		32
+#endif
+#define	BIT_MASK(n)		(~0UL >> (BITS_PER_LONG - (n)))
+#define	BITS_TO_LONGS(n)	howmany((n), BITS_PER_LONG)
+
+static inline int
+__ffs(int mask)
+{
+	return (ffs(mask) - 1);
+}
+
+static inline int
+__fls(int mask)
+{
+	return (fls(mask) - 1);
+}
+
+static inline int
+__ffsl(long mask)
+{
+	return (ffsl(mask) - 1);
+}
+
+static inline int
+__flsl(long mask)
+{
+	return (flsl(mask) - 1);
+}
+
+
+#define	ffz(mask)	__ffs(~(mask))
+
+static inline unsigned long
+find_first_bit(unsigned long *addr, unsigned long size)
+{
+	long mask;
+	int bit;
+
+	for (bit = 0; size >= BITS_PER_LONG;
+	    size -= BITS_PER_LONG, bit += BITS_PER_LONG, addr++) {
+		if (*addr == 0)
+			continue;
+		return (bit + __ffsl(*addr));
+	}
+	if (size) {
+		mask = (*addr) & BIT_MASK(size);
+		if (mask)
+			bit += __ffsl(mask);
+		else
+			bit += size;
+	}
+	return (bit);
+}
+
+static inline unsigned long
+find_first_zero_bit(unsigned long *addr, unsigned long size)
+{
+	long mask;
+	int bit;
+
+	for (bit = 0; size >= BITS_PER_LONG;
+	    size -= BITS_PER_LONG, bit += BITS_PER_LONG, addr++) {
+		if (~(*addr) == 0)
+			continue;
+		return (bit + __ffsl(~(*addr)));
+	}
+	if (size) {
+		mask = ~(*addr) & BIT_MASK(size);
+		if (mask)
+			bit += __ffsl(mask);
+		else
+			bit += size;
+	}
+	return (bit);
+}
+
+static inline unsigned long
+find_last_bit(unsigned long *addr, unsigned long size)
+{
+	long mask;
+	int offs;
+	int bit;
+	int pos;
+
+	pos = size / BITS_PER_LONG;
+	offs = size % BITS_PER_LONG;
+	bit = BITS_PER_LONG * pos;
+	addr += pos;
+	if (offs) {
+		mask = (*addr) & BIT_MASK(offs);
+		if (mask)
+			return (bit + __flsl(mask));
+	}
+	while (--pos) {
+		addr--;
+		bit -= BITS_PER_LONG;
+		if (*addr)
+			return (bit + __flsl(mask));
+	}
+	return (size);
+}
+
+static inline unsigned long
+find_next_bit(unsigned long *addr, unsigned long size, unsigned long offset)
+{
+	long mask;
+	int offs;
+	int bit;
+	int pos;
+
+	if (offset >= size)
+		return (size);
+	pos = offset / BITS_PER_LONG;
+	offs = offset % BITS_PER_LONG;
+	bit = BITS_PER_LONG * pos;
+	addr += pos;
+	if (offs) {
+		mask = (*addr) & ~BIT_MASK(offs);
+		if (mask)
+			return (bit + __ffsl(mask));
+		bit += BITS_PER_LONG;
+		addr++;
+	}
+	for (size -= bit; size >= BITS_PER_LONG;
+	    size -= BITS_PER_LONG, bit += BITS_PER_LONG, addr++) {
+		if (*addr == 0)
+			continue;
+		return (bit + __ffsl(*addr));
+	}
+	if (size) {
+		mask = (*addr) & BIT_MASK(size);
+		if (mask)
+			bit += __ffsl(mask);
+		else
+			bit += size;
+	}
+	return (bit);
+}
+
+static inline unsigned long
+find_next_zero_bit(unsigned long *addr, unsigned long size,
+    unsigned long offset)
+{
+	long mask;
+	int offs;
+	int bit;
+	int pos;
+
+	if (offset >= size)
+		return (size);
+	pos = offset / BITS_PER_LONG;
+	offs = offset % BITS_PER_LONG;
+	bit = BITS_PER_LONG * pos;
+	addr += pos;
+	if (offs) {
+		mask = ~(*addr) & ~BIT_MASK(offs);
+		if (mask)
+			return (bit + __ffsl(mask));
+		bit += BITS_PER_LONG;
+		addr++;
+	}
+	for (size -= bit; size >= BITS_PER_LONG;
+	    size -= BITS_PER_LONG, bit += BITS_PER_LONG, addr++) {
+		if (~(*addr) == 0)
+			continue;
+		return (bit + __ffsl(~(*addr)));
+	}
+	if (size) {
+		mask = ~(*addr) & BIT_MASK(size);
+		if (mask)
+			bit += __ffsl(mask);
+		else
+			bit += size;
+	}
+	return (bit);
+}
+
+static inline void
+bitmap_zero(unsigned long *addr, int size)
+{
+	int len;
+
+	len = BITS_TO_LONGS(size) * sizeof(long);
+	memset(addr, 0, len);
+}
+
+static inline void
+bitmap_fill(unsigned long *addr, int size)
+{
+	int tail;
+	int len;
+
+	len = (size / BITS_PER_LONG) * sizeof(long);
+	memset(addr, 0xff, len);
+	tail = size & (BITS_PER_LONG - 1);
+	if (tail) 
+		addr[size / BITS_PER_LONG] = BIT_MASK(tail);
+}
+
+static inline int
+bitmap_full(unsigned long *addr, int size)
+{
+	long mask;
+	int tail;
+	int len;
+	int i;
+
+	len = size / BITS_PER_LONG;
+	for (i = 0; i < len; i++)
+		if (addr[i] != ~0UL)
+			return (0);
+	tail = size & (BITS_PER_LONG - 1);
+	if (tail) {
+		mask = BIT_MASK(tail);
+		if ((addr[i] & mask) != mask)
+			return (0);
+	}
+	return (1);
+}
+
+static inline int
+bitmap_empty(unsigned long *addr, int size)
+{
+	long mask;
+	int tail;
+	int len;
+	int i;
+
+	len = size / BITS_PER_LONG;
+	for (i = 0; i < len; i++)
+		if (addr[i] != 0)
+			return (0);
+	tail = size & (BITS_PER_LONG - 1);
+	if (tail) {
+		mask = BIT_MASK(tail);
+		if ((addr[i] & mask) != 0)
+			return (0);
+	}
+	return (1);
+}
+
+#define	NBINT	(NBBY * sizeof(int))
+
+#define	set_bit(i, a)							\
+    atomic_set_int(&((volatile int *)(a))[(i)/NBINT], 1 << (i) % NBINT)
+
+#define	clear_bit(i, a)							\
+    atomic_clear_int(&((volatile int *)(a))[(i)/NBINT], 1 << (i) % NBINT)
+
+#define	test_bit(i, a)							\
+    !!(atomic_load_acq_int(&((volatile int *)(a))[(i)/NBINT]) & 1 << ((i) % NBINT))
+
+static inline long
+test_and_clear_bit(long bit, long *var)
+{
+	long val;
+
+	bit = 1 << bit;
+	do {
+		val = *(volatile long *)var;
+	} while (atomic_cmpset_long(var, val, val & ~bit) == 0);
+
+	return !!(val & bit);
+}
+
+static inline long
+test_and_set_bit(long bit, long *var)
+{
+	long val;
+
+	bit = 1 << bit;
+	do {
+		val = *(volatile long *)var;
+	} while (atomic_cmpset_long(var, val, val | bit) == 0);
+
+	return !!(val & bit);
+}
+
+#endif	/* _LINUX_BITOPS_H_ */
diff --git a/sys/ofed/include/linux/cdev.h b/sys/ofed/include/linux/cdev.h
new file mode 100644
index 0000000..cc77495
--- /dev/null
+++ b/sys/ofed/include/linux/cdev.h
@@ -0,0 +1,129 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_CDEV_H_
+#define	_LINUX_CDEV_H_
+
+#include <linux/kobject.h>
+#include <linux/kdev_t.h>
+#include <linux/list.h>
+
+struct file_operations;
+struct inode;
+struct module;
+
+extern struct cdevsw linuxcdevsw;
+
+struct linux_cdev {
+	struct kobject	kobj;
+	struct module	*owner;
+	struct cdev	*cdev;
+	dev_t		dev;
+	const struct file_operations *ops;
+};
+
+static inline void
+cdev_release(struct kobject *kobj)
+{
+	struct linux_cdev *cdev;
+
+	cdev = container_of(kobj, struct linux_cdev, kobj);
+	if (cdev->cdev)
+		destroy_dev(cdev->cdev);
+	kfree(cdev);
+}
+
+static inline void
+cdev_static_release(struct kobject *kobj)
+{
+	struct linux_cdev *cdev;
+
+	cdev = container_of(kobj, struct linux_cdev, kobj);
+	if (cdev->cdev)
+		destroy_dev(cdev->cdev);
+}
+
+static struct kobj_type cdev_ktype = {
+	.release = cdev_release,
+};
+
+static struct kobj_type cdev_static_ktype = {
+	.release = cdev_static_release,
+};
+
+static inline void
+cdev_init(struct linux_cdev *cdev, const struct file_operations *ops)
+{
+
+	kobject_init(&cdev->kobj, &cdev_static_ktype);
+	cdev->ops = ops;
+}
+
+static inline struct linux_cdev *
+cdev_alloc(void)
+{
+	struct linux_cdev *cdev;
+
+	cdev = kzalloc(sizeof(struct linux_cdev), M_WAITOK);
+	if (cdev)
+		kobject_init(&cdev->kobj, &cdev_ktype);
+	return (cdev);
+}
+
+static inline void
+cdev_put(struct linux_cdev *p)
+{
+	kobject_put(&p->kobj);
+}
+
+static inline int
+cdev_add(struct linux_cdev *cdev, dev_t dev, unsigned count)
+{
+	if (count != 1)
+		panic("cdev_add: Unsupported count: %d", count);
+	cdev->cdev = make_dev(&linuxcdevsw, MINOR(dev), 0, 0, 0700, 
+	    kobject_name(&cdev->kobj));
+	cdev->dev = dev;
+	cdev->cdev->si_drv1 = cdev;
+
+	return (0);
+}
+
+static inline void
+cdev_del(struct linux_cdev *cdev)
+{
+	if (cdev->cdev) {
+		destroy_dev(cdev->cdev);
+		cdev->cdev = NULL;
+	}
+	kobject_put(&cdev->kobj);
+}
+
+#define	cdev	linux_cdev
+
+#endif	/* _LINUX_CDEV_H_ */
diff --git a/sys/ofed/include/linux/compat.h b/sys/ofed/include/linux/compat.h
new file mode 100644
index 0000000..cfb1671
--- /dev/null
+++ b/sys/ofed/include/linux/compat.h
@@ -0,0 +1,33 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_COMPAT_H_
+#define	_LINUX_COMPAT_H_
+
+
+#endif	/* _LINUX_COMPAT_H_ */
diff --git a/sys/ofed/include/linux/compiler.h b/sys/ofed/include/linux/compiler.h
new file mode 100644
index 0000000..12938ba
--- /dev/null
+++ b/sys/ofed/include/linux/compiler.h
@@ -0,0 +1,65 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_COMPILER_H_
+#define	_LINUX_COMPILER_H_
+
+#include <sys/cdefs.h>
+
+#define __user
+#define __kernel
+#define __safe
+#define __force
+#define __nocast
+#define __iomem
+#define __chk_user_ptr(x)		0
+#define __chk_io_ptr(x)			0
+#define __builtin_warning(x, y...)	(1)
+#define __acquires(x)
+#define __releases(x)
+#define __acquire(x)			0
+#define __release(x)			0
+#define __cond_lock(x,c)		(c)
+#define	__bitwise
+#define __devinitdata
+#define __init
+#define	__devinit
+#define	__devexit
+#define __exit
+#define	__stringify(x)			#x
+#define	__attribute_const__		__attribute__((__const__))
+#undef __always_inline
+#define	__always_inline			inline
+
+#define	likely(x)			__builtin_expect(!!(x), 1)
+#define	unlikely(x)			__builtin_expect(!!(x), 0)
+#define typeof(x)			__typeof(x)
+
+#define	uninitialized_var(x)		x = x
+
+#endif	/* _LINUX_COMPILER_H_ */
diff --git a/sys/ofed/include/linux/completion.h b/sys/ofed/include/linux/completion.h
new file mode 100644
index 0000000..59f36b0
--- /dev/null
+++ b/sys/ofed/include/linux/completion.h
@@ -0,0 +1,155 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_COMPLETION_H_
+#define	_LINUX_COMPLETION_H_
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sleepqueue.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+
+struct completion {
+	unsigned int done;
+};
+
+#define	INIT_COMPLETION(c)	((c).done = 0)
+#define	init_completion(c)	((c)->done = 0)
+
+static inline void
+_complete_common(struct completion *c, int all)
+{
+	int wakeup_swapper;
+
+	sleepq_lock(c);
+	c->done++;
+	if (all)
+		wakeup_swapper = sleepq_broadcast(c, SLEEPQ_SLEEP, 0, 0);
+	else
+		wakeup_swapper = sleepq_signal(c, SLEEPQ_SLEEP, 0, 0);
+	sleepq_release(c);
+	if (wakeup_swapper)
+		kick_proc0();
+}
+
+#define	complete(c)	_complete_common(c, 0)
+#define	complete_all(c)	_complete_common(c, 1)
+
+/*
+ * Indefinite wait for done != 0 with or without signals.
+ */
+static inline long
+_wait_for_common(struct completion *c, int flags)
+{
+
+	flags |= SLEEPQ_SLEEP;
+	for (;;) {
+		sleepq_lock(c);
+		if (c->done)
+			break;
+		sleepq_add(c, NULL, "completion", flags, 0);
+		if (flags & SLEEPQ_INTERRUPTIBLE) {
+			if (sleepq_wait_sig(c, 0) != 0)
+				return (-ERESTARTSYS);
+		} else
+			sleepq_wait(c, 0);
+	}
+	c->done--;
+	sleepq_release(c);
+
+	return (0);
+}
+
+#define	wait_for_completion(c)	_wait_for_common(c, 0)
+#define	wait_for_completion_interuptible(c)				\
+	_wait_for_common(c, SLEEPQ_INTERRUPTIBLE)
+
+static inline long
+_wait_for_timeout_common(struct completion *c, long timeout, int flags)
+{
+	long end;
+
+	end = ticks + timeout;
+	flags |= SLEEPQ_SLEEP;
+	for (;;) {
+		sleepq_lock(c);
+		if (c->done)
+			break;
+		sleepq_add(c, NULL, "completion", flags, 0);
+		sleepq_set_timeout(c, end - ticks);
+		if (flags & SLEEPQ_INTERRUPTIBLE) {
+			if (sleepq_timedwait_sig(c, 0) != 0)
+				return (-ERESTARTSYS);
+		} else
+			sleepq_timedwait(c, 0);
+	}
+	c->done--;
+	sleepq_release(c);
+	timeout = end - ticks;
+
+	return (timeout > 0 ? timeout : 1);
+}
+
+#define	wait_for_completion_timeout(c, timeout)				\
+	_wait_for_timeout_common(c, timeout, 0)
+#define	wait_for_completion_interruptible_timeout(c, timeout)		\
+	_wait_for_timeout_common(c, timeout, SLEEPQ_INTERRUPTIBLE)
+
+static inline int
+try_wait_for_completion(struct completion *c)
+{
+	int isdone;
+
+	isdone = 1;
+	sleepq_lock(c);
+	if (c->done)
+		c->done--;
+	else
+		isdone = 0;
+	sleepq_release(c);
+	return (isdone);
+}
+
+static inline int
+completion_done(struct completion *c)
+{
+	int isdone;
+
+	isdone = 1;
+	sleepq_lock(c);
+	if (c->done == 0)
+		isdone = 0;
+	sleepq_release(c);
+	return (isdone);
+}
+
+#endif	/* _LINUX_COMPLETION_H_ */
diff --git a/sys/ofed/include/linux/ctype.h b/sys/ofed/include/linux/ctype.h
new file mode 100644
index 0000000..3ed4137
--- /dev/null
+++ b/sys/ofed/include/linux/ctype.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_CTYPE_H_
+#define	_LINUX_CTYPE_H_
+
+#include <sys/ctype.h>
+
+#endif	/* _LINUX_CTYPE_H_ */
diff --git a/sys/ofed/include/linux/delay.h b/sys/ofed/include/linux/delay.h
new file mode 100644
index 0000000..019ef8a
--- /dev/null
+++ b/sys/ofed/include/linux/delay.h
@@ -0,0 +1,43 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_DELAY_H_
+#define	_LINUX_DELAY_H_
+
+#include <linux/jiffies.h>
+
+static inline void
+linux_msleep(int ms)
+{
+	pause("lnxsleep", msecs_to_jiffies(ms));
+}
+
+#undef msleep
+#define	msleep	linux_msleep
+
+#endif	/* _LINUX_DELAY_H_ */
diff --git a/sys/ofed/include/linux/device.h b/sys/ofed/include/linux/device.h
new file mode 100644
index 0000000..cce46ca
--- /dev/null
+++ b/sys/ofed/include/linux/device.h
@@ -0,0 +1,388 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_DEVICE_H_
+#define	_LINUX_DEVICE_H_
+
+#include <linux/types.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/workqueue.h>
+#include <linux/sysfs.h>
+#include <linux/kdev_t.h>
+#include <asm/atomic.h>
+
+#include <sys/bus.h>
+
+enum irqreturn	{ IRQ_NONE = 0, IRQ_HANDLED, IRQ_WAKE_THREAD, };
+typedef enum irqreturn	irqreturn_t;
+
+struct class {
+	const char	*name;
+	struct module	*owner;
+	struct kobject	kobj;
+	devclass_t	bsdclass;
+	void		(*class_release)(struct class *class);
+	void		(*dev_release)(struct device *dev);
+};
+
+struct device {
+	struct device	*parent;
+	struct list_head irqents;
+	device_t	bsddev;
+	dev_t		devt;
+	struct class	*class;
+	void		(*release)(struct device *dev);
+	struct kobject	kobj;
+	uint64_t	*dma_mask;
+	void		*driver_data;
+	unsigned int	irq;
+	unsigned int	msix;
+	unsigned int	msix_max;
+};
+
+extern struct device linux_rootdev;
+extern struct kobject class_root;
+
+struct class_attribute {
+	struct attribute	attr;
+        ssize_t			(*show)(struct class *, char *);
+        ssize_t			(*store)(struct class *, const char *, size_t);
+};
+#define	CLASS_ATTR(_name, _mode, _show, _store)				\
+	struct class_attribute class_attr_##_name =			\
+	    { { #_name, NULL, _mode }, _show, _store }
+
+struct device_attribute {
+	struct attribute	attr;
+	ssize_t			(*show)(struct device *,
+				    struct device_attribute *, char *);
+	ssize_t			(*store)(struct device *,
+				    struct device_attribute *, const char *,
+				    size_t);
+};
+
+#define	DEVICE_ATTR(_name, _mode, _show, _store)			\
+	struct device_attribute dev_attr_##_name =			\
+	    { { #_name, NULL, _mode }, _show, _store }
+
+#define	dev_err(dev, fmt, ...)	device_printf((dev)->bsddev, fmt, ##__VA_ARGS__)
+#define	dev_warn(dev, fmt, ...)	device_printf((dev)->bsddev, fmt, ##__VA_ARGS__)
+#define	dev_info(dev, fmt, ...)	device_printf((dev)->bsddev, fmt, ##__VA_ARGS__)
+#define	dev_printk(lvl, dev, fmt, ...)					\
+	    device_printf((dev)->bsddev, fmt, ##__VA_ARGS__)
+
+static inline void *
+dev_get_drvdata(struct device *dev)
+{
+
+	return dev->driver_data;
+}
+
+static inline void
+dev_set_drvdata(struct device *dev, void *data)
+{
+
+	dev->driver_data = data;
+}
+
+static inline struct device *
+get_device(struct device *dev)
+{
+
+	if (dev)
+		kobject_get(&dev->kobj);
+
+	return (dev);
+}
+
+static inline char *
+dev_name(const struct device *dev)
+{
+
+ 	return kobject_name(&dev->kobj);
+}
+
+#define	dev_set_name(_dev, _fmt, ...)					\
+	kobject_set_name(&(_dev)->kobj, (_fmt), ##__VA_ARGS__)
+
+static inline void
+put_device(struct device *dev)
+{
+
+	if (dev)
+		kobject_put(&dev->kobj);
+}
+
+static inline ssize_t
+class_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	struct class_attribute *dattr;
+	ssize_t error;
+
+	dattr = container_of(attr, struct class_attribute, attr);
+	error = -EIO;
+	if (dattr->show)
+		error = dattr->show(container_of(kobj, struct class, kobj),
+		    buf);
+	return (error);
+}
+
+static inline ssize_t
+class_store(struct kobject *kobj, struct attribute *attr, const char *buf,
+    size_t count)
+{
+	struct class_attribute *dattr;
+	ssize_t error;
+
+	dattr = container_of(attr, struct class_attribute, attr);
+	error = -EIO;
+	if (dattr->store)
+		error = dattr->store(container_of(kobj, struct class, kobj),
+		    buf, count);
+	return (error);
+}
+
+static inline void
+class_release(struct kobject *kobj)
+{
+	struct class *class;
+
+	class = container_of(kobj, struct class, kobj);
+	if (class->class_release)
+		class->class_release(class);
+}
+
+static struct sysfs_ops class_sysfs = {
+	.show  = class_show,
+	.store = class_store,
+};
+static struct kobj_type class_ktype = {
+	.release = class_release,
+	.sysfs_ops = &class_sysfs
+};
+
+static inline int
+class_register(struct class *class)
+{
+
+	class->bsdclass = devclass_create(class->name);
+	kobject_init(&class->kobj, &class_ktype);
+	kobject_set_name(&class->kobj, class->name);
+	kobject_add(&class->kobj, &class_root, class->name);
+
+	return (0);
+}
+
+static inline void
+class_unregister(struct class *class)
+{
+
+	kobject_put(&class->kobj);
+}
+
+static inline void
+device_release(struct kobject *kobj)
+{
+	struct device *dev;
+
+	dev = container_of(kobj, struct device, kobj);
+	/* This is the precedence defined by linux. */
+	if (dev->release)
+		dev->release(dev);
+	else if (dev->class && dev->class->dev_release)
+		dev->class->dev_release(dev);
+}
+
+static inline ssize_t
+dev_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	struct device_attribute *dattr;
+	ssize_t error;
+
+	dattr = container_of(attr, struct device_attribute, attr);
+	error = -EIO;
+	if (dattr->show)
+		error = dattr->show(container_of(kobj, struct device, kobj),
+		    dattr, buf);
+	return (error);
+}
+
+static inline ssize_t
+dev_store(struct kobject *kobj, struct attribute *attr, const char *buf,
+    size_t count)
+{
+	struct device_attribute *dattr;
+	ssize_t error;
+
+	dattr = container_of(attr, struct device_attribute, attr);
+	error = -EIO;
+	if (dattr->store)
+		error = dattr->store(container_of(kobj, struct device, kobj),
+		    dattr, buf, count);
+	return (error);
+}
+
+static struct sysfs_ops dev_sysfs = { .show  = dev_show, .store = dev_store, };
+static struct kobj_type dev_ktype = {
+	.release = device_release,
+	.sysfs_ops = &dev_sysfs
+};
+
+/*
+ * Devices are registered and created for exporting to sysfs.  create
+ * implies register and register assumes the device fields have been
+ * setup appropriately before being called.
+ */
+static inline int
+device_register(struct device *dev)
+{
+	device_t bsddev;
+	int unit;
+
+	bsddev = NULL;
+	if (dev->devt) {
+		unit = MINOR(dev->devt);
+		bsddev = devclass_get_device(dev->class->bsdclass, unit);
+	} else
+		unit = -1;
+	if (bsddev == NULL)
+		bsddev = device_add_child(dev->parent->bsddev,
+		    dev->class->kobj.name, unit);
+	if (bsddev) {
+		if (dev->devt == 0)
+			dev->devt = makedev(0, device_get_unit(bsddev));
+		device_set_softc(bsddev, dev);
+	}
+	dev->bsddev = bsddev;
+	kobject_init(&dev->kobj, &dev_ktype);
+	kobject_add(&dev->kobj, &dev->class->kobj, dev_name(dev));
+
+	return (0);
+}
+
+static inline void
+device_unregister(struct device *dev)
+{
+	device_t bsddev;
+
+	bsddev = dev->bsddev;
+	mtx_lock(&Giant);
+	if (bsddev)
+		device_delete_child(device_get_parent(bsddev), bsddev);
+	mtx_unlock(&Giant);
+	put_device(dev);
+}
+
+struct device *device_create(struct class *class, struct device *parent,
+	    dev_t devt, void *drvdata, const char *fmt, ...);
+
+static inline void
+device_destroy(struct class *class, dev_t devt)
+{
+	device_t bsddev;
+	int unit;
+
+	unit = MINOR(devt);
+	bsddev = devclass_get_device(class->bsdclass, unit);
+	if (bsddev)
+		device_unregister(device_get_softc(bsddev));
+}
+
+static inline void
+class_kfree(struct class *class)
+{
+
+	kfree(class);
+}
+
+static inline struct class *
+class_create(struct module *owner, const char *name)
+{
+	struct class *class;
+	int error;
+
+	class = kzalloc(sizeof(*class), M_WAITOK);
+	class->owner = owner;
+	class->name= name;
+	class->class_release = class_kfree;
+	error = class_register(class);
+	if (error) {
+		kfree(class);
+		return (NULL);
+	}
+
+	return (class);
+}
+
+static inline void
+class_destroy(struct class *class)
+{
+
+	if (class == NULL)
+		return;
+	class_unregister(class);
+}
+
+static inline int
+device_create_file(struct device *dev, const struct device_attribute *attr)
+{
+
+	if (dev)
+		return sysfs_create_file(&dev->kobj, &attr->attr);
+	return -EINVAL;
+}
+
+static inline void
+device_remove_file(struct device *dev, const struct device_attribute *attr)
+{
+
+	if (dev)
+		sysfs_remove_file(&dev->kobj, &attr->attr);
+}
+
+static inline int
+class_create_file(struct class *class, const struct class_attribute *attr)
+{
+
+	if (class)
+		return sysfs_create_file(&class->kobj, &attr->attr);
+	return -EINVAL;
+}
+
+static inline void
+class_remove_file(struct class *class, const struct class_attribute *attr)
+{
+
+	if (class)
+		sysfs_remove_file(&class->kobj, &attr->attr);
+}
+
+#endif	/* _LINUX_DEVICE_H_ */
diff --git a/sys/ofed/include/linux/dma-attrs.h b/sys/ofed/include/linux/dma-attrs.h
new file mode 100644
index 0000000..9e625bd
--- /dev/null
+++ b/sys/ofed/include/linux/dma-attrs.h
@@ -0,0 +1,47 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_DMA_ATTR_H_
+#define	_LINUX_DMA_ATTR_H_
+
+enum dma_attr { DMA_ATTR_WRITE_BARRIER, DMA_ATTR_WEAK_ORDERING, DMA_ATTR_MAX, };
+
+#define __DMA_ATTRS_LONGS BITS_TO_LONGS(DMA_ATTR_MAX)
+
+struct dma_attrs {
+	unsigned long flags;
+};
+ 
+#define DEFINE_DMA_ATTRS(x) struct dma_attrs x = { }
+
+static inline void
+init_dma_attrs(struct dma_attrs *attrs)
+{
+	attrs->flags = 0;
+}
+
+#endif	/* _LINUX_DMA_ATTR_H_ */
diff --git a/sys/ofed/include/linux/dma-mapping.h b/sys/ofed/include/linux/dma-mapping.h
new file mode 100644
index 0000000..c653524
--- /dev/null
+++ b/sys/ofed/include/linux/dma-mapping.h
@@ -0,0 +1,263 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_DMA_MAPPING_H_
+#define _LINUX_DMA_MAPPING_H_
+
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/dma-attrs.h>
+#include <linux/scatterlist.h>
+#include <linux/mm.h>
+#include <linux/page.h>
+
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/pmap.h>
+
+#include <machine/bus.h>
+#include <machine/pmap.h>
+
+enum dma_data_direction {
+	DMA_BIDIRECTIONAL = 0,
+	DMA_TO_DEVICE = 1,
+	DMA_FROM_DEVICE = 2,
+	DMA_NONE = 3,
+};
+
+struct dma_map_ops {
+	void* (*alloc_coherent)(struct device *dev, size_t size,
+	    dma_addr_t *dma_handle, gfp_t gfp);
+	void (*free_coherent)(struct device *dev, size_t size,
+	    void *vaddr, dma_addr_t dma_handle);
+	dma_addr_t (*map_page)(struct device *dev, struct page *page,
+	    unsigned long offset, size_t size, enum dma_data_direction dir,
+	    struct dma_attrs *attrs);
+	void (*unmap_page)(struct device *dev, dma_addr_t dma_handle,
+	    size_t size, enum dma_data_direction dir, struct dma_attrs *attrs);
+	int (*map_sg)(struct device *dev, struct scatterlist *sg,
+	    int nents, enum dma_data_direction dir, struct dma_attrs *attrs);
+	void (*unmap_sg)(struct device *dev, struct scatterlist *sg, int nents,
+	    enum dma_data_direction dir, struct dma_attrs *attrs);
+	void (*sync_single_for_cpu)(struct device *dev, dma_addr_t dma_handle,
+	    size_t size, enum dma_data_direction dir);
+	void (*sync_single_for_device)(struct device *dev,
+	    dma_addr_t dma_handle, size_t size, enum dma_data_direction dir);
+	void (*sync_single_range_for_cpu)(struct device *dev,
+	    dma_addr_t dma_handle, unsigned long offset, size_t size,
+	    enum dma_data_direction dir);
+	void (*sync_single_range_for_device)(struct device *dev,
+	    dma_addr_t dma_handle, unsigned long offset, size_t size,
+	    enum dma_data_direction dir);
+	void (*sync_sg_for_cpu)(struct device *dev, struct scatterlist *sg,
+	    int nents, enum dma_data_direction dir);
+	void (*sync_sg_for_device)(struct device *dev, struct scatterlist *sg,
+	    int nents, enum dma_data_direction dir);
+	int (*mapping_error)(struct device *dev, dma_addr_t dma_addr);
+	int (*dma_supported)(struct device *dev, u64 mask);
+	int is_phys;
+};
+
+#define	DMA_BIT_MASK(n)	(((n) == 64) ? ~0ULL : ((1ULL << (n)) - 1))
+
+static inline int
+dma_supported(struct device *dev, u64 mask)
+{
+
+	/* XXX busdma takes care of this elsewhere. */
+	return (1);
+}
+ 
+static inline int
+dma_set_mask(struct device *dev, u64 dma_mask)
+{
+
+	if (!dev->dma_mask || !dma_supported(dev, dma_mask))
+		return -EIO;
+
+	*dev->dma_mask = dma_mask;
+	return (0);
+}
+
+static inline int
+dma_set_coherent_mask(struct device *dev, u64 mask)
+{
+
+	if (!dma_supported(dev, mask))
+		return -EIO;
+	/* XXX Currently we don't support a seperate coherent mask. */
+	return 0;
+}
+
+static inline void *
+dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
+    gfp_t flag)
+{
+	vm_paddr_t high;
+	size_t align;
+	void *mem;
+
+	if (dev->dma_mask)
+		high = *dev->dma_mask;
+	else
+		high = BUS_SPACE_MAXADDR_32BIT;
+	align = PAGE_SIZE << get_order(size);
+	mem = (void *)kmem_alloc_contig(kmem_map, size, flag, 0, high, align,
+	    0, VM_MEMATTR_DEFAULT);
+	if (mem)
+		*dma_handle = vtophys(mem);
+	else
+		*dma_handle = 0;
+	return (mem);
+}
+                       
+static inline void
+dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
+    dma_addr_t dma_handle)
+{
+
+	kmem_free(kmem_map, (vm_offset_t)cpu_addr, size);
+}
+
+/* XXX This only works with no iommu. */
+static inline dma_addr_t
+dma_map_single_attrs(struct device *dev, void *ptr, size_t size,
+    enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+
+	return vtophys(ptr);
+}
+
+static inline void
+dma_unmap_single_attrs(struct device *dev, dma_addr_t addr, size_t size,
+    enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+}
+
+static inline int
+dma_map_sg_attrs(struct device *dev, struct scatterlist *sgl, int nents,
+    enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+	struct scatterlist *sg;
+	int i;
+	
+	for_each_sg(sgl, sg, nents, i)
+		sg_dma_address(sg) = sg_phys(sg);
+
+	return (nents);
+}
+
+static inline void
+dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
+    enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+}
+ 
+static inline dma_addr_t
+dma_map_page(struct device *dev, struct page *page,
+    unsigned long offset, size_t size, enum dma_data_direction direction)
+{
+
+	return VM_PAGE_TO_PHYS(page) + offset;
+}
+
+static inline void
+dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
+    enum dma_data_direction direction)
+{
+}
+
+static inline void
+dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
+    enum dma_data_direction direction)
+{
+}
+
+static inline void
+dma_sync_single(struct device *dev, dma_addr_t addr, size_t size,
+    enum dma_data_direction dir)
+{
+	dma_sync_single_for_cpu(dev, addr, size, dir);
+}
+
+static inline void
+dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle,
+    size_t size, enum dma_data_direction direction)
+{
+}
+
+static inline void
+dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
+    enum dma_data_direction direction)
+{
+}
+
+static inline void
+dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
+    enum dma_data_direction direction)
+{
+}
+
+static inline void
+dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle,
+    unsigned long offset, size_t size, int direction)
+{
+}
+
+static inline void
+dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
+    unsigned long offset, size_t size, int direction)
+{
+}
+
+static inline int
+dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+
+	return (0);
+}
+
+#define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, NULL)
+#define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, NULL)
+#define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, NULL)
+#define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, NULL)
+
+#define	DEFINE_DMA_UNMAP_ADDR(name)		dma_addr_t name
+#define	DEFINE_DMA_UNMAP_LEN(name)		__u32 name
+#define	dma_unmap_addr(p, name)			((p)->name)
+#define	dma_unmap_addr_set(p, name, v)		(((p)->name) = (v))
+#define	dma_unmap_len(p, name)			((p)->name)
+#define	dma_unmap_len_set(p, name, v)		(((p)->name) = (v))
+
+extern int uma_align_cache;
+#define	dma_get_cache_alignment()	uma_align_cache
+
+#endif	/* _LINUX_DMA_MAPPING_H_ */
diff --git a/sys/ofed/include/linux/dmapool.h b/sys/ofed/include/linux/dmapool.h
new file mode 100644
index 0000000..3b58164
--- /dev/null
+++ b/sys/ofed/include/linux/dmapool.h
@@ -0,0 +1,85 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_DMAPOOL_H_
+#define	_LINUX_DMAPOOL_H_
+
+#include <linux/types.h>
+#include <linux/io.h>
+#include <linux/scatterlist.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+
+struct dma_pool {
+	uma_zone_t	pool_zone;
+};
+
+static inline struct dma_pool *
+dma_pool_create(char *name, struct device *dev, size_t size,
+    size_t align, size_t boundary)
+{
+	struct dma_pool *pool;
+
+	pool = kmalloc(sizeof(*pool), GFP_KERNEL);
+	align--;
+	/*
+	 * XXX Eventually this could use a seperate allocf to honor boundary
+	 * and physical address requirements of the device.
+	 */
+	pool->pool_zone = uma_zcreate(name, size, NULL, NULL, NULL, NULL,
+	    align, UMA_ZONE_OFFPAGE|UMA_ZONE_HASH);
+
+	return (pool);
+}
+
+static inline void
+dma_pool_destroy(struct dma_pool *pool)
+{
+	uma_zdestroy(pool->pool_zone);
+	kfree(pool);
+}
+
+static inline void *
+dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, dma_addr_t *handle)
+{
+	void *vaddr;
+
+	vaddr = uma_zalloc(pool->pool_zone, mem_flags);
+	if (vaddr)
+		*handle = vtophys(vaddr);
+	return (vaddr);
+}
+
+static inline void
+dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t addr)
+{
+	uma_zfree(pool->pool_zone, vaddr);
+}
+
+
+#endif /* _LINUX_DMAPOOL_H_ */
diff --git a/sys/ofed/include/linux/err.h b/sys/ofed/include/linux/err.h
new file mode 100644
index 0000000..858931d
--- /dev/null
+++ b/sys/ofed/include/linux/err.h
@@ -0,0 +1,60 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_ERR_H_
+#define	_LINUX_ERR_H_
+
+#define MAX_ERRNO	4095
+
+#define IS_ERR_VALUE(x) ((x) >= (unsigned long)-MAX_ERRNO)
+
+static inline void *
+ERR_PTR(long error)
+{
+	return (void *)error;
+}
+
+static inline long
+PTR_ERR(const void *ptr)
+{
+	return (long)ptr;
+}
+
+static inline long
+IS_ERR(const void *ptr)
+{
+	return IS_ERR_VALUE((unsigned long)ptr);
+}
+
+static inline void *
+ERR_CAST(void *ptr)
+{
+	return (void *)ptr;
+}
+
+#endif	/* _LINUX_ERR_H_ */
diff --git a/sys/ofed/include/linux/errno.h b/sys/ofed/include/linux/errno.h
new file mode 100644
index 0000000..b107c45
--- /dev/null
+++ b/sys/ofed/include/linux/errno.h
@@ -0,0 +1,39 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_ERRNO_H_
+#define	_LINUX_ERRNO_H_
+
+#include <sys/errno.h>
+
+#define	ECOMM		ESTALE
+#define	ENODATA		ECONNREFUSED
+#define	ENOIOCTLCMD	ENOIOCTL		/* XXX this is negative */
+#define ERESTARTSYS     ERESTART		/* XXX this is negative */
+
+#endif	/* _LINUX_ERRNO_H_ */
diff --git a/sys/ofed/include/linux/ethtool.h b/sys/ofed/include/linux/ethtool.h
new file mode 100644
index 0000000..a267209
--- /dev/null
+++ b/sys/ofed/include/linux/ethtool.h
@@ -0,0 +1,31 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_ETHTOOL_H_
+#define	_LINUX_ETHTOOL_H_
+
+#endif	/* _LINUX_ETHTOOL_H_ */
diff --git a/sys/ofed/include/linux/file.h b/sys/ofed/include/linux/file.h
new file mode 100644
index 0000000..12858d7
--- /dev/null
+++ b/sys/ofed/include/linux/file.h
@@ -0,0 +1,120 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_FILE_H_
+#define	_LINUX_FILE_H_
+
+#include <sys/param.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/refcount.h>
+#include <sys/proc.h>
+
+#include <linux/fs.h>
+
+struct linux_file;
+
+#undef file
+
+extern struct fileops linuxfileops;
+
+static inline struct linux_file *
+linux_fget(unsigned int fd)
+{
+	struct file *file;
+
+	file = fget_unlocked(curthread->td_proc->p_fd, fd);
+	return (struct linux_file *)file->f_data;
+}
+
+static inline void
+fput(struct linux_file *filp)
+{
+	if (filp->_file == NULL) {
+		kfree(filp);
+		return;
+	}
+	if (refcount_release(&filp->_file->f_count)) {
+		_fdrop(filp->_file, curthread);
+		kfree(filp);
+	}
+}
+
+static inline void
+put_unused_fd(unsigned int fd)
+{
+	struct file *file;
+
+	file = fget_unlocked(curthread->td_proc->p_fd, fd);
+	if (file == NULL)
+		return;
+	fdclose(curthread->td_proc->p_fd, file, fd, curthread);
+}
+
+static inline void
+fd_install(unsigned int fd, struct linux_file *filp)
+{
+	struct file *file;
+
+	file = fget_unlocked(curthread->td_proc->p_fd, fd);
+	filp->_file = file;
+        finit(file, filp->f_mode, DTYPE_DEV, filp, &linuxfileops);
+}
+
+static inline int
+get_unused_fd(void)
+{
+	struct file *file;
+	int error;
+	int fd;
+
+	error = falloc(curthread, &file, &fd);
+	if (error)
+		return -error;
+	return fd;
+}
+
+static inline struct linux_file *
+_alloc_file(int mode, const struct file_operations *fops)
+{
+	struct linux_file *filp;
+
+	filp = kzalloc(sizeof(*filp), GFP_KERNEL);
+	if (filp == NULL) 
+		return (NULL);
+	filp->f_op = fops;
+	filp->f_mode = mode;
+
+	return filp;
+}
+
+#define	alloc_file(mnt, root, mode, fops)	_alloc_file((mode), (fops))
+
+#define	file	linux_file
+#define	fget	linux_fget
+
+#endif	/* _LINUX_FILE_H_ */
diff --git a/sys/ofed/include/linux/fs.h b/sys/ofed/include/linux/fs.h
new file mode 100644
index 0000000..4e667cc
--- /dev/null
+++ b/sys/ofed/include/linux/fs.h
@@ -0,0 +1,182 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_FS_H_
+#define	_LINUX_FS_H_
+
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <linux/semaphore.h>
+
+struct module;
+struct kiocb;
+struct iovec;
+struct dentry;
+struct page;
+struct file_lock;
+struct pipe_inode_info;
+struct vm_area_struct;
+struct poll_table_struct;
+struct files_struct;
+
+#define	inode	vnode
+#define	i_cdev	v_rdev
+
+#define	S_IRUGO	(S_IRUSR | S_IRGRP | S_IROTH)
+#define	S_IWUGO	(S_IWUSR | S_IWGRP | S_IWOTH)
+
+
+typedef struct files_struct *fl_owner_t;
+
+struct dentry {
+	struct inode	*d_inode;
+};
+
+struct file_operations;
+
+struct linux_file {
+	struct file	*_file;
+	const struct file_operations	*f_op;
+	void 		*private_data;
+	int		f_flags;
+	int		f_mode;	/* Just starting mode. */
+	struct dentry	*f_dentry;
+	struct dentry	f_dentry_store;
+	struct selinfo	f_selinfo;
+	struct sigio	*f_sigio;
+};
+
+#define	file		linux_file
+#define	fasync_struct	sigio *
+
+#define	fasync_helper(fd, filp, on, queue)				\
+({									\
+	if ((on))							\
+		*(queue) = &(filp)->f_sigio;				\
+	else								\
+		*(queue) = NULL;					\
+	0;								\
+})
+
+#define	kill_fasync(queue, sig, pollstat)				\
+do {									\
+	if (*(queue) != NULL)						\
+		pgsigio(*(queue), (sig), 0);				\
+} while (0)
+
+typedef int (*filldir_t)(void *, const char *, int, loff_t, u64, unsigned);
+
+struct file_operations {
+	struct module *owner;
+	ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
+	ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
+	unsigned int (*poll) (struct file *, struct poll_table_struct *);
+	long (*unlocked_ioctl)(struct file *, unsigned int, unsigned long);
+	int (*mmap)(struct file *, struct vm_area_struct *);
+	int (*open)(struct inode *, struct file *);
+	int (*release)(struct inode *, struct file *);
+	int (*fasync)(int, struct file *, int);
+#if 0
+	/* We do not support these methods.  Don't permit them to compile. */
+	loff_t (*llseek)(struct file *, loff_t, int);
+	ssize_t (*aio_read)(struct kiocb *, const struct iovec *,
+	    unsigned long, loff_t);
+	ssize_t (*aio_write)(struct kiocb *, const struct iovec *,
+	    unsigned long, loff_t);
+	int (*readdir)(struct file *, void *, filldir_t);
+	int (*ioctl)(struct inode *, struct file *, unsigned int,
+	    unsigned long);
+	long (*compat_ioctl)(struct file *, unsigned int, unsigned long);
+	int (*flush)(struct file *, fl_owner_t id);
+	int (*fsync)(struct file *, struct dentry *, int datasync);
+	int (*aio_fsync)(struct kiocb *, int datasync);
+	int (*lock)(struct file *, int, struct file_lock *);
+	ssize_t (*sendpage)(struct file *, struct page *, int, size_t,
+	    loff_t *, int);
+	unsigned long (*get_unmapped_area)(struct file *, unsigned long,
+	    unsigned long, unsigned long, unsigned long);
+	int (*check_flags)(int);
+	int (*flock)(struct file *, int, struct file_lock *);
+	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
+	    loff_t *, size_t, unsigned int);
+	ssize_t (*splice_read)(struct file *, loff_t *,
+	    struct pipe_inode_info *, size_t, unsigned int);
+	int (*setlease)(struct file *, long, struct file_lock **);
+#endif
+};
+#define	fops_get(fops)	(fops)
+
+#define	FMODE_READ	FREAD
+#define	FMODE_WRITE	FWRITE
+#define	FMODE_EXEC	FEXEC
+
+static inline int
+register_chrdev_region(dev_t dev, unsigned range, const char *name)
+{
+
+	return 0;
+}
+
+static inline void
+unregister_chrdev_region(dev_t dev, unsigned range)
+{
+
+	return;
+}
+
+static inline dev_t
+iminor(struct inode *inode)
+{
+
+	return dev2unit(inode->v_rdev);
+}
+
+static inline struct inode *
+igrab(struct inode *inode)
+{
+	int error;
+
+	error = vget(inode, 0, curthread);
+	if (error)
+		return (NULL);
+
+	return (inode);
+}
+
+static inline void
+iput(struct inode *inode)
+{
+
+	vrele(inode);
+}
+
+#endif	/* _LINUX_FS_H_ */
diff --git a/sys/ofed/include/linux/gfp.h b/sys/ofed/include/linux/gfp.h
new file mode 100644
index 0000000..7f8a24f
--- /dev/null
+++ b/sys/ofed/include/linux/gfp.h
@@ -0,0 +1,122 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_GFP_H_
+#define	_LINUX_GFP_H_
+
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include <linux/page.h>
+
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+
+#define	__GFP_NOWARN	0
+#define	__GFP_HIGHMEM	0
+#define	__GFP_ZERO	M_ZERO
+
+#define	GFP_NOWAIT	M_NOWAIT
+#define	GFP_ATOMIC	(M_NOWAIT | M_USE_RESERVE)
+#define	GFP_KERNEL	M_WAITOK
+#define	GFP_USER	M_WAITOK
+#define	GFP_HIGHUSER	M_WAITOK
+#define	GFP_HIGHUSER_MOVABLE	M_WAITOK
+#define	GFP_IOFS	M_NOWAIT
+
+static inline void *
+page_address(struct page *page)
+{
+
+	if (page->object != kmem_object && page->object != kernel_object)
+		return (NULL);
+	return (void *)(VM_MIN_KERNEL_ADDRESS + IDX_TO_OFF(page->pindex));
+}
+
+static inline unsigned long
+_get_page(gfp_t mask)
+{
+
+	return kmem_malloc(kmem_map, PAGE_SIZE, mask);
+}
+
+#define	get_zeroed_page(mask)	_get_page((mask) | M_ZERO)
+#define	alloc_page(mask)	virt_to_page(_get_page((mask)))
+#define	__get_free_page(mask)	_get_page((mask))
+
+static inline void
+free_page(unsigned long page)
+{
+
+	if (page == 0)
+		return;
+	kmem_free(kmem_map, page, PAGE_SIZE);
+}
+
+static inline void
+__free_page(struct page *m)
+{
+
+	if (m->object != kmem_object)
+		panic("__free_page:  Freed page %p not allocated via wrappers.",
+		    m);
+	kmem_free(kmem_map, (vm_offset_t)page_address(m), PAGE_SIZE);
+}
+
+static inline void
+__free_pages(void *p, unsigned int order)
+{
+	size_t size;
+
+	if (p == 0)
+		return;
+	size = PAGE_SIZE << order;
+	kmem_free(kmem_map, (vm_offset_t)p, size);
+}
+
+/*
+ * Alloc pages allocates directly from the buddy allocator on linux so
+ * order specifies a power of two bucket of pages and the results
+ * are expected to be aligned on the size as well.
+ */
+static inline struct page *
+alloc_pages(gfp_t gfp_mask, unsigned int order)
+{
+	unsigned long page;
+	size_t size;
+
+	size = PAGE_SIZE << order;
+	page = kmem_alloc_contig(kmem_map, size, gfp_mask, 0, -1,
+	    size, 0, VM_MEMATTR_DEFAULT);
+	if (page == 0)
+		return (NULL);
+        return (virt_to_page(page));
+}
+
+#endif	/* _LINUX_GFP_H_ */
diff --git a/sys/ofed/include/linux/hardirq.h b/sys/ofed/include/linux/hardirq.h
new file mode 100644
index 0000000..4c3aeba
--- /dev/null
+++ b/sys/ofed/include/linux/hardirq.h
@@ -0,0 +1,39 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_HARDIRQ_H_
+#define	_LINUX_HARDIRQ_H_
+
+#include <linux/types.h>
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
+
+#define	synchronize_irq(irq)	_intr_drain((irq))
+
+#endif	/* _LINUX_HARDIRQ_H_ */
diff --git a/sys/ofed/include/linux/idr.h b/sys/ofed/include/linux/idr.h
new file mode 100644
index 0000000..40b25b6
--- /dev/null
+++ b/sys/ofed/include/linux/idr.h
@@ -0,0 +1,70 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_IDR_H_
+#define	_LINUX_IDR_H_
+
+#include <sys/kernel.h>
+
+#define	IDR_BITS	5
+#define	IDR_SIZE	(1 << IDR_BITS)
+#define	IDR_MASK	(IDR_SIZE - 1)
+
+#define	MAX_ID_SHIFT	((sizeof(int) * NBBY) - 1)
+#define	MAX_ID_BIT	(1U << MAX_ID_SHIFT)
+#define	MAX_ID_MASK	(MAX_ID_BIT - 1)
+#define	MAX_LEVEL	(MAX_ID_SHIFT + IDR_BITS - 1) / IDR_BITS
+
+struct idr_layer {
+	unsigned long		bitmap;
+	struct idr_layer	*ary[IDR_SIZE];
+};
+
+struct idr {
+	struct mtx		lock;
+	struct idr_layer	*top;
+	struct idr_layer	*free;
+	int			layers;
+};
+
+#define DEFINE_IDR(name)						\
+	struct idr name;						\
+	SYSINIT(name##_idr_sysinit, SI_SUB_DRIVERS, SI_ORDER_FIRST,	\
+	    idr_init, &(name));
+
+void	*idr_find(struct idr *idp, int id);
+int	idr_pre_get(struct idr *idp, gfp_t gfp_mask);
+int	idr_get_new(struct idr *idp, void *ptr, int *id);
+int	idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id);
+void	*idr_replace(struct idr *idp, void *ptr, int id);
+void	idr_remove(struct idr *idp, int id);
+void	idr_remove_all(struct idr *idp);
+void	idr_destroy(struct idr *idp);
+void	idr_init(struct idr *idp);
+
+#endif	/* _LINUX_IDR_H_ */
diff --git a/sys/ofed/include/linux/if_arp.h b/sys/ofed/include/linux/if_arp.h
new file mode 100644
index 0000000..c82a2c5
--- /dev/null
+++ b/sys/ofed/include/linux/if_arp.h
@@ -0,0 +1,32 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_IF_ARP_H_
+#define	_LINUX_IF_ARP_H_
+#include <sys/socket.h>
+#include <net/if_arp.h>
+#endif	/* _LINUX_IF_ARP_H_ */
diff --git a/sys/ofed/include/linux/if_ether.h b/sys/ofed/include/linux/if_ether.h
new file mode 100644
index 0000000..9608657
--- /dev/null
+++ b/sys/ofed/include/linux/if_ether.h
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_IF_ETHER_H_
+#define	_LINUX_IF_ETHER_H_
+
+#include <linux/types.h>
+
+#include <net/ethernet.h>
+
+#define	ETH_P_8021Q	ETHERTYPE_VLAN
+
+#endif	/* _LINUX_IF_ETHER_H_ */
diff --git a/sys/ofed/include/linux/if_vlan.h b/sys/ofed/include/linux/if_vlan.h
new file mode 100644
index 0000000..bb7eee0
--- /dev/null
+++ b/sys/ofed/include/linux/if_vlan.h
@@ -0,0 +1,35 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_IF_VLAN_H_
+#define	_LINUX_IF_VLAN_H_
+
+#include <net/ethernet.h>
+#include <net/if_vlan_var.h>
+
+#endif	/* _LINUX_IF_VLAN_H_ */
diff --git a/sys/ofed/include/linux/in.h b/sys/ofed/include/linux/in.h
new file mode 100644
index 0000000..8fa3dc2
--- /dev/null
+++ b/sys/ofed/include/linux/in.h
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_IN_H_
+#define	_LINUX_IN_H_
+
+#include <netinet/in.h>
+#include <asm/byteorder.h>
+
+#define	ipv4_is_zeronet		IN_ZERONET
+#define	ipv4_is_loopback	IN_LOOPBACK
+
+#endif	/* _LINUX_IN_H_ */
diff --git a/sys/ofed/include/linux/in6.h b/sys/ofed/include/linux/in6.h
new file mode 100644
index 0000000..2032b61
--- /dev/null
+++ b/sys/ofed/include/linux/in6.h
@@ -0,0 +1,36 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_IN6_H_
+#define	_LINUX_IN6_H_
+
+#ifndef KLD_MODULE
+#include "opt_inet6.h"
+#endif
+
+#endif	/* _LINUX_IN6_H_ */
diff --git a/sys/ofed/include/linux/inet.h b/sys/ofed/include/linux/inet.h
new file mode 100644
index 0000000..07fcc73
--- /dev/null
+++ b/sys/ofed/include/linux/inet.h
@@ -0,0 +1,31 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_INET_H_
+#define	_LINUX_INET_H_
+#endif	/* _LINUX_INET_H_ */
diff --git a/sys/ofed/include/linux/inetdevice.h b/sys/ofed/include/linux/inetdevice.h
new file mode 100644
index 0000000..c7fe1d2
--- /dev/null
+++ b/sys/ofed/include/linux/inetdevice.h
@@ -0,0 +1,56 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_INETDEVICE_H_
+#define	_LINUX_INETDEVICE_H_
+
+#include <linux/netdevice.h>
+
+static inline struct net_device *
+ip_dev_find(struct net *net, uint32_t addr)
+{
+	struct sockaddr_in sin;
+	struct ifaddr *ifa;
+	struct ifnet *ifp;
+
+	ifp = NULL;
+	memset(&sin, 0, sizeof(sin));
+	sin.sin_addr.s_addr = addr;
+	sin.sin_port = 0;
+	sin.sin_len = sizeof(sin);
+	sin.sin_family = AF_INET;
+	ifa = ifa_ifwithaddr((struct sockaddr *)&sin);
+	if (ifa) {
+		ifp = ifa->ifa_ifp;
+		if_ref(ifp);
+		ifa_free(ifa);
+	}
+	return (ifp);
+}
+
+#endif	/* _LINUX_INETDEVICE_H_ */
diff --git a/sys/ofed/include/linux/init.h b/sys/ofed/include/linux/init.h
new file mode 100644
index 0000000..d7c2bb1
--- /dev/null
+++ b/sys/ofed/include/linux/init.h
@@ -0,0 +1,31 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_INIT_H_
+#define	_LINUX_INIT_H_
+
+#endif	/* _LINUX_INIT_H_ */
diff --git a/sys/ofed/include/linux/interrupt.h b/sys/ofed/include/linux/interrupt.h
new file mode 100644
index 0000000..e35882c
--- /dev/null
+++ b/sys/ofed/include/linux/interrupt.h
@@ -0,0 +1,139 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_INTERRUPT_H_
+#define	_LINUX_INTERRUPT_H_
+
+#include <linux/device.h>
+#include <linux/pci.h>
+
+#include <sys/bus.h>
+#include <sys/rman.h>
+
+typedef	irqreturn_t	(*irq_handler_t)(int, void *);
+
+#define	IRQ_RETVAL(x)	((x) != IRQ_NONE)
+
+#define	IRQF_SHARED	RF_SHAREABLE
+
+struct irq_ent {
+	struct list_head	links;
+	struct device	*dev;
+	struct resource	*res;
+	void		*arg;
+	irqreturn_t	(*handler)(int, void *);
+	void		*tag;
+	int		 irq;
+};
+
+static inline int
+_irq_rid(struct device *dev, int irq)
+{
+	if (irq == dev->irq)
+		return (0);
+	return irq - dev->msix + 1;
+}
+
+static void
+_irq_handler(void *ent)
+{
+	struct irq_ent *irqe;
+
+	irqe = ent;
+	irqe->handler(irqe->irq, irqe->arg);
+}
+
+static inline struct irq_ent *
+_irq_ent(struct device *dev, int irq)
+{
+	struct irq_ent *irqe;
+
+	list_for_each_entry(irqe, &dev->irqents, links)
+		if (irqe->irq == irq)
+			return (irqe);
+
+	return (NULL);
+}
+
+static inline int
+request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags,
+    const char *name, void *arg)
+{
+	struct resource *res;
+	struct irq_ent *irqe;
+	struct device *dev;
+	int error;
+	int rid;
+
+	dev = _pci_find_irq_dev(irq);
+	if (dev == NULL)
+		return -ENXIO;
+	rid = _irq_rid(dev, irq);
+	res = bus_alloc_resource_any(dev->bsddev, SYS_RES_IRQ, &rid,
+	    flags | RF_ACTIVE);
+	if (res == NULL)
+		return (-ENXIO);
+	irqe = kmalloc(sizeof(*irqe), GFP_KERNEL);
+	irqe->dev = dev;
+	irqe->res = res;
+	irqe->arg = arg;
+	irqe->handler = handler;
+	irqe->irq = irq;
+	error = bus_setup_intr(dev->bsddev, res, INTR_TYPE_NET | INTR_MPSAFE,
+	    NULL, _irq_handler, irqe, &irqe->tag);
+	if (error) {
+		bus_release_resource(dev->bsddev, SYS_RES_IRQ, rid, irqe->res);
+		kfree(irqe);
+		return (-error);
+	}
+	list_add(&irqe->links, &dev->irqents);
+
+	return 0;
+}
+
+static inline void
+free_irq(unsigned int irq, void *device)
+{
+	struct irq_ent *irqe;
+	struct device *dev;
+	int rid;
+
+	dev = _pci_find_irq_dev(irq);
+	if (dev == NULL)
+		return;
+	rid = _irq_rid(dev, irq);
+	irqe = _irq_ent(dev, irq);
+	if (irqe == NULL)
+		return;
+	bus_teardown_intr(dev->bsddev, irqe->res, irqe->tag);
+	bus_release_resource(dev->bsddev, SYS_RES_IRQ, rid, irqe->res);
+	list_del(&irqe->links);
+	kfree(irqe);
+}
+
+#endif	/* _LINUX_INTERRUPT_H_ */
diff --git a/sys/ofed/include/linux/io-mapping.h b/sys/ofed/include/linux/io-mapping.h
new file mode 100644
index 0000000..0753bbc
--- /dev/null
+++ b/sys/ofed/include/linux/io-mapping.h
@@ -0,0 +1,77 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_IO_MAPPING_H_
+#define	_LINUX_IO_MAPPING_H_
+
+#include <linux/types.h>
+#include <linux/io.h>
+
+struct io_mapping;
+
+static inline struct io_mapping *
+io_mapping_create_wc(resource_size_t base, unsigned long size)
+{
+
+	return ioremap_wc(base, size);
+}
+
+static inline void
+io_mapping_free(struct io_mapping *mapping)
+{
+
+	iounmap(mapping);
+}
+
+static inline void *
+io_mapping_map_atomic_wc(struct io_mapping *mapping, unsigned long offset)
+{
+
+	return (((char *)mapping) + offset);
+}
+
+static inline void
+io_mapping_unmap_atomic(void *vaddr)
+{
+
+}
+
+static inline void *
+io_mapping_map_wc(struct io_mapping *mapping, unsigned long offset)
+{
+
+	return (((char *) mapping) + offset);
+}
+
+static inline void
+io_mapping_unmap(void *vaddr)
+{
+
+}
+
+#endif	/* _LINUX_IO_MAPPING_H_ */
diff --git a/sys/ofed/include/linux/io.h b/sys/ofed/include/linux/io.h
new file mode 100644
index 0000000..5405be7
--- /dev/null
+++ b/sys/ofed/include/linux/io.h
@@ -0,0 +1,125 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_IO_H_
+#define	_LINUX_IO_H_
+
+#include <machine/vm.h>
+
+static inline uint32_t
+__raw_readl(const volatile void *addr)
+{
+	return *(const volatile uint32_t *)addr;
+}
+
+static inline void
+__raw_writel(uint32_t b, volatile void *addr)
+{
+	*(volatile uint32_t *)addr = b;
+}
+
+static inline uint64_t
+__raw_readq(const volatile void *addr)
+{
+	return *(const volatile uint64_t *)addr;
+}
+
+static inline void
+__raw_writeq(uint64_t b, volatile void *addr)
+{
+	*(volatile uint64_t *)addr = b;
+}
+
+/*
+ * XXX This is all x86 specific.  It should be bus space access.
+ */
+#define mmiowb()
+
+#undef writel
+static inline void
+writel(uint32_t b, void *addr)
+{
+        *(volatile uint32_t *)addr = b;
+}
+
+#undef writeq
+static inline void
+writeq(uint64_t b, void *addr)
+{
+        *(volatile uint64_t *)addr = b;
+}
+
+#undef writeb
+static inline void
+writeb(uint8_t b, void *addr)
+{
+        *(volatile uint8_t *)addr = b;
+}
+
+#undef writew
+static inline void
+writew(uint16_t b, void *addr)
+{
+        *(volatile uint16_t *)addr = b;
+}
+
+void *_ioremap_attr(vm_paddr_t phys_addr, unsigned long size, int attr);
+#define	ioremap_nocache(addr, size)					\
+    _ioremap_attr((addr), (size), VM_MEMATTR_UNCACHED)
+#define	ioremap_wc(addr, size)						\
+    _ioremap_attr((addr), (size), VM_MEMATTR_WRITE_COMBINING)
+#define	ioremap	ioremap_nocache
+void iounmap(void *addr);
+
+#define	memset_io(a, b, c)	memset((a), (b), (c))
+#define	memcpy_fromio(a, b, c)	memcpy((a), (b), (c))
+#define	memcpy_toio(a, b, c)	memcpy((a), (b), (c))
+
+static inline void
+__iowrite64_copy(void *to, void *from, size_t count)
+{
+#ifdef __LP64__
+	uint64_t *src;
+	uint64_t *dst;
+	int i;
+
+	for (i = 0, src = from, dst = to; i < count; i++, src++, dst++)
+		__raw_writeq(*src, dst);
+#else
+	uint32_t *src;
+	uint32_t *dst;
+	int i;
+
+	count *= 2;
+	for (i = 0, src = from, dst = to; i < count; i++, src++, dst++)
+		__raw_writel(*src, dst);
+#endif
+}
+
+
+#endif	/* _LINUX_IO_H_ */
diff --git a/sys/ofed/include/linux/ioctl.h b/sys/ofed/include/linux/ioctl.h
new file mode 100644
index 0000000..9e00b7f
--- /dev/null
+++ b/sys/ofed/include/linux/ioctl.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_IOCTL_H_
+#define	_LINUX_IOCTL_H_
+
+#include <sys/ioccom.h>
+
+#endif	/* _LINUX_IOCTL_H_ */
diff --git a/sys/ofed/include/linux/jhash.h b/sys/ofed/include/linux/jhash.h
new file mode 100644
index 0000000..ff6ff09
--- /dev/null
+++ b/sys/ofed/include/linux/jhash.h
@@ -0,0 +1,143 @@
+#ifndef	_LINUX_JHASH_H_
+#define	_LINUX_JHASH_H_
+
+/* jhash.h: Jenkins hash support.
+ *
+ * Copyright (C) 1996 Bob Jenkins (bob_jenkins@burtleburtle.net)
+ *
+ * http://burtleburtle.net/bob/hash/
+ *
+ * These are the credits from Bob's sources:
+ *
+ * lookup2.c, by Bob Jenkins, December 1996, Public Domain.
+ * hash(), hash2(), hash3, and mix() are externally useful functions.
+ * Routines to test the hash are included if SELF_TEST is defined.
+ * You can use this free for any purpose.  It has no warranty.
+ *
+ * Copyright (C) 2003 David S. Miller (davem@redhat.com)
+ *
+ * I've modified Bob's hash to be useful in the Linux kernel, and
+ * any bugs present are surely my fault.  -DaveM
+ */
+
+/* NOTE: Arguments are modified. */
+#define __jhash_mix(a, b, c) \
+{ \
+  a -= b; a -= c; a ^= (c>>13); \
+  b -= c; b -= a; b ^= (a<<8); \
+  c -= a; c -= b; c ^= (b>>13); \
+  a -= b; a -= c; a ^= (c>>12);  \
+  b -= c; b -= a; b ^= (a<<16); \
+  c -= a; c -= b; c ^= (b>>5); \
+  a -= b; a -= c; a ^= (c>>3);  \
+  b -= c; b -= a; b ^= (a<<10); \
+  c -= a; c -= b; c ^= (b>>15); \
+}
+
+/* The golden ration: an arbitrary value */
+#define JHASH_GOLDEN_RATIO	0x9e3779b9
+
+/* The most generic version, hashes an arbitrary sequence
+ * of bytes.  No alignment or length assumptions are made about
+ * the input key.
+ */
+static inline u32 jhash(const void *key, u32 length, u32 initval)
+{
+	u32 a, b, c, len;
+	const u8 *k = key;
+
+	len = length;
+	a = b = JHASH_GOLDEN_RATIO;
+	c = initval;
+
+	while (len >= 12) {
+		a += (k[0] +((u32)k[1]<<8) +((u32)k[2]<<16) +((u32)k[3]<<24));
+		b += (k[4] +((u32)k[5]<<8) +((u32)k[6]<<16) +((u32)k[7]<<24));
+		c += (k[8] +((u32)k[9]<<8) +((u32)k[10]<<16)+((u32)k[11]<<24));
+
+		__jhash_mix(a,b,c);
+
+		k += 12;
+		len -= 12;
+	}
+
+	c += length;
+	switch (len) {
+	case 11: c += ((u32)k[10]<<24);
+	case 10: c += ((u32)k[9]<<16);
+	case 9 : c += ((u32)k[8]<<8);
+	case 8 : b += ((u32)k[7]<<24);
+	case 7 : b += ((u32)k[6]<<16);
+	case 6 : b += ((u32)k[5]<<8);
+	case 5 : b += k[4];
+	case 4 : a += ((u32)k[3]<<24);
+	case 3 : a += ((u32)k[2]<<16);
+	case 2 : a += ((u32)k[1]<<8);
+	case 1 : a += k[0];
+	};
+
+	__jhash_mix(a,b,c);
+
+	return c;
+}
+
+/* A special optimized version that handles 1 or more of u32s.
+ * The length parameter here is the number of u32s in the key.
+ */
+static inline u32 jhash2(const u32 *k, u32 length, u32 initval)
+{
+	u32 a, b, c, len;
+
+	a = b = JHASH_GOLDEN_RATIO;
+	c = initval;
+	len = length;
+
+	while (len >= 3) {
+		a += k[0];
+		b += k[1];
+		c += k[2];
+		__jhash_mix(a, b, c);
+		k += 3; len -= 3;
+	}
+
+	c += length * 4;
+
+	switch (len) {
+	case 2 : b += k[1];
+	case 1 : a += k[0];
+	};
+
+	__jhash_mix(a,b,c);
+
+	return c;
+}
+
+
+/* A special ultra-optimized versions that knows they are hashing exactly
+ * 3, 2 or 1 word(s).
+ *
+ * NOTE: In partilar the "c += length; __jhash_mix(a,b,c);" normally
+ *       done at the end is not done here.
+ */
+static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval)
+{
+	a += JHASH_GOLDEN_RATIO;
+	b += JHASH_GOLDEN_RATIO;
+	c += initval;
+
+	__jhash_mix(a, b, c);
+
+	return c;
+}
+
+static inline u32 jhash_2words(u32 a, u32 b, u32 initval)
+{
+	return jhash_3words(a, b, 0, initval);
+}
+
+static inline u32 jhash_1word(u32 a, u32 initval)
+{
+	return jhash_3words(a, 0, 0, initval);
+}
+
+#endif	/* _LINUX_JHASH_H_ */
diff --git a/sys/ofed/include/linux/jiffies.h b/sys/ofed/include/linux/jiffies.h
new file mode 100644
index 0000000..7ca6337
--- /dev/null
+++ b/sys/ofed/include/linux/jiffies.h
@@ -0,0 +1,56 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_JIFFIES_H_
+#define	_LINUX_JIFFIES_H_
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+
+#include <sys/time.h>
+#include <sys/kernel.h>
+
+static inline int
+msecs_to_jiffies(int msec)
+{
+	struct timeval tv;
+
+	tv.tv_sec = msec / 1000;
+	tv.tv_usec = (msec % 1000) * 1000;
+	return (tvtohz(&tv));
+}
+
+#define	jiffies	ticks
+
+#define	time_after(a, b)	((long)(b) - (long)(a) < 0)
+#define	time_before(a, b)	time_after(b,a)
+#define	time_after_eq(a, b)	((long)(a) - (long)(b) >= 0)
+#define	time_before_eq(a, b)	time_after_eq(b, a)
+
+#define	HZ	hz
+
+#endif	/* _LINUX_JIFFIES_H_ */
diff --git a/sys/ofed/include/linux/kdev_t.h b/sys/ofed/include/linux/kdev_t.h
new file mode 100644
index 0000000..4b4f43e
--- /dev/null
+++ b/sys/ofed/include/linux/kdev_t.h
@@ -0,0 +1,36 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_KDEV_T_H_
+#define	_LINUX_KDEV_T_H_
+
+#define MAJOR(dev)      major((dev))
+#define MINOR(dev)      minor((dev))
+#define MKDEV(ma, mi)   makedev((ma), (mi))
+
+#endif	/* _LINUX_KDEV_T_H_ */
diff --git a/sys/ofed/include/linux/kernel.h b/sys/ofed/include/linux/kernel.h
new file mode 100644
index 0000000..f49036e
--- /dev/null
+++ b/sys/ofed/include/linux/kernel.h
@@ -0,0 +1,88 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_KERNEL_H_
+#define	_LINUX_KERNEL_H_
+
+#include <sys/systm.h>
+#include <sys/param.h>
+#include <sys/libkern.h>
+#include <sys/stat.h>
+#include <sys/smp.h>
+
+#include <linux/bitops.h>
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/stddef.h>
+#include <linux/kthread.h>
+#include <linux/types.h>
+#include <linux/jiffies.h>
+#include <linux/wait.h>
+#include <linux/fs.h>
+#include <linux/notifier.h>
+#include <linux/log2.h>
+#include <asm/byteorder.h>
+
+#define	KERN_EMERG	"<0>"
+#define	KERN_ALERT	"<1>"
+#define	KERN_CRIT	"<2>"
+#define	KERN_ERR	"<3>"
+#define	KERN_WARNING	"<4>"
+#define	KERN_NOTICE	"<5>"
+#define	KERN_INFO	"<6>"
+#define	KERN_DEBUG	"<7>"
+
+#define BUG()			panic("BUG")
+#define BUG_ON(condition)	do { if (condition) BUG(); } while(0)
+#define	WARN_ON			BUG_ON
+
+#undef	ALIGN
+#define	ALIGN(x, y)		roundup2((x), (y))
+#define	DIV_ROUND_UP		howmany
+
+#define	printk(X...)		printf(X)
+#define	pr_debug(fmt, ...)	printk(KERN_DEBUG # fmt, ##__VA_ARGS__)
+#define udelay(t)       	DELAY(t)
+
+#define container_of(ptr, type, member)				\
+({								\
+	__typeof(((type *)0)->member) *_p = (ptr);		\
+	(type *)((char *)_p - offsetof(type, member));		\
+})
+  
+#define	ARRAY_SIZE(x)	(sizeof(x) / sizeof((x)[0]))
+
+#define	simple_strtoul	strtoul
+
+#define min(x, y)	(x < y ? x : y)
+#define max(x, y)	(x > y ? x : y)
+#define min_t(type, _x, _y)	(type)(_x) < (type)(_y) ? (type)(_x) : (_y)
+#define max_t(type, _x, _y)	(type)(_x) > (type)(_y) ? (type)(_x) : (_y)
+
+#define	num_possible_cpus()	mp_ncpus
+
+#endif	/* _LINUX_KERNEL_H_ */
diff --git a/sys/ofed/include/linux/kobject.h b/sys/ofed/include/linux/kobject.h
new file mode 100644
index 0000000..5872c05
--- /dev/null
+++ b/sys/ofed/include/linux/kobject.h
@@ -0,0 +1,153 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_KOBJECT_H_
+#define	_LINUX_KOBJECT_H_
+
+#include <machine/stdarg.h>
+
+#include <linux/kernel.h>
+#include <linux/kref.h>
+#include <linux/slab.h>
+
+struct kobject;
+struct sysctl_oid;
+
+struct kobj_type {
+	void (*release)(struct kobject *kobj);
+	const struct sysfs_ops *sysfs_ops;
+	struct attribute **default_attrs;
+};
+
+extern struct kobj_type kfree_type;
+
+struct kobject {
+	struct kobject		*parent;
+	char			*name;
+	struct kref		kref;
+	struct kobj_type	*ktype;
+	struct list_head	entry;
+	struct sysctl_oid	*oidp;
+};
+
+static inline void
+kobject_init(struct kobject *kobj, struct kobj_type *ktype)
+{
+
+	kref_init(&kobj->kref);
+	INIT_LIST_HEAD(&kobj->entry);
+	kobj->ktype = ktype;
+	kobj->oidp = NULL;
+}
+
+static inline void kobject_put(struct kobject *kobj);
+void kobject_release(struct kref *kref);
+
+static inline void
+kobject_put(struct kobject *kobj)
+{
+
+	if (kobj)
+		kref_put(&kobj->kref, kobject_release);
+}
+
+static inline struct kobject *
+kobject_get(struct kobject *kobj)
+{
+
+	if (kobj)
+		kref_get(&kobj->kref);
+	return kobj;
+}
+
+static inline int
+kobject_set_name_vargs(struct kobject *kobj, const char *fmt, va_list args)
+{
+	char *old;
+	char *name;
+
+	old = kobj->name;
+
+	if (old && !fmt)
+		return 0;
+
+	name = kzalloc(MAXPATHLEN, GFP_KERNEL);
+	if (!name)
+		return -ENOMEM;
+	vsnprintf(name, MAXPATHLEN, fmt, args);
+	kobj->name = name;
+	kfree(old);
+	for (; *name != '\0'; name++)
+		if (*name == '/')
+			*name = '!';
+	return (0);
+}
+
+int	kobject_add(struct kobject *kobj, struct kobject *parent,
+	    const char *fmt, ...);
+
+static inline struct kobject *
+kobject_create(void)
+{
+	struct kobject *kobj;
+
+	kobj = kzalloc(sizeof(*kobj), GFP_KERNEL);
+	if (kobj == NULL)
+		return (NULL);
+	kobject_init(kobj, &kfree_type);
+
+	return (kobj);
+}
+
+static inline struct kobject *
+kobject_create_and_add(const char *name, struct kobject *parent)
+{
+	struct kobject *kobj;
+
+	kobj = kobject_create();
+	if (kobj == NULL)
+		return (NULL);
+	if (kobject_add(kobj, parent, "%s", name) == 0)
+		return (kobj);
+	kobject_put(kobj);
+
+	return (NULL);
+}
+
+
+static inline char *
+kobject_name(const struct kobject *kobj)
+{
+
+	return kobj->name;
+}
+
+int	kobject_set_name(struct kobject *kobj, const char *fmt, ...);
+int	kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype,
+	    struct kobject *parent, const char *fmt, ...);
+
+#endif /* _LINUX_KOBJECT_H_ */
diff --git a/sys/ofed/include/linux/kref.h b/sys/ofed/include/linux/kref.h
new file mode 100644
index 0000000..14346c1
--- /dev/null
+++ b/sys/ofed/include/linux/kref.h
@@ -0,0 +1,62 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_KREF_H_
+#define _LINUX_KREF_H_
+
+#include <sys/refcount.h>
+
+struct kref {
+        volatile u_int count;
+};
+
+static inline void
+kref_init(struct kref *kref)
+{
+
+	refcount_init(&kref->count, 1);
+}
+
+static inline void
+kref_get(struct kref *kref)
+{
+
+	refcount_acquire(&kref->count);
+}
+
+static inline int
+kref_put(struct kref *kref, void (*rel)(struct kref *kref))
+{
+
+	if (refcount_release(&kref->count)) {
+		rel(kref);
+		return 1;
+	}
+	return 0;
+}
+
+#endif /* _KREF_H_ */
diff --git a/sys/ofed/include/linux/kthread.h b/sys/ofed/include/linux/kthread.h
new file mode 100644
index 0000000..e2882958
--- /dev/null
+++ b/sys/ofed/include/linux/kthread.h
@@ -0,0 +1,104 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_KTHREAD_H_
+#define	_LINUX_KTHREAD_H_
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/sleepqueue.h>
+
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+static inline void
+_kthread_fn(void *arg)
+{
+	struct task_struct *task;
+
+	task = arg;
+	task_struct_set(curthread, task);
+	if (task->should_stop == 0)
+		task->task_ret = task->task_fn(task->task_data);
+	PROC_LOCK(task->task_thread->td_proc);
+	task->should_stop = TASK_STOPPED;
+	wakeup(task);
+	PROC_UNLOCK(task->task_thread->td_proc);
+	kthread_exit();
+}
+
+static inline struct task_struct *
+_kthread_create(int (*threadfn)(void *data), void *data)
+{
+	struct task_struct *task;
+
+	task = kzalloc(sizeof(*task), GFP_KERNEL);
+	task->task_fn = threadfn;
+	task->task_data = data;
+
+	return (task);
+}
+
+struct task_struct *kthread_create(int (*threadfn)(void *data),
+                                   void *data,
+                                   const char namefmt[], ...)
+        __attribute__((format(printf, 3, 4)));
+
+#define	kthread_run(fn, data, fmt, ...)					\
+({									\
+	struct task_struct *_task;					\
+									\
+	_task = _kthread_create((fn), (data));				\
+	if (kthread_add(_kthread_fn, _task, NULL, &_task->task_thread,	\
+	    0, 0, fmt, ## __VA_ARGS__)) {				\
+		kfree(_task);						\
+		_task = NULL;						\
+	} else								\
+		task_struct_set(_task->task_thread, _task);		\
+	_task;								\
+})
+
+#define	kthread_should_stop()	current->should_stop
+
+static inline int
+kthread_stop(struct task_struct *task)
+{
+
+	PROC_LOCK(task->task_thread->td_proc);
+	task->should_stop = TASK_SHOULD_STOP;
+	wake_up_process(task);
+	while (task->should_stop != TASK_STOPPED)
+		msleep(task, &task->task_thread->td_proc->p_mtx, PWAIT,
+		    "kstop", hz);
+	PROC_UNLOCK(task->task_thread->td_proc);
+	return task->task_ret;
+}
+
+#endif	/* _LINUX_KTHREAD_H_ */
diff --git a/sys/ofed/include/linux/linux_compat.c b/sys/ofed/include/linux/linux_compat.c
new file mode 100644
index 0000000..98ad807
--- /dev/null
+++ b/sys/ofed/include/linux/linux_compat.c
@@ -0,0 +1,695 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/bus.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filio.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/stdarg.h>
+#include <machine/pmap.h>
+
+#include <linux/kobject.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/cdev.h>
+#include <linux/file.h>
+#include <linux/sysfs.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+#include <linux/vmalloc.h>
+
+#include <vm/vm_pager.h>
+
+MALLOC_DEFINE(M_KMALLOC, "linux", "Linux kmalloc compat");
+
+#include <linux/rbtree.h>
+/* Undo Linux compat changes. */
+#undef RB_ROOT
+#undef file
+#undef cdev
+#define	RB_ROOT(head)	(head)->rbh_root
+#undef LIST_HEAD
+/* From sys/queue.h */
+#define LIST_HEAD(name, type)						\
+struct name {								\
+	struct type *lh_first;	/* first element */			\
+}
+
+struct kobject class_root;
+struct device linux_rootdev;
+struct class miscclass;
+struct list_head pci_drivers;
+struct list_head pci_devices;
+spinlock_t pci_lock;
+
+int
+panic_cmp(struct rb_node *one, struct rb_node *two)
+{
+	panic("no cmp");
+}
+
+RB_GENERATE(linux_root, rb_node, __entry, panic_cmp);
+ 
+int
+kobject_set_name(struct kobject *kobj, const char *fmt, ...)
+{
+	va_list args;
+	int error;
+
+	va_start(args, fmt);
+	error = kobject_set_name_vargs(kobj, fmt, args);
+	va_end(args);
+
+	return (error);
+}
+
+static inline int
+kobject_add_complete(struct kobject *kobj, struct kobject *parent)
+{
+	struct kobj_type *t;
+	int error;
+
+	kobj->parent = kobject_get(parent);
+	error = sysfs_create_dir(kobj);
+	if (error == 0 && kobj->ktype && kobj->ktype->default_attrs) {
+		struct attribute **attr;
+		t = kobj->ktype;
+
+		for (attr = t->default_attrs; *attr != NULL; attr++) {
+			error = sysfs_create_file(kobj, *attr);
+			if (error)
+				break;
+		}
+		if (error)
+			sysfs_remove_dir(kobj);
+		
+	}
+	return (error);
+}
+
+int
+kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...)
+{
+	va_list args;
+	int error;
+
+	va_start(args, fmt);
+	error = kobject_set_name_vargs(kobj, fmt, args);
+	va_end(args);
+	if (error)
+		return (error);
+
+	return kobject_add_complete(kobj, parent);
+}
+
+void
+kobject_release(struct kref *kref)
+{
+	struct kobject *kobj;
+	char *name;
+
+	kobj = container_of(kref, struct kobject, kref);
+	sysfs_remove_dir(kobj);
+	if (kobj->parent)
+		kobject_put(kobj->parent);
+	kobj->parent = NULL;
+	name = kobj->name;
+	if (kobj->ktype && kobj->ktype->release)
+		kobj->ktype->release(kobj);
+	kfree(name);
+}
+
+static void
+kobject_kfree(struct kobject *kobj)
+{
+
+	kfree(kobj);
+}
+
+struct kobj_type kfree_type = { .release = kobject_kfree };
+
+struct device *
+device_create(struct class *class, struct device *parent, dev_t devt,
+    void *drvdata, const char *fmt, ...)
+{
+	struct device *dev;
+	va_list args;
+
+	dev = kzalloc(sizeof(*dev), M_WAITOK);
+	dev->parent = parent;
+	dev->class = class;
+	dev->devt = devt;
+	dev->driver_data = drvdata;
+	va_start(args, fmt);
+	kobject_set_name_vargs(&dev->kobj, fmt, args);
+	va_end(args);
+	device_register(dev);
+
+	return (dev);
+}
+
+int
+kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype,
+    struct kobject *parent, const char *fmt, ...)
+{
+	va_list args;
+	int error;
+
+	kobject_init(kobj, ktype);
+	kobj->ktype = ktype;
+	kobj->parent = parent;
+	kobj->name = NULL;
+
+	va_start(args, fmt);
+	error = kobject_set_name_vargs(kobj, fmt, args);
+	va_end(args);
+	if (error)
+		return (error);
+	return kobject_add_complete(kobj, parent);
+}
+
+static void
+linux_file_dtor(void *cdp)
+{
+	struct linux_file *filp;
+
+	filp = cdp;
+	filp->f_op->release(curthread->td_fpop->f_vnode, filp);
+	kfree(filp);
+}
+
+static int
+linux_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
+{
+	struct linux_cdev *ldev;
+	struct linux_file *filp;
+	struct file *file;
+	int error;
+
+	file = curthread->td_fpop;
+	ldev = dev->si_drv1;
+	if (ldev == NULL)
+		return (ENODEV);
+	filp = kzalloc(sizeof(*filp), GFP_KERNEL);
+	filp->f_dentry = &filp->f_dentry_store;
+	filp->f_op = ldev->ops;
+	filp->f_flags = file->f_flag;
+	if (filp->f_op->open) {
+		error = -filp->f_op->open(file->f_vnode, filp);
+		if (error) {
+			kfree(filp);
+			return (error);
+		}
+	}
+	error = devfs_set_cdevpriv(filp, linux_file_dtor);
+	if (error) {
+		filp->f_op->release(file->f_vnode, filp);
+		kfree(filp);
+		return (error);
+	}
+
+	return 0;
+}
+
+static int
+linux_dev_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
+{
+	struct linux_cdev *ldev;
+	struct linux_file *filp;
+	struct file *file;
+	int error;
+
+	file = curthread->td_fpop;
+	ldev = dev->si_drv1;
+	if (ldev == NULL)
+		return (0);
+	if ((error = devfs_get_cdevpriv((void **)&filp)) != 0)
+		return (error);
+	filp->f_flags = file->f_flag;
+	devfs_clear_cdevpriv();
+
+	return (0);
+}
+
+static int
+linux_dev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
+    struct thread *td)
+{
+	struct linux_cdev *ldev;
+	struct linux_file *filp;
+	struct file *file;
+	int error;
+
+	file = curthread->td_fpop;
+	ldev = dev->si_drv1;
+	if (ldev == NULL)
+		return (0);
+	if ((error = devfs_get_cdevpriv((void **)&filp)) != 0)
+		return (error);
+	filp->f_flags = file->f_flag;
+	/*
+	 * Linux does not have a generic ioctl copyin/copyout layer.  All
+	 * linux ioctls must be converted to void ioctls which pass a
+	 * pointer to the address of the data.  We want the actual user
+	 * address so we dereference here.
+	 */
+	data = *(void **)data;
+	if (filp->f_op->unlocked_ioctl)
+		error = -filp->f_op->unlocked_ioctl(filp, cmd, (u_long)data);
+	else
+		error = ENOTTY;
+
+	return (error);
+}
+
+static int
+linux_dev_read(struct cdev *dev, struct uio *uio, int ioflag)
+{
+	struct linux_cdev *ldev;
+	struct linux_file *filp;
+	struct file *file;
+	ssize_t bytes;
+	int error;
+
+	file = curthread->td_fpop;
+	ldev = dev->si_drv1;
+	if (ldev == NULL)
+		return (0);
+	if ((error = devfs_get_cdevpriv((void **)&filp)) != 0)
+		return (error);
+	filp->f_flags = file->f_flag;
+	if (uio->uio_iovcnt != 1)
+		panic("linux_dev_read: uio %p iovcnt %d",
+		    uio, uio->uio_iovcnt);
+	if (filp->f_op->read) {
+		bytes = filp->f_op->read(filp, uio->uio_iov->iov_base,
+		    uio->uio_iov->iov_len, &uio->uio_offset);
+		if (bytes >= 0) {
+			uio->uio_iov->iov_base += bytes;
+			uio->uio_iov->iov_len -= bytes;
+			uio->uio_resid -= bytes;
+		} else
+			error = -bytes;
+	} else
+		error = ENXIO;
+
+	return (error);
+}
+
+static int
+linux_dev_write(struct cdev *dev, struct uio *uio, int ioflag)
+{
+	struct linux_cdev *ldev;
+	struct linux_file *filp;
+	struct file *file;
+	ssize_t bytes;
+	int error;
+
+	file = curthread->td_fpop;
+	ldev = dev->si_drv1;
+	if (ldev == NULL)
+		return (0);
+	if ((error = devfs_get_cdevpriv((void **)&filp)) != 0)
+		return (error);
+	filp->f_flags = file->f_flag;
+	if (uio->uio_iovcnt != 1)
+		panic("linux_dev_write: uio %p iovcnt %d",
+		    uio, uio->uio_iovcnt);
+	if (filp->f_op->write) {
+		bytes = filp->f_op->write(filp, uio->uio_iov->iov_base,
+		    uio->uio_iov->iov_len, &uio->uio_offset);
+		if (bytes >= 0) {
+			uio->uio_iov->iov_base += bytes;
+			uio->uio_iov->iov_len -= bytes;
+			uio->uio_resid -= bytes;
+		} else
+			error = -bytes;
+	} else
+		error = ENXIO;
+
+	return (error);
+}
+
+static int
+linux_dev_poll(struct cdev *dev, int events, struct thread *td)
+{
+	struct linux_cdev *ldev;
+	struct linux_file *filp;
+	struct file *file;
+	int revents;
+	int error;
+
+	file = curthread->td_fpop;
+	ldev = dev->si_drv1;
+	if (ldev == NULL)
+		return (0);
+	if ((error = devfs_get_cdevpriv((void **)&filp)) != 0)
+		return (error);
+	filp->f_flags = file->f_flag;
+	if (filp->f_op->poll)
+		revents = filp->f_op->poll(filp, NULL) & events;
+	else
+		revents = 0;
+
+	return (revents);
+}
+
+static int
+linux_dev_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
+    int nprot, vm_memattr_t *memattr)
+{
+
+	/* XXX memattr not honored. */
+	*paddr = offset;
+	return (0);
+}
+
+static int
+linux_dev_mmap_single(struct cdev *dev, vm_ooffset_t *offset,
+    vm_size_t size, struct vm_object **object, int nprot)
+{
+	struct linux_cdev *ldev;
+	struct linux_file *filp;
+	struct file *file;
+	struct vm_area_struct vma;
+	vm_paddr_t paddr;
+	vm_page_t m;
+	int error;
+
+	file = curthread->td_fpop;
+	ldev = dev->si_drv1;
+	if (ldev == NULL)
+		return (ENODEV);
+	if (size != PAGE_SIZE)
+		return (EINVAL);
+	if ((error = devfs_get_cdevpriv((void **)&filp)) != 0)
+		return (error);
+	filp->f_flags = file->f_flag;
+	vma.vm_start = 0;
+	vma.vm_end = PAGE_SIZE;
+	vma.vm_pgoff = *offset / PAGE_SIZE;
+	vma.vm_pfn = 0;
+	vma.vm_page_prot = 0;
+	if (filp->f_op->mmap) {
+		error = -filp->f_op->mmap(filp, &vma);
+		if (error == 0) {
+			paddr = (vm_paddr_t)vma.vm_pfn << PAGE_SHIFT;
+			*offset = paddr;
+			m = PHYS_TO_VM_PAGE(paddr);
+			*object = vm_pager_allocate(OBJT_DEVICE, dev,
+			    PAGE_SIZE, nprot, *offset, curthread->td_ucred);
+		        if (*object == NULL)
+               			 return (EINVAL);
+			if (vma.vm_page_prot != VM_MEMATTR_DEFAULT)
+				pmap_page_set_memattr(m, vma.vm_page_prot);
+		}
+	} else
+		error = ENODEV;
+
+	return (error);
+}
+
+struct cdevsw linuxcdevsw = {
+	.d_version = D_VERSION,
+	.d_flags = D_TRACKCLOSE,
+	.d_open = linux_dev_open,
+	.d_close = linux_dev_close,
+	.d_read = linux_dev_read,
+	.d_write = linux_dev_write,
+	.d_ioctl = linux_dev_ioctl,
+	.d_mmap_single = linux_dev_mmap_single,
+	.d_mmap = linux_dev_mmap,
+	.d_poll = linux_dev_poll,
+};
+
+static int
+linux_file_read(struct file *file, struct uio *uio, struct ucred *active_cred,
+    int flags, struct thread *td)
+{
+	struct linux_file *filp;
+	ssize_t bytes;
+	int error;
+
+	error = 0;
+	filp = (struct linux_file *)file->f_data;
+	filp->f_flags = file->f_flag;
+	if (uio->uio_iovcnt != 1)
+		panic("linux_file_read: uio %p iovcnt %d",
+		    uio, uio->uio_iovcnt);
+	if (filp->f_op->read) {
+		bytes = filp->f_op->read(filp, uio->uio_iov->iov_base,
+		    uio->uio_iov->iov_len, &uio->uio_offset);
+		if (bytes >= 0) {
+			uio->uio_iov->iov_base += bytes;
+			uio->uio_iov->iov_len -= bytes;
+			uio->uio_resid -= bytes;
+		} else
+			error = -bytes;
+	} else
+		error = ENXIO;
+
+	return (error);
+}
+
+static int
+linux_file_poll(struct file *file, int events, struct ucred *active_cred,
+    struct thread *td)
+{
+	struct linux_file *filp;
+	int revents;
+
+	filp = (struct linux_file *)file->f_data;
+	filp->f_flags = file->f_flag;
+	if (filp->f_op->poll)
+		revents = filp->f_op->poll(filp, NULL) & events;
+	else
+		revents = 0;
+
+	return (0);
+}
+
+static int
+linux_file_close(struct file *file, struct thread *td)
+{
+	struct linux_file *filp;
+	int error;
+
+	filp = (struct linux_file *)file->f_data;
+	filp->f_flags = file->f_flag;
+	error = -filp->f_op->release(NULL, filp);
+	funsetown(&filp->f_sigio);
+	kfree(filp);
+
+	return (error);
+}
+
+static int
+linux_file_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *cred,
+    struct thread *td)
+{
+	struct linux_file *filp;
+	int error;
+
+	filp = (struct linux_file *)fp->f_data;
+	filp->f_flags = fp->f_flag;
+	error = 0;
+
+	switch (cmd) {
+	case FIONBIO:
+		break;
+	case FIOASYNC:
+		if (filp->f_op->fasync == NULL)
+			break;
+		error = filp->f_op->fasync(0, filp, fp->f_flag & FASYNC);
+		break;
+	case FIOSETOWN:
+		error = fsetown(*(int *)data, &filp->f_sigio);
+		if (error == 0)
+			error = filp->f_op->fasync(0, filp,
+			    fp->f_flag & FASYNC);
+		break;
+	case FIOGETOWN:
+		*(int *)data = fgetown(&filp->f_sigio);
+		break;
+	default:
+		error = ENOTTY;
+		break;
+	}
+	return (error);
+}
+
+struct fileops linuxfileops = {
+	.fo_read = linux_file_read,
+	.fo_poll = linux_file_poll,
+	.fo_close = linux_file_close,
+	.fo_ioctl = linux_file_ioctl
+};
+
+/*
+ * Hash of vmmap addresses.  This is infrequently accessed and does not
+ * need to be particularly large.  This is done because we must store the
+ * caller's idea of the map size to properly unmap.
+ */
+struct vmmap {
+	LIST_ENTRY(vmmap)	vm_next;
+	void 			*vm_addr;
+	unsigned long		vm_size;
+};
+
+LIST_HEAD(vmmaphd, vmmap);
+#define	VMMAP_HASH_SIZE	64
+#define	VMMAP_HASH_MASK	(VMMAP_HASH_SIZE - 1)
+#define	VM_HASH(addr)	((uintptr_t)(addr) >> PAGE_SHIFT) & VMMAP_HASH_MASK
+static struct vmmaphd vmmaphead[VMMAP_HASH_SIZE];
+static struct mtx vmmaplock;
+
+static void
+vmmap_add(void *addr, unsigned long size)
+{
+	struct vmmap *vmmap;
+
+	vmmap = kmalloc(sizeof(*vmmap), GFP_KERNEL);
+	mtx_lock(&vmmaplock);
+	vmmap->vm_size = size;
+	vmmap->vm_addr = addr;
+	LIST_INSERT_HEAD(&vmmaphead[VM_HASH(addr)], vmmap, vm_next);
+	mtx_unlock(&vmmaplock);
+}
+
+static struct vmmap *
+vmmap_remove(void *addr)
+{
+	struct vmmap *vmmap;
+
+	mtx_lock(&vmmaplock);
+	LIST_FOREACH(vmmap, &vmmaphead[VM_HASH(addr)], vm_next)
+		if (vmmap->vm_addr == addr)
+			break;
+	if (vmmap)
+		LIST_REMOVE(vmmap, vm_next);
+	mtx_unlock(&vmmaplock);
+
+	return (vmmap);
+}
+
+void *
+_ioremap_attr(vm_paddr_t phys_addr, unsigned long size, int attr)
+{
+	void *addr;
+
+	addr = pmap_mapdev_attr(phys_addr, size, attr);
+	if (addr == NULL)
+		return (NULL);
+	vmmap_add(addr, size);
+
+	return (addr);
+}
+
+void
+iounmap(void *addr)
+{
+	struct vmmap *vmmap;
+
+	vmmap = vmmap_remove(addr);
+	if (vmmap == NULL)
+		return;
+	pmap_unmapdev((vm_offset_t)addr, vmmap->vm_size);
+	kfree(vmmap);
+}
+
+
+void *
+vmap(struct page **pages, unsigned int count, unsigned long flags, int prot)
+{
+	vm_offset_t off;
+	size_t size;
+
+	size = count * PAGE_SIZE;
+	off = kmem_alloc_nofault(kernel_map, size);
+	if (off == 0)
+		return (NULL);
+	vmmap_add((void *)off, size);
+	pmap_qenter(off, pages, count);
+
+	return ((void *)off);
+}
+
+void
+vunmap(void *addr)
+{
+	struct vmmap *vmmap;
+
+	vmmap = vmmap_remove(addr);
+	if (vmmap == NULL)
+		return;
+	pmap_qremove((vm_offset_t)addr, vmmap->vm_size / PAGE_SIZE);
+	kmem_free(kernel_map, (vm_offset_t)addr, vmmap->vm_size);
+	kfree(vmmap);
+}
+
+static void
+linux_compat_init(void)
+{
+	struct sysctl_oid *rootoid;
+	int i;
+
+	rootoid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(),
+	    OID_AUTO, "sys", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "sys");
+	kobject_init(&class_root, &class_ktype);
+	kobject_set_name(&class_root, "class");
+	class_root.oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(rootoid),
+	    OID_AUTO, "class", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "class");
+	kobject_init(&linux_rootdev.kobj, &dev_ktype);
+	kobject_set_name(&linux_rootdev.kobj, "device");
+	linux_rootdev.kobj.oidp = SYSCTL_ADD_NODE(NULL,
+	    SYSCTL_CHILDREN(rootoid), OID_AUTO, "device", CTLFLAG_RD, NULL,
+	    "device");
+	linux_rootdev.bsddev = root_bus;
+	miscclass.name = "misc";
+	class_register(&miscclass);
+	INIT_LIST_HEAD(&pci_drivers);
+	INIT_LIST_HEAD(&pci_devices);
+	spin_lock_init(&pci_lock);
+	mtx_init(&vmmaplock, "IO Map lock", NULL, MTX_DEF);
+	for (i = 0; i < VMMAP_HASH_SIZE; i++)
+		LIST_INIT(&vmmaphead[i]);
+}
+
+SYSINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_init, NULL);
diff --git a/sys/ofed/include/linux/linux_idr.c b/sys/ofed/include/linux/linux_idr.c
new file mode 100644
index 0000000..5cfaff5
--- /dev/null
+++ b/sys/ofed/include/linux/linux_idr.c
@@ -0,0 +1,447 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+
+#include <machine/stdarg.h>
+
+#include <linux/bitops.h>
+#include <linux/kobject.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/err.h>
+
+/*
+ * IDR Implementation.
+ *
+ * This is quick and dirty and not as re-entrant as the linux version
+ * however it should be fairly fast.  It is basically a radix tree with
+ * a builtin bitmap for allocation.
+ */
+MALLOC_DEFINE(M_IDR, "idr", "Linux IDR compat");
+
+static inline int
+idr_max(struct idr *idr)
+{
+	return (1 << (idr->layers * IDR_BITS)) - 1;
+}
+
+static inline int
+idr_pos(int id, int layer)
+{
+	return (id >> (IDR_BITS * layer)) & IDR_MASK;
+}
+
+void
+idr_init(struct idr *idr)
+{
+	bzero(idr, sizeof(*idr));
+	mtx_init(&idr->lock, "idr", NULL, MTX_DEF);
+}
+
+/* Only frees cached pages. */
+void
+idr_destroy(struct idr *idr)
+{
+	struct idr_layer *il, *iln;
+
+	mtx_lock(&idr->lock);
+	for (il = idr->free; il != NULL; il = iln) {
+		iln = il->ary[0];
+		free(il, M_IDR);
+	}
+	mtx_unlock(&idr->lock);
+}
+
+static void
+idr_remove_layer(struct idr_layer *il, int layer)
+{
+	int i;
+
+	if (il == NULL)
+		return;
+	if (layer == 0) {
+		free(il, M_IDR);
+		return;
+	}
+	for (i = 0; i < IDR_SIZE; i++)
+		if (il->ary[i])
+			idr_remove_layer(il->ary[i], layer - 1);
+}
+
+void
+idr_remove_all(struct idr *idr)
+{
+
+	mtx_lock(&idr->lock);
+	idr_remove_layer(idr->top, idr->layers - 1);
+	idr->top = NULL;
+	idr->layers = 0;
+	mtx_unlock(&idr->lock);
+}
+
+void
+idr_remove(struct idr *idr, int id)
+{
+	struct idr_layer *il;
+	int layer;
+	int idx;
+
+	id &= MAX_ID_MASK;
+	mtx_lock(&idr->lock);
+	il = idr->top;
+	layer = idr->layers - 1;
+	if (il == NULL || id > idr_max(idr)) {
+		mtx_unlock(&idr->lock);
+		return;
+	}
+	/*
+	 * Walk down the tree to this item setting bitmaps along the way
+	 * as we know at least one item will be free along this path.
+	 */
+	while (layer && il) {
+		idx = idr_pos(id, layer);
+		il->bitmap |= 1 << idx;
+		il = il->ary[idx];
+		layer--;
+	}
+	idx = id & IDR_MASK;
+	/*
+	 * At this point we've set free space bitmaps up the whole tree.
+	 * We could make this non-fatal and unwind but linux dumps a stack
+	 * and a warning so I don't think it's necessary.
+	 */
+	if (il == NULL || (il->bitmap & (1 << idx)) != 0)
+		panic("idr_remove: Item %d not allocated (%p, %p)\n",
+		    id, idr, il);
+	il->ary[idx] = NULL;
+	il->bitmap |= 1 << idx;
+	mtx_unlock(&idr->lock);
+	return;
+}
+
+void *
+idr_replace(struct idr *idr, void *ptr, int id)
+{
+	struct idr_layer *il;
+	void *res;
+	int layer;
+	int idx;
+
+	res = ERR_PTR(-EINVAL);
+	id &= MAX_ID_MASK;
+	mtx_lock(&idr->lock);
+	il = idr->top;
+	layer = idr->layers - 1;
+	if (il == NULL || id > idr_max(idr))
+		goto out;
+	while (layer && il) {
+		il = il->ary[idr_pos(id, layer)];
+		layer--;
+	}
+	idx = id & IDR_MASK;
+	/*
+	 * Replace still returns an error if the item was not allocated.
+	 */
+	if (il != NULL && (il->bitmap & (1 << idx)) != 0) {
+		res = il->ary[idx];
+		il->ary[idx] = ptr;
+	}
+out:
+	mtx_unlock(&idr->lock);
+	return (res);
+}
+
+void *
+idr_find(struct idr *idr, int id)
+{
+	struct idr_layer *il;
+	void *res;
+	int layer;
+
+	res = NULL;
+	id &= MAX_ID_MASK;
+	mtx_lock(&idr->lock);
+	il = idr->top;
+	layer = idr->layers - 1;
+	if (il == NULL || id > idr_max(idr))
+		goto out;
+	while (layer && il) {
+		il = il->ary[idr_pos(id, layer)];
+		layer--;
+	}
+	if (il != NULL)
+		res = il->ary[id & IDR_MASK];
+out:
+	mtx_unlock(&idr->lock);
+	return (res);
+}
+
+int
+idr_pre_get(struct idr *idr, gfp_t gfp_mask)
+{
+	struct idr_layer *il, *iln;
+	struct idr_layer *head;
+	int need;
+
+	mtx_lock(&idr->lock);
+	for (;;) {
+		need = idr->layers + 1;
+		for (il = idr->free; il != NULL; il = il->ary[0])
+			need--;
+		mtx_unlock(&idr->lock);
+		if (need == 0)
+			break;
+		for (head = NULL; need; need--) {
+			iln = malloc(sizeof(*il), M_IDR, M_ZERO | gfp_mask);
+			if (iln == NULL)
+				break;
+			bitmap_fill(&iln->bitmap, IDR_SIZE);
+			if (head != NULL) {
+				il->ary[0] = iln;
+				il = iln;
+			} else
+				head = il = iln;
+		}
+		if (head == NULL)
+			return (0);
+		mtx_lock(&idr->lock);
+		il->ary[0] = idr->free;
+		idr->free = head;
+	}
+	return (1);
+}
+
+static inline struct idr_layer *
+idr_get(struct idr *idr)
+{
+	struct idr_layer *il;
+
+	il = idr->free;
+	if (il) {
+		idr->free = il->ary[0];
+		il->ary[0] = NULL;
+		return (il);
+	}
+	il = malloc(sizeof(*il), M_IDR, M_ZERO | M_NOWAIT);
+	bitmap_fill(&il->bitmap, IDR_SIZE);
+	return (il);
+}
+
+/*
+ * Could be implemented as get_new_above(idr, ptr, 0, idp) but written
+ * first for simplicity sake.
+ */
+int
+idr_get_new(struct idr *idr, void *ptr, int *idp)
+{
+	struct idr_layer *stack[MAX_LEVEL];
+	struct idr_layer *il;
+	int error;
+	int layer;
+	int idx;
+	int id;
+
+	error = -EAGAIN;
+	mtx_lock(&idr->lock);
+	/*
+	 * Expand the tree until there is free space.
+	 */
+	if (idr->top == NULL || idr->top->bitmap == 0) {
+		if (idr->layers == MAX_LEVEL + 1) {
+			error = -ENOSPC;
+			goto out;
+		}
+		il = idr_get(idr);
+		if (il == NULL)
+			goto out;
+		il->ary[0] = idr->top;
+		if (idr->top)
+			il->bitmap &= ~1;
+		idr->top = il;
+		idr->layers++;
+	}
+	il = idr->top;
+	id = 0;
+	/*
+	 * Walk the tree following free bitmaps, record our path.
+	 */
+	for (layer = idr->layers - 1;; layer--) {
+		stack[layer] = il;
+		idx = ffsl(il->bitmap);
+		if (idx == 0)
+			panic("idr_get_new: Invalid leaf state (%p, %p)\n",
+			    idr, il);
+		idx--;
+		id |= idx << (layer * IDR_BITS);
+		if (layer == 0)
+			break;
+		if (il->ary[idx] == NULL) {
+			il->ary[idx] = idr_get(idr);
+			if (il->ary[idx] == NULL)
+				goto out;
+		}
+		il = il->ary[idx];
+	}
+	/*
+	 * Allocate the leaf to the consumer.
+	 */
+	il->bitmap &= ~(1 << idx);
+	il->ary[idx] = ptr;
+	*idp = id;
+	/*
+	 * Clear bitmaps potentially up to the root.
+	 */
+	while (il->bitmap == 0 && ++layer < idr->layers) {
+		il = stack[layer];
+		il->bitmap &= ~(1 << idr_pos(id, layer));
+	}
+	error = 0;
+out:
+	mtx_unlock(&idr->lock);
+#ifdef INVARIANTS
+	if (error == 0 && idr_find(idr, id) != ptr) {
+		panic("idr_get_new: Failed for idr %p, id %d, ptr %p\n",
+		    idr, id, ptr);
+	}
+#endif
+	return (error);
+}
+
+int
+idr_get_new_above(struct idr *idr, void *ptr, int starting_id, int *idp)
+{
+	struct idr_layer *stack[MAX_LEVEL];
+	struct idr_layer *il;
+	int error;
+	int layer;
+	int idx, sidx;
+	int id;
+
+	error = -EAGAIN;
+	mtx_lock(&idr->lock);
+	/*
+	 * Compute the layers required to support starting_id and the mask
+	 * at the top layer.
+	 */
+restart:
+	idx = starting_id;
+	layer = 0;
+	while (idx & ~IDR_MASK) {
+		layer++;
+		idx >>= IDR_BITS;
+	}
+	if (layer == MAX_LEVEL + 1) {
+		error = -ENOSPC;
+		goto out;
+	}
+	/*
+	 * Expand the tree until there is free space at or beyond starting_id.
+	 */
+	while (idr->layers <= layer ||
+	    idr->top->bitmap < (1 << idr_pos(starting_id, idr->layers - 1))) {
+		if (idr->layers == MAX_LEVEL + 1) {
+			error = -ENOSPC;
+			goto out;
+		}
+		il = idr_get(idr);
+		if (il == NULL)
+			goto out;
+		il->ary[0] = idr->top;
+		if (idr->top && idr->top->bitmap == 0)
+			il->bitmap &= ~1;
+		idr->top = il;
+		idr->layers++;
+	}
+	il = idr->top;
+	id = 0;
+	/*
+	 * Walk the tree following free bitmaps, record our path.
+	 */
+	for (layer = idr->layers - 1;; layer--) {
+		stack[layer] = il;
+		sidx = idr_pos(starting_id, layer);
+		/* Returns index numbered from 0 or size if none exists. */
+		idx = find_next_bit(&il->bitmap, IDR_SIZE, sidx);
+		if (idx == IDR_SIZE && sidx == 0)
+			panic("idr_get_new: Invalid leaf state (%p, %p)\n",
+			    idr, il);
+		/*
+		 * We may have walked a path where there was a free bit but
+		 * it was lower than what we wanted.  Restart the search with
+		 * a larger starting id.  id contains the progress we made so
+		 * far.  Search the leaf one above this level.  This may
+		 * restart as many as MAX_LEVEL times but that is expected
+		 * to be rare.
+		 */
+		if (idx == IDR_SIZE) {
+			starting_id = id + (1 << (layer+1 * IDR_BITS));
+			goto restart;
+		}
+		if (idx > sidx)
+			starting_id = 0;	/* Search the whole subtree. */
+		id |= idx << (layer * IDR_BITS);
+		if (layer == 0)
+			break;
+		if (il->ary[idx] == NULL) {
+			il->ary[idx] = idr_get(idr);
+			if (il->ary[idx] == NULL)
+				goto out;
+		}
+		il = il->ary[idx];
+	}
+	/*
+	 * Allocate the leaf to the consumer.
+	 */
+	il->bitmap &= ~(1 << idx);
+	il->ary[idx] = ptr;
+	*idp = id;
+	/*
+	 * Clear bitmaps potentially up to the root.
+	 */
+	while (il->bitmap == 0 && ++layer < idr->layers) {
+		il = stack[layer];
+		il->bitmap &= ~(1 << idr_pos(id, layer));
+	}
+	error = 0;
+out:
+	mtx_unlock(&idr->lock);
+#ifdef INVARIANTS
+	if (error == 0 && idr_find(idr, id) != ptr) {
+		panic("idr_get_new_above: Failed for idr %p, id %d, ptr %p\n",
+		    idr, id, ptr);
+	}
+#endif
+	return (error);
+}
diff --git a/sys/ofed/include/linux/linux_radix.c b/sys/ofed/include/linux/linux_radix.c
new file mode 100644
index 0000000..e642eae
--- /dev/null
+++ b/sys/ofed/include/linux/linux_radix.c
@@ -0,0 +1,170 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/radix-tree.h>
+#include <linux/err.h>
+
+MALLOC_DEFINE(M_RADIX, "radix", "Linux radix compat");
+
+static inline int
+radix_max(struct radix_tree_root *root)
+{
+	return (1 << (root->height * RADIX_TREE_MAP_SHIFT)) - 1;
+}
+
+static inline int
+radix_pos(long id, int height)
+{
+	return (id >> (RADIX_TREE_MAP_SHIFT * height)) & RADIX_TREE_MAP_MASK;
+}
+
+void *
+radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
+{
+	struct radix_tree_node *node;
+	void *item;
+	int height;
+
+	item = NULL;
+	node = root->rnode;
+	height = root->height - 1;
+	if (index > radix_max(root))
+		goto out;
+	while (height && node)
+		node = node->slots[radix_pos(index, height--)];
+	if (node)
+		item = node->slots[radix_pos(index, 0)];
+
+out:
+	return (item);
+}
+
+void *
+radix_tree_delete(struct radix_tree_root *root, unsigned long index)
+{
+	struct radix_tree_node *stack[RADIX_TREE_MAX_HEIGHT];
+	struct radix_tree_node *node;
+	void *item;
+	int height;
+	int idx;
+
+	item = NULL;
+	node = root->rnode;
+	height = root->height - 1;
+	if (index > radix_max(root))
+		goto out;
+	/*
+	 * Find the node and record the path in stack.
+	 */
+	while (height && node) {
+		stack[height] = node;
+		node = node->slots[radix_pos(index, height--)];
+	}
+	idx = radix_pos(index, 0);
+	if (node)
+		item = node->slots[idx];
+	/*
+	 * If we removed something reduce the height of the tree.
+	 */
+	if (item)
+		for (;;) {
+			node->slots[idx] = NULL;
+			node->count--;
+			if (node->count > 0)
+				break;
+			free(node, M_RADIX);
+			if (node == root->rnode) {
+				root->rnode = NULL;
+				root->height = 0;
+				break;
+			}
+			height++;
+			node = stack[height];
+			idx = radix_pos(index, height);
+		}
+out:
+	return (item);
+}
+
+int
+radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item)
+{
+	struct radix_tree_node *node;
+	int height;
+	int idx;
+
+	/*
+ 	 * Expand the tree to fit indexes as big as requested.
+	 */
+	while (root->rnode == NULL || radix_max(root) < index) {
+		node = malloc(sizeof(*node), M_RADIX, root->gfp_mask | M_ZERO);
+		if (node == NULL)
+			return (-ENOMEM);
+		node->slots[0] = root->rnode;
+		if (root->rnode)
+			node->count++;
+		root->rnode = node;
+		root->height++;
+	}
+	node = root->rnode;
+	height = root->height - 1;
+	/*
+	 * Walk down the tree finding the correct node and allocating any
+	 * missing nodes along the way.
+	 */
+	while (height) {
+		idx = radix_pos(index, height);
+		if (node->slots[idx] == NULL) {
+			node->slots[idx] = malloc(sizeof(*node), M_RADIX,
+			    root->gfp_mask | M_ZERO);
+			if (node->slots[idx] == NULL)
+				return (-ENOMEM);
+			node->count++;
+		}
+		node = node->slots[idx];
+		height--;
+	}
+	/*
+	 * Insert and adjust count if the item does not already exist.
+	 */
+	idx = radix_pos(index, 0);
+	if (node->slots[idx])
+		return (-EEXIST);
+	node->slots[idx] = item;
+	node->count++;
+	
+	return (0);
+}
diff --git a/sys/ofed/include/linux/list.h b/sys/ofed/include/linux/list.h
new file mode 100644
index 0000000..f6f9404
--- /dev/null
+++ b/sys/ofed/include/linux/list.h
@@ -0,0 +1,331 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_LIST_H_
+#define _LINUX_LIST_H_
+
+/*
+ * Since LIST_HEAD conflicts with the linux definition we must include any
+ * FreeBSD header which requires it here so it is resolved with the correct
+ * definition prior to the undef.
+ */
+#include <linux/types.h>
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/queue.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/conf.h>
+#include <sys/socket.h>
+#include <sys/mbuf.h>
+
+#include <net/bpf.h>
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/if_media.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+
+#include <netinet6/in6_var.h>
+#include <netinet6/nd6.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+
+#define	prefetch(x)
+
+struct list_head {
+	struct list_head *next;
+	struct list_head *prev;
+};
+
+static inline void
+INIT_LIST_HEAD(struct list_head *list)
+{
+
+	list->next = list->prev = list;
+}
+ 
+static inline int
+list_empty(const struct list_head *head)
+{
+
+	return (head->next == head);
+}
+
+static inline void
+list_del(struct list_head *entry)
+{
+
+	entry->next->prev = entry->prev;
+	entry->prev->next = entry->next;
+}
+
+static inline void
+_list_add(struct list_head *new, struct list_head *prev,
+    struct list_head *next)
+{
+
+	next->prev = new;
+	new->next = next;
+	new->prev = prev;
+	prev->next = new;
+}
+
+static inline void
+list_del_init(struct list_head *entry)
+{	
+
+	list_del(entry);
+	INIT_LIST_HEAD(entry);
+}
+
+#define	list_entry(ptr, type, field)	container_of(ptr, type, field)
+
+#define	list_for_each(p, head)						\
+	for (p = (head)->next; p != (head); p = p->next)
+
+#define	list_for_each_safe(p, n, head)					\
+	for (p = (head)->next, n = p->next; p != (head); p = n, n = p->next)
+
+#define list_for_each_entry(p, h, field)				\
+	for (p = list_entry((h)->next, typeof(*p), field); &p->field != (h); \
+	    p = list_entry(p->field.next, typeof(*p), field))
+
+#define list_for_each_entry_safe(p, n, h, field)			\
+	for (p = list_entry((h)->next, typeof(*p), field), 		\
+	    n = list_entry(p->field.next, typeof(*p), field); &p->field != (h);\
+	    p = n, n = list_entry(n->field.next, typeof(*n), field))
+
+#define	list_for_each_entry_reverse(p, h, field)			\
+	for (p = list_entry((h)->prev, typeof(*p), field); &p->field != (h); \
+	    p = list_entry(p->field.prev, typeof(*p), field))
+
+#define	list_for_each_prev(p, h) for (p = (h)->prev; p != (h); p = p->prev)
+
+static inline void
+list_add(struct list_head *new, struct list_head *head)
+{
+
+	_list_add(new, head, head->next);
+}
+
+static inline void
+list_add_tail(struct list_head *new, struct list_head *head)
+{
+
+	_list_add(new, head->prev, head);
+}
+
+static inline void
+list_move(struct list_head *list, struct list_head *head)
+{
+
+	list_del(list);
+	list_add(list, head);
+}
+
+static inline void
+list_move_tail(struct list_head *entry, struct list_head *head)
+{
+
+	list_del(entry);
+	list_add_tail(entry, head);
+}
+
+static inline void
+_list_splice(const struct list_head *list, struct list_head *prev,  
+    struct list_head *next)
+{
+	struct list_head *first;
+	struct list_head *last;
+
+	if (list_empty(list))
+		return;
+	first = list->next;
+	last = list->prev;
+	first->prev = prev;
+	prev->next = first;
+	last->next = next;
+	next->prev = last;
+}
+
+static inline void
+list_splice(const struct list_head *list, struct list_head *head)
+{
+
+	_list_splice(list, head, head->next);
+} 
+
+static inline void
+list_splice_tail(struct list_head *list, struct list_head *head)
+{
+
+	_list_splice(list, head->prev, head);
+}
+ 
+static inline void
+list_splice_init(struct list_head *list, struct list_head *head)
+{
+
+	_list_splice(list, head, head->next);
+	INIT_LIST_HEAD(list);   
+}
+ 
+static inline void
+list_splice_tail_init(struct list_head *list, struct list_head *head)
+{
+
+	_list_splice(list, head->prev, head);
+	INIT_LIST_HEAD(list);
+}
+
+#undef LIST_HEAD
+#define LIST_HEAD(name)	struct list_head name = { &(name), &(name) }
+
+
+struct hlist_head {
+	struct hlist_node *first;
+};
+
+struct hlist_node {
+	struct hlist_node *next, **pprev;
+};
+
+#define	HLIST_HEAD_INIT { }
+#define	HLIST_HEAD(name) struct hlist_head name = HLIST_HEAD_INIT
+#define	INIT_HLIST_HEAD(head) (head)->first = NULL
+#define	INIT_HLIST_NODE(node)						\
+do {									\
+	(node)->next = NULL;						\
+	(node)->pprev = NULL;						\
+} while (0)
+
+static inline int
+hlist_unhashed(const struct hlist_node *h)
+{
+
+	return !h->pprev;
+}
+
+static inline int
+hlist_empty(const struct hlist_head *h)
+{
+
+	return !h->first;
+}
+
+static inline void
+hlist_del(struct hlist_node *n)
+{
+
+        if (n->next)
+                n->next->pprev = n->pprev;
+        *n->pprev = n->next;
+}
+
+static inline void
+hlist_del_init(struct hlist_node *n)
+{
+
+	if (hlist_unhashed(n))
+		return;
+	hlist_del(n);
+	INIT_HLIST_NODE(n);
+}
+
+static inline void
+hlist_add_head(struct hlist_node *n, struct hlist_head *h)
+{
+
+	n->next = h->first;
+	if (h->first)
+		h->first->pprev = &n->next;
+	h->first = n;
+	n->pprev = &h->first;
+}
+
+static inline void
+hlist_add_before(struct hlist_node *n, struct hlist_node *next)
+{
+
+	n->pprev = next->pprev;
+	n->next = next;
+	next->pprev = &n->next;
+	*(n->pprev) = n;
+}
+ 
+static inline void
+hlist_add_after(struct hlist_node *n, struct hlist_node *next)
+{
+
+	next->next = n->next;
+	n->next = next;
+	next->pprev = &n->next;
+	if (next->next)
+		next->next->pprev = &next->next;
+}
+ 
+static inline void
+hlist_move_list(struct hlist_head *old, struct hlist_head *new)
+{
+
+	new->first = old->first;
+	if (new->first)
+		new->first->pprev = &new->first;
+	old->first = NULL;
+}
+ 
+#define	hlist_entry(ptr, type, field)	container_of(ptr, type, field)
+
+#define	hlist_for_each(p, head)						\
+	for (p = (head)->first; p; p = p->next)
+
+#define	hlist_for_each_safe(p, n, head)					\
+	for (p = (head)->first; p && ({ n = p->next; 1; }); p = n)
+
+#define	hlist_for_each_entry(tp, p, head, field)			\
+	for (p = (head)->first;						\
+	    p ? (tp = hlist_entry(p, typeof(*tp), field)): NULL; p = p->next)
+ 
+#define hlist_for_each_entry_continue(tp, p, field)			\
+	for (p = (p)->next;						\
+	    p ? (tp = hlist_entry(p, typeof(*tp), field)): NULL; p = p->next)
+
+#define	hlist_for_each_entry_from(tp, p, field)				\
+	for (; p ? (tp = hlist_entry(p, typeof(*tp), field)): NULL; p = p->next)
+
+#define	hlist_for_each_entry_safe(tp, p, n, head, field)		\
+	for (p = (head)->first;	p ?					\
+	    (n = p->next) | (tp = hlist_entry(p, typeof(*tp), field)) :	\
+	    NULL; p = n)
+
+#endif /* _LINUX_LIST_H_ */
diff --git a/sys/ofed/include/linux/lockdep.h b/sys/ofed/include/linux/lockdep.h
new file mode 100644
index 0000000..8ddb079
--- /dev/null
+++ b/sys/ofed/include/linux/lockdep.h
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_LOCKDEP_H_
+#define	_LINUX_LOCKDEP_H_
+
+struct lock_class_key {
+};
+
+#define lockdep_set_class(lock, key)
+
+#endif	/* _LINUX_LOCKDEP_H_ */
diff --git a/sys/ofed/include/linux/log2.h b/sys/ofed/include/linux/log2.h
new file mode 100644
index 0000000..0a8315a
--- /dev/null
+++ b/sys/ofed/include/linux/log2.h
@@ -0,0 +1,60 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_LOG2_H_
+#define	_LINUX_LOG2_H_
+
+#include <linux/types.h>
+
+#include <sys/libkern.h>
+
+static inline unsigned long
+roundup_pow_of_two(unsigned long x)
+{
+	return (1UL << flsl(x - 1));
+}
+
+static inline int
+is_power_of_2(unsigned long n)
+{
+	return (n == roundup_pow_of_two(n));
+}
+
+static inline unsigned long
+rounddown_pow_of_two(unsigned long x)
+{
+        return (1UL << (flsl(x) - 1));
+}
+
+static inline unsigned long
+ilog2(unsigned long x)
+{
+	return (flsl(x) - 1);
+}
+
+#endif	/* _LINUX_LOG2_H_ */
diff --git a/sys/ofed/include/linux/miscdevice.h b/sys/ofed/include/linux/miscdevice.h
new file mode 100644
index 0000000..e6a4435
--- /dev/null
+++ b/sys/ofed/include/linux/miscdevice.h
@@ -0,0 +1,72 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_MISCDEVICE_H_
+#define	_LINUX_MISCDEVICE_H_
+
+#define	MISC_DYNAMIC_MINOR	-1
+
+#include <linux/device.h>
+#include <linux/cdev.h>
+
+struct miscdevice  {
+	const char	*name;
+	struct device	*this_device;
+	const struct file_operations *fops;
+	struct cdev	*cdev;
+	int		minor;
+};
+
+extern struct class	miscclass;
+
+static inline int
+misc_register(struct miscdevice *misc)
+{
+	misc->this_device = device_create(&miscclass, &linux_rootdev, 0, misc, 
+	    misc->name);
+	misc->cdev = cdev_alloc();
+	if (misc->cdev == NULL)
+		return -ENOMEM;
+	misc->cdev->owner = THIS_MODULE;
+	misc->cdev->ops = misc->fops;
+	kobject_set_name(&misc->cdev->kobj, misc->name);
+        if (cdev_add(misc->cdev, misc->this_device->devt, 1))
+		return -EINVAL;
+	return (0);
+}
+
+static inline int
+misc_deregister(struct miscdevice *misc)
+{
+	device_destroy(&miscclass, misc->this_device->devt);
+	cdev_del(misc->cdev);
+
+	return (0);
+}
+
+#endif	/* _LINUX_MISCDEVICE_H_ */
diff --git a/sys/ofed/include/linux/mlx4/cmd.h b/sys/ofed/include/linux/mlx4/cmd.h
new file mode 100644
index 0000000..60d3036
--- /dev/null
+++ b/sys/ofed/include/linux/mlx4/cmd.h
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2006 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_CMD_H
+#define MLX4_CMD_H
+
+#include <linux/dma-mapping.h>
+
+enum {
+	/* initialization and general commands */
+	MLX4_CMD_SYS_EN		 = 0x1,
+	MLX4_CMD_SYS_DIS	 = 0x2,
+	MLX4_CMD_MAP_FA		 = 0xfff,
+	MLX4_CMD_UNMAP_FA	 = 0xffe,
+	MLX4_CMD_RUN_FW		 = 0xff6,
+	MLX4_CMD_MOD_STAT_CFG	 = 0x34,
+	MLX4_CMD_QUERY_DEV_CAP	 = 0x3,
+	MLX4_CMD_QUERY_FW	 = 0x4,
+	MLX4_CMD_ENABLE_LAM	 = 0xff8,
+	MLX4_CMD_DISABLE_LAM	 = 0xff7,
+	MLX4_CMD_QUERY_DDR	 = 0x5,
+	MLX4_CMD_QUERY_ADAPTER	 = 0x6,
+	MLX4_CMD_INIT_HCA	 = 0x7,
+	MLX4_CMD_CLOSE_HCA	 = 0x8,
+	MLX4_CMD_INIT_PORT	 = 0x9,
+	MLX4_CMD_CLOSE_PORT	 = 0xa,
+	MLX4_CMD_QUERY_HCA	 = 0xb,
+	MLX4_CMD_QUERY_PORT	 = 0x43,
+	MLX4_CMD_SENSE_PORT	 = 0x4d,
+	MLX4_CMD_HW_HEALTH_CHECK = 0x50,
+	MLX4_CMD_SET_PORT	 = 0xc,
+	MLX4_CMD_SET_NODE	 = 0x5a,
+	MLX4_CMD_ACCESS_DDR	 = 0x2e,
+	MLX4_CMD_MAP_ICM	 = 0xffa,
+	MLX4_CMD_UNMAP_ICM	 = 0xff9,
+	MLX4_CMD_MAP_ICM_AUX	 = 0xffc,
+	MLX4_CMD_UNMAP_ICM_AUX	 = 0xffb,
+	MLX4_CMD_SET_ICM_SIZE	 = 0xffd,
+
+	/* TPT commands */
+	MLX4_CMD_SW2HW_MPT	 = 0xd,
+	MLX4_CMD_QUERY_MPT	 = 0xe,
+	MLX4_CMD_HW2SW_MPT	 = 0xf,
+	MLX4_CMD_READ_MTT	 = 0x10,
+	MLX4_CMD_WRITE_MTT	 = 0x11,
+	MLX4_CMD_SYNC_TPT	 = 0x2f,
+
+	/* EQ commands */
+	MLX4_CMD_MAP_EQ		 = 0x12,
+	MLX4_CMD_SW2HW_EQ	 = 0x13,
+	MLX4_CMD_HW2SW_EQ	 = 0x14,
+	MLX4_CMD_QUERY_EQ	 = 0x15,
+
+	/* CQ commands */
+	MLX4_CMD_SW2HW_CQ	 = 0x16,
+	MLX4_CMD_HW2SW_CQ	 = 0x17,
+	MLX4_CMD_QUERY_CQ	 = 0x18,
+	MLX4_CMD_MODIFY_CQ	 = 0x2c,
+
+	/* SRQ commands */
+	MLX4_CMD_SW2HW_SRQ	 = 0x35,
+	MLX4_CMD_HW2SW_SRQ	 = 0x36,
+	MLX4_CMD_QUERY_SRQ	 = 0x37,
+	MLX4_CMD_ARM_SRQ	 = 0x40,
+
+	/* QP/EE commands */
+	MLX4_CMD_RST2INIT_QP	 = 0x19,
+	MLX4_CMD_INIT2RTR_QP	 = 0x1a,
+	MLX4_CMD_RTR2RTS_QP	 = 0x1b,
+	MLX4_CMD_RTS2RTS_QP	 = 0x1c,
+	MLX4_CMD_SQERR2RTS_QP	 = 0x1d,
+	MLX4_CMD_2ERR_QP	 = 0x1e,
+	MLX4_CMD_RTS2SQD_QP	 = 0x1f,
+	MLX4_CMD_SQD2SQD_QP	 = 0x38,
+	MLX4_CMD_SQD2RTS_QP	 = 0x20,
+	MLX4_CMD_2RST_QP	 = 0x21,
+	MLX4_CMD_QUERY_QP	 = 0x22,
+	MLX4_CMD_INIT2INIT_QP	 = 0x2d,
+	MLX4_CMD_SUSPEND_QP	 = 0x32,
+	MLX4_CMD_UNSUSPEND_QP	 = 0x33,
+	/* special QP and management commands */
+	MLX4_CMD_CONF_SPECIAL_QP = 0x23,
+	MLX4_CMD_MAD_IFC	 = 0x24,
+
+	/* multicast commands */
+	MLX4_CMD_READ_MCG	 = 0x25,
+	MLX4_CMD_WRITE_MCG	 = 0x26,
+	MLX4_CMD_MGID_HASH	 = 0x27,
+
+	/* miscellaneous commands */
+	MLX4_CMD_DIAG_RPRT	 = 0x30,
+	MLX4_CMD_NOP		 = 0x31,
+
+	/* debug commands */
+	MLX4_CMD_QUERY_DEBUG_MSG = 0x2a,
+	MLX4_CMD_SET_DEBUG_MSG	 = 0x2b,
+
+	/* statistics commands */
+	MLX4_CMD_QUERY_IF_STAT	 = 0X54,
+	MLX4_CMD_SET_IF_STAT	 = 0X55,
+};
+
+enum {
+	MLX4_CMD_TIME_CLASS_A	= 10000,
+	MLX4_CMD_TIME_CLASS_B	= 10000,
+	MLX4_CMD_TIME_CLASS_C	= 10000,
+};
+
+enum {
+	MLX4_MAILBOX_SIZE	=  4096
+};
+
+enum {
+	/* set port opcode modifiers */
+	MLX4_SET_PORT_GENERAL   = 0x0,
+	MLX4_SET_PORT_RQP_CALC  = 0x1,
+	MLX4_SET_PORT_MAC_TABLE = 0x2,
+	MLX4_SET_PORT_VLAN_TABLE = 0x3,
+	MLX4_SET_PORT_PRIO_MAP  = 0x4,
+	MLX4_SET_PORT_GID_TABLE = 0x5,
+};
+
+struct mlx4_dev;
+
+struct mlx4_cmd_mailbox {
+	void		       *buf;
+	dma_addr_t		dma;
+};
+
+int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+	       int out_is_imm, u32 in_modifier, u8 op_modifier,
+	       u16 op, unsigned long timeout);
+
+/* Invoke a command with no output parameter */
+static inline int mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u32 in_modifier,
+			   u8 op_modifier, u16 op, unsigned long timeout)
+{
+	return __mlx4_cmd(dev, in_param, NULL, 0, in_modifier,
+			  op_modifier, op, timeout);
+}
+
+/* Invoke a command with an output mailbox */
+static inline int mlx4_cmd_box(struct mlx4_dev *dev, u64 in_param, u64 out_param,
+			       u32 in_modifier, u8 op_modifier, u16 op,
+			       unsigned long timeout)
+{
+	return __mlx4_cmd(dev, in_param, &out_param, 0, in_modifier,
+			  op_modifier, op, timeout);
+}
+
+/*
+ * Invoke a command with an immediate output parameter (and copy the
+ * output into the caller's out_param pointer after the command
+ * executes).
+ */
+static inline int mlx4_cmd_imm(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+			       u32 in_modifier, u8 op_modifier, u16 op,
+			       unsigned long timeout)
+{
+	return __mlx4_cmd(dev, in_param, out_param, 1, in_modifier,
+			  op_modifier, op, timeout);
+}
+
+struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev);
+void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox);
+
+#endif /* MLX4_CMD_H */
diff --git a/sys/ofed/include/linux/mlx4/cq.h b/sys/ofed/include/linux/mlx4/cq.h
new file mode 100644
index 0000000..6f65b2c
--- /dev/null
+++ b/sys/ofed/include/linux/mlx4/cq.h
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_CQ_H
+#define MLX4_CQ_H
+
+#include <linux/types.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/doorbell.h>
+
+struct mlx4_cqe {
+	__be32			vlan_my_qpn;
+	__be32			immed_rss_invalid;
+	__be32			g_mlpath_rqpn;
+	__be16			sl_vid;
+	__be16			rlid;
+	__be16			status;
+	u8			ipv6_ext_mask;
+	u8			badfcs_enc;
+	__be32			byte_cnt;
+	__be16			wqe_index;
+	__be16			checksum;
+	u8			reserved[3];
+	u8			owner_sr_opcode;
+};
+
+struct mlx4_err_cqe {
+	__be32			my_qpn;
+	u32			reserved1[5];
+	__be16			wqe_index;
+	u8			vendor_err_syndrome;
+	u8			syndrome;
+	u8			reserved2[3];
+	u8			owner_sr_opcode;
+};
+
+enum {
+	MLX4_CQE_VLAN_PRESENT_MASK	= 1 << 29,
+	MLX4_CQE_QPN_MASK		= 0xffffff,
+};
+
+enum {
+	MLX4_CQE_OWNER_MASK	= 0x80,
+	MLX4_CQE_IS_SEND_MASK	= 0x40,
+	MLX4_CQE_OPCODE_MASK	= 0x1f
+};
+
+enum {
+	MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR		= 0x01,
+	MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR		= 0x02,
+	MLX4_CQE_SYNDROME_LOCAL_PROT_ERR		= 0x04,
+	MLX4_CQE_SYNDROME_WR_FLUSH_ERR			= 0x05,
+	MLX4_CQE_SYNDROME_MW_BIND_ERR			= 0x06,
+	MLX4_CQE_SYNDROME_BAD_RESP_ERR			= 0x10,
+	MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR		= 0x11,
+	MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR		= 0x12,
+	MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR		= 0x13,
+	MLX4_CQE_SYNDROME_REMOTE_OP_ERR			= 0x14,
+	MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR	= 0x15,
+	MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR		= 0x16,
+	MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR		= 0x22,
+};
+
+enum {
+	MLX4_CQE_STATUS_IPV4		= 1 << 6,
+	MLX4_CQE_STATUS_IPV4F		= 1 << 7,
+	MLX4_CQE_STATUS_IPV6		= 1 << 8,
+	MLX4_CQE_STATUS_IPV4OPT		= 1 << 9,
+	MLX4_CQE_STATUS_TCP		= 1 << 10,
+	MLX4_CQE_STATUS_UDP		= 1 << 11,
+	MLX4_CQE_STATUS_IPOK		= 1 << 12,
+};
+
+enum {
+	MLX4_CQE_LLC                     = 1,
+	MLX4_CQE_SNAP                    = 1 << 1,
+	MLX4_CQE_BAD_FCS                 = 1 << 4,
+};
+
+static inline void mlx4_cq_arm(struct mlx4_cq *cq, u32 cmd,
+			       void __iomem *uar_page,
+			       spinlock_t *doorbell_lock)
+{
+	__be32 doorbell[2];
+	u32 sn;
+	u32 ci;
+
+	sn = cq->arm_sn & 3;
+	ci = cq->cons_index & 0xffffff;
+
+	*cq->arm_db = cpu_to_be32(sn << 28 | cmd | ci);
+
+	/*
+	 * Make sure that the doorbell record in host memory is
+	 * written before ringing the doorbell via PCI MMIO.
+	 */
+	wmb();
+
+	doorbell[0] = cpu_to_be32(sn << 28 | cmd | cq->cqn);
+	doorbell[1] = cpu_to_be32(ci);
+
+	mlx4_write64(doorbell, uar_page + MLX4_CQ_DOORBELL, doorbell_lock);
+}
+
+static inline void mlx4_cq_set_ci(struct mlx4_cq *cq)
+{
+	*cq->set_ci_db = cpu_to_be32(cq->cons_index & 0xffffff);
+}
+
+enum {
+	MLX4_CQ_DB_REQ_NOT_SOL		= 1 << 24,
+	MLX4_CQ_DB_REQ_NOT		= 2 << 24
+};
+
+int mlx4_cq_modify(struct mlx4_dev *dev, struct mlx4_cq *cq,
+		   u16 count, u16 period);
+int mlx4_cq_resize(struct mlx4_dev *dev, struct mlx4_cq *cq,
+		   int entries, struct mlx4_mtt *mtt);
+
+#endif /* MLX4_CQ_H */
diff --git a/sys/ofed/include/linux/mlx4/device.h b/sys/ofed/include/linux/mlx4/device.h
new file mode 100644
index 0000000..5272e5f
--- /dev/null
+++ b/sys/ofed/include/linux/mlx4/device.h
@@ -0,0 +1,619 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_DEVICE_H
+#define MLX4_DEVICE_H
+
+#include <linux/pci.h>
+#include <linux/completion.h>
+#include <linux/radix-tree.h>
+
+#include <asm/atomic.h>
+
+#include <linux/mlx4/driver.h>
+
+enum {
+	MLX4_FLAG_MSI_X		= 1 << 0,
+	MLX4_FLAG_OLD_PORT_CMDS	= 1 << 1,
+};
+
+enum {
+	MLX4_MAX_PORTS		= 2
+};
+
+enum {
+	MLX4_BOARD_ID_LEN = 64
+};
+
+enum {
+	MLX4_DEV_CAP_FLAG_RC		= 1 <<  0,
+	MLX4_DEV_CAP_FLAG_UC		= 1 <<  1,
+	MLX4_DEV_CAP_FLAG_UD		= 1 <<  2,
+	MLX4_DEV_CAP_FLAG_XRC		= 1 <<  3,
+	MLX4_DEV_CAP_FLAG_SRQ		= 1 <<  6,
+	MLX4_DEV_CAP_FLAG_IPOIB_CSUM	= 1 <<  7,
+	MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR	= 1 <<  8,
+	MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR	= 1 <<  9,
+	MLX4_DEV_CAP_FLAG_DPDP		= 1 << 12,
+	MLX4_DEV_CAP_FLAG_RAW_ETY	= 1 << 13,
+	MLX4_DEV_CAP_FLAG_BLH		= 1 << 15,
+	MLX4_DEV_CAP_FLAG_MEM_WINDOW	= 1 << 16,
+	MLX4_DEV_CAP_FLAG_APM		= 1 << 17,
+	MLX4_DEV_CAP_FLAG_ATOMIC	= 1 << 18,
+	MLX4_DEV_CAP_FLAG_RAW_MCAST	= 1 << 19,
+	MLX4_DEV_CAP_FLAG_UD_AV_PORT	= 1 << 20,
+	MLX4_DEV_CAP_FLAG_UD_MCAST	= 1 << 21,
+	MLX4_DEV_CAP_FLAG_IBOE		= 1 << 30,
+	MLX4_DEV_CAP_FLAG_FC_T11	= 1 << 31
+};
+
+enum {
+	MLX4_BMME_FLAG_LOCAL_INV	= 1 <<  6,
+	MLX4_BMME_FLAG_REMOTE_INV	= 1 <<  7,
+	MLX4_BMME_FLAG_TYPE_2_WIN	= 1 <<  9,
+	MLX4_BMME_FLAG_RESERVED_LKEY	= 1 << 10,
+	MLX4_BMME_FLAG_FAST_REG_WR	= 1 << 11,
+};
+
+enum mlx4_event {
+	MLX4_EVENT_TYPE_COMP		   = 0x00,
+	MLX4_EVENT_TYPE_PATH_MIG	   = 0x01,
+	MLX4_EVENT_TYPE_COMM_EST	   = 0x02,
+	MLX4_EVENT_TYPE_SQ_DRAINED	   = 0x03,
+	MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE	   = 0x13,
+	MLX4_EVENT_TYPE_SRQ_LIMIT	   = 0x14,
+	MLX4_EVENT_TYPE_CQ_ERROR	   = 0x04,
+	MLX4_EVENT_TYPE_WQ_CATAS_ERROR	   = 0x05,
+	MLX4_EVENT_TYPE_EEC_CATAS_ERROR	   = 0x06,
+	MLX4_EVENT_TYPE_PATH_MIG_FAILED	   = 0x07,
+	MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR = 0x10,
+	MLX4_EVENT_TYPE_WQ_ACCESS_ERROR	   = 0x11,
+	MLX4_EVENT_TYPE_SRQ_CATAS_ERROR	   = 0x12,
+	MLX4_EVENT_TYPE_LOCAL_CATAS_ERROR  = 0x08,
+	MLX4_EVENT_TYPE_PORT_CHANGE	   = 0x09,
+	MLX4_EVENT_TYPE_EQ_OVERFLOW	   = 0x0f,
+	MLX4_EVENT_TYPE_ECC_DETECT	   = 0x0e,
+	MLX4_EVENT_TYPE_CMD		   = 0x0a
+};
+
+enum {
+	MLX4_PORT_CHANGE_SUBTYPE_DOWN	= 1,
+	MLX4_PORT_CHANGE_SUBTYPE_ACTIVE	= 4
+};
+
+enum {
+	MLX4_PERM_LOCAL_READ	= 1 << 10,
+	MLX4_PERM_LOCAL_WRITE	= 1 << 11,
+	MLX4_PERM_REMOTE_READ	= 1 << 12,
+	MLX4_PERM_REMOTE_WRITE	= 1 << 13,
+	MLX4_PERM_ATOMIC	= 1 << 14
+};
+
+enum {
+	MLX4_OPCODE_NOP			= 0x00,
+	MLX4_OPCODE_SEND_INVAL		= 0x01,
+	MLX4_OPCODE_RDMA_WRITE		= 0x08,
+	MLX4_OPCODE_RDMA_WRITE_IMM	= 0x09,
+	MLX4_OPCODE_SEND		= 0x0a,
+	MLX4_OPCODE_SEND_IMM		= 0x0b,
+	MLX4_OPCODE_LSO			= 0x0e,
+	MLX4_OPCODE_BIG_LSO		= 0x2e,
+	MLX4_OPCODE_RDMA_READ		= 0x10,
+	MLX4_OPCODE_ATOMIC_CS		= 0x11,
+	MLX4_OPCODE_ATOMIC_FA		= 0x12,
+	MLX4_OPCODE_MASKED_ATOMIC_CS	= 0x14,
+	MLX4_OPCODE_MASKED_ATOMIC_FA	= 0x15,
+	MLX4_OPCODE_BIND_MW		= 0x18,
+	MLX4_OPCODE_FMR			= 0x19,
+	MLX4_OPCODE_LOCAL_INVAL		= 0x1b,
+	MLX4_OPCODE_CONFIG_CMD		= 0x1f,
+
+	MLX4_RECV_OPCODE_RDMA_WRITE_IMM	= 0x00,
+	MLX4_RECV_OPCODE_SEND		= 0x01,
+	MLX4_RECV_OPCODE_SEND_IMM	= 0x02,
+	MLX4_RECV_OPCODE_SEND_INVAL	= 0x03,
+
+	MLX4_CQE_OPCODE_ERROR		= 0x1e,
+	MLX4_CQE_OPCODE_RESIZE		= 0x16,
+};
+
+enum {
+	MLX4_STAT_RATE_OFFSET	= 5
+};
+
+enum {
+	MLX4_MTT_FLAG_PRESENT		= 1
+};
+
+enum mlx4_qp_region {
+	MLX4_QP_REGION_FW = 0,
+	MLX4_QP_REGION_ETH_ADDR,
+	MLX4_QP_REGION_FC_ADDR,
+	MLX4_NUM_QP_REGION
+};
+
+enum mlx4_port_type {
+	MLX4_PORT_TYPE_NONE	= 0,
+	MLX4_PORT_TYPE_IB	= 1,
+	MLX4_PORT_TYPE_ETH	= 2,
+	MLX4_PORT_TYPE_AUTO	= 3
+};
+
+enum mlx4_special_vlan_idx {
+	MLX4_NO_VLAN_IDX        = 0,
+	MLX4_VLAN_MISS_IDX,
+	MLX4_VLAN_REGULAR
+};
+#define MLX4_LEAST_ATTACHED_VECTOR	0xffffffff
+
+enum {
+	MLX4_CUNTERS_DISABLED,
+	MLX4_CUNTERS_BASIC,
+	MLX4_CUNTERS_EXT
+};
+
+enum {
+	MAX_FAST_REG_PAGES = 511,
+};
+
+static inline u64 mlx4_fw_ver(u64 major, u64 minor, u64 subminor)
+{
+	return (major << 32) | (minor << 16) | subminor;
+}
+
+struct mlx4_caps {
+	u64			fw_ver;
+	int			num_ports;
+	int			vl_cap[MLX4_MAX_PORTS + 1];
+	int			ib_mtu_cap[MLX4_MAX_PORTS + 1];
+	__be32			ib_port_def_cap[MLX4_MAX_PORTS + 1];
+	u64			def_mac[MLX4_MAX_PORTS + 1];
+	int			eth_mtu_cap[MLX4_MAX_PORTS + 1];
+	int			gid_table_len[MLX4_MAX_PORTS + 1];
+	int			pkey_table_len[MLX4_MAX_PORTS + 1];
+	int			trans_type[MLX4_MAX_PORTS + 1];
+	int			vendor_oui[MLX4_MAX_PORTS + 1];
+	int			wavelength[MLX4_MAX_PORTS + 1];
+	u64			trans_code[MLX4_MAX_PORTS + 1];
+	int			local_ca_ack_delay;
+	int			num_uars;
+	int			bf_reg_size;
+	int			bf_regs_per_page;
+	int			max_sq_sg;
+	int			max_rq_sg;
+	int			num_qps;
+	int			max_wqes;
+	int			max_sq_desc_sz;
+	int			max_rq_desc_sz;
+	int			max_qp_init_rdma;
+	int			max_qp_dest_rdma;
+	int			sqp_start;
+	int			num_srqs;
+	int			max_srq_wqes;
+	int			max_srq_sge;
+	int			reserved_srqs;
+	int			num_cqs;
+	int			max_cqes;
+	int			reserved_cqs;
+	int			num_eqs;
+	int			reserved_eqs;
+	int			num_comp_vectors;
+	int			num_mpts;
+	int			num_mtt_segs;
+	int			mtts_per_seg;
+	int			fmr_reserved_mtts;
+	int			reserved_mtts;
+	int			reserved_mrws;
+	int			reserved_uars;
+	int			num_mgms;
+	int			num_amgms;
+	int			reserved_mcgs;
+	int			num_qp_per_mgm;
+	int			num_pds;
+	int			reserved_pds;
+	int			mtt_entry_sz;
+	int			reserved_xrcds;
+	int			max_xrcds;
+	u32			max_msg_sz;
+	u32			page_size_cap;
+	u64			flags;
+	u32			bmme_flags;
+	u32			reserved_lkey;
+	u16			stat_rate_support;
+	int			udp_rss;
+	int			loopback_support;
+	u8			port_width_cap[MLX4_MAX_PORTS + 1];
+	int			max_gso_sz;
+	int                     reserved_qps_cnt[MLX4_NUM_QP_REGION];
+	int			reserved_qps;
+	int                     reserved_qps_base[MLX4_NUM_QP_REGION];
+	int                     log_num_macs;
+	int                     log_num_vlans;
+	int                     log_num_prios;
+	enum mlx4_port_type	port_type[MLX4_MAX_PORTS + 1];
+	u8			supported_type[MLX4_MAX_PORTS + 1];
+	enum mlx4_port_type	port_mask[MLX4_MAX_PORTS + 1];
+	enum mlx4_port_type	possible_type[MLX4_MAX_PORTS + 1];
+	u8			counters_mode;
+	u32			max_basic_counters;
+	u32			max_ext_counters;
+	u32			mc_promisc_mode;
+};
+
+struct mlx4_buf_list {
+	void		       *buf;
+	dma_addr_t		map;
+};
+
+struct mlx4_buf {
+	struct mlx4_buf_list	direct;
+	struct mlx4_buf_list   *page_list;
+	int			nbufs;
+	int			npages;
+	int			page_shift;
+};
+
+struct mlx4_mtt {
+	u32			first_seg;
+	int			order;
+	int			page_shift;
+};
+
+enum {
+	MLX4_DB_PER_PAGE = PAGE_SIZE / 4
+};
+
+struct mlx4_db_pgdir {
+	struct list_head	list;
+	DECLARE_BITMAP(order0, MLX4_DB_PER_PAGE);
+	DECLARE_BITMAP(order1, MLX4_DB_PER_PAGE / 2);
+	unsigned long	       *bits[2];
+	__be32		       *db_page;
+	dma_addr_t		db_dma;
+};
+
+struct mlx4_ib_user_db_page;
+
+struct mlx4_db {
+	__be32			*db;
+	union {
+		struct mlx4_db_pgdir		*pgdir;
+		struct mlx4_ib_user_db_page	*user_page;
+	}			u;
+	dma_addr_t		dma;
+	int			index;
+	int			order;
+};
+
+struct mlx4_hwq_resources {
+	struct mlx4_db		db;
+	struct mlx4_mtt		mtt;
+	struct mlx4_buf		buf;
+};
+
+struct mlx4_mr {
+	struct mlx4_mtt		mtt;
+	u64			iova;
+	u64			size;
+	u32			key;
+	u32			pd;
+	u32			access;
+	int			enabled;
+};
+
+struct mlx4_fmr {
+	struct mlx4_mr		mr;
+	struct mlx4_mpt_entry  *mpt;
+	__be64		       *mtts;
+	dma_addr_t		dma_handle;
+	int			max_pages;
+	int			max_maps;
+	int			maps;
+	u8			page_shift;
+};
+
+struct mlx4_uar {
+	unsigned long		pfn;
+	int			index;
+	struct list_head	bf_list;
+	unsigned		free_bf_bmap;
+	void __iomem	       *map;
+	void __iomem	       *bf_map;
+};
+
+struct mlx4_bf {
+	unsigned long		offset;
+	int			buf_size;
+	struct mlx4_uar	       *uar;
+	void __iomem	       *reg;
+};
+
+struct mlx4_cq {
+	void (*comp)		(struct mlx4_cq *);
+	void (*event)		(struct mlx4_cq *, enum mlx4_event);
+
+	struct mlx4_uar	       *uar;
+
+	u32			cons_index;
+
+	__be32		       *set_ci_db;
+	__be32		       *arm_db;
+	int			arm_sn;
+
+	int			cqn;
+	unsigned		vector;
+
+	atomic_t		refcount;
+	struct completion	free;
+};
+
+struct mlx4_qp {
+	void (*event)		(struct mlx4_qp *, enum mlx4_event);
+
+	int			qpn;
+
+	atomic_t		refcount;
+	struct completion	free;
+};
+
+struct mlx4_srq {
+	void (*event)		(struct mlx4_srq *, enum mlx4_event);
+
+	int			srqn;
+	int			max;
+	int			max_gs;
+	int			wqe_shift;
+
+	atomic_t		refcount;
+	struct completion	free;
+};
+
+struct mlx4_av {
+	__be32			port_pd;
+	u8			reserved1;
+	u8			g_slid;
+	__be16			dlid;
+	u8			reserved2;
+	u8			gid_index;
+	u8			stat_rate;
+	u8			hop_limit;
+	__be32			sl_tclass_flowlabel;
+	u8			dgid[16];
+};
+
+struct mlx4_eth_av {
+	__be32		port_pd;
+	u8		reserved1;
+	u8		smac_idx;
+	u16		reserved2;
+	u8		reserved3;
+	u8		gid_index;
+	u8		stat_rate;
+	u8		hop_limit;
+	__be32		sl_tclass_flowlabel;
+	u8		dgid[16];
+	u32		reserved4[2];
+	__be16		vlan;
+	u8		mac[6];
+};
+
+union mlx4_ext_av {
+	struct mlx4_av		ib;
+	struct mlx4_eth_av	eth;
+};
+
+struct mlx4_counters {
+	__be32	counter_mode;
+	__be32	num_ifc;
+	u32	reserved[2];
+	__be64	rx_frames;
+	__be64	rx_bytes;
+	__be64	tx_frames;
+	__be64	tx_bytes;
+};
+
+struct mlx4_counters_ext {
+	__be32	counter_mode;
+	__be32	num_ifc;
+	u32	reserved[2];
+	__be64	rx_uni_frames;
+	__be64	rx_uni_bytes;
+	__be64	rx_mcast_frames;
+	__be64	rx_mcast_bytes;
+	__be64	rx_bcast_frames;
+	__be64	rx_bcast_bytes;
+	__be64	rx_nobuf_frames;
+	__be64	rx_nobuf_bytes;
+	__be64	rx_err_frames;
+	__be64	rx_err_bytes;
+	__be64	tx_uni_frames;
+	__be64	tx_uni_bytes;
+	__be64	tx_mcast_frames;
+	__be64	tx_mcast_bytes;
+	__be64	tx_bcast_frames;
+	__be64	tx_bcast_bytes;
+	__be64	tx_nobuf_frames;
+	__be64	tx_nobuf_bytes;
+	__be64	tx_err_frames;
+	__be64	tx_err_bytes;
+};
+
+struct mlx4_dev {
+	struct pci_dev	       *pdev;
+	unsigned long		flags;
+	struct mlx4_caps	caps;
+	struct radix_tree_root	qp_table_tree;
+	struct radix_tree_root	srq_table_tree;
+	u32			rev_id;
+	char			board_id[MLX4_BOARD_ID_LEN];
+};
+
+struct mlx4_init_port_param {
+	int			set_guid0;
+	int			set_node_guid;
+	int			set_si_guid;
+	u16			mtu;
+	int			port_width_cap;
+	u16			vl_cap;
+	u16			max_gid;
+	u16			max_pkey;
+	u64			guid0;
+	u64			node_guid;
+	u64			si_guid;
+};
+
+static inline void mlx4_query_steer_cap(struct mlx4_dev *dev, int *log_mac,
+					int *log_vlan, int *log_prio)
+{
+	*log_mac = dev->caps.log_num_macs;
+	*log_vlan = dev->caps.log_num_vlans;
+	*log_prio = dev->caps.log_num_prios;
+}
+
+#define mlx4_foreach_port(port, dev, type)				\
+	for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)	\
+		if ((type) == (dev)->caps.port_mask[(port)])
+
+#define mlx4_foreach_ib_transport_port(port, dev)                         \
+	for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)       \
+		if (((dev)->caps.port_mask[port] == MLX4_PORT_TYPE_IB) || \
+			((dev)->caps.flags & MLX4_DEV_CAP_FLAG_IBOE))
+
+int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
+		   struct mlx4_buf *buf);
+void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf);
+static inline void *mlx4_buf_offset(struct mlx4_buf *buf, int offset)
+{
+	if (buf->direct.buf != NULL)
+		return buf->direct.buf + offset;
+	else
+		return buf->page_list[offset >> PAGE_SHIFT].buf +
+			(offset & (PAGE_SIZE - 1));
+}
+
+int mlx4_pd_alloc(struct mlx4_dev *dev, u32 *pdn);
+void mlx4_pd_free(struct mlx4_dev *dev, u32 pdn);
+
+int mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn);
+void mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn);
+
+int mlx4_uar_alloc(struct mlx4_dev *dev, struct mlx4_uar *uar);
+void mlx4_uar_free(struct mlx4_dev *dev, struct mlx4_uar *uar);
+int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf);
+void mlx4_bf_free(struct mlx4_dev *dev, struct mlx4_bf *bf);
+
+int mlx4_mtt_init(struct mlx4_dev *dev, int npages, int page_shift,
+		  struct mlx4_mtt *mtt);
+void mlx4_mtt_cleanup(struct mlx4_dev *dev, struct mlx4_mtt *mtt);
+u64 mlx4_mtt_addr(struct mlx4_dev *dev, struct mlx4_mtt *mtt);
+
+int mlx4_mr_reserve_range(struct mlx4_dev *dev, int cnt, int align, u32 *base_mridx);
+void mlx4_mr_release_range(struct mlx4_dev *dev, u32 base_mridx, int cnt);
+int mlx4_mr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd,
+			   u64 iova, u64 size, u32 access, int npages,
+			   int page_shift, struct mlx4_mr *mr);
+int mlx4_mr_alloc(struct mlx4_dev *dev, u32 pd, u64 iova, u64 size, u32 access,
+		  int npages, int page_shift, struct mlx4_mr *mr);
+void mlx4_mr_free_reserved(struct mlx4_dev *dev, struct mlx4_mr *mr);
+void mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr);
+int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr);
+int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		   int start_index, int npages, u64 *page_list);
+int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		       struct mlx4_buf *buf);
+
+int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order);
+void mlx4_db_free(struct mlx4_dev *dev, struct mlx4_db *db);
+
+int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres,
+		       int size, int max_direct);
+void mlx4_free_hwq_res(struct mlx4_dev *mdev, struct mlx4_hwq_resources *wqres,
+		       int size);
+
+int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt,
+		  struct mlx4_uar *uar, u64 db_rec, struct mlx4_cq *cq,
+		  unsigned vector, int collapsed);
+void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq);
+
+int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base);
+void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt);
+
+int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp);
+void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp);
+
+int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcd,
+		   struct mlx4_mtt *mtt, u64 db_rec, struct mlx4_srq *srq);
+void mlx4_srq_free(struct mlx4_dev *dev, struct mlx4_srq *srq);
+int mlx4_srq_arm(struct mlx4_dev *dev, struct mlx4_srq *srq, int limit_watermark);
+int mlx4_srq_query(struct mlx4_dev *dev, struct mlx4_srq *srq, int *limit_watermark);
+
+int mlx4_INIT_PORT(struct mlx4_dev *dev, int port);
+int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port);
+
+int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  int block_mcast_loopback, enum mlx4_mcast_prot prot);
+int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+				enum mlx4_mcast_prot prot);
+
+int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *index);
+void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, int index);
+
+int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx);
+int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index);
+void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, int index);
+
+int mlx4_map_phys_fmr_fbo(struct mlx4_dev *dev, struct mlx4_fmr *fmr,
+			  u64 *page_list, int npages, u64 iova, u32 fbo,
+			  u32 len, u32 *lkey, u32 *rkey, int same_key);
+int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list,
+		      int npages, u64 iova, u32 *lkey, u32 *rkey);
+int mlx4_fmr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd,
+			    u32 access, int max_pages, int max_maps,
+			    u8 page_shift, struct mlx4_fmr *fmr);
+int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages,
+		   int max_maps, u8 page_shift, struct mlx4_fmr *fmr);
+int mlx4_fmr_enable(struct mlx4_dev *dev, struct mlx4_fmr *fmr);
+void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr,
+		    u32 *lkey, u32 *rkey);
+int mlx4_fmr_free_reserved(struct mlx4_dev *dev, struct mlx4_fmr *fmr);
+int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr);
+int mlx4_SYNC_TPT(struct mlx4_dev *dev);
+int mlx4_query_diag_counters(struct mlx4_dev *mlx4_dev, int array_length,
+			     u8 op_modifier, u32 in_offset[], u32 counter_out[]);
+int mlx4_test_interrupts(struct mlx4_dev *dev);
+
+void mlx4_get_fc_t11_settings(struct mlx4_dev *dev, int *enable_pre_t11, int *t11_supported);
+
+int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx);
+void mlx4_counter_free(struct mlx4_dev *dev, u32 idx);
+
+#endif /* MLX4_DEVICE_H */
diff --git a/sys/ofed/include/linux/mlx4/doorbell.h b/sys/ofed/include/linux/mlx4/doorbell.h
new file mode 100644
index 0000000..f31bba2
--- /dev/null
+++ b/sys/ofed/include/linux/mlx4/doorbell.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_DOORBELL_H
+#define MLX4_DOORBELL_H
+
+#include <linux/types.h>
+#include <linux/io.h>
+
+#define MLX4_SEND_DOORBELL    0x14
+#define MLX4_CQ_DOORBELL      0x20
+
+#if BITS_PER_LONG == 64
+/*
+ * Assume that we can just write a 64-bit doorbell atomically.  s390
+ * actually doesn't have writeq() but S/390 systems don't even have
+ * PCI so we won't worry about it.
+ */
+
+#define MLX4_DECLARE_DOORBELL_LOCK(name)
+#define MLX4_INIT_DOORBELL_LOCK(ptr)    do { } while (0)
+#define MLX4_GET_DOORBELL_LOCK(ptr)      (NULL)
+
+static inline void mlx4_write64(__be32 val[2], void __iomem *dest,
+				spinlock_t *doorbell_lock)
+{
+	__raw_writeq(*(u64 *) val, dest);
+}
+
+#else
+
+/*
+ * Just fall back to a spinlock to protect the doorbell if
+ * BITS_PER_LONG is 32 -- there's no portable way to do atomic 64-bit
+ * MMIO writes.
+ */
+
+#define MLX4_DECLARE_DOORBELL_LOCK(name) spinlock_t name;
+#define MLX4_INIT_DOORBELL_LOCK(ptr)     spin_lock_init(ptr)
+#define MLX4_GET_DOORBELL_LOCK(ptr)      (ptr)
+
+static inline void mlx4_write64(__be32 val[2], void __iomem *dest,
+				spinlock_t *doorbell_lock)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(doorbell_lock, flags);
+	__raw_writel((__force u32) val[0], dest);
+	__raw_writel((__force u32) val[1], dest + 4);
+	spin_unlock_irqrestore(doorbell_lock, flags);
+}
+
+#endif
+
+#endif /* MLX4_DOORBELL_H */
diff --git a/sys/ofed/include/linux/mlx4/driver.h b/sys/ofed/include/linux/mlx4/driver.h
new file mode 100644
index 0000000..15c8319
--- /dev/null
+++ b/sys/ofed/include/linux/mlx4/driver.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2006 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_DRIVER_H
+#define MLX4_DRIVER_H
+
+#include <linux/device.h>
+
+struct mlx4_dev;
+
+enum mlx4_dev_event {
+	MLX4_DEV_EVENT_CATASTROPHIC_ERROR,
+	MLX4_DEV_EVENT_PORT_UP,
+	MLX4_DEV_EVENT_PORT_DOWN,
+	MLX4_DEV_EVENT_PORT_REINIT,
+};
+
+enum mlx4_query_reply {
+	MLX4_QUERY_NOT_MINE	= -1,
+	MLX4_QUERY_MINE_NOPORT 	= 0
+};
+
+enum mlx4_prot {
+	MLX4_PROT_IB,
+	MLX4_PROT_EN,
+};
+
+enum mlx4_mcast_prot {
+	MLX4_MCAST_PROT_IB = 0,
+	MLX4_MCAST_PROT_EN = 1,
+};
+
+struct mlx4_interface {
+	void *			(*add)	 (struct mlx4_dev *dev);
+	void			(*remove)(struct mlx4_dev *dev, void *context);
+	void			(*event) (struct mlx4_dev *dev, void *context,
+					  enum mlx4_dev_event event, int port);
+	void *  (*get_prot_dev) (struct mlx4_dev *dev, void *context, u8 port);
+	enum mlx4_prot          protocol;
+
+	enum mlx4_query_reply	(*query) (void *context, void *);
+	struct list_head	list;
+};
+
+int mlx4_register_interface(struct mlx4_interface *intf);
+void mlx4_unregister_interface(struct mlx4_interface *intf);
+void *mlx4_get_prot_dev(struct mlx4_dev *dev, enum mlx4_prot proto, int port);
+
+struct mlx4_dev *mlx4_query_interface(void *, int *port);
+void mlx4_set_iboe_counter(struct mlx4_dev *dev, int index, u8 port);
+int mlx4_get_iboe_counter(struct mlx4_dev *dev, u8 port);
+
+#endif /* MLX4_DRIVER_H */
diff --git a/sys/ofed/include/linux/mlx4/qp.h b/sys/ofed/include/linux/mlx4/qp.h
new file mode 100644
index 0000000..3fe2bc5
--- /dev/null
+++ b/sys/ofed/include/linux/mlx4/qp.h
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_QP_H
+#define MLX4_QP_H
+
+#include <linux/types.h>
+
+#include <linux/mlx4/device.h>
+
+#define MLX4_INVALID_LKEY	0x100
+
+enum mlx4_qp_optpar {
+	MLX4_QP_OPTPAR_ALT_ADDR_PATH		= 1 << 0,
+	MLX4_QP_OPTPAR_RRE			= 1 << 1,
+	MLX4_QP_OPTPAR_RAE			= 1 << 2,
+	MLX4_QP_OPTPAR_RWE			= 1 << 3,
+	MLX4_QP_OPTPAR_PKEY_INDEX		= 1 << 4,
+	MLX4_QP_OPTPAR_Q_KEY			= 1 << 5,
+	MLX4_QP_OPTPAR_RNR_TIMEOUT		= 1 << 6,
+	MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH	= 1 << 7,
+	MLX4_QP_OPTPAR_SRA_MAX			= 1 << 8,
+	MLX4_QP_OPTPAR_RRA_MAX			= 1 << 9,
+	MLX4_QP_OPTPAR_PM_STATE			= 1 << 10,
+	MLX4_QP_OPTPAR_RETRY_COUNT		= 1 << 12,
+	MLX4_QP_OPTPAR_RNR_RETRY		= 1 << 13,
+	MLX4_QP_OPTPAR_ACK_TIMEOUT		= 1 << 14,
+	MLX4_QP_OPTPAR_SCHED_QUEUE		= 1 << 16,
+	MLX4_QP_OPTPAR_COUNTER_INDEX		= 1 << 20
+};
+
+enum mlx4_qp_state {
+	MLX4_QP_STATE_RST			= 0,
+	MLX4_QP_STATE_INIT			= 1,
+	MLX4_QP_STATE_RTR			= 2,
+	MLX4_QP_STATE_RTS			= 3,
+	MLX4_QP_STATE_SQER			= 4,
+	MLX4_QP_STATE_SQD			= 5,
+	MLX4_QP_STATE_ERR			= 6,
+	MLX4_QP_STATE_SQ_DRAINING		= 7,
+	MLX4_QP_NUM_STATE
+};
+
+enum {
+	MLX4_QP_ST_RC				= 0x0,
+	MLX4_QP_ST_UC				= 0x1,
+	MLX4_QP_ST_RD				= 0x2,
+	MLX4_QP_ST_UD				= 0x3,
+	MLX4_QP_ST_XRC				= 0x6,
+	MLX4_QP_ST_MLX				= 0x7
+};
+
+enum {
+	MLX4_QP_PM_MIGRATED			= 0x3,
+	MLX4_QP_PM_ARMED			= 0x0,
+	MLX4_QP_PM_REARM			= 0x1
+};
+
+enum {
+	/* params1 */
+	MLX4_QP_BIT_SRE				= 1 << 15,
+	MLX4_QP_BIT_SWE				= 1 << 14,
+	MLX4_QP_BIT_SAE				= 1 << 13,
+	/* params2 */
+	MLX4_QP_BIT_RRE				= 1 << 15,
+	MLX4_QP_BIT_RWE				= 1 << 14,
+	MLX4_QP_BIT_RAE				= 1 << 13,
+	MLX4_QP_BIT_RIC				= 1 <<	4,
+};
+
+struct mlx4_qp_path {
+	u8			fl;
+	u8			reserved1[2];
+	u8			pkey_index;
+	u8			counter_index;
+	u8			grh_mylmc;
+	__be16			rlid;
+	u8			ackto;
+	u8			mgid_index;
+	u8			static_rate;
+	u8			hop_limit;
+	__be32			tclass_flowlabel;
+	u8			rgid[16];
+	u8			sched_queue;
+	u8			vlan_index;
+	u8			reserved3[2];
+	u8			reserved4[2];
+	u8			dmac[6];
+};
+
+struct mlx4_qp_context {
+	__be32			flags;
+	__be32			pd;
+	u8			mtu_msgmax;
+	u8			rq_size_stride;
+	u8			sq_size_stride;
+	u8			rlkey;
+	__be32			usr_page;
+	__be32			local_qpn;
+	__be32			remote_qpn;
+	struct			mlx4_qp_path pri_path;
+	struct			mlx4_qp_path alt_path;
+	__be32			params1;
+	u32			reserved1;
+	__be32			next_send_psn;
+	__be32			cqn_send;
+	u32			reserved2[2];
+	__be32			last_acked_psn;
+	__be32			ssn;
+	__be32			params2;
+	__be32			rnr_nextrecvpsn;
+	__be32			xrcd;
+	__be32			cqn_recv;
+	__be64			db_rec_addr;
+	__be32			qkey;
+	__be32			srqn;
+	__be32			msn;
+	__be16			rq_wqe_counter;
+	__be16			sq_wqe_counter;
+	u32			reserved3[2];
+	__be32			param3;
+	__be32			nummmcpeers_basemkey;
+	u8			log_page_size;
+	u8			reserved4[2];
+	u8			mtt_base_addr_h;
+	__be32			mtt_base_addr_l;
+	u8			VE;
+	u8			reserved5;
+	__be16			VFT_id_prio;
+	u8			reserved6;
+	u8			exch_size;
+	__be16			exch_base;
+	u8			VFT_hop_cnt;
+	u8			my_fc_id_idx;
+	__be16			reserved7;
+	u32			reserved8[7];
+};
+
+/* Which firmware version adds support for NEC (NoErrorCompletion) bit */
+#define MLX4_FW_VER_WQE_CTRL_NEC mlx4_fw_ver(2, 2, 232)
+
+enum {
+	MLX4_WQE_CTRL_NEC		= 1 << 29,
+	MLX4_WQE_CTRL_FENCE		= 1 << 6,
+	MLX4_WQE_CTRL_CQ_UPDATE		= 3 << 2,
+	MLX4_WQE_CTRL_SOLICITED		= 1 << 1,
+	MLX4_WQE_CTRL_IP_CSUM		= 1 << 4,
+	MLX4_WQE_CTRL_TCP_UDP_CSUM	= 1 << 5,
+	MLX4_WQE_CTRL_INS_VLAN		= 1 << 6,
+	MLX4_WQE_CTRL_STRONG_ORDER	= 1 << 7,
+	MLX4_WQE_CTRL_FORCE_LOOPBACK	= 1 << 0,
+};
+
+struct mlx4_wqe_ctrl_seg {
+	__be32			owner_opcode;
+	__be16			vlan_tag;
+	u8			ins_vlan;
+	u8			fence_size;
+	/*
+	 * High 24 bits are SRC remote buffer; low 8 bits are flags:
+	 * [7]   SO (strong ordering)
+	 * [5]   TCP/UDP checksum
+	 * [4]   IP checksum
+	 * [3:2] C (generate completion queue entry)
+	 * [1]   SE (solicited event)
+	 */
+	__be32			srcrb_flags;
+	/*
+	 * imm is immediate data for send/RDMA write w/ immediate;
+	 * also invalidation key for send with invalidate; input
+	 * modifier for WQEs on CCQs.
+	 */
+	__be32			imm;
+};
+
+enum {
+	MLX4_WQE_MLX_VL15	= 1 << 17,
+	MLX4_WQE_MLX_SLR	= 1 << 16,
+	MLX4_WQE_MLX_ICRC	= 1 << 4
+};
+
+struct mlx4_wqe_mlx_seg {
+	u8			owner;
+	u8			reserved1[2];
+	u8			opcode;
+	u8			reserved2[3];
+	u8			size;
+	/*
+	 * [17]    VL15
+	 * [16]    SLR
+	 * [15:12] static rate
+	 * [11:8]  SL
+	 * [4]     ICRC
+	 * [3:2]   C
+	 * [0]     FL (force loopback)
+	 */
+	__be32			flags;
+	__be16			rlid;
+	u16			reserved3;
+};
+
+struct mlx4_wqe_datagram_seg {
+	__be32			av[8];
+	__be32			dqpn;
+	__be32			qkey;
+	__be16			vlan;
+	u8			mac[6];
+};
+
+struct mlx4_wqe_lso_seg {
+	__be32			mss_hdr_size;
+	__be32			header[0];
+};
+
+struct mlx4_wqe_bind_seg {
+	__be32			flags1;
+	__be32			flags2;
+	__be32			new_rkey;
+	__be32			lkey;
+	__be64			addr;
+	__be64			length;
+};
+
+enum {
+	MLX4_WQE_FMR_PERM_LOCAL_READ	= 1 << 27,
+	MLX4_WQE_FMR_PERM_LOCAL_WRITE	= 1 << 28,
+	MLX4_WQE_FMR_PERM_REMOTE_READ	= 1 << 29,
+	MLX4_WQE_FMR_PERM_REMOTE_WRITE	= 1 << 30,
+	MLX4_WQE_FMR_PERM_ATOMIC	= 1 << 31
+};
+
+struct mlx4_wqe_fmr_seg {
+	__be32			flags;
+	__be32			mem_key;
+	__be64			buf_list;
+	__be64			start_addr;
+	__be64			reg_len;
+	__be32			offset;
+	__be32			page_size;
+	u32			reserved[2];
+};
+
+struct mlx4_wqe_fmr_ext_seg {
+	u8			flags;
+	u8			reserved;
+	__be16			app_mask;
+	__be16			wire_app_tag;
+	__be16			mem_app_tag;
+	__be32			wire_ref_tag_base;
+	__be32			mem_ref_tag_base;
+};
+
+struct mlx4_wqe_local_inval_seg {
+	__be32			flags;
+	u32			reserved1;
+	__be32			mem_key;
+	u32			reserved2[2];
+	__be32			guest_id;
+	__be64			pa;
+};
+
+struct mlx4_wqe_raddr_seg {
+	__be64			raddr;
+	__be32			rkey;
+	u32			reserved;
+};
+
+struct mlx4_wqe_atomic_seg {
+	__be64			swap_add;
+	__be64			compare;
+};
+
+struct mlx4_wqe_masked_atomic_seg {
+	__be64			swap_add;
+	__be64			compare;
+	__be64			swap_add_mask;
+	__be64			compare_mask;
+};
+
+struct mlx4_wqe_data_seg {
+	__be32			byte_count;
+	__be32			lkey;
+	__be64			addr;
+};
+
+enum {
+	MLX4_INLINE_ALIGN	= 64,
+	MLX4_INLINE_SEG		= 1 << 31,
+};
+
+struct mlx4_wqe_inline_seg {
+	__be32			byte_count;
+};
+
+int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		   enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state,
+		   struct mlx4_qp_context *context, enum mlx4_qp_optpar optpar,
+		   int sqd_event, struct mlx4_qp *qp);
+
+int mlx4_qp_query(struct mlx4_dev *dev, struct mlx4_qp *qp,
+		  struct mlx4_qp_context *context);
+
+int mlx4_qp_to_ready(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		     struct mlx4_qp_context *context,
+		     struct mlx4_qp *qp, enum mlx4_qp_state *qp_state);
+
+static inline struct mlx4_qp *__mlx4_qp_lookup(struct mlx4_dev *dev, u32 qpn)
+{
+	return radix_tree_lookup(&dev->qp_table_tree, qpn & (dev->caps.num_qps - 1));
+}
+
+struct mlx4_qp *mlx4_qp_lookup_lock(struct mlx4_dev *dev, u32 qpn);
+void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp);
+int mlx4_qp_get_region(struct mlx4_dev *dev, enum mlx4_qp_region region,
+			int *base_qpn, int *cnt);
+
+#endif /* MLX4_QP_H */
diff --git a/sys/ofed/include/linux/mlx4/srq.h b/sys/ofed/include/linux/mlx4/srq.h
new file mode 100644
index 0000000..5e041e5
--- /dev/null
+++ b/sys/ofed/include/linux/mlx4/srq.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_SRQ_H
+#define MLX4_SRQ_H
+
+#include <linux/types.h>
+#include <linux/mlx4/device.h>
+
+struct mlx4_wqe_srq_next_seg {
+	u16			reserved1;
+	__be16			next_wqe_index;
+	u32			reserved2[3];
+};
+
+void mlx4_srq_invalidate(struct mlx4_dev *dev, struct mlx4_srq *srq);
+void mlx4_srq_remove(struct mlx4_dev *dev, struct mlx4_srq *srq);
+
+static inline struct mlx4_srq *__mlx4_srq_lookup(struct mlx4_dev *dev, u32 srqn)
+{
+	return radix_tree_lookup(&dev->srq_table_tree,
+				 srqn & (dev->caps.num_srqs - 1));
+}
+
+#endif /* MLX4_SRQ_H */
diff --git a/sys/ofed/include/linux/mm.h b/sys/ofed/include/linux/mm.h
new file mode 100644
index 0000000..13b749b
--- /dev/null
+++ b/sys/ofed/include/linux/mm.h
@@ -0,0 +1,84 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_MM_H_
+#define	_LINUX_MM_H_
+
+#include <linux/spinlock.h>
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+
+#define	PAGE_ALIGN(x)	ALIGN(x, PAGE_SIZE)
+
+struct vm_area_struct {
+	vm_offset_t	vm_start;
+	vm_offset_t	vm_end;
+	vm_offset_t	vm_pgoff;
+	vm_paddr_t	vm_pfn;		/* PFN For mmap. */
+	vm_memattr_t	vm_page_prot;
+};
+
+/*
+ * Compute log2 of the power of two rounded up count of pages
+ * needed for size bytes.
+ */
+static inline int
+get_order(unsigned long size)
+{
+	int order;
+
+	size = (size - 1) >> PAGE_SHIFT;
+	order = 0;
+	while (size) {
+		order++;
+		size >>= 1;
+	}
+	return (order);
+}
+
+static inline void *
+lowmem_page_address(struct page *page)
+{
+
+	return page_address(page);
+}
+
+/*
+ * This only works via mmap ops.
+ */
+static inline int
+io_remap_pfn_range(struct vm_area_struct *vma,
+    unsigned long addr, unsigned long pfn, unsigned long size,
+    vm_memattr_t prot)
+{
+	vma->vm_page_prot = prot;
+	vma->vm_pfn = pfn;
+
+	return (0);
+}
+
+#endif	/* _LINUX_MM_H_ */
diff --git a/sys/ofed/include/linux/module.h b/sys/ofed/include/linux/module.h
new file mode 100644
index 0000000..1e3a682
--- /dev/null
+++ b/sys/ofed/include/linux/module.h
@@ -0,0 +1,87 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_MODULE_H_
+#define	_LINUX_MODULE_H_
+
+#include <linux/list.h>
+#include <linux/compiler.h>
+#include <linux/kobject.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+
+#define MODULE_AUTHOR(name)
+#define MODULE_DESCRIPTION(name)
+#define MODULE_LICENSE(name)
+#define	MODULE_VERSION(name)
+
+#define	THIS_MODULE	((struct module *)0)
+
+#define	EXPORT_SYMBOL(name)
+#define	EXPORT_SYMBOL_GPL(name)
+
+#include <sys/linker.h>
+
+static inline void
+_module_run(void *arg)
+{
+	void (*fn)(void);
+#ifdef OFED_DEBUG_INIT
+	char name[1024];
+	caddr_t pc;
+	long offset;
+
+	pc = (caddr_t)arg;
+	if (linker_search_symbol_name(pc, name, sizeof(name), &offset) != 0)
+		printf("Running ??? (%p)\n", pc);
+	else
+		printf("Running %s (%p)\n", name, pc);
+#endif
+	fn = arg;
+	DROP_GIANT();
+	fn();
+	PICKUP_GIANT();
+}
+
+#define	module_init(fn)							\
+	SYSINIT(fn, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, _module_run, (fn))
+
+/*
+ * XXX This is a freebsdism designed to work around not having a module
+ * load order resolver built in.
+ */
+#define	module_init_order(fn, order)					\
+	SYSINIT(fn, SI_SUB_RUN_SCHEDULER, (order), _module_run, (fn))
+
+#define	module_exit(fn)						\
+	SYSUNINIT(fn, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, _module_run, (fn))
+
+#define	module_get(module)
+#define	module_put(module)
+#define	try_module_get(module)	1
+
+#endif	/* _LINUX_MODULE_H_ */
diff --git a/sys/ofed/include/linux/moduleparam.h b/sys/ofed/include/linux/moduleparam.h
new file mode 100644
index 0000000..2c541a6
--- /dev/null
+++ b/sys/ofed/include/linux/moduleparam.h
@@ -0,0 +1,226 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_MODULEPARAM_H_
+#define	_LINUX_MODULEPARAM_H_
+
+#include <linux/types.h>
+
+/*
+ * These are presently not hooked up to anything.  In linux the parameters
+ * can be set when modules are loaded.  On FreeBSD these could be mapped
+ * to kenv in the future.
+ */
+struct kernel_param;
+
+typedef int (*param_set_fn)(const char *val, struct kernel_param *kp);
+typedef int (*param_get_fn)(char *buffer, struct kernel_param *kp);
+
+struct kernel_param {
+	const char	*name;
+	u16		perm;
+	u16		flags;
+	param_set_fn	set;
+	param_get_fn	get;
+	union { 
+		void	*arg;
+		struct kparam_string	*str;
+		struct kparam_array	*arr;
+	} un;
+};
+
+#define	KPARAM_ISBOOL	2
+
+struct kparam_string {
+	unsigned int maxlen;
+	char *string;
+};
+
+struct kparam_array
+{
+	unsigned int	max;
+	unsigned int	*num;
+	param_set_fn	set;
+	param_get_fn	get;
+	unsigned int	elemsize;
+	void 		*elem;
+};
+
+static inline void
+param_sysinit(struct kernel_param *param)
+{
+}
+
+#define	module_param_call(name, set, get, arg, perm)			\
+	static struct kernel_param __param_##name =			\
+	    { #name, perm, 0, set, get, { arg } };			\
+	SYSINIT(name##_param_sysinit, SI_SUB_DRIVERS, SI_ORDER_FIRST,	\
+	    param_sysinit, &__param_##name);
+
+#define	module_param_named(name, var, type, mode)			\
+	module_param_call(name, param_set_##type, param_get_##type, &var, mode)
+
+#define	module_param(var, type, mode)					\
+	module_param_named(var, var, type, mode)
+
+#define	MODULE_PARM_DESC(name, desc)
+
+static inline int
+param_set_byte(const char *val, struct kernel_param *kp)
+{
+
+	return 0;
+}
+
+static inline int
+param_get_byte(char *buffer, struct kernel_param *kp)
+{
+
+	return 0;
+}
+
+
+static inline int
+param_set_short(const char *val, struct kernel_param *kp)
+{
+
+	return 0;
+}
+
+static inline int 
+param_get_short(char *buffer, struct kernel_param *kp)
+{
+
+	return 0;
+}
+
+
+static inline int 
+param_set_ushort(const char *val, struct kernel_param *kp)
+{
+
+	return 0;
+}
+
+static inline int 
+param_get_ushort(char *buffer, struct kernel_param *kp)
+{
+
+	return 0;
+}
+
+
+static inline int 
+param_set_int(const char *val, struct kernel_param *kp)
+{
+
+	return 0;
+}
+
+static inline int 
+param_get_int(char *buffer, struct kernel_param *kp)
+{
+
+	return 0;
+}
+
+
+static inline int 
+param_set_uint(const char *val, struct kernel_param *kp)
+{
+
+	return 0;
+}
+
+static inline int 
+param_get_uint(char *buffer, struct kernel_param *kp)
+{
+
+	return 0;
+}
+
+
+static inline int 
+param_set_long(const char *val, struct kernel_param *kp)
+{
+
+	return 0;
+}
+
+static inline int 
+param_get_long(char *buffer, struct kernel_param *kp)
+{
+
+	return 0;
+}
+
+
+static inline int 
+param_set_ulong(const char *val, struct kernel_param *kp)
+{
+
+	return 0;
+}
+
+static inline int 
+param_get_ulong(char *buffer, struct kernel_param *kp)
+{
+
+	return 0;
+}
+
+
+static inline int 
+param_set_charp(const char *val, struct kernel_param *kp)
+{
+
+	return 0;
+}
+
+static inline int 
+param_get_charp(char *buffer, struct kernel_param *kp)
+{
+
+	return 0;
+}
+
+
+static inline int 
+param_set_bool(const char *val, struct kernel_param *kp)
+{
+
+	return 0;
+}
+
+static inline int 
+param_get_bool(char *buffer, struct kernel_param *kp)
+{
+
+	return 0;
+}
+
+#endif	/* _LINUX_MODULEPARAM_H_ */
diff --git a/sys/ofed/include/linux/mount.h b/sys/ofed/include/linux/mount.h
new file mode 100644
index 0000000..33db94e
--- /dev/null
+++ b/sys/ofed/include/linux/mount.h
@@ -0,0 +1,33 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_MOUNT_H_
+#define	_LINUX_MOUNT_H_
+
+
+#endif	/* _LINUX_MOUNT_H_ */
diff --git a/sys/ofed/include/linux/mutex.h b/sys/ofed/include/linux/mutex.h
new file mode 100644
index 0000000..ef65816
--- /dev/null
+++ b/sys/ofed/include/linux/mutex.h
@@ -0,0 +1,61 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_MUTEX_H_
+#define	_LINUX_MUTEX_H_
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/sx.h>
+
+#include <linux/spinlock.h>
+
+typedef struct mutex {
+	struct sx sx;
+} mutex_t;
+
+#define	mutex_lock(_m)			sx_xlock(&(_m)->sx)
+#define	mutex_lock_nested(_m, _s)	mutex_lock(_m)
+#define	mutex_lock_interruptible(_m)	({ mutex_lock((_m)); 0; })
+#define	mutex_unlock(_m)		sx_xunlock(&(_m)->sx)
+#define	mutex_trylock(_m)		!!sx_try_xlock(&(_m)->sx)
+
+#define DEFINE_MUTEX(lock)						\
+	mutex_t lock;							\
+	SX_SYSINIT_FLAGS(lock, &(lock).sx, "lnxmtx", SX_NOWITNESS)
+
+static inline void
+linux_mutex_init(mutex_t *m)
+{
+
+	memset(&m->sx, 0, sizeof(m->sx));
+	sx_init_flags(&m->sx, "lnxmtx",  SX_NOWITNESS);
+}
+
+#define	mutex_init	linux_mutex_init
+
+#endif	/* _LINUX_MUTEX_H_ */
diff --git a/sys/ofed/include/linux/net.h b/sys/ofed/include/linux/net.h
new file mode 100644
index 0000000..6e2aff3
--- /dev/null
+++ b/sys/ofed/include/linux/net.h
@@ -0,0 +1,73 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_NET_H_
+#define	_LINUX_NET_H_
+
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+
+static inline int
+sock_create_kern(int family, int type, int proto, struct socket **res)
+{
+	return -socreate(family, res, type, proto, curthread->td_ucred,
+	    curthread);
+}
+
+static inline int
+sock_getname(struct socket *so, struct sockaddr *addr, int *sockaddr_len,
+    int peer)
+{
+	struct sockaddr **nam;
+	int error;
+
+	nam = NULL;
+	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0)
+		return (-ENOTCONN);
+
+	if (peer)
+		error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, nam);
+	else
+		error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, nam);
+	if (error)
+		return (-error);
+	*addr = **nam;
+	*sockaddr_len = addr->sa_len;
+
+	free(*nam, M_SONAME);
+	return (0);
+}
+
+static inline void
+sock_release(struct socket *so)
+{
+	soclose(so);
+}
+
+#endif	/* _LINUX_NET_H_ */
diff --git a/sys/ofed/include/linux/netdevice.h b/sys/ofed/include/linux/netdevice.h
new file mode 100644
index 0000000..b02a9dd
--- /dev/null
+++ b/sys/ofed/include/linux/netdevice.h
@@ -0,0 +1,159 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_NETDEVICE_H_
+#define	_LINUX_NETDEVICE_H_
+
+#include <linux/types.h>
+
+#include <sys/socket.h>
+
+#include <net/if_types.h>
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_dl.h>
+
+#include <linux/completion.h>
+#include <linux/device.h>
+#include <linux/ethtool.h>
+#include <linux/workqueue.h>
+#include <linux/net.h>
+#include <linux/notifier.h>
+
+struct net {
+};
+
+extern struct net init_net;
+
+#define	MAX_ADDR_LEN		20
+
+#define	net_device	ifnet
+
+#define	dev_get_by_index(n, idx)	ifnet_byindex_ref((idx))
+#define	dev_hold(d)	if_ref((d))
+#define	dev_put(d)	if_rele((d))
+
+#define	netif_running(dev)	!!((dev)->if_drv_flags & IFF_DRV_RUNNING)
+#define	netif_oper_up(dev)	!!((dev)->if_flags & IFF_UP)
+#define	netif_carrier_ok(dev)	netif_running(dev)
+
+static inline void *
+netdev_priv(const struct net_device *dev)
+{
+	return (dev->if_softc);
+}
+
+static inline void
+_handle_ifnet_link_event(void *arg, struct ifnet *ifp, int linkstate)
+{
+	struct notifier_block *nb;
+
+	nb = arg;
+	if (linkstate == LINK_STATE_UP)
+		nb->notifier_call(nb, NETDEV_UP, ifp);
+	else
+		nb->notifier_call(nb, NETDEV_DOWN, ifp);
+}
+
+static inline void
+_handle_ifnet_arrival_event(void *arg, struct ifnet *ifp)
+{
+	struct notifier_block *nb;
+
+	nb = arg;
+	nb->notifier_call(nb, NETDEV_REGISTER, ifp);
+}
+
+static inline void
+_handle_ifnet_departure_event(void *arg, struct ifnet *ifp)
+{
+	struct notifier_block *nb;
+
+	nb = arg;
+	nb->notifier_call(nb, NETDEV_UNREGISTER, ifp);
+}
+
+static inline int
+register_netdevice_notifier(struct notifier_block *nb)
+{
+
+	nb->tags[NETDEV_UP] = EVENTHANDLER_REGISTER(
+	    ifnet_link_event, _handle_ifnet_link_event, nb, 0);
+	nb->tags[NETDEV_REGISTER] = EVENTHANDLER_REGISTER(
+	    ifnet_arrival_event, _handle_ifnet_arrival_event, nb, 0);
+	nb->tags[NETDEV_UNREGISTER] = EVENTHANDLER_REGISTER(
+	    ifnet_departure_event, _handle_ifnet_departure_event, nb, 0);
+	return (0);
+}
+
+static inline int
+unregister_netdevice_notifier(struct notifier_block *nb)
+{
+
+        EVENTHANDLER_DEREGISTER(ifnet_link_event, nb->tags[NETDEV_UP]);
+        EVENTHANDLER_DEREGISTER(ifnet_arrival_event, nb->tags[NETDEV_REGISTER]);
+        EVENTHANDLER_DEREGISTER(ifnet_departure_event,
+	    nb->tags[NETDEV_UNREGISTER]);
+	return (0);
+}
+
+#define	rtnl_lock()
+#define	rtnl_unlock()
+
+static inline int
+dev_mc_delete(struct net_device *dev, void *addr, int alen, int all)
+{
+	struct sockaddr_dl sdl;
+
+	if (alen > sizeof(sdl.sdl_data))
+		return (-EINVAL);
+	memset(&sdl, 0, sizeof(sdl));
+	sdl.sdl_len = sizeof(sdl);
+	sdl.sdl_family = AF_LINK;
+	sdl.sdl_alen = alen;
+	memcpy(&sdl.sdl_data, addr, alen);
+
+	return -if_delmulti(dev, (struct sockaddr *)&sdl);
+}
+
+static inline int
+dev_mc_add(struct net_device *dev, void *addr, int alen, int newonly)
+{
+	struct sockaddr_dl sdl;
+
+	if (alen > sizeof(sdl.sdl_data))
+		return (-EINVAL);
+	memset(&sdl, 0, sizeof(sdl));
+	sdl.sdl_len = sizeof(sdl);
+	sdl.sdl_family = AF_LINK;
+	sdl.sdl_alen = alen;
+	memcpy(&sdl.sdl_data, addr, alen);
+
+	return -if_addmulti(dev, (struct sockaddr *)&sdl, NULL);
+}
+
+#endif	/* _LINUX_NETDEVICE_H_ */
diff --git a/sys/ofed/include/linux/notifier.h b/sys/ofed/include/linux/notifier.h
new file mode 100644
index 0000000..eeef8e7
--- /dev/null
+++ b/sys/ofed/include/linux/notifier.h
@@ -0,0 +1,54 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_NOTIFIER_H_
+#define	_LINUX_NOTIFIER_H_
+
+#include <sys/eventhandler.h>
+
+/*
+ * Max number of FreeBSD events to map to Linux events per notify type.
+ */
+#define	NOTIFY_DONE	0
+#define	_NOTIFY_COUNT	5
+
+struct notifier_block {
+	int (*notifier_call)(struct notifier_block *, unsigned long, void *);
+	struct notifier_block	*next;
+	int			priority;
+	eventhandler_tag	tags[_NOTIFY_COUNT];
+};
+
+/* Values must be less than NOTIFY_COUNT */
+#define	NETDEV_UP		0x0001
+#define	NETDEV_DOWN		0x0002
+#define	NETDEV_REGISTER		0x0003
+#define	NETDEV_UNREGISTER	0x0004
+
+
+#endif	/* _LINUX_NOTIFIER_H_ */
diff --git a/sys/ofed/include/linux/page.h b/sys/ofed/include/linux/page.h
new file mode 100644
index 0000000..0c9052c
--- /dev/null
+++ b/sys/ofed/include/linux/page.h
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_PAGE_H_
+#define _LINUX_PAGE_H_
+
+#include <linux/types.h>
+
+#include <sys/param.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+
+#define page	vm_page
+
+#define	virt_to_page(x)	PHYS_TO_VM_PAGE(vtophys((x)))
+
+#define	clear_page(page)		memset((page), 0, PAGE_SIZE)
+#define	pgprot_noncached(prot)		VM_MEMATTR_UNCACHED
+#define	pgprot_writecombine(prot)	VM_MEMATTR_WRITE_COMBINING
+
+#undef	PAGE_MASK
+#define	PAGE_MASK	(~(PAGE_SIZE-1))
+
+#endif	/* _LINUX_PAGE_H_ */
diff --git a/sys/ofed/include/linux/pci.h b/sys/ofed/include/linux/pci.h
new file mode 100644
index 0000000..e4fb5f5
--- /dev/null
+++ b/sys/ofed/include/linux/pci.h
@@ -0,0 +1,580 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_PCI_H_
+#define	_LINUX_PCI_H_
+
+#define	CONFIG_PCI_MSI
+
+#include <linux/types.h>
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/pciio.h>
+#include <sys/rman.h>
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+#include <dev/pci/pci_private.h>
+
+#include <machine/resource.h>
+
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/dmapool.h>
+#include <linux/dma-mapping.h>
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <asm/atomic.h>
+#include <linux/device.h>
+
+struct pci_device_id {
+	uint32_t	vendor;
+	uint32_t	device;
+        uint32_t	subvendor;
+	uint32_t	subdevice;
+	uint32_t	class_mask;
+	uintptr_t	driver_data;
+};
+
+#define	MODULE_DEVICE_TABLE(bus, table)
+#define	PCI_ANY_ID		(-1)
+#define	PCI_VENDOR_ID_MELLANOX			0x15b3
+#define	PCI_VENDOR_ID_TOPSPIN			0x1867
+#define	PCI_DEVICE_ID_MELLANOX_TAVOR		0x5a44
+#define	PCI_DEVICE_ID_MELLANOX_TAVOR_BRIDGE	0x5a46
+#define	PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT	0x6278
+#define	PCI_DEVICE_ID_MELLANOX_ARBEL		0x6282
+#define	PCI_DEVICE_ID_MELLANOX_SINAI_OLD	0x5e8c
+#define	PCI_DEVICE_ID_MELLANOX_SINAI		0x6274
+
+
+#define PCI_VDEVICE(vendor, device)					\
+	    PCI_VENDOR_ID_##vendor, (device), PCI_ANY_ID, PCI_ANY_ID, 0, 0
+#define	PCI_DEVICE(vendor, device)					\
+	    (vendor), (device), PCI_ANY_ID, PCI_ANY_ID, 0, 0
+
+#define	to_pci_dev(n)	container_of(n, struct pci_dev, dev)
+
+#define	PCI_VENDOR_ID	PCIR_DEVVENDOR
+#define	PCI_COMMAND	PCIR_COMMAND
+#define	PCI_EXP_DEVCTL	PCIR_EXPRESS_DEVICE_CTL
+#define	PCI_EXP_LNKCTL	PCIR_EXPRESS_LINK_CTL
+
+#define	IORESOURCE_MEM	SYS_RES_MEMORY
+#define	IORESOURCE_IO	SYS_RES_IOPORT
+#define	IORESOURCE_IRQ	SYS_RES_IRQ
+
+struct pci_dev;
+
+struct pci_driver {
+	struct list_head		links;
+	char				*name;
+	struct pci_device_id		*id_table;
+	int  (*probe)(struct pci_dev *dev, const struct pci_device_id *id);
+	void (*remove)(struct pci_dev *dev);
+	driver_t			driver;
+	devclass_t			bsdclass;
+};
+
+extern struct list_head pci_drivers;
+extern struct list_head pci_devices;
+extern spinlock_t pci_lock;
+
+#define	__devexit_p(x)	x
+
+struct pci_dev {
+	struct device		dev;
+	struct list_head	links;
+	struct pci_driver	*pdrv;
+	uint64_t		dma_mask;
+	uint16_t		device;
+	uint16_t		vendor;
+	unsigned int		irq;
+};
+
+static inline struct resource_list_entry *
+_pci_get_rle(struct pci_dev *pdev, int type, int rid)
+{
+	struct pci_devinfo *dinfo;
+	struct resource_list *rl;
+
+	dinfo = device_get_ivars(pdev->dev.bsddev);
+	rl = &dinfo->resources;
+	return resource_list_find(rl, type, rid);
+}
+
+static inline struct resource_list_entry *
+_pci_get_bar(struct pci_dev *pdev, int bar)
+{
+	struct resource_list_entry *rle;
+
+	bar = PCIR_BAR(bar);
+	if ((rle = _pci_get_rle(pdev, SYS_RES_MEMORY, bar)) == NULL)
+		rle = _pci_get_rle(pdev, SYS_RES_IOPORT, bar);
+	return (rle);
+}
+
+static inline struct device *
+_pci_find_irq_dev(unsigned int irq)
+{
+	struct pci_dev *pdev;
+
+	spin_lock(&pci_lock);
+	list_for_each_entry(pdev, &pci_devices, links) {
+		if (irq == pdev->dev.irq)
+			break;
+		if (irq >= pdev->dev.msix && irq < pdev->dev.msix_max)
+			break;
+	}
+	spin_unlock(&pci_lock);
+	if (pdev)
+		return &pdev->dev;
+	return (NULL);
+}
+
+static inline unsigned long
+pci_resource_start(struct pci_dev *pdev, int bar)
+{
+	struct resource_list_entry *rle;
+
+	if ((rle = _pci_get_bar(pdev, bar)) == NULL)
+		return (0);
+	return rle->start;
+}
+
+static inline unsigned long
+pci_resource_len(struct pci_dev *pdev, int bar)
+{
+	struct resource_list_entry *rle;
+
+	if ((rle = _pci_get_bar(pdev, bar)) == NULL)
+		return (0);
+	return rle->count;
+}
+
+/*
+ * All drivers just seem to want to inspect the type not flags.
+ */
+static inline int
+pci_resource_flags(struct pci_dev *pdev, int bar)
+{
+	struct resource_list_entry *rle;
+
+	if ((rle = _pci_get_bar(pdev, bar)) == NULL)
+		return (0);
+	return rle->type;
+}
+
+static inline const char *
+pci_name(struct pci_dev *d)
+{
+
+	return device_get_desc(d->dev.bsddev);
+}
+
+static inline void *
+pci_get_drvdata(struct pci_dev *pdev)
+{
+
+	return dev_get_drvdata(&pdev->dev);
+}
+
+static inline void
+pci_set_drvdata(struct pci_dev *pdev, void *data)
+{
+
+	dev_set_drvdata(&pdev->dev, data);
+}
+
+static inline int
+pci_enable_device(struct pci_dev *pdev)
+{
+
+	pci_enable_io(pdev->dev.bsddev, SYS_RES_IOPORT);
+	pci_enable_io(pdev->dev.bsddev, SYS_RES_MEMORY);
+	return (0);
+}
+
+static inline void
+pci_disable_device(struct pci_dev *pdev)
+{
+}
+
+static inline int
+pci_set_master(struct pci_dev *pdev)
+{
+
+	pci_enable_busmaster(pdev->dev.bsddev);
+	return (0);
+}
+
+static inline int
+pci_request_region(struct pci_dev *pdev, int bar, const char *res_name)
+{
+	int rid;
+	int type;
+
+	type = pci_resource_flags(pdev, bar);
+	if (type == 0)
+		return (-ENODEV);
+	rid = PCIR_BAR(bar);
+	if (bus_alloc_resource_any(pdev->dev.bsddev, type, &rid,
+	    RF_ACTIVE) == NULL)
+		return (-EINVAL);
+	return (0);
+}
+
+static inline void
+pci_release_region(struct pci_dev *pdev, int bar)
+{
+	struct resource_list_entry *rle;
+
+	if ((rle = _pci_get_bar(pdev, bar)) == NULL)
+		return;
+	bus_release_resource(pdev->dev.bsddev, rle->type, rle->rid, rle->res);
+}
+
+static inline void
+pci_release_regions(struct pci_dev *pdev)
+{
+	int i;
+
+	for (i = 0; i <= PCIR_MAX_BAR_0; i++)
+		pci_release_region(pdev, i);
+}
+
+static inline int
+pci_request_regions(struct pci_dev *pdev, const char *res_name)
+{
+	int error;
+	int i;
+
+	for (i = 0; i <= PCIR_MAX_BAR_0; i++) {
+		error = pci_request_region(pdev, i, res_name);
+		if (error && error != -ENODEV) {
+			pci_release_regions(pdev);
+			return (error);
+		}
+	}
+	return (0);
+}
+
+static inline void
+pci_disable_msix(struct pci_dev *pdev)
+{
+
+	pci_release_msi(pdev->dev.bsddev);
+}
+
+#define	PCI_CAP_ID_EXP	PCIY_EXPRESS
+#define	PCI_CAP_ID_PCIX	PCIY_PCIX
+
+static inline int
+pci_find_capability(struct pci_dev *pdev, int capid)
+{
+	int reg;
+
+	if (pci_find_extcap(pdev->dev.bsddev, capid, &reg))
+		return (0);
+	return (reg);
+}
+
+static inline int
+pci_read_config_byte(struct pci_dev *pdev, int where, u8 *val)
+{
+
+	*val = (u8)pci_read_config(pdev->dev.bsddev, where, 1);
+	return (0);
+}
+
+static inline int
+pci_read_config_word(struct pci_dev *pdev, int where, u16 *val)
+{
+
+	*val = (u16)pci_read_config(pdev->dev.bsddev, where, 2);
+	return (0);
+}
+
+static inline int
+pci_read_config_dword(struct pci_dev *pdev, int where, u32 *val)
+{
+
+	*val = (u32)pci_read_config(pdev->dev.bsddev, where, 4);
+	return (0);
+} 
+
+static inline int
+pci_write_config_byte(struct pci_dev *pdev, int where, u8 val)
+{
+
+	pci_write_config(pdev->dev.bsddev, where, val, 1);
+	return (0);
+}
+
+static inline int
+pci_write_config_word(struct pci_dev *pdev, int where, u16 val)
+{
+
+	pci_write_config(pdev->dev.bsddev, where, val, 2);
+	return (0);
+}
+
+static inline int
+pci_write_config_dword(struct pci_dev *pdev, int where, u32 val)
+{ 
+
+	pci_write_config(pdev->dev.bsddev, where, val, 4);
+	return (0);
+}
+
+static struct pci_driver *
+linux_pci_find(device_t dev, struct pci_device_id **idp)
+{
+	struct pci_device_id *id;
+	struct pci_driver *pdrv;
+	uint16_t vendor;
+	uint16_t device;
+
+	vendor = pci_get_vendor(dev);
+	device = pci_get_device(dev);
+
+	spin_lock(&pci_lock);
+	list_for_each_entry(pdrv, &pci_drivers, links) {
+		for (id = pdrv->id_table; id->vendor != 0; id++) {
+			if (vendor == id->vendor && device == id->device) {
+				*idp = id;
+				spin_unlock(&pci_lock);
+				return (pdrv);
+			}
+		}
+	}
+	spin_unlock(&pci_lock);
+	return (NULL);
+}
+
+static inline int
+linux_pci_probe(device_t dev)
+{
+	struct pci_device_id *id;
+	struct pci_driver *pdrv;
+
+	if ((pdrv = linux_pci_find(dev, &id)) == NULL)
+		return (ENXIO);
+	if (device_get_driver(dev) != &pdrv->driver)
+		return (ENXIO);
+	device_set_desc(dev, pdrv->name);
+	return (0);
+}
+
+static inline int
+linux_pci_attach(device_t dev)
+{
+	struct resource_list_entry *rle;
+	struct pci_dev *pdev;
+	struct pci_driver *pdrv;
+	struct pci_device_id *id;
+	int error;
+
+	pdrv = linux_pci_find(dev, &id);
+	pdev = device_get_softc(dev);
+	pdev->dev.parent = &linux_rootdev;
+	pdev->dev.bsddev = dev;
+	INIT_LIST_HEAD(&pdev->dev.irqents);
+	pdev->device = id->device;
+	pdev->vendor = id->vendor;
+	pdev->dev.dma_mask = &pdev->dma_mask;
+	pdev->pdrv = pdrv;
+	kobject_init(&pdev->dev.kobj, &dev_ktype);
+	kobject_set_name(&pdev->dev.kobj, device_get_nameunit(dev));
+	kobject_add(&pdev->dev.kobj, &linux_rootdev.kobj,
+	    kobject_name(&pdev->dev.kobj));
+	rle = _pci_get_rle(pdev, SYS_RES_IRQ, 0);
+	if (rle)
+		pdev->dev.irq = rle->start;
+	else
+		pdev->dev.irq = 0;
+	pdev->irq = pdev->dev.irq;
+	mtx_unlock(&Giant);
+	spin_lock(&pci_lock);
+	list_add(&pdev->links, &pci_devices);
+	spin_unlock(&pci_lock);
+	error = pdrv->probe(pdev, id);
+	mtx_lock(&Giant);
+	if (error) {
+		spin_lock(&pci_lock);
+		list_del(&pdev->links);
+		spin_unlock(&pci_lock);
+		put_device(&pdev->dev);
+		return (-error);
+	}
+	return (0);
+}
+
+static inline int
+linux_pci_detach(device_t dev)
+{
+	struct pci_dev *pdev;
+
+	pdev = device_get_softc(dev);
+	mtx_unlock(&Giant);
+	pdev->pdrv->remove(pdev);
+	mtx_lock(&Giant);
+	spin_lock(&pci_lock);
+	list_del(&pdev->links);
+	spin_unlock(&pci_lock);
+	put_device(&pdev->dev);
+
+	return (0);
+}
+
+static device_method_t pci_methods[] = {
+	DEVMETHOD(device_probe, linux_pci_probe),
+	DEVMETHOD(device_attach, linux_pci_attach),
+	DEVMETHOD(device_detach, linux_pci_detach),
+	{0, 0}
+};
+
+static inline int
+pci_register_driver(struct pci_driver *pdrv)
+{
+	devclass_t bus;
+	int error;
+
+	spin_lock(&pci_lock);
+	list_add(&pdrv->links, &pci_drivers);
+	spin_unlock(&pci_lock);
+	bus = devclass_find("pci");
+	pdrv->driver.name = pdrv->name;
+	pdrv->driver.methods = pci_methods;
+	pdrv->driver.size = sizeof(struct pci_dev);
+	mtx_lock(&Giant);
+	error = devclass_add_driver(bus, &pdrv->driver, BUS_PASS_DEFAULT,
+	    &pdrv->bsdclass);
+	mtx_unlock(&Giant);
+	if (error)
+		return (-error);
+	return (0);
+}
+
+static inline void
+pci_unregister_driver(struct pci_driver *pdrv)
+{
+	devclass_t bus;
+
+	list_del(&pdrv->links);
+	bus = devclass_find("pci");
+	mtx_lock(&Giant);
+	devclass_delete_driver(bus, &pdrv->driver);
+	mtx_unlock(&Giant);
+}
+
+struct msix_entry {
+	int entry;
+	int vector;
+};
+
+/*
+ * Enable msix, positive errors indicate actual number of available
+ * vectors.  Negative errors are failures.
+ */
+static inline int
+pci_enable_msix(struct pci_dev *pdev, struct msix_entry *entries, int nreq)
+{
+	struct resource_list_entry *rle;
+	int error;
+	int avail;
+	int i;
+
+	avail = pci_msix_count(pdev->dev.bsddev);
+	if (avail < nreq) {
+		if (avail == 0)
+			return -EINVAL;
+		return avail;
+	}
+	avail = nreq;
+	if ((error = -pci_alloc_msix(pdev->dev.bsddev, &avail)) != 0)
+		return error;
+	rle = _pci_get_rle(pdev, SYS_RES_IRQ, 1);
+	pdev->dev.msix = rle->start;
+	pdev->dev.msix_max = rle->start + avail;
+	for (i = 0; i < nreq; i++)
+		entries[i].vector = pdev->dev.msix + i;
+	return (0);
+}
+
+/* XXX This should not be necessary. */
+#define	pcix_set_mmrbc(d, v)	0
+#define	pcix_get_max_mmrbc(d)	0
+#define	pcie_set_readrq(d, v)	0
+
+#define	PCI_DMA_BIDIRECTIONAL	0
+#define	PCI_DMA_TODEVICE	1
+#define	PCI_DMA_FROMDEVICE	2
+#define	PCI_DMA_NONE		3
+
+#define	pci_pool		dma_pool
+#define pci_pool_destroy	dma_pool_destroy
+#define pci_pool_alloc		dma_pool_alloc
+#define pci_pool_free		dma_pool_free
+#define	pci_pool_create(_name, _pdev, _size, _align, _alloc)		\
+	    dma_pool_create(_name, &(_pdev)->dev, _size, _align, _alloc)
+#define	pci_free_consistent(_hwdev, _size, _vaddr, _dma_handle)		\
+	    dma_free_coherent((_hwdev) == NULL ? NULL : &(_hwdev)->dev,	\
+		_size, _vaddr, _dma_handle)
+#define	pci_map_sg(_hwdev, _sg, _nents, _dir)				\
+	    dma_map_sg((_hwdev) == NULL ? NULL : &(_hwdev->dev),	\
+		_sg, _nents, (enum dma_data_direction)_dir)
+#define	pci_map_single(_hwdev, _ptr, _size, _dir)			\
+	    dma_map_single((_hwdev) == NULL ? NULL : &(_hwdev->dev),	\
+		(_ptr), (_size), (enum dma_data_direction)_dir)
+#define	pci_unmap_single(_hwdev, _addr, _size, _dir)			\
+	    dma_unmap_single((_hwdev) == NULL ? NULL : &(_hwdev)->dev,	\
+		_addr, _size, (enum dma_data_direction)_dir)
+#define	pci_unmap_sg(_hwdev, _sg, _nents, _dir)				\
+	    dma_unmap_sg((_hwdev) == NULL ? NULL : &(_hwdev)->dev,	\
+		_sg, _nents, (enum dma_data_direction)_dir)
+#define	pci_map_page(_hwdev, _page, _offset, _size, _dir)		\
+	    dma_map_page((_hwdev) == NULL ? NULL : &(_hwdev)->dev, _page,\
+		_offset, _size, (enum dma_data_direction)_dir)
+#define	pci_unmap_page(_hwdev, _dma_address, _size, _dir)		\
+	    dma_unmap_page((_hwdev) == NULL ? NULL : &(_hwdev)->dev,	\
+		_dma_address, _size, (enum dma_data_direction)_dir)
+#define	pci_set_dma_mask(_pdev, mask)	dma_set_mask(&(_pdev)->dev, (mask))
+#define	pci_dma_mapping_error(_pdev, _dma_addr)				\
+	    dma_mapping_error(&(_pdev)->dev, _dma_addr)
+#define	pci_set_consistent_dma_mask(_pdev, _mask)			\
+	    dma_set_coherent_mask(&(_pdev)->dev, (_mask))
+#define	DECLARE_PCI_UNMAP_ADDR(x)	DEFINE_DMA_UNMAP_ADDR(x);
+#define	DECLARE_PCI_UNMAP_LEN(x)	DEFINE_DMA_UNMAP_LEN(x);
+#define	pci_unmap_addr		dma_unmap_addr
+#define	pci_unmap_addr_set	dma_unmap_addr_set
+#define	pci_unmap_len		dma_unmap_len
+#define	pci_unmap_len_set	dma_unmap_len_set
+
+
+#endif	/* _LINUX_PCI_H_ */
diff --git a/sys/ofed/include/linux/poll.h b/sys/ofed/include/linux/poll.h
new file mode 100644
index 0000000..5b7f34e
--- /dev/null
+++ b/sys/ofed/include/linux/poll.h
@@ -0,0 +1,44 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_POLL_H_
+#define	_LINUX_POLL_H_
+
+#include <sys/poll.h>
+#include <sys/fcntl.h>
+
+typedef struct poll_table_struct {
+} poll_table;
+
+static inline void
+poll_wait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p)
+{
+	selrecord(curthread, &filp->f_selinfo);
+}
+
+#endif	/* _LINUX_POLL_H_ */
diff --git a/sys/ofed/include/linux/radix-tree.h b/sys/ofed/include/linux/radix-tree.h
new file mode 100644
index 0000000..a02a90f
--- /dev/null
+++ b/sys/ofed/include/linux/radix-tree.h
@@ -0,0 +1,60 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_RADIX_TREE_H_
+#define	_LINUX_RADIX_TREE_H_
+
+#define	RADIX_TREE_MAP_SHIFT	6
+#define	RADIX_TREE_MAP_SIZE	(1 << RADIX_TREE_MAP_SHIFT)
+#define	RADIX_TREE_MAP_MASK	(RADIX_TREE_MAP_SIZE - 1)
+#define	RADIX_TREE_MAX_HEIGHT						\
+	    DIV_ROUND_UP((sizeof(long) * NBBY), RADIX_TREE_MAP_SHIFT)
+
+struct radix_tree_node {
+	void		*slots[RADIX_TREE_MAP_SIZE];
+	int		count;
+};
+
+struct radix_tree_root {
+	struct radix_tree_node	*rnode;
+	gfp_t			gfp_mask;
+	int			height;
+};
+
+#define	RADIX_TREE_INIT(mask)						\
+	    { .rnode = NULL, .gfp_mask = mask, .height = 0 };
+#define	INIT_RADIX_TREE(root, mask)					\
+	    { (root)->rnode = NULL; (root)->gfp_mask = mask; (root)->height = 0; }
+#define	RADIX_TREE(name, mask)						\
+	    struct radix_tree_root name = RADIX_TREE_INIT(mask)
+
+void	*radix_tree_lookup(struct radix_tree_root *, unsigned long);
+void	*radix_tree_delete(struct radix_tree_root *, unsigned long);
+int	radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
+
+#endif	/* _LINUX_RADIX_TREE_H_ */
diff --git a/sys/ofed/include/linux/random.h b/sys/ofed/include/linux/random.h
new file mode 100644
index 0000000..84a24c8
--- /dev/null
+++ b/sys/ofed/include/linux/random.h
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_RANDOM_H_
+#define	_LINUX_RANDOM_H_
+
+#include <sys/random.h>
+
+static inline void
+get_random_bytes(void *buf, int nbytes)
+{
+	read_random(buf, nbytes);
+}
+
+#endif	/* _LINUX_RANDOM_H_ */
diff --git a/sys/ofed/include/linux/rbtree.h b/sys/ofed/include/linux/rbtree.h
new file mode 100644
index 0000000..ea9afc3
--- /dev/null
+++ b/sys/ofed/include/linux/rbtree.h
@@ -0,0 +1,111 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_RBTREE_H_
+#define	_LINUX_RBTREE_H_
+
+#include <sys/stddef.h>
+#include <sys/tree.h>
+
+struct rb_node {
+	RB_ENTRY(rb_node)	__entry;
+};
+#define	rb_left		__entry.rbe_left
+#define	rb_right	__entry.rbe_right
+
+/*
+ * We provide a false structure that has the same bit pattern as tree.h
+ * presents so it matches the member names expected by linux.
+ */
+struct rb_root {
+	struct	rb_node	*rb_node;
+};
+
+/*
+ * In linux all of the comparisons are done by the caller.
+ */
+int panic_cmp(struct rb_node *one, struct rb_node *two);
+
+RB_HEAD(linux_root, rb_node);
+RB_PROTOTYPE(linux_root, rb_node, __entry, panic_cmp);
+
+#define	rb_parent(r)	RB_PARENT(r, __entry)
+#define	rb_color(r)	RB_COLOR(r, __entry)
+#define	rb_is_red(r)	(rb_color(r) == RB_RED)
+#define	rb_is_black(r)	(rb_color(r) == RB_BLACK)
+#define	rb_set_parent(r, p)	rb_parent((r)) = (p)
+#define	rb_set_color(r, c)	rb_color((r)) = (c)
+#define	rb_entry(ptr, type, member)	container_of(ptr, type, member)
+
+#define RB_EMPTY_ROOT(root)     RB_EMPTY((struct linux_root *)root)
+#define RB_EMPTY_NODE(node)     (rb_parent(node) == node)
+#define RB_CLEAR_NODE(node)     (rb_set_parent(node, node))
+
+#define	rb_insert_color(node, root)					\
+	linux_root_RB_INSERT_COLOR((struct linux_root *)(root), (node))
+#define	rb_erase(node, root)						\
+	linux_root_RB_REMOVE((struct linux_root *)(root), (node))
+#define	rb_next(node)	RB_NEXT(linux_root, NULL, (node))
+#define	rb_prev(node)	RB_PREV(linux_root, NULL, (node))
+#define	rb_first(root)	RB_MIN(linux_root, (struct linux_root *)(root))
+#define	rb_last(root)	RB_MAX(linux_root, (struct linux_root *)(root))
+
+static inline void
+rb_link_node(struct rb_node *node, struct rb_node *parent,
+    struct rb_node **rb_link)
+{
+	rb_set_parent(node, parent);
+	rb_set_color(node, RB_RED);
+	node->__entry.rbe_left = node->__entry.rbe_right = NULL;
+	*rb_link = node;
+}
+
+static inline void
+rb_replace_node(struct rb_node *victim, struct rb_node *new,
+    struct rb_root *root)
+{
+	struct rb_node *p;
+
+	p = rb_parent(victim);
+	if (p) {
+		if (p->rb_left == victim)
+			p->rb_left = new;
+		else
+			p->rb_right = new;
+	} else
+		root->rb_node = new;
+	if (victim->rb_left)
+		rb_set_parent(victim->rb_left, new);
+	if (victim->rb_right)
+		rb_set_parent(victim->rb_right, new);
+	*new = *victim;
+}
+
+#undef RB_ROOT
+#define RB_ROOT		(struct rb_root) { NULL }
+
+#endif	/* _LINUX_RBTREE_H_ */
diff --git a/sys/ofed/include/linux/rtnetlink.h b/sys/ofed/include/linux/rtnetlink.h
new file mode 100644
index 0000000..e5d814e
--- /dev/null
+++ b/sys/ofed/include/linux/rtnetlink.h
@@ -0,0 +1,27 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
diff --git a/sys/ofed/include/linux/rwlock.h b/sys/ofed/include/linux/rwlock.h
new file mode 100644
index 0000000..0162455
--- /dev/null
+++ b/sys/ofed/include/linux/rwlock.h
@@ -0,0 +1,63 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_RWLOCK_H_
+#define	_LINUX_RWLOCK_H_
+
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+
+typedef struct {
+	struct rwlock rw;
+} rwlock_t;
+
+#define	read_lock(_l)		rw_rlock(&(_l)->rw)
+#define	write_lock(_l)		rw_wlock(&(_l)->rw)
+#define	read_unlock(_l)		rw_runlock(&(_l)->rw)
+#define	write_unlock(_l)	rw_wunlock(&(_l)->rw)
+#define	read_lock_irq(lock)	read_lock((lock))
+#define	read_unlock_irq(lock)	read_unlock((lock))
+#define	write_lock_irq(lock)	write_lock((lock))
+#define	write_unlock_irq(lock)	write_unlock((lock))
+#define	read_lock_irqsave(lock, flags)   				\
+    do {(flags) = 0; read_lock(lock); } while (0)
+#define	write_lock_irqsave(lock, flags)   				\
+    do {(flags) = 0; write_lock(lock); } while (0)
+#define	read_unlock_irqrestore(lock, flags)				\
+    do { read_unlock(lock); } while (0)
+#define	write_unlock_irqrestore(lock, flags)				\
+    do { write_unlock(lock); } while (0)
+
+static inline void
+rwlock_init(rwlock_t *lock)
+{
+
+	memset(&lock->rw, 0, sizeof(lock->rw));
+	rw_init_flags(&lock->rw, "lnxrw", RW_NOWITNESS);
+}
+
+#endif	/* _LINUX_RWLOCK_H_ */
diff --git a/sys/ofed/include/linux/rwsem.h b/sys/ofed/include/linux/rwsem.h
new file mode 100644
index 0000000..f87c9d9
--- /dev/null
+++ b/sys/ofed/include/linux/rwsem.h
@@ -0,0 +1,56 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_RWSEM_H_
+#define	_LINUX_RWSEM_H_
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/sx.h>
+
+struct rw_semaphore {
+	struct sx sx;
+};
+
+#define	down_write(_rw)			sx_xlock(&(_rw)->sx)
+#define	up_write(_rw)			sx_xunlock(&(_rw)->sx)
+#define	down_read(_rw)			sx_slock(&(_rw)->sx)
+#define	up_read(_rw)			sx_sunlock(&(_rw)->sx)
+#define	down_read_trylock(_rw)		!!sx_try_slock(&(_rw)->sx)
+#define	down_write_trylock(_rw)		!!sx_try_xlock(&(_rw)->sx)
+#define	downgrade_write(_rw)		sx_downgrade(&(_rw)->sx)
+#define	down_read_nested(_rw, _sc)	down_read(_rw)
+
+static inline void
+init_rwsem(struct rw_semaphore *rw)
+{
+
+	memset(&rw->sx, 0, sizeof(rw->sx));
+	sx_init_flags(&rw->sx, "lnxrwsem", SX_NOWITNESS);
+}
+
+#endif	/* _LINUX_RWSEM_H_ */
diff --git a/sys/ofed/include/linux/scatterlist.h b/sys/ofed/include/linux/scatterlist.h
new file mode 100644
index 0000000..611ad56
--- /dev/null
+++ b/sys/ofed/include/linux/scatterlist.h
@@ -0,0 +1,98 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_SCATTERLIST_H_
+#define	_LINUX_SCATTERLIST_H_
+
+#include <linux/string.h>
+#include <linux/page.h>
+
+struct scatterlist {
+	union {
+		struct page		*page;
+		struct scatterlist	*sg;
+	} sl_un;
+	unsigned long	address;
+	unsigned long	offset;
+	uint32_t	length;
+	uint32_t	flags;
+};
+
+#define	sg_dma_address(sg)	(sg)->address
+#define	sg_dma_len(sg)		(sg)->length
+#define	sg_page(sg)		(sg)->sl_un.page
+#define	sg_scatternext(sg)	(sg)->sl_un.sg
+
+#define	SG_END		0x01
+#define	SG_CHAIN	0x02
+
+static inline void
+sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len,
+    unsigned int offset)
+{
+	sg_page(sg) = page;
+	sg_dma_len(sg) = len;
+	sg->offset = offset;
+	if (offset > PAGE_SIZE)
+		panic("sg_set_page: Invalid offset %d\n", offset);
+}
+
+static inline void
+sg_set_buf(struct scatterlist *sg, const void *buf, unsigned int buflen)
+{
+	sg_set_page(sg, virt_to_page(buf), buflen,
+	    ((uintptr_t)buf) & ~PAGE_MASK);
+}
+
+static inline void
+sg_init_table(struct scatterlist *sg, unsigned int nents)
+{
+	bzero(sg, sizeof(*sg) * nents);
+	sg[nents - 1].flags = SG_END;
+}
+
+static inline struct scatterlist *
+sg_next(struct scatterlist *sg)
+{
+	if (sg->flags & SG_END)
+		return (NULL);
+	sg++;
+	if (sg->flags & SG_CHAIN)
+		sg = sg_scatternext(sg);
+	return (sg);
+}
+
+static inline vm_paddr_t
+sg_phys(struct scatterlist *sg)
+{
+	return sg_page(sg)->phys_addr + sg->offset;
+}
+
+#define	for_each_sg(sglist, sg, sgmax, _itr)				\
+	for (_itr = 0, sg = (sglist); _itr < (sgmax); _itr++, sg = sg_next(sg))
+
+#endif	/* _LINUX_SCATTERLIST_H_ */
diff --git a/sys/ofed/include/linux/sched.h b/sys/ofed/include/linux/sched.h
new file mode 100644
index 0000000..414b0ac
--- /dev/null
+++ b/sys/ofed/include/linux/sched.h
@@ -0,0 +1,109 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_SCHED_H_
+#define	_LINUX_SCHED_H_
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/sleepqueue.h>
+
+#define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
+
+#define	TASK_RUNNING		0
+#define	TASK_INTERRUPTIBLE	1
+#define	TASK_UNINTERRUPTIBLE	2
+#define	TASK_DEAD		64
+#define	TASK_WAKEKILL		128
+#define	TASK_WAKING		256
+
+#define	TASK_SHOULD_STOP	1
+#define	TASK_STOPPED		2
+
+/*
+ * A task_struct is only provided for those tasks created with kthread.
+ * Using these routines with threads not started via kthread will cause
+ * panics because no task_struct is allocated and td_retval[1] is
+ * overwritten by syscalls which kernel threads will not make use of.
+ */
+struct task_struct {
+	struct	thread *task_thread;
+	int	(*task_fn)(void *data);
+	void	*task_data;
+	int	task_ret;
+	int	state;
+	int	should_stop;
+};
+
+#define	current			((struct task_struct *)curthread->td_retval[1])
+#define	task_struct_get(x)	(struct task_struct *)(x)->td_retval[1]
+#define	task_struct_set(x, y)	(x)->td_retval[1] = (register_t)(y)
+
+#define	set_current_state(x)						\
+	atomic_store_rel_int((volatile int *)&current->state, (x))
+#define	__set_current_state(x)	current->state = (x)
+
+
+#define	schedule()							\
+do {									\
+	void *c;							\
+									\
+	if (cold)							\
+		break;							\
+	c = curthread;							\
+	sleepq_lock(c);							\
+	if (current->state == TASK_INTERRUPTIBLE ||			\
+	    current->state == TASK_UNINTERRUPTIBLE) {			\
+		sleepq_add(c, NULL, "task", SLEEPQ_SLEEP, 0);		\
+		sleepq_wait(c, 0);					\
+	} else {							\
+		sleepq_release(c);					\
+		sched_relinquish(curthread);				\
+	}								\
+} while (0)
+
+#define	wake_up_process(x)						\
+do {									\
+	int wakeup_swapper;						\
+	void *c;							\
+									\
+	c = (x)->task_thread;						\
+	sleepq_lock(c);							\
+	(x)->state = TASK_RUNNING;					\
+	wakeup_swapper = sleepq_signal(c, SLEEPQ_SLEEP, 0, 0);		\
+	sleepq_release(c);						\
+	if (wakeup_swapper)						\
+		kick_proc0();						\
+} while (0)
+
+#define	cond_resched()	if (!cold)	sched_relinquish(curthread)
+
+#define	sched_yield()	sched_relinquish(curthread)
+
+#endif	/* _LINUX_SCHED_H_ */
diff --git a/sys/ofed/include/linux/semaphore.h b/sys/ofed/include/linux/semaphore.h
new file mode 100644
index 0000000..4b9fd56
--- /dev/null
+++ b/sys/ofed/include/linux/semaphore.h
@@ -0,0 +1,66 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_SEMAPHORE_H_
+#define _LINUX_SEMAPHORE_H_
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/sema.h>
+
+/*
+ * XXX BSD semaphores are disused and slow.  They also do not provide a
+ * sema_wait_sig method.  This must be resolved eventually.
+ */
+struct semaphore {
+	struct sema	sema;
+};
+
+#define	down(_sem)			sema_wait(&(_sem)->sema)
+#define	down_interruptible(_sem)	sema_wait(&(_sem)->sema), 0
+#define	down_trylock(_sem)		!sema_trywait(&(_sem)->sema)
+#define	up(_sem)			sema_post(&(_sem)->sema)
+
+static inline void
+linux_sema_init(struct semaphore *sem, int val)
+{
+
+	memset(&sem->sema, 0, sizeof(sem->sema));
+	sema_init(&sem->sema, val, "lnxsema");
+}
+
+static inline void
+init_MUTEX(struct semaphore *sem)
+{
+
+	memset(&sem->sema, 0, sizeof(sem->sema));
+	sema_init(&sem->sema, 1, "lnxsema");
+}
+
+#define	sema_init	linux_sema_init
+
+#endif /* _LINUX_SEMAPHORE_H_ */
diff --git a/sys/ofed/include/linux/slab.h b/sys/ofed/include/linux/slab.h
new file mode 100644
index 0000000..5e7e608
--- /dev/null
+++ b/sys/ofed/include/linux/slab.h
@@ -0,0 +1,102 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_SLAB_H_
+#define	_LINUX_SLAB_H_
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <vm/uma.h>
+
+#include <linux/types.h>
+#include <linux/gfp.h>
+
+MALLOC_DECLARE(M_KMALLOC);
+
+#define	kmalloc(size, flags)	malloc((size), M_KMALLOC, (flags))
+#define	kzalloc(size, flags)	kmalloc((size), (flags) | M_ZERO)
+#define	kfree(ptr)		free(__DECONST(void *, (ptr)), M_KMALLOC)
+#define	krealloc(ptr, size, flags) realloc((ptr), (size), M_KMALLOC, (flags))
+#define	kcalloc(n, size, flags)	kmalloc((n) * (size), flags | M_ZERO)
+
+struct kmem_cache {
+	uma_zone_t	cache_zone;
+	void		(*cache_ctor)(void *);
+};
+
+#define	SLAB_HWCACHE_ALIGN	0x0001
+
+static inline int
+kmem_ctor(void *mem, int size, void *arg, int flags)
+{
+	void (*ctor)(void *);
+
+	ctor = arg;
+	ctor(mem);
+
+	return (0);
+}
+
+static inline struct kmem_cache *
+kmem_cache_create(char *name, size_t size, size_t align, u_long flags,
+    void (*ctor)(void *))
+{
+	struct kmem_cache *c;
+
+	c = malloc(sizeof(*c), M_KMALLOC, M_WAITOK);
+	if (align)
+		align--;
+	if (flags & SLAB_HWCACHE_ALIGN)
+		align = UMA_ALIGN_CACHE;
+	c->cache_zone = uma_zcreate(name, size, ctor ? kmem_ctor : NULL,
+	    NULL, NULL, NULL, align, 0);
+	c->cache_ctor = ctor;
+
+	return c;
+}
+
+static inline void *
+kmem_cache_alloc(struct kmem_cache *c, int flags)
+{
+	return uma_zalloc_arg(c->cache_zone, c->cache_ctor, flags);
+}
+
+static inline void
+kmem_cache_free(struct kmem_cache *c, void *m)
+{
+	uma_zfree(c->cache_zone, m);
+}
+
+static inline void
+kmem_cache_destroy(struct kmem_cache *c)
+{
+	uma_zdestroy(c->cache_zone);
+	free(c, M_KMALLOC);
+}
+
+#endif	/* _LINUX_SLAB_H_ */
diff --git a/sys/ofed/include/linux/socket.h b/sys/ofed/include/linux/socket.h
new file mode 100644
index 0000000..e14c982
--- /dev/null
+++ b/sys/ofed/include/linux/socket.h
@@ -0,0 +1,66 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_SOCKET_H_
+#define	_LINUX_SOCKET_H_
+
+#include <sys/socket.h>
+
+#ifdef notyet
+static inline int
+memcpy_toiovec(struct iovec *v, unsigned char *kdata, int len)
+{
+	struct uio uio;
+	int error;
+
+	uio.uio_iov = v;
+	uio.uio_iovcnt = -1;
+	uio.uio_offset = 0;
+	uio.uio_resid = len;
+	uio.uio_segflag = UIO_USERSPACE;
+	uio.uio_rw = UIO_READ;
+	error = -uiomove(kdata, len, &uio);
+	return (error);
+}
+
+static inline int
+memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len)
+{
+	struct uio uio;
+	int error;
+
+	uio.uio_iov = v;
+	uio.uio_iovcnt = -1;
+	uio.uio_offset = 0;
+	uio.uio_resid = len;
+	uio.uio_segflag = UIO_USERSPACE;
+	uio.uio_rw = UIO_WRITE;
+	error = -uiomove(kdata, len, &uio);
+}
+#endif
+
+#endif	/* _LINUX_SOCKET_H_ */
diff --git a/sys/ofed/include/linux/spinlock.h b/sys/ofed/include/linux/spinlock.h
new file mode 100644
index 0000000..4b972f4
--- /dev/null
+++ b/sys/ofed/include/linux/spinlock.h
@@ -0,0 +1,68 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_SPINLOCK_H_
+#define	_LINUX_SPINLOCK_H_
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/lockdep.h>
+#include <linux/rwlock.h>
+
+typedef struct {
+	struct mtx m;
+} spinlock_t;
+
+#define	spin_lock(_l)		mtx_lock(&(_l)->m)
+#define	spin_unlock(_l)		mtx_unlock(&(_l)->m)
+#define	spin_trylock(_l)	mtx_trylock(&(_l)->m)
+#define	spin_lock_nested(_l, _n) mtx_lock_flags(&(_l)->m, MTX_DUPOK)
+#define	spin_lock_irq(lock)	spin_lock(lock)
+#define	spin_unlock_irq(lock)	spin_unlock(lock)
+#define	spin_lock_irqsave(lock, flags)   				\
+    do {(flags) = 0; spin_lock(lock); } while (0)
+#define	spin_unlock_irqrestore(lock, flags)				\
+    do { spin_unlock(lock); } while (0)
+
+static inline void
+spin_lock_init(spinlock_t *lock)
+{
+
+	memset(&lock->m, 0, sizeof(lock->m));
+	mtx_init(&lock->m, "lnxspin", NULL, MTX_DEF | MTX_NOWITNESS);
+}
+
+#define	DEFINE_SPINLOCK(lock)						\
+	spinlock_t lock;						\
+	MTX_SYSINIT(lock, &(lock).m, "lnxspin", MTX_DEF)
+
+#endif	/* _LINUX_SPINLOCK_H_ */
diff --git a/sys/ofed/include/linux/stddef.h b/sys/ofed/include/linux/stddef.h
new file mode 100644
index 0000000..22bf938
--- /dev/null
+++ b/sys/ofed/include/linux/stddef.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_STDDEF_H_
+#define	_LINUX_STDDEF_H_
+
+#include <sys/stddef.h>
+
+#endif	/* _LINUX_STDDEF_H_ */
diff --git a/sys/ofed/include/linux/string.h b/sys/ofed/include/linux/string.h
new file mode 100644
index 0000000..b14a5c6
--- /dev/null
+++ b/sys/ofed/include/linux/string.h
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_STRING_H_
+#define	_LINUX_STRING_H_
+
+#include <linux/types.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+
+#include <sys/libkern.h>
+
+static inline void *
+kmemdup(const void *src, size_t len, gfp_t gfp)
+{
+	void *dst;
+
+	dst = kmalloc(len, gfp);
+	if (dst)
+		memcpy(dst, src, len);
+	return (dst);
+}
+
+#endif	/* _LINUX_STRING_H_ */
diff --git a/sys/ofed/include/linux/sysfs.h b/sys/ofed/include/linux/sysfs.h
new file mode 100644
index 0000000..698f75e
--- /dev/null
+++ b/sys/ofed/include/linux/sysfs.h
@@ -0,0 +1,182 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_SYSFS_H_
+#define	_LINUX_SYSFS_H_
+
+#include <sys/sysctl.h>
+
+struct attribute {
+	const char 	*name;
+	struct module	*owner;
+	mode_t		mode;
+};
+
+struct sysfs_ops {
+	ssize_t (*show)(struct kobject *, struct attribute *, char *);
+	ssize_t (*store)(struct kobject *, struct attribute *, const char *,
+	    size_t);
+};
+
+struct attribute_group {
+	const char		*name;
+	mode_t                  (*is_visible)(struct kobject *,
+				    struct attribute *, int);
+	struct attribute	**attrs;
+};
+
+#define	__ATTR(_name, _mode, _show, _store) {				\
+	.attr = { .name = __stringify(_name), .mode = _mode },		\
+        .show = _show, .store  = _store,				\
+}
+
+#define	__ATTR_RO(_name) {						\
+	.attr = { .name = __stringify(_name), .mode = 0444 },		\
+	.show   = _name##_show,						\
+}
+
+#define	__ATTR_NULL	{ .attr = { .name = NULL } }
+
+/*
+ * Handle our generic '\0' terminated 'C' string.
+ * Two cases:
+ *      a variable string:  point arg1 at it, arg2 is max length.
+ *      a constant string:  point arg1 at it, arg2 is zero.
+ */
+
+static inline int
+sysctl_handle_attr(SYSCTL_HANDLER_ARGS)
+{
+	struct kobject *kobj;
+	struct attribute *attr;
+	const struct sysfs_ops *ops;
+	void *buf;
+	int error;
+	ssize_t len;
+
+	kobj = arg1;
+	attr = (struct attribute *)arg2;
+	buf = (void *)get_zeroed_page(GFP_KERNEL);
+	len = 1;	/* Copy out a NULL byte at least. */
+	if (kobj->ktype == NULL || kobj->ktype->sysfs_ops == NULL)
+		return (ENODEV);
+	ops = kobj->ktype->sysfs_ops;
+	if (buf == NULL)
+		return (ENOMEM);
+	if (ops->show) {
+		len = ops->show(kobj, attr, buf);
+		/*
+		 * It's valid not to have a 'show' so we just return 1 byte
+		 * of NULL.
+	 	 */
+		if (len < 0) {
+			error = -len;
+			len = 1;
+			if (error != EIO)
+				goto out;
+		}
+	}
+	error = SYSCTL_OUT(req, buf, len);
+	if (error || !req->newptr || ops->store == NULL)
+		goto out;
+	error = SYSCTL_IN(req, buf, PAGE_SIZE);
+	if (error)
+		goto out;
+	len = ops->store(kobj, attr, buf, req->newlen);
+	if (len < 0)
+		error = -len;
+out:
+	free_page((unsigned long)buf);
+
+	return (error);
+}
+
+static inline int
+sysfs_create_file(struct kobject *kobj, const struct attribute *attr)
+{
+
+	sysctl_add_oid(NULL, SYSCTL_CHILDREN(kobj->oidp), OID_AUTO,
+	    attr->name, CTLTYPE_STRING|CTLFLAG_RW|CTLFLAG_MPSAFE, kobj,
+	    (uintptr_t)attr, sysctl_handle_attr, "A", "");
+
+	return (0);
+}
+
+static inline void
+sysfs_remove_file(struct kobject *kobj, const struct attribute *attr)
+{
+
+	if (kobj->oidp)
+		sysctl_remove_name(kobj->oidp, attr->name, 1, 1);
+}
+
+static inline void
+sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp)
+{
+
+	if (kobj->oidp)
+		sysctl_remove_name(kobj->oidp, grp->name, 1, 1);
+}
+
+static inline int
+sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp)
+{
+	struct attribute **attr;
+	struct sysctl_oid *oidp;
+
+	oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(kobj->oidp),
+	    OID_AUTO, grp->name, CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, grp->name);
+	for (attr = grp->attrs; *attr != NULL; attr++) {
+		sysctl_add_oid(NULL, SYSCTL_CHILDREN(oidp), OID_AUTO,
+		    (*attr)->name, CTLTYPE_STRING|CTLFLAG_RW|CTLFLAG_MPSAFE,
+		    kobj, (uintptr_t)*attr, sysctl_handle_attr, "A", "");
+	}
+
+	return (0);
+}
+
+static inline int
+sysfs_create_dir(struct kobject *kobj)
+{
+
+	kobj->oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(kobj->parent->oidp),
+	    OID_AUTO, kobj->name, CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, kobj->name);
+
+        return (0);
+}
+
+static inline void
+sysfs_remove_dir(struct kobject *kobj)
+{
+
+	if (kobj->oidp == NULL)
+		return;
+	sysctl_remove_oid(kobj->oidp, 1, 1);
+}
+
+#endif	/* _LINUX_SYSFS_H_ */
diff --git a/sys/ofed/include/linux/timer.h b/sys/ofed/include/linux/timer.h
new file mode 100644
index 0000000..ed4ed4a
--- /dev/null
+++ b/sys/ofed/include/linux/timer.h
@@ -0,0 +1,87 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_TIMER_H_
+#define _LINUX_TIMER_H_
+
+#include <linux/types.h>
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/callout.h>
+
+struct timer_list {
+	struct callout	timer_callout;
+	void		(*function)(unsigned long);
+        unsigned long	data;
+};
+
+#define	expires	timer_callout.c_time
+
+static inline void
+_timer_fn(void *context)
+{
+	struct timer_list *timer;
+
+	timer = context;
+	timer->function(timer->data);
+}
+
+#define	setup_timer(timer, func, dat)					\
+do {									\
+	(timer)->function = (func);					\
+	(timer)->data = (dat);						\
+	callout_init(&(timer)->timer_callout, CALLOUT_MPSAFE);		\
+} while (0)
+
+#define	init_timer(timer)						\
+do {									\
+	(timer)->function = NULL;					\
+	(timer)->data = 0;						\
+	callout_init(&(timer)->timer_callout, CALLOUT_MPSAFE);		\
+} while (0)
+
+#define	mod_timer(timer, expire)					\
+	callout_reset(&(timer)->timer_callout, (expire) - jiffies,	\
+	    _timer_fn, (timer))
+
+#define	add_timer(timer)						\
+	callout_reset(&(timer)->timer_callout,				\
+	    (timer)->timer_callout.c_time - jiffies, _timer_fn, (timer))
+
+#define	del_timer(timer)	callout_stop(&(timer)->timer_callout)
+#define	del_timer_sync(timer)	callout_drain(&(timer)->timer_callout)
+
+#define	timer_pending(timer)	callout_pending(&(timer)->timer_callout)
+
+static inline unsigned long
+round_jiffies(unsigned long j)
+{
+	return roundup(j, hz);
+}
+
+#endif /* _LINUX_TIMER_H_ */
diff --git a/sys/ofed/include/linux/types.h b/sys/ofed/include/linux/types.h
new file mode 100644
index 0000000..496d6f9
--- /dev/null
+++ b/sys/ofed/include/linux/types.h
@@ -0,0 +1,55 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_TYPES_H_
+#define	_LINUX_TYPES_H_
+
+#include <sys/cdefs.h>
+#include <sys/types.h>
+#include <linux/compiler.h>
+#include <asm/types.h>
+
+typedef __u16 __le16;
+typedef __u16 __be16;
+typedef __u32 __le32;
+typedef __u32 __be32;
+typedef __u64 __le64;
+typedef __u64 __be64;
+typedef _Bool bool;
+#define	true	TRUE
+#define	false	FALSE
+
+typedef unsigned long kernel_ulong_t;
+typedef unsigned int    uint;
+typedef unsigned gfp_t;
+typedef uint64_t loff_t;
+typedef vm_paddr_t resource_size_t;
+
+#define	DECLARE_BITMAP(n, bits)						\
+	unsigned long n[howmany(bits, sizeof(long) * 8)]
+
+#endif	/* _LINUX_TYPES_H_ */
diff --git a/sys/ofed/include/linux/uaccess.h b/sys/ofed/include/linux/uaccess.h
new file mode 100644
index 0000000..9015b1e
--- /dev/null
+++ b/sys/ofed/include/linux/uaccess.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_UACCESS_H_
+#define	_LINUX_UACCESS_H_
+
+#define	get_user(_x, _p)	-copyin((_p), &(_x), sizeof(*(_p)))
+#define	put_user(_x, _p)	-copyout(&(_x), (_p), sizeof(*(_p)))
+
+#endif	/* _LINUX_UACCESS_H_ */
diff --git a/sys/ofed/include/linux/vmalloc.h b/sys/ofed/include/linux/vmalloc.h
new file mode 100644
index 0000000..4a94a5c
--- /dev/null
+++ b/sys/ofed/include/linux/vmalloc.h
@@ -0,0 +1,41 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_VMALLOC_H_
+#define	_LINUX_VMALLOC_H_
+
+#include <asm/page.h>
+
+#define	VM_MAP		0x0000
+#define	PAGE_KERNEL	0x0000
+
+void *vmap(struct page **pages, unsigned int count, unsigned long flags,
+    int prot);
+void vunmap(void *addr);
+
+#endif	/* _LINUX_VMALLOC_H_ */
diff --git a/sys/ofed/include/linux/wait.h b/sys/ofed/include/linux/wait.h
new file mode 100644
index 0000000..b02014e
--- /dev/null
+++ b/sys/ofed/include/linux/wait.h
@@ -0,0 +1,112 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_WAIT_H_
+#define	_LINUX_WAIT_H_
+
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sleepqueue.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+
+struct __wait_queue_head {
+	unsigned int	wchan;
+};
+typedef struct __wait_queue_head wait_queue_head_t;
+
+#define	init_waitqueue_head(x)
+
+static inline void
+__wake_up(struct __wait_queue_head *q, int all)
+{
+	int wakeup_swapper;
+	void *c;
+
+	c = &q->wchan;
+	sleepq_lock(c);
+	if (all)
+		wakeup_swapper = sleepq_broadcast(c, SLEEPQ_SLEEP, 0, 0);
+	else
+		wakeup_swapper = sleepq_signal(c, SLEEPQ_SLEEP, 0, 0);
+	sleepq_release(c);
+	if (wakeup_swapper)
+		kick_proc0();
+}
+
+#define	wake_up(q)				__wake_up(q, 0)
+#define	wake_up_nr(q, nr)			__wake_up(q, 1)
+#define	wake_up_all(q)				__wake_up(q, 1)
+#define	wake_up_interruptible(q)		__wake_up(q, 0)
+#define	wake_up_interruptible_nr(q, nr)		__wake_up(q, 1)
+#define	wake_up_interruptible_all(q, nr)	__wake_up(q, 1)
+
+#define	wait_event(q, cond)						\
+do {									\
+	void *c = &(q).wchan;						\
+	if (!(cond)) {							\
+		for (;;) {						\
+			sleepq_lock(c);					\
+			if (cond) {					\
+				sleepq_release(c);			\
+				break;					\
+			}						\
+			sleepq_add(c, NULL, "completion", SLEEPQ_SLEEP, 0); \
+			sleepq_wait(c, 0);				\
+		}							\
+	}								\
+} while (0)
+
+#define	wait_event_interruptible(q, cond)				\
+({									\
+	void *c = &(q).wchan;						\
+	int _error;							\
+									\
+	_error = 0;							\
+	if (!(cond)) {							\
+		for (; _error == 0;) {					\
+			sleepq_lock(c);					\
+			if (cond) {					\
+				sleepq_release(c);			\
+				break;					\
+			}						\
+			sleepq_add(c, NULL, "completion",		\
+			    SLEEPQ_SLEEP | SLEEPQ_INTERRUPTIBLE, 0);	\
+			if (sleepq_wait_sig(c, 0))			\
+				_error = -ERESTARTSYS;			\
+		}							\
+	}								\
+	-_error;							\
+})
+
+#define	DEFINE_WAIT(x)
+
+#endif	/* _LINUX_WAIT_H_ */
diff --git a/sys/ofed/include/linux/workqueue.h b/sys/ofed/include/linux/workqueue.h
new file mode 100644
index 0000000..6b48f9c
--- /dev/null
+++ b/sys/ofed/include/linux/workqueue.h
@@ -0,0 +1,191 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef	_LINUX_WORKQUEUE_H_
+#define	_LINUX_WORKQUEUE_H_
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/slab.h>
+
+#include <sys/taskqueue.h>
+
+struct workqueue_struct {
+	struct taskqueue	*taskqueue;
+};
+
+struct work_struct {
+	struct	task 		work_task;
+	struct	taskqueue	*taskqueue;
+	void			(*fn)(struct work_struct *);
+};
+
+struct delayed_work {
+	struct work_struct	work;
+	struct callout		timer;
+};
+
+static inline struct delayed_work *
+to_delayed_work(struct work_struct *work)
+{
+
+ 	return container_of(work, struct delayed_work, work);
+}
+
+
+static inline void
+_work_fn(void *context, int pending)
+{
+	struct work_struct *work;
+
+	work = context;
+	work->fn(work);
+}
+
+#define	INIT_WORK(work, func) 	 					\
+do {									\
+	(work)->fn = (func);						\
+	(work)->taskqueue = NULL;					\
+	TASK_INIT(&(work)->work_task, 0, _work_fn, (work));		\
+} while (0)
+
+#define	INIT_DELAYED_WORK(_work, func)					\
+do {									\
+	INIT_WORK(&(_work)->work, func);				\
+	callout_init(&(_work)->timer, CALLOUT_MPSAFE);			\
+} while (0)
+
+#define	INIT_DELAYED_WORK_DEFERRABLE	INIT_DELAYED_WORK
+
+#define	schedule_work(work)						\
+do {									\
+	(work)->taskqueue = taskqueue_thread;				\
+	taskqueue_enqueue(taskqueue_thread, &(work)->work_task);	\
+} while (0)
+
+#define	flush_scheduled_work()	flush_taskqueue(taskqueue_thread)
+
+#define	queue_work(q, work)						\
+do {									\
+	(work)->taskqueue = (q)->taskqueue;				\
+	taskqueue_enqueue((q)->taskqueue, &(work)->work_task);		\
+} while (0)
+
+static inline void
+_delayed_work_fn(void *arg)
+{
+	struct delayed_work *work;
+
+	work = arg;
+	taskqueue_enqueue(work->work.taskqueue, &work->work.work_task);
+}
+
+static inline int
+queue_delayed_work(struct workqueue_struct *wq, struct delayed_work *work,
+    unsigned long delay)
+{
+	int pending;
+
+	pending = work->work.work_task.ta_pending;
+	work->work.taskqueue = wq->taskqueue;
+	if (delay != 0)
+		callout_reset(&work->timer, delay, _delayed_work_fn, work);
+	else
+		_delayed_work_fn((void *)work);
+
+	return (!pending);
+}
+
+static inline struct workqueue_struct *
+_create_workqueue_common(char *name, int cpus)
+{
+	struct workqueue_struct *wq;
+
+	wq = kmalloc(sizeof(*wq), M_WAITOK);
+	wq->taskqueue = taskqueue_create((name), M_WAITOK,
+	    taskqueue_thread_enqueue,  &wq->taskqueue);
+	taskqueue_start_threads(&wq->taskqueue, cpus, PWAIT, (name));
+
+	return (wq);
+}
+
+
+#define	create_singlethread_workqueue(name)				\
+	_create_workqueue_common(name, 1)
+
+#define	create_workqueue(name)						\
+	_create_workqueue_common(name, MAXCPU)
+
+static inline void
+destroy_workqueue(struct workqueue_struct *wq)
+{
+	taskqueue_free(wq->taskqueue);
+	kfree(wq);
+}
+
+#define	flush_workqueue(wq)	flush_taskqueue((wq)->taskqueue)
+
+static inline void
+_flush_fn(void *context, int pending)
+{
+}
+
+static inline void
+flush_taskqueue(struct taskqueue *tq)
+{
+	struct task flushtask;
+
+	TASK_INIT(&flushtask, 0, _flush_fn, NULL);
+	taskqueue_enqueue(tq, &flushtask);
+	taskqueue_drain(tq, &flushtask);
+}
+
+static inline int
+cancel_work_sync(struct work_struct *work)
+{
+	if (work->taskqueue &&
+	    taskqueue_cancel(work->taskqueue, &work->work_task, NULL))
+		taskqueue_drain(work->taskqueue, &work->work_task);
+	return 0;
+}
+
+/*
+ * This may leave work running on another CPU as it does on Linux.
+ */
+static inline int
+cancel_delayed_work(struct delayed_work *work)
+{
+
+	callout_stop(&work->timer);
+	if (work->work.taskqueue &&
+	    taskqueue_cancel(work->work.taskqueue, &work->work.work_task, NULL))
+		taskqueue_drain(work->work.taskqueue, &work->work.work_task);
+	return 0;
+}
+
+#endif	/* _LINUX_WORKQUEUE_H_ */
diff --git a/sys/ofed/include/net/addrconf.h b/sys/ofed/include/net/addrconf.h
new file mode 100644
index 0000000..e5d814e
--- /dev/null
+++ b/sys/ofed/include/net/addrconf.h
@@ -0,0 +1,27 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
diff --git a/sys/ofed/include/net/arp.h b/sys/ofed/include/net/arp.h
new file mode 100644
index 0000000..e5d814e
--- /dev/null
+++ b/sys/ofed/include/net/arp.h
@@ -0,0 +1,27 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
diff --git a/sys/ofed/include/net/ip.h b/sys/ofed/include/net/ip.h
new file mode 100644
index 0000000..8b29d62
--- /dev/null
+++ b/sys/ofed/include/net/ip.h
@@ -0,0 +1,77 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_NET_IP_H_
+#define	_LINUX_NET_IP_H_
+
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#include <net/if_types.h>
+#include <net/if.h>
+#include <net/if_var.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+
+static inline void inet_get_local_port_range(int *low, int *high)
+{
+	*low = V_ipport_firstauto;
+	*high = V_ipport_lastauto;
+}
+
+static inline void
+ip_ib_mc_map(uint32_t addr, const unsigned char *bcast, char *buf)
+{
+	unsigned char scope;
+
+	addr = ntohl(addr);
+	scope = bcast[5] & 0xF;
+	buf[0] = 0;
+	buf[1] = 0xff;
+	buf[2] = 0xff;
+	buf[3] = 0xff;
+	buf[4] = 0xff;
+	buf[5] = 0x10 | scope;
+	buf[6] = 0x40;
+	buf[7] = 0x1b;
+	buf[8] = bcast[8];
+	buf[9] = bcast[9];
+	buf[10] = 0;
+	buf[11] = 0;
+	buf[12] = 0;
+	buf[13] = 0;
+	buf[14] = 0;
+	buf[15] = 0;
+	buf[16] = (addr >> 24) & 0x0f;
+	buf[17] = (addr >> 16) & 0xff;
+	buf[18] = (addr >> 8) & 0xff;
+	buf[19] = addr & 0xff;
+}
+
+#endif	/* _LINUX_NET_IP_H_ */
diff --git a/sys/ofed/include/net/ip6_route.h b/sys/ofed/include/net/ip6_route.h
new file mode 100644
index 0000000..e5d814e
--- /dev/null
+++ b/sys/ofed/include/net/ip6_route.h
@@ -0,0 +1,27 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
diff --git a/sys/ofed/include/net/ipv6.h b/sys/ofed/include/net/ipv6.h
new file mode 100644
index 0000000..6f02555
--- /dev/null
+++ b/sys/ofed/include/net/ipv6.h
@@ -0,0 +1,62 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_NET_IPV6_H_
+#define	_LINUX_NET_IPV6_H_
+
+#ifndef KLD_MODULE
+#include "opt_inet6.h"
+#endif
+
+#define	ipv6_addr_loopback IN6_IS_ADDR_LOOPBACK
+#define	ipv6_addr_copy(dst, src)					\
+	memcpy((dst), (src), sizeof(struct in6_addr))
+
+#ifdef INET6
+static inline void
+ipv6_ib_mc_map(const struct in6_addr *addr, const unsigned char *broadcast,
+    char *buf)
+{
+	unsigned char scope;
+
+	scope = broadcast[5] & 0xF;
+	buf[0]  = 0;
+	buf[1]  = 0xff;
+	buf[2]  = 0xff;
+	buf[3]  = 0xff;
+	buf[4]  = 0xff;
+	buf[5]  = 0x10 | scope;
+	buf[6]  = 0x60;
+	buf[7]  = 0x1b;
+	buf[8]  = broadcast[8];
+	buf[9]  = broadcast[9];
+	memcpy(&buf[10], &addr->s6_addr[6], 10);
+}
+#endif
+
+#endif	/* _LINUX_NET_IPV6_H_ */
diff --git a/sys/ofed/include/net/neighbour.h b/sys/ofed/include/net/neighbour.h
new file mode 100644
index 0000000..e5d814e
--- /dev/null
+++ b/sys/ofed/include/net/neighbour.h
@@ -0,0 +1,27 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
diff --git a/sys/ofed/include/net/netevent.h b/sys/ofed/include/net/netevent.h
new file mode 100644
index 0000000..db5b50e
--- /dev/null
+++ b/sys/ofed/include/net/netevent.h
@@ -0,0 +1,71 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef	_LINUX_NET_NETEVENT_H_
+#define	_LINUX_NET_NETEVENT_H_
+
+#include <netinet/if_ether.h>
+
+enum netevent_notif_type {
+	NETEVENT_NEIGH_UPDATE = 0,
+#if 0 /* Unsupported events. */
+        NETEVENT_PMTU_UPDATE,
+        NETEVENT_REDIRECT,
+#endif
+};
+
+struct llentry;
+
+static inline void
+_handle_arp_update_event(void *arg, struct llentry *lle)
+{
+	struct notifier_block *nb;
+
+	nb = arg;
+	nb->notifier_call(nb, NETEVENT_NEIGH_UPDATE, lle);
+}
+
+static inline int
+register_netevent_notifier(struct notifier_block *nb)
+{
+	nb->tags[NETEVENT_NEIGH_UPDATE] = EVENTHANDLER_REGISTER(
+	    arp_update_event, _handle_arp_update_event, nb, 0);
+	return (0);
+}
+
+static inline int
+unregister_netevent_notifier(struct notifier_block *nb)
+{
+
+	EVENTHANDLER_DEREGISTER(arp_update_event,
+	    nb->tags[NETEVENT_NEIGH_UPDATE]);
+
+	return (0);
+}
+
+#endif /* _LINUX_NET_NETEVENT_H_ */
diff --git a/sys/ofed/include/net/tcp.h b/sys/ofed/include/net/tcp.h
new file mode 100644
index 0000000..75da3f8
--- /dev/null
+++ b/sys/ofed/include/net/tcp.h
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_NET_TCP_H_
+#define	_LINUX_NET_TCP_H_
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/kref.h>
+
+#include <net/ip.h>
+
+#endif	/* _LINUX_NET_TCP_H_ */
diff --git a/sys/ofed/include/rdma/Kbuild b/sys/ofed/include/rdma/Kbuild
new file mode 100644
index 0000000..e7c0432
--- /dev/null
+++ b/sys/ofed/include/rdma/Kbuild
@@ -0,0 +1 @@
+header-y += ib_user_mad.h
diff --git a/sys/ofed/include/rdma/ib_addr.h b/sys/ofed/include/rdma/ib_addr.h
new file mode 100644
index 0000000..61b0a7c
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_addr.h
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc.  All rights reserved.
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(IB_ADDR_H)
+#define IB_ADDR_H
+
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/if_arp.h>
+#include <linux/netdevice.h>
+#include <linux/socket.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+#include <linux/ethtool.h>
+#include <linux/if_vlan.h>
+
+struct rdma_addr_client {
+	atomic_t refcount;
+	struct completion comp;
+};
+
+/**
+ * rdma_addr_register_client - Register an address client.
+ */
+void rdma_addr_register_client(struct rdma_addr_client *client);
+
+/**
+ * rdma_addr_unregister_client - Deregister an address client.
+ * @client: Client object to deregister.
+ */
+void rdma_addr_unregister_client(struct rdma_addr_client *client);
+
+struct rdma_dev_addr {
+	unsigned char src_dev_addr[MAX_ADDR_LEN];
+	unsigned char dst_dev_addr[MAX_ADDR_LEN];
+	unsigned char broadcast[MAX_ADDR_LEN];
+	unsigned short dev_type;
+	int bound_dev_if;
+	enum rdma_transport_type transport;
+};
+
+/**
+ * rdma_translate_ip - Translate a local IP address to an RDMA hardware
+ *   address.
+ */
+int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr);
+
+/**
+ * rdma_resolve_ip - Resolve source and destination IP addresses to
+ *   RDMA hardware addresses.
+ * @client: Address client associated with request.
+ * @src_addr: An optional source address to use in the resolution.  If a
+ *   source address is not provided, a usable address will be returned via
+ *   the callback.
+ * @dst_addr: The destination address to resolve.
+ * @addr: A reference to a data location that will receive the resolved
+ *   addresses.  The data location must remain valid until the callback has
+ *   been invoked.
+ * @timeout_ms: Amount of time to wait for the address resolution to complete.
+ * @callback: Call invoked once address resolution has completed, timed out,
+ *   or been canceled.  A status of 0 indicates success.
+ * @context: User-specified context associated with the call.
+ */
+int rdma_resolve_ip(struct rdma_addr_client *client,
+		    struct sockaddr *src_addr, struct sockaddr *dst_addr,
+		    struct rdma_dev_addr *addr, int timeout_ms,
+		    void (*callback)(int status, struct sockaddr *src_addr,
+				     struct rdma_dev_addr *addr, void *context),
+		    void *context);
+
+void rdma_addr_cancel(struct rdma_dev_addr *addr);
+
+int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev,
+	      const unsigned char *dst_dev_addr);
+
+static inline int ip_addr_size(struct sockaddr *addr)
+{
+	return addr->sa_family == AF_INET6 ?
+	       sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in);
+}
+
+static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr)
+{
+	return ((u16)dev_addr->broadcast[8] << 8) | (u16)dev_addr->broadcast[9];
+}
+
+static inline void ib_addr_set_pkey(struct rdma_dev_addr *dev_addr, u16 pkey)
+{
+	dev_addr->broadcast[8] = pkey >> 8;
+	dev_addr->broadcast[9] = (unsigned char) pkey;
+}
+
+static inline void ib_addr_get_mgid(struct rdma_dev_addr *dev_addr,
+				    union ib_gid *gid)
+{
+	memcpy(gid, dev_addr->broadcast + 4, sizeof *gid);
+}
+
+static inline int rdma_addr_gid_offset(struct rdma_dev_addr *dev_addr)
+{
+	return dev_addr->dev_type == ARPHRD_INFINIBAND ? 4 : 0;
+}
+
+static inline void iboe_mac_vlan_to_ll(union ib_gid *gid, u8 *mac, u16 vid)
+{
+	memset(gid->raw, 0, 16);
+	*((u32 *)gid->raw) = cpu_to_be32(0xfe800000);
+	if (vid < 0x1000) {
+		gid->raw[12] = vid & 0xff;
+		gid->raw[11] = vid >> 8;
+	} else {
+		gid->raw[12] = 0xfe;
+		gid->raw[11] = 0xff;
+	}
+
+	memcpy(gid->raw + 13, mac + 3, 3);
+	memcpy(gid->raw + 8, mac, 3);
+	gid->raw[8] ^= 2;
+}
+
+static inline u16 rdma_vlan_dev_vlan_id(const struct net_device *dev)
+{
+#ifdef __linux__
+	return dev->priv_flags & IFF_802_1Q_VLAN ?
+		vlan_dev_vlan_id(dev) : 0xffff;
+#else
+	uint16_t tag;
+
+	if (VLAN_TAG(__DECONST(struct ifnet *, dev), &tag) != 0)
+		return 0xffff;
+	return tag;
+#endif
+}
+
+static inline void iboe_addr_get_sgid(struct rdma_dev_addr *dev_addr,
+				      union ib_gid *gid)
+{
+	struct net_device *dev;
+	u16 vid = 0xffff;
+
+	dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+	if (dev) {
+		vid = rdma_vlan_dev_vlan_id(dev);
+		dev_put(dev);
+	}
+
+	iboe_mac_vlan_to_ll(gid, dev_addr->src_dev_addr, vid);
+}
+
+static inline void rdma_addr_get_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid)
+{
+	if (dev_addr->transport == RDMA_TRANSPORT_IB &&
+	    dev_addr->dev_type != ARPHRD_INFINIBAND)
+		iboe_addr_get_sgid(dev_addr, gid);
+	else
+		memcpy(gid, dev_addr->src_dev_addr +
+		       rdma_addr_gid_offset(dev_addr), sizeof *gid);
+}
+
+static inline void rdma_addr_set_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid)
+{
+	memcpy(dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid);
+}
+
+static inline void rdma_addr_get_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid)
+{
+	memcpy(gid, dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof *gid);
+}
+
+static inline void rdma_addr_set_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid)
+{
+	memcpy(dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid);
+}
+
+static inline enum ib_mtu iboe_get_mtu(int mtu)
+{
+	/*
+	 * reduce IB headers from effective IBoE MTU. 28 stands for
+	 * atomic header which is the biggest possible header after BTH
+	 */
+	mtu = mtu - IB_GRH_BYTES - IB_BTH_BYTES - 28;
+
+	if (mtu >= ib_mtu_enum_to_int(IB_MTU_4096))
+		return IB_MTU_4096;
+	else if (mtu >= ib_mtu_enum_to_int(IB_MTU_2048))
+		return IB_MTU_2048;
+	else if (mtu >= ib_mtu_enum_to_int(IB_MTU_1024))
+		return IB_MTU_1024;
+	else if (mtu >= ib_mtu_enum_to_int(IB_MTU_512))
+		return IB_MTU_512;
+	else if (mtu >= ib_mtu_enum_to_int(IB_MTU_256))
+		return IB_MTU_256;
+	else
+		return 0;
+}
+
+#ifdef __linux__
+static inline int iboe_get_rate(struct net_device *dev)
+{
+	struct ethtool_cmd cmd;
+
+	if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings ||
+	    dev->ethtool_ops->get_settings(dev, &cmd))
+		return IB_RATE_PORT_CURRENT;
+
+	if (cmd.speed >= 40000)
+		return IB_RATE_40_GBPS;
+	else if (cmd.speed >= 30000)
+		return IB_RATE_30_GBPS;
+	else if (cmd.speed >= 20000)
+		return IB_RATE_20_GBPS;
+	else if (cmd.speed >= 10000)
+		return IB_RATE_10_GBPS;
+	else
+		return IB_RATE_PORT_CURRENT;
+}
+#else
+static inline int iboe_get_rate(struct net_device *dev)
+{
+	if (dev->if_baudrate >= IF_Gbps(40ULL))
+		return IB_RATE_40_GBPS;
+	else if (dev->if_baudrate >= IF_Gbps(30ULL))
+		return IB_RATE_30_GBPS;
+	else if (dev->if_baudrate >= IF_Gbps(20ULL))
+		return IB_RATE_20_GBPS;
+	else if (dev->if_baudrate >= IF_Gbps(10ULL))
+		return IB_RATE_10_GBPS;
+	else
+		return IB_RATE_PORT_CURRENT;
+}
+#endif
+
+static inline int rdma_link_local_addr(struct in6_addr *addr)
+{
+	if (addr->s6_addr32[0] == cpu_to_be32(0xfe800000) &&
+	    addr->s6_addr32[1] == 0)
+		return 1;
+
+	return 0;
+}
+
+static inline void rdma_get_ll_mac(struct in6_addr *addr, u8 *mac)
+{
+	memcpy(mac, &addr->s6_addr[8], 3);
+	memcpy(mac + 3, &addr->s6_addr[13], 3);
+	mac[0] ^= 2;
+}
+
+static inline int rdma_is_multicast_addr(struct in6_addr *addr)
+{
+	return addr->s6_addr[0] == 0xff;
+}
+
+static inline void rdma_get_mcast_mac(struct in6_addr *addr, u8 *mac)
+{
+	int i;
+
+	mac[0] = 0x33;
+	mac[1] = 0x33;
+	for (i = 2; i < 6; ++i)
+		mac[i] = addr->s6_addr[i + 10];
+}
+
+static inline u16 rdma_get_vlan_id(union ib_gid *dgid)
+{
+	u16 vid;
+
+	vid = dgid->raw[11] << 8 | dgid->raw[12];
+	return vid < 0x1000 ? vid  : 0xffff;
+}
+
+static inline struct net_device *rdma_vlan_dev_real_dev(const struct net_device *dev)
+{
+#ifdef __linux__
+	return dev->priv_flags & IFF_802_1Q_VLAN ?
+		vlan_dev_real_dev(dev) : 0;
+#else
+	return VLAN_TRUNKDEV(__DECONST(struct ifnet *, dev));
+#endif
+}
+
+#endif /* IB_ADDR_H */
diff --git a/sys/ofed/include/rdma/ib_cache.h b/sys/ofed/include/rdma/ib_cache.h
new file mode 100644
index 0000000..00a2b8e
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_cache.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IB_CACHE_H
+#define _IB_CACHE_H
+
+#include <rdma/ib_verbs.h>
+
+/**
+ * ib_get_cached_gid - Returns a cached GID table entry
+ * @device: The device to query.
+ * @port_num: The port number of the device to query.
+ * @index: The index into the cached GID table to query.
+ * @gid: The GID value found at the specified index.
+ *
+ * ib_get_cached_gid() fetches the specified GID table entry stored in
+ * the local software cache.
+ */
+int ib_get_cached_gid(struct ib_device    *device,
+		      u8                   port_num,
+		      int                  index,
+		      union ib_gid        *gid);
+
+/**
+ * ib_find_cached_gid - Returns the port number and GID table index where
+ *   a specified GID value occurs.
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @port_num: The port number of the device where the GID value was found.
+ * @index: The index into the cached GID table where the GID was found.  This
+ *   parameter may be NULL.
+ *
+ * ib_find_cached_gid() searches for the specified GID value in
+ * the local software cache.
+ */
+int ib_find_cached_gid(struct ib_device *device,
+		       union ib_gid	*gid,
+		       u8               *port_num,
+		       u16              *index);
+
+/**
+ * ib_get_cached_pkey - Returns a cached PKey table entry
+ * @device: The device to query.
+ * @port_num: The port number of the device to query.
+ * @index: The index into the cached PKey table to query.
+ * @pkey: The PKey value found at the specified index.
+ *
+ * ib_get_cached_pkey() fetches the specified PKey table entry stored in
+ * the local software cache.
+ */
+int ib_get_cached_pkey(struct ib_device    *device_handle,
+		       u8                   port_num,
+		       int                  index,
+		       u16                 *pkey);
+
+/**
+ * ib_find_cached_pkey - Returns the PKey table index where a specified
+ *   PKey value occurs.
+ * @device: The device to query.
+ * @port_num: The port number of the device to search for the PKey.
+ * @pkey: The PKey value to search for.
+ * @index: The index into the cached PKey table where the PKey was found.
+ *
+ * ib_find_cached_pkey() searches the specified PKey table in
+ * the local software cache.
+ */
+int ib_find_cached_pkey(struct ib_device    *device,
+			u8                   port_num,
+			u16                  pkey,
+			u16                 *index);
+
+/**
+ * ib_get_cached_lmc - Returns a cached lmc table entry
+ * @device: The device to query.
+ * @port_num: The port number of the device to query.
+ * @lmc: The lmc value for the specified port for that device.
+ *
+ * ib_get_cached_lmc() fetches the specified lmc table entry stored in
+ * the local software cache.
+ */
+int ib_get_cached_lmc(struct ib_device *device,
+		      u8                port_num,
+		      u8                *lmc);
+
+#endif /* _IB_CACHE_H */
diff --git a/sys/ofed/include/rdma/ib_cm.h b/sys/ofed/include/rdma/ib_cm.h
new file mode 100644
index 0000000..9388583
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_cm.h
@@ -0,0 +1,589 @@
+/*
+ * Copyright (c) 2004, 2005 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if !defined(IB_CM_H)
+#define IB_CM_H
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_sa.h>
+
+enum ib_cm_state {
+	IB_CM_IDLE,
+	IB_CM_LISTEN,
+	IB_CM_REQ_SENT,
+	IB_CM_REQ_RCVD,
+	IB_CM_MRA_REQ_SENT,
+	IB_CM_MRA_REQ_RCVD,
+	IB_CM_REP_SENT,
+	IB_CM_REP_RCVD,
+	IB_CM_MRA_REP_SENT,
+	IB_CM_MRA_REP_RCVD,
+	IB_CM_ESTABLISHED,
+	IB_CM_DREQ_SENT,
+	IB_CM_DREQ_RCVD,
+	IB_CM_TIMEWAIT,
+	IB_CM_SIDR_REQ_SENT,
+	IB_CM_SIDR_REQ_RCVD
+};
+
+enum ib_cm_lap_state {
+	IB_CM_LAP_UNINIT,
+	IB_CM_LAP_IDLE,
+	IB_CM_LAP_SENT,
+	IB_CM_LAP_RCVD,
+	IB_CM_MRA_LAP_SENT,
+	IB_CM_MRA_LAP_RCVD,
+};
+
+enum ib_cm_event_type {
+	IB_CM_REQ_ERROR,
+	IB_CM_REQ_RECEIVED,
+	IB_CM_REP_ERROR,
+	IB_CM_REP_RECEIVED,
+	IB_CM_RTU_RECEIVED,
+	IB_CM_USER_ESTABLISHED,
+	IB_CM_DREQ_ERROR,
+	IB_CM_DREQ_RECEIVED,
+	IB_CM_DREP_RECEIVED,
+	IB_CM_TIMEWAIT_EXIT,
+	IB_CM_MRA_RECEIVED,
+	IB_CM_REJ_RECEIVED,
+	IB_CM_LAP_ERROR,
+	IB_CM_LAP_RECEIVED,
+	IB_CM_APR_RECEIVED,
+	IB_CM_SIDR_REQ_ERROR,
+	IB_CM_SIDR_REQ_RECEIVED,
+	IB_CM_SIDR_REP_RECEIVED
+};
+
+enum ib_cm_data_size {
+	IB_CM_REQ_PRIVATE_DATA_SIZE	 = 92,
+	IB_CM_MRA_PRIVATE_DATA_SIZE	 = 222,
+	IB_CM_REJ_PRIVATE_DATA_SIZE	 = 148,
+	IB_CM_REP_PRIVATE_DATA_SIZE	 = 196,
+	IB_CM_RTU_PRIVATE_DATA_SIZE	 = 224,
+	IB_CM_DREQ_PRIVATE_DATA_SIZE	 = 220,
+	IB_CM_DREP_PRIVATE_DATA_SIZE	 = 224,
+	IB_CM_REJ_ARI_LENGTH		 = 72,
+	IB_CM_LAP_PRIVATE_DATA_SIZE	 = 168,
+	IB_CM_APR_PRIVATE_DATA_SIZE	 = 148,
+	IB_CM_APR_INFO_LENGTH		 = 72,
+	IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE = 216,
+	IB_CM_SIDR_REP_PRIVATE_DATA_SIZE = 136,
+	IB_CM_SIDR_REP_INFO_LENGTH	 = 72,
+	IB_CM_COMPARE_SIZE		 = 64
+};
+
+struct ib_cm_id;
+
+struct ib_cm_req_event_param {
+	struct ib_cm_id		*listen_id;
+	u8			port;
+
+	struct ib_sa_path_rec	*primary_path;
+	struct ib_sa_path_rec	*alternate_path;
+
+	__be64			remote_ca_guid;
+	u32			remote_qkey;
+	u32			remote_qpn;
+	enum ib_qp_type		qp_type;
+
+	u32			starting_psn;
+	u8			responder_resources;
+	u8			initiator_depth;
+	unsigned int		local_cm_response_timeout:5;
+	unsigned int		flow_control:1;
+	unsigned int		remote_cm_response_timeout:5;
+	unsigned int		retry_count:3;
+	unsigned int		rnr_retry_count:3;
+	unsigned int		srq:1;
+};
+
+struct ib_cm_rep_event_param {
+	__be64			remote_ca_guid;
+	u32			remote_qkey;
+	u32			remote_qpn;
+	u32			starting_psn;
+	u8			responder_resources;
+	u8			initiator_depth;
+	unsigned int		target_ack_delay:5;
+	unsigned int		failover_accepted:2;
+	unsigned int		flow_control:1;
+	unsigned int		rnr_retry_count:3;
+	unsigned int		srq:1;
+};
+
+enum ib_cm_rej_reason {
+	IB_CM_REJ_NO_QP				= 1,
+	IB_CM_REJ_NO_EEC			= 2,
+	IB_CM_REJ_NO_RESOURCES			= 3,
+	IB_CM_REJ_TIMEOUT			= 4,
+	IB_CM_REJ_UNSUPPORTED			= 5,
+	IB_CM_REJ_INVALID_COMM_ID		= 6,
+	IB_CM_REJ_INVALID_COMM_INSTANCE		= 7,
+	IB_CM_REJ_INVALID_SERVICE_ID		= 8,
+	IB_CM_REJ_INVALID_TRANSPORT_TYPE	= 9,
+	IB_CM_REJ_STALE_CONN			= 10,
+	IB_CM_REJ_RDC_NOT_EXIST			= 11,
+	IB_CM_REJ_INVALID_GID			= 12,
+	IB_CM_REJ_INVALID_LID			= 13,
+	IB_CM_REJ_INVALID_SL			= 14,
+	IB_CM_REJ_INVALID_TRAFFIC_CLASS		= 15,
+	IB_CM_REJ_INVALID_HOP_LIMIT		= 16,
+	IB_CM_REJ_INVALID_PACKET_RATE		= 17,
+	IB_CM_REJ_INVALID_ALT_GID		= 18,
+	IB_CM_REJ_INVALID_ALT_LID		= 19,
+	IB_CM_REJ_INVALID_ALT_SL		= 20,
+	IB_CM_REJ_INVALID_ALT_TRAFFIC_CLASS	= 21,
+	IB_CM_REJ_INVALID_ALT_HOP_LIMIT		= 22,
+	IB_CM_REJ_INVALID_ALT_PACKET_RATE	= 23,
+	IB_CM_REJ_PORT_CM_REDIRECT		= 24,
+	IB_CM_REJ_PORT_REDIRECT			= 25,
+	IB_CM_REJ_INVALID_MTU			= 26,
+	IB_CM_REJ_INSUFFICIENT_RESP_RESOURCES	= 27,
+	IB_CM_REJ_CONSUMER_DEFINED		= 28,
+	IB_CM_REJ_INVALID_RNR_RETRY		= 29,
+	IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID	= 30,
+	IB_CM_REJ_INVALID_CLASS_VERSION		= 31,
+	IB_CM_REJ_INVALID_FLOW_LABEL		= 32,
+	IB_CM_REJ_INVALID_ALT_FLOW_LABEL	= 33
+};
+
+struct ib_cm_rej_event_param {
+	enum ib_cm_rej_reason	reason;
+	void			*ari;
+	u8			ari_length;
+};
+
+struct ib_cm_mra_event_param {
+	u8	service_timeout;
+};
+
+struct ib_cm_lap_event_param {
+	struct ib_sa_path_rec	*alternate_path;
+};
+
+enum ib_cm_apr_status {
+	IB_CM_APR_SUCCESS,
+	IB_CM_APR_INVALID_COMM_ID,
+	IB_CM_APR_UNSUPPORTED,
+	IB_CM_APR_REJECT,
+	IB_CM_APR_REDIRECT,
+	IB_CM_APR_IS_CURRENT,
+	IB_CM_APR_INVALID_QPN_EECN,
+	IB_CM_APR_INVALID_LID,
+	IB_CM_APR_INVALID_GID,
+	IB_CM_APR_INVALID_FLOW_LABEL,
+	IB_CM_APR_INVALID_TCLASS,
+	IB_CM_APR_INVALID_HOP_LIMIT,
+	IB_CM_APR_INVALID_PACKET_RATE,
+	IB_CM_APR_INVALID_SL
+};
+
+struct ib_cm_apr_event_param {
+	enum ib_cm_apr_status	ap_status;
+	void			*apr_info;
+	u8			info_len;
+};
+
+struct ib_cm_sidr_req_event_param {
+	struct ib_cm_id		*listen_id;
+	u8			port;
+	u16			pkey;
+};
+
+enum ib_cm_sidr_status {
+	IB_SIDR_SUCCESS,
+	IB_SIDR_UNSUPPORTED,
+	IB_SIDR_REJECT,
+	IB_SIDR_NO_QP,
+	IB_SIDR_REDIRECT,
+	IB_SIDR_UNSUPPORTED_VERSION
+};
+
+struct ib_cm_sidr_rep_event_param {
+	enum ib_cm_sidr_status	status;
+	u32			qkey;
+	u32			qpn;
+	void			*info;
+	u8			info_len;
+};
+
+struct ib_cm_event {
+	enum ib_cm_event_type	event;
+	union {
+		struct ib_cm_req_event_param	req_rcvd;
+		struct ib_cm_rep_event_param	rep_rcvd;
+		/* No data for RTU received events. */
+		struct ib_cm_rej_event_param	rej_rcvd;
+		struct ib_cm_mra_event_param	mra_rcvd;
+		struct ib_cm_lap_event_param	lap_rcvd;
+		struct ib_cm_apr_event_param	apr_rcvd;
+		/* No data for DREQ/DREP received events. */
+		struct ib_cm_sidr_req_event_param sidr_req_rcvd;
+		struct ib_cm_sidr_rep_event_param sidr_rep_rcvd;
+		enum ib_wc_status		send_status;
+	} param;
+
+	void			*private_data;
+};
+
+/**
+ * ib_cm_handler - User-defined callback to process communication events.
+ * @cm_id: Communication identifier associated with the reported event.
+ * @event: Information about the communication event.
+ *
+ * IB_CM_REQ_RECEIVED and IB_CM_SIDR_REQ_RECEIVED communication events
+ * generated as a result of listen requests result in the allocation of a
+ * new @cm_id.  The new @cm_id is returned to the user through this callback.
+ * Clients are responsible for destroying the new @cm_id.  For peer-to-peer
+ * IB_CM_REQ_RECEIVED and all other events, the returned @cm_id corresponds
+ * to a user's existing communication identifier.
+ *
+ * Users may not call ib_destroy_cm_id while in the context of this callback;
+ * however, returning a non-zero value instructs the communication manager to
+ * destroy the @cm_id after the callback completes.
+ */
+typedef int (*ib_cm_handler)(struct ib_cm_id *cm_id,
+			     struct ib_cm_event *event);
+
+struct ib_cm_id {
+	ib_cm_handler		cm_handler;
+	void			*context;
+	struct ib_device	*device;
+	__be64			service_id;
+	__be64			service_mask;
+	enum ib_cm_state	state;		/* internal CM/debug use */
+	enum ib_cm_lap_state	lap_state;	/* internal CM/debug use */
+	__be32			local_id;
+	__be32			remote_id;
+	u32			remote_cm_qpn;  /* 1 unless redirected */
+};
+
+/**
+ * ib_create_cm_id - Allocate a communication identifier.
+ * @device: Device associated with the cm_id.  All related communication will
+ * be associated with the specified device.
+ * @cm_handler: Callback invoked to notify the user of CM events.
+ * @context: User specified context associated with the communication
+ *   identifier.
+ *
+ * Communication identifiers are used to track connection states, service
+ * ID resolution requests, and listen requests.
+ */
+struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
+				 ib_cm_handler cm_handler,
+				 void *context);
+
+/**
+ * ib_destroy_cm_id - Destroy a connection identifier.
+ * @cm_id: Connection identifier to destroy.
+ *
+ * This call blocks until the connection identifier is destroyed.
+ */
+void ib_destroy_cm_id(struct ib_cm_id *cm_id);
+
+#define IB_SERVICE_ID_AGN_MASK	cpu_to_be64(0xFF00000000000000ULL)
+#define IB_CM_ASSIGN_SERVICE_ID	cpu_to_be64(0x0200000000000000ULL)
+#define IB_CMA_SERVICE_ID	cpu_to_be64(0x0000000001000000ULL)
+#define IB_CMA_SERVICE_ID_MASK	cpu_to_be64(0xFFFFFFFFFF000000ULL)
+#define IB_SDP_SERVICE_ID	cpu_to_be64(0x0000000000010000ULL)
+#define IB_SDP_SERVICE_ID_MASK	cpu_to_be64(0xFFFFFFFFFFFF0000ULL)
+
+struct ib_cm_compare_data {
+	u8  data[IB_CM_COMPARE_SIZE];
+	u8  mask[IB_CM_COMPARE_SIZE];
+};
+
+/**
+ * ib_cm_listen - Initiates listening on the specified service ID for
+ *   connection and service ID resolution requests.
+ * @cm_id: Connection identifier associated with the listen request.
+ * @service_id: Service identifier matched against incoming connection
+ *   and service ID resolution requests.  The service ID should be specified
+ *   network-byte order.  If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ *   assign a service ID to the caller.
+ * @service_mask: Mask applied to service ID used to listen across a
+ *   range of service IDs.  If set to 0, the service ID is matched
+ *   exactly.  This parameter is ignored if %service_id is set to
+ *   IB_CM_ASSIGN_SERVICE_ID.
+ * @compare_data: This parameter is optional.  It specifies data that must
+ *   appear in the private data of a connection request for the specified
+ *   listen request.
+ */
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
+		 struct ib_cm_compare_data *compare_data);
+
+struct ib_cm_req_param {
+	struct ib_sa_path_rec	*primary_path;
+	struct ib_sa_path_rec	*alternate_path;
+	__be64			service_id;
+	u32			qp_num;
+	enum ib_qp_type		qp_type;
+	u32			starting_psn;
+	const void		*private_data;
+	u8			private_data_len;
+	u8			peer_to_peer;
+	u8			responder_resources;
+	u8			initiator_depth;
+	u8			remote_cm_response_timeout;
+	u8			flow_control;
+	u8			local_cm_response_timeout;
+	u8			retry_count;
+	u8			rnr_retry_count;
+	u8			max_cm_retries;
+	u8			srq;
+};
+
+/**
+ * ib_send_cm_req - Sends a connection request to the remote node.
+ * @cm_id: Connection identifier that will be associated with the
+ *   connection request.
+ * @param: Connection request information needed to establish the
+ *   connection.
+ */
+int ib_send_cm_req(struct ib_cm_id *cm_id,
+		   struct ib_cm_req_param *param);
+
+struct ib_cm_rep_param {
+	u32		qp_num;
+	u32		starting_psn;
+	const void	*private_data;
+	u8		private_data_len;
+	u8		responder_resources;
+	u8		initiator_depth;
+	u8		failover_accepted;
+	u8		flow_control;
+	u8		rnr_retry_count;
+	u8		srq;
+};
+
+/**
+ * ib_send_cm_rep - Sends a connection reply in response to a connection
+ *   request.
+ * @cm_id: Connection identifier that will be associated with the
+ *   connection request.
+ * @param: Connection reply information needed to establish the
+ *   connection.
+ */
+int ib_send_cm_rep(struct ib_cm_id *cm_id,
+		   struct ib_cm_rep_param *param);
+
+/**
+ * ib_send_cm_rtu - Sends a connection ready to use message in response
+ *   to a connection reply message.
+ * @cm_id: Connection identifier associated with the connection request.
+ * @private_data: Optional user-defined private data sent with the
+ *   ready to use message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_rtu(struct ib_cm_id *cm_id,
+		   const void *private_data,
+		   u8 private_data_len);
+
+/**
+ * ib_send_cm_dreq - Sends a disconnection request for an existing
+ *   connection.
+ * @cm_id: Connection identifier associated with the connection being
+ *   released.
+ * @private_data: Optional user-defined private data sent with the
+ *   disconnection request message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_dreq(struct ib_cm_id *cm_id,
+		    const void *private_data,
+		    u8 private_data_len);
+
+/**
+ * ib_send_cm_drep - Sends a disconnection reply to a disconnection request.
+ * @cm_id: Connection identifier associated with the connection being
+ *   released.
+ * @private_data: Optional user-defined private data sent with the
+ *   disconnection reply message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ *
+ * If the cm_id is in the correct state, the CM will transition the connection
+ * to the timewait state, even if an error occurs sending the DREP message.
+ */
+int ib_send_cm_drep(struct ib_cm_id *cm_id,
+		    const void *private_data,
+		    u8 private_data_len);
+
+/**
+ * ib_cm_notify - Notifies the CM of an event reported to the consumer.
+ * @cm_id: Connection identifier to transition to established.
+ * @event: Type of event.
+ *
+ * This routine should be invoked by users to notify the CM of relevant
+ * communication events.  Events that should be reported to the CM and
+ * when to report them are:
+ *
+ * IB_EVENT_COMM_EST - Used when a message is received on a connected
+ *    QP before an RTU has been received.
+ * IB_EVENT_PATH_MIG - Notifies the CM that the connection has failed over
+ *   to the alternate path.
+ */
+int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event);
+
+/**
+ * ib_send_cm_rej - Sends a connection rejection message to the
+ *   remote node.
+ * @cm_id: Connection identifier associated with the connection being
+ *   rejected.
+ * @reason: Reason for the connection request rejection.
+ * @ari: Optional additional rejection information.
+ * @ari_length: Size of the additional rejection information, in bytes.
+ * @private_data: Optional user-defined private data sent with the
+ *   rejection message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_rej(struct ib_cm_id *cm_id,
+		   enum ib_cm_rej_reason reason,
+		   void *ari,
+		   u8 ari_length,
+		   const void *private_data,
+		   u8 private_data_len);
+
+#define IB_CM_MRA_FLAG_DELAY 0x80  /* Send MRA only after a duplicate msg */
+
+/**
+ * ib_send_cm_mra - Sends a message receipt acknowledgement to a connection
+ *   message.
+ * @cm_id: Connection identifier associated with the connection message.
+ * @service_timeout: The lower 5-bits specify the maximum time required for
+ *   the sender to reply to to the connection message.  The upper 3-bits
+ *   specify additional control flags.
+ * @private_data: Optional user-defined private data sent with the
+ *   message receipt acknowledgement.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_mra(struct ib_cm_id *cm_id,
+		   u8 service_timeout,
+		   const void *private_data,
+		   u8 private_data_len);
+
+/**
+ * ib_send_cm_lap - Sends a load alternate path request.
+ * @cm_id: Connection identifier associated with the load alternate path
+ *   message.
+ * @alternate_path: A path record that identifies the alternate path to
+ *   load.
+ * @private_data: Optional user-defined private data sent with the
+ *   load alternate path message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_lap(struct ib_cm_id *cm_id,
+		   struct ib_sa_path_rec *alternate_path,
+		   const void *private_data,
+		   u8 private_data_len);
+
+/**
+ * ib_cm_init_qp_attr - Initializes the QP attributes for use in transitioning
+ *   to a specified QP state.
+ * @cm_id: Communication identifier associated with the QP attributes to
+ *   initialize.
+ * @qp_attr: On input, specifies the desired QP state.  On output, the
+ *   mandatory and desired optional attributes will be set in order to
+ *   modify the QP to the specified state.
+ * @qp_attr_mask: The QP attribute mask that may be used to transition the
+ *   QP to the specified state.
+ *
+ * Users must set the @qp_attr->qp_state to the desired QP state.  This call
+ * will set all required attributes for the given transition, along with
+ * known optional attributes.  Users may override the attributes returned from
+ * this call before calling ib_modify_qp.
+ */
+int ib_cm_init_qp_attr(struct ib_cm_id *cm_id,
+		       struct ib_qp_attr *qp_attr,
+		       int *qp_attr_mask);
+
+/**
+ * ib_send_cm_apr - Sends an alternate path response message in response to
+ *   a load alternate path request.
+ * @cm_id: Connection identifier associated with the alternate path response.
+ * @status: Reply status sent with the alternate path response.
+ * @info: Optional additional information sent with the alternate path
+ *   response.
+ * @info_length: Size of the additional information, in bytes.
+ * @private_data: Optional user-defined private data sent with the
+ *   alternate path response message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_apr(struct ib_cm_id *cm_id,
+		   enum ib_cm_apr_status status,
+		   void *info,
+		   u8 info_length,
+		   const void *private_data,
+		   u8 private_data_len);
+
+struct ib_cm_sidr_req_param {
+	struct ib_sa_path_rec	*path;
+	__be64			service_id;
+	int			timeout_ms;
+	const void		*private_data;
+	u8			private_data_len;
+	u8			max_cm_retries;
+};
+
+/**
+ * ib_send_cm_sidr_req - Sends a service ID resolution request to the
+ *   remote node.
+ * @cm_id: Communication identifier that will be associated with the
+ *   service ID resolution request.
+ * @param: Service ID resolution request information.
+ */
+int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
+			struct ib_cm_sidr_req_param *param);
+
+struct ib_cm_sidr_rep_param {
+	u32			qp_num;
+	u32			qkey;
+	enum ib_cm_sidr_status	status;
+	const void		*info;
+	u8			info_length;
+	const void		*private_data;
+	u8			private_data_len;
+};
+
+/**
+ * ib_send_cm_sidr_rep - Sends a service ID resolution reply to the
+ *   remote node.
+ * @cm_id: Communication identifier associated with the received service ID
+ *   resolution request.
+ * @param: Service ID resolution reply information.
+ */
+int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id,
+			struct ib_cm_sidr_rep_param *param);
+
+#endif /* IB_CM_H */
diff --git a/sys/ofed/include/rdma/ib_fmr_pool.h b/sys/ofed/include/rdma/ib_fmr_pool.h
new file mode 100644
index 0000000..f62b842
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_fmr_pool.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(IB_FMR_POOL_H)
+#define IB_FMR_POOL_H
+
+#include <rdma/ib_verbs.h>
+
+struct ib_fmr_pool;
+
+/**
+ * struct ib_fmr_pool_param - Parameters for creating FMR pool
+ * @max_pages_per_fmr:Maximum number of pages per map request.
+ * @page_shift: Log2 of sizeof "pages" mapped by this fmr
+ * @access:Access flags for FMRs in pool.
+ * @pool_size:Number of FMRs to allocate for pool.
+ * @dirty_watermark:Flush is triggered when @dirty_watermark dirty
+ *     FMRs are present.
+ * @flush_function:Callback called when unmapped FMRs are flushed and
+ *     more FMRs are possibly available for mapping
+ * @flush_arg:Context passed to user's flush function.
+ * @cache:If set, FMRs may be reused after unmapping for identical map
+ *     requests.
+ */
+struct ib_fmr_pool_param {
+	int                     max_pages_per_fmr;
+	int                     page_shift;
+	enum ib_access_flags    access;
+	int                     pool_size;
+	int                     dirty_watermark;
+	void                  (*flush_function)(struct ib_fmr_pool *pool,
+						void               *arg);
+	void                   *flush_arg;
+	unsigned                cache:1;
+};
+
+struct ib_pool_fmr {
+	struct ib_fmr      *fmr;
+	struct ib_fmr_pool *pool;
+	struct list_head    list;
+	struct hlist_node   cache_node;
+	int                 ref_count;
+	int                 remap_count;
+	u64                 io_virtual_address;
+	int                 page_list_len;
+	u64                 page_list[0];
+};
+
+struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd             *pd,
+				       struct ib_fmr_pool_param *params);
+
+void ib_destroy_fmr_pool(struct ib_fmr_pool *pool);
+
+int ib_flush_fmr_pool(struct ib_fmr_pool *pool);
+
+struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle,
+					 u64                *page_list,
+					 int                 list_len,
+					 u64                 io_virtual_address);
+
+int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr);
+
+#endif /* IB_FMR_POOL_H */
diff --git a/sys/ofed/include/rdma/ib_mad.h b/sys/ofed/include/rdma/ib_mad.h
new file mode 100644
index 0000000..d3b9401
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_mad.h
@@ -0,0 +1,655 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004-2006 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(IB_MAD_H)
+#define IB_MAD_H
+
+#include <linux/list.h>
+
+#include <rdma/ib_verbs.h>
+
+/* Management base version */
+#define IB_MGMT_BASE_VERSION			1
+
+/* Management classes */
+#define IB_MGMT_CLASS_SUBN_LID_ROUTED		0x01
+#define IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE	0x81
+#define IB_MGMT_CLASS_SUBN_ADM			0x03
+#define IB_MGMT_CLASS_PERF_MGMT			0x04
+#define IB_MGMT_CLASS_BM			0x05
+#define IB_MGMT_CLASS_DEVICE_MGMT		0x06
+#define IB_MGMT_CLASS_CM			0x07
+#define IB_MGMT_CLASS_SNMP			0x08
+#define IB_MGMT_CLASS_DEVICE_ADM		0x10
+#define IB_MGMT_CLASS_BOOT_MGMT			0x11
+#define IB_MGMT_CLASS_BIS			0x12
+#define IB_MGMT_CLASS_CONG_MGMT			0x21
+#define IB_MGMT_CLASS_VENDOR_RANGE2_START	0x30
+#define IB_MGMT_CLASS_VENDOR_RANGE2_END		0x4F
+
+#define	IB_OPENIB_OUI				(0x001405)
+
+/* Management methods */
+#define IB_MGMT_METHOD_GET			0x01
+#define IB_MGMT_METHOD_SET			0x02
+#define IB_MGMT_METHOD_GET_RESP			0x81
+#define IB_MGMT_METHOD_SEND			0x03
+#define IB_MGMT_METHOD_TRAP			0x05
+#define IB_MGMT_METHOD_REPORT			0x06
+#define IB_MGMT_METHOD_REPORT_RESP		0x86
+#define IB_MGMT_METHOD_TRAP_REPRESS		0x07
+
+#define IB_MGMT_METHOD_RESP			0x80
+#define IB_BM_ATTR_MOD_RESP			cpu_to_be32(1)
+
+#define IB_MGMT_MAX_METHODS			128
+
+/* RMPP information */
+#define IB_MGMT_RMPP_VERSION			1
+
+#define IB_MGMT_RMPP_TYPE_DATA			1
+#define IB_MGMT_RMPP_TYPE_ACK			2
+#define IB_MGMT_RMPP_TYPE_STOP			3
+#define IB_MGMT_RMPP_TYPE_ABORT			4
+
+#define IB_MGMT_RMPP_FLAG_ACTIVE		1
+#define IB_MGMT_RMPP_FLAG_FIRST			(1<<1)
+#define IB_MGMT_RMPP_FLAG_LAST			(1<<2)
+
+#define IB_MGMT_RMPP_NO_RESPTIME		0x1F
+
+#define	IB_MGMT_RMPP_STATUS_SUCCESS		0
+#define	IB_MGMT_RMPP_STATUS_RESX		1
+#define	IB_MGMT_RMPP_STATUS_ABORT_MIN		118
+#define	IB_MGMT_RMPP_STATUS_T2L			118
+#define	IB_MGMT_RMPP_STATUS_BAD_LEN		119
+#define	IB_MGMT_RMPP_STATUS_BAD_SEG		120
+#define	IB_MGMT_RMPP_STATUS_BADT		121
+#define	IB_MGMT_RMPP_STATUS_W2S			122
+#define	IB_MGMT_RMPP_STATUS_S2B			123
+#define	IB_MGMT_RMPP_STATUS_BAD_STATUS		124
+#define	IB_MGMT_RMPP_STATUS_UNV			125
+#define	IB_MGMT_RMPP_STATUS_TMR			126
+#define	IB_MGMT_RMPP_STATUS_UNSPEC		127
+#define	IB_MGMT_RMPP_STATUS_ABORT_MAX		127
+
+#define IB_QP0		0
+#define IB_QP1		cpu_to_be32(1)
+#define IB_QP1_QKEY	0x80010000
+#define IB_QP_SET_QKEY	0x80000000
+
+#define IB_DEFAULT_PKEY_PARTIAL 0x7FFF
+#define IB_DEFAULT_PKEY_FULL	0xFFFF
+
+enum {
+	IB_MGMT_MAD_HDR = 24,
+	IB_MGMT_MAD_DATA = 232,
+	IB_MGMT_RMPP_HDR = 36,
+	IB_MGMT_RMPP_DATA = 220,
+	IB_MGMT_VENDOR_HDR = 40,
+	IB_MGMT_VENDOR_DATA = 216,
+	IB_MGMT_SA_HDR = 56,
+	IB_MGMT_SA_DATA = 200,
+	IB_MGMT_DEVICE_HDR = 64,
+	IB_MGMT_DEVICE_DATA = 192,
+};
+
+struct ib_mad_hdr {
+	u8	base_version;
+	u8	mgmt_class;
+	u8	class_version;
+	u8	method;
+	__be16	status;
+	__be16	class_specific;
+	__be64	tid;
+	__be16	attr_id;
+	__be16	resv;
+	__be32	attr_mod;
+};
+
+struct ib_rmpp_hdr {
+	u8	rmpp_version;
+	u8	rmpp_type;
+	u8	rmpp_rtime_flags;
+	u8	rmpp_status;
+	__be32	seg_num;
+	__be32	paylen_newwin;
+};
+
+typedef u64 __bitwise ib_sa_comp_mask;
+
+#define IB_SA_COMP_MASK(n) ((__force ib_sa_comp_mask) cpu_to_be64(1ull << n))
+
+/*
+ * ib_sa_hdr and ib_sa_mad structures must be packed because they have
+ * 64-bit fields that are only 32-bit aligned. 64-bit architectures will
+ * lay them out wrong otherwise.  (And unfortunately they are sent on
+ * the wire so we can't change the layout)
+ */
+struct ib_sa_hdr {
+	__be64			sm_key;
+	__be16			attr_offset;
+	__be16			reserved;
+	ib_sa_comp_mask		comp_mask;
+} __attribute__ ((packed));
+
+struct ib_mad {
+	struct ib_mad_hdr	mad_hdr;
+	u8			data[IB_MGMT_MAD_DATA];
+};
+
+struct ib_rmpp_mad {
+	struct ib_mad_hdr	mad_hdr;
+	struct ib_rmpp_hdr	rmpp_hdr;
+	u8			data[IB_MGMT_RMPP_DATA];
+};
+
+struct ib_sa_mad {
+	struct ib_mad_hdr	mad_hdr;
+	struct ib_rmpp_hdr	rmpp_hdr;
+	struct ib_sa_hdr	sa_hdr;
+	u8			data[IB_MGMT_SA_DATA];
+} __attribute__ ((packed));
+
+struct ib_vendor_mad {
+	struct ib_mad_hdr	mad_hdr;
+	struct ib_rmpp_hdr	rmpp_hdr;
+	u8			reserved;
+	u8			oui[3];
+	u8			data[IB_MGMT_VENDOR_DATA];
+};
+
+struct ib_class_port_info {
+	u8			base_version;
+	u8			class_version;
+	__be16			capability_mask;
+	u8			reserved[3];
+	u8			resp_time_value;
+	u8			redirect_gid[16];
+	__be32			redirect_tcslfl;
+	__be16			redirect_lid;
+	__be16			redirect_pkey;
+	__be32			redirect_qp;
+	__be32			redirect_qkey;
+	u8			trap_gid[16];
+	__be32			trap_tcslfl;
+	__be16			trap_lid;
+	__be16			trap_pkey;
+	__be32			trap_hlqp;
+	__be32			trap_qkey;
+};
+
+/**
+ * ib_mad_send_buf - MAD data buffer and work request for sends.
+ * @next: A pointer used to chain together MADs for posting.
+ * @mad: References an allocated MAD data buffer for MADs that do not have
+ *   RMPP active.  For MADs using RMPP, references the common and management
+ *   class specific headers.
+ * @mad_agent: MAD agent that allocated the buffer.
+ * @ah: The address handle to use when sending the MAD.
+ * @context: User-controlled context fields.
+ * @hdr_len: Indicates the size of the data header of the MAD.  This length
+ *   includes the common MAD, RMPP, and class specific headers.
+ * @data_len: Indicates the total size of user-transferred data.
+ * @seg_count: The number of RMPP segments allocated for this send.
+ * @seg_size: Size of each RMPP segment.
+ * @timeout_ms: Time to wait for a response.
+ * @retries: Number of times to retry a request for a response.  For MADs
+ *   using RMPP, this applies per window.  On completion, returns the number
+ *   of retries needed to complete the transfer.
+ *
+ * Users are responsible for initializing the MAD buffer itself, with the
+ * exception of any RMPP header.  Additional segment buffer space allocated
+ * beyond data_len is padding.
+ */
+struct ib_mad_send_buf {
+	struct ib_mad_send_buf	*next;
+	void			*mad;
+	struct ib_mad_agent	*mad_agent;
+	struct ib_ah		*ah;
+	void			*context[2];
+	int			hdr_len;
+	int			data_len;
+	int			seg_count;
+	int			seg_size;
+	int			timeout_ms;
+	int			retries;
+};
+
+/**
+ * ib_response_mad - Returns if the specified MAD has been generated in
+ *   response to a sent request or trap.
+ */
+int ib_response_mad(struct ib_mad *mad);
+
+/**
+ * ib_get_rmpp_resptime - Returns the RMPP response time.
+ * @rmpp_hdr: An RMPP header.
+ */
+static inline u8 ib_get_rmpp_resptime(struct ib_rmpp_hdr *rmpp_hdr)
+{
+	return rmpp_hdr->rmpp_rtime_flags >> 3;
+}
+
+/**
+ * ib_get_rmpp_flags - Returns the RMPP flags.
+ * @rmpp_hdr: An RMPP header.
+ */
+static inline u8 ib_get_rmpp_flags(struct ib_rmpp_hdr *rmpp_hdr)
+{
+	return rmpp_hdr->rmpp_rtime_flags & 0x7;
+}
+
+/**
+ * ib_set_rmpp_resptime - Sets the response time in an RMPP header.
+ * @rmpp_hdr: An RMPP header.
+ * @rtime: The response time to set.
+ */
+static inline void ib_set_rmpp_resptime(struct ib_rmpp_hdr *rmpp_hdr, u8 rtime)
+{
+	rmpp_hdr->rmpp_rtime_flags = ib_get_rmpp_flags(rmpp_hdr) | (rtime << 3);
+}
+
+/**
+ * ib_set_rmpp_flags - Sets the flags in an RMPP header.
+ * @rmpp_hdr: An RMPP header.
+ * @flags: The flags to set.
+ */
+static inline void ib_set_rmpp_flags(struct ib_rmpp_hdr *rmpp_hdr, u8 flags)
+{
+	rmpp_hdr->rmpp_rtime_flags = (rmpp_hdr->rmpp_rtime_flags & 0xF8) |
+				     (flags & 0x7);
+}
+
+struct ib_mad_agent;
+struct ib_mad_send_wc;
+struct ib_mad_recv_wc;
+
+/**
+ * ib_mad_send_handler - callback handler for a sent MAD.
+ * @mad_agent: MAD agent that sent the MAD.
+ * @mad_send_wc: Send work completion information on the sent MAD.
+ */
+typedef void (*ib_mad_send_handler)(struct ib_mad_agent *mad_agent,
+				    struct ib_mad_send_wc *mad_send_wc);
+
+/**
+ * ib_mad_snoop_handler - Callback handler for snooping sent MADs.
+ * @mad_agent: MAD agent that snooped the MAD.
+ * @send_wr: Work request information on the sent MAD.
+ * @mad_send_wc: Work completion information on the sent MAD.  Valid
+ *   only for snooping that occurs on a send completion.
+ *
+ * Clients snooping MADs should not modify data referenced by the @send_wr
+ * or @mad_send_wc.
+ */
+typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent,
+				     struct ib_mad_send_buf *send_buf,
+				     struct ib_mad_send_wc *mad_send_wc);
+
+/**
+ * ib_mad_recv_handler - callback handler for a received MAD.
+ * @mad_agent: MAD agent requesting the received MAD.
+ * @mad_recv_wc: Received work completion information on the received MAD.
+ *
+ * MADs received in response to a send request operation will be handed to
+ * the user before the send operation completes.  All data buffers given
+ * to registered agents through this routine are owned by the receiving
+ * client, except for snooping agents.  Clients snooping MADs should not
+ * modify the data referenced by @mad_recv_wc.
+ */
+typedef void (*ib_mad_recv_handler)(struct ib_mad_agent *mad_agent,
+				    struct ib_mad_recv_wc *mad_recv_wc);
+
+/**
+ * ib_mad_agent - Used to track MAD registration with the access layer.
+ * @device: Reference to device registration is on.
+ * @qp: Reference to QP used for sending and receiving MADs.
+ * @mr: Memory region for system memory usable for DMA.
+ * @recv_handler: Callback handler for a received MAD.
+ * @send_handler: Callback handler for a sent MAD.
+ * @snoop_handler: Callback handler for snooped sent MADs.
+ * @context: User-specified context associated with this registration.
+ * @hi_tid: Access layer assigned transaction ID for this client.
+ *   Unsolicited MADs sent by this client will have the upper 32-bits
+ *   of their TID set to this value.
+ * @port_num: Port number on which QP is registered
+ * @rmpp_version: If set, indicates the RMPP version used by this agent.
+ */
+struct ib_mad_agent {
+	struct ib_device	*device;
+	struct ib_qp		*qp;
+	struct ib_mr		*mr;
+	ib_mad_recv_handler	recv_handler;
+	ib_mad_send_handler	send_handler;
+	ib_mad_snoop_handler	snoop_handler;
+	void			*context;
+	u32			hi_tid;
+	u8			port_num;
+	u8			rmpp_version;
+};
+
+/**
+ * ib_mad_send_wc - MAD send completion information.
+ * @send_buf: Send MAD data buffer associated with the send MAD request.
+ * @status: Completion status.
+ * @vendor_err: Optional vendor error information returned with a failed
+ *   request.
+ */
+struct ib_mad_send_wc {
+	struct ib_mad_send_buf	*send_buf;
+	enum ib_wc_status	status;
+	u32			vendor_err;
+};
+
+/**
+ * ib_mad_recv_buf - received MAD buffer information.
+ * @list: Reference to next data buffer for a received RMPP MAD.
+ * @grh: References a data buffer containing the global route header.
+ *   The data refereced by this buffer is only valid if the GRH is
+ *   valid.
+ * @mad: References the start of the received MAD.
+ */
+struct ib_mad_recv_buf {
+	struct list_head	list;
+	struct ib_grh		*grh;
+	struct ib_mad		*mad;
+};
+
+/**
+ * ib_mad_recv_wc - received MAD information.
+ * @wc: Completion information for the received data.
+ * @recv_buf: Specifies the location of the received data buffer(s).
+ * @rmpp_list: Specifies a list of RMPP reassembled received MAD buffers.
+ * @mad_len: The length of the received MAD, without duplicated headers.
+ *
+ * For received response, the wr_id contains a pointer to the ib_mad_send_buf
+ *   for the corresponding send request.
+ */
+struct ib_mad_recv_wc {
+	struct ib_wc		*wc;
+	struct ib_mad_recv_buf	recv_buf;
+	struct list_head	rmpp_list;
+	int			mad_len;
+};
+
+/**
+ * ib_mad_reg_req - MAD registration request
+ * @mgmt_class: Indicates which management class of MADs should be receive
+ *   by the caller.  This field is only required if the user wishes to
+ *   receive unsolicited MADs, otherwise it should be 0.
+ * @mgmt_class_version: Indicates which version of MADs for the given
+ *   management class to receive.
+ * @oui: Indicates IEEE OUI when mgmt_class is a vendor class
+ *   in the range from 0x30 to 0x4f. Otherwise not used.
+ * @method_mask: The caller will receive unsolicited MADs for any method
+ *   where @method_mask = 1.
+ */
+struct ib_mad_reg_req {
+	u8	mgmt_class;
+	u8	mgmt_class_version;
+	u8	oui[3];
+	DECLARE_BITMAP(method_mask, IB_MGMT_MAX_METHODS);
+};
+
+/**
+ * ib_register_mad_agent - Register to send/receive MADs.
+ * @device: The device to register with.
+ * @port_num: The port on the specified device to use.
+ * @qp_type: Specifies which QP to access.  Must be either
+ *   IB_QPT_SMI or IB_QPT_GSI.
+ * @mad_reg_req: Specifies which unsolicited MADs should be received
+ *   by the caller.  This parameter may be NULL if the caller only
+ *   wishes to receive solicited responses.
+ * @rmpp_version: If set, indicates that the client will send
+ *   and receive MADs that contain the RMPP header for the given version.
+ *   If set to 0, indicates that RMPP is not used by this client.
+ * @send_handler: The completion callback routine invoked after a send
+ *   request has completed.
+ * @recv_handler: The completion callback routine invoked for a received
+ *   MAD.
+ * @context: User specified context associated with the registration.
+ */
+struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
+					   u8 port_num,
+					   enum ib_qp_type qp_type,
+					   struct ib_mad_reg_req *mad_reg_req,
+					   u8 rmpp_version,
+					   ib_mad_send_handler send_handler,
+					   ib_mad_recv_handler recv_handler,
+					   void *context);
+
+enum ib_mad_snoop_flags {
+	/*IB_MAD_SNOOP_POSTED_SENDS	   = 1,*/
+	/*IB_MAD_SNOOP_RMPP_SENDS	   = (1<<1),*/
+	IB_MAD_SNOOP_SEND_COMPLETIONS	   = (1<<2),
+	/*IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS = (1<<3),*/
+	IB_MAD_SNOOP_RECVS		   = (1<<4)
+	/*IB_MAD_SNOOP_RMPP_RECVS	   = (1<<5),*/
+	/*IB_MAD_SNOOP_REDIRECTED_QPS	   = (1<<6)*/
+};
+
+/**
+ * ib_register_mad_snoop - Register to snoop sent and received MADs.
+ * @device: The device to register with.
+ * @port_num: The port on the specified device to use.
+ * @qp_type: Specifies which QP traffic to snoop.  Must be either
+ *   IB_QPT_SMI or IB_QPT_GSI.
+ * @mad_snoop_flags: Specifies information where snooping occurs.
+ * @send_handler: The callback routine invoked for a snooped send.
+ * @recv_handler: The callback routine invoked for a snooped receive.
+ * @context: User specified context associated with the registration.
+ */
+struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device,
+					   u8 port_num,
+					   enum ib_qp_type qp_type,
+					   int mad_snoop_flags,
+					   ib_mad_snoop_handler snoop_handler,
+					   ib_mad_recv_handler recv_handler,
+					   void *context);
+
+/**
+ * ib_unregister_mad_agent - Unregisters a client from using MAD services.
+ * @mad_agent: Corresponding MAD registration request to deregister.
+ *
+ * After invoking this routine, MAD services are no longer usable by the
+ * client on the associated QP.
+ */
+int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent);
+
+/**
+ * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated
+ *   with the registered client.
+ * @send_buf: Specifies the information needed to send the MAD(s).
+ * @bad_send_buf: Specifies the MAD on which an error was encountered.  This
+ *   parameter is optional if only a single MAD is posted.
+ *
+ * Sent MADs are not guaranteed to complete in the order that they were posted.
+ *
+ * If the MAD requires RMPP, the data buffer should contain a single copy
+ * of the common MAD, RMPP, and class specific headers, followed by the class
+ * defined data.  If the class defined data would not divide evenly into
+ * RMPP segments, then space must be allocated at the end of the referenced
+ * buffer for any required padding.  To indicate the amount of class defined
+ * data being transferred, the paylen_newwin field in the RMPP header should
+ * be set to the size of the class specific header plus the amount of class
+ * defined data being transferred.  The paylen_newwin field should be
+ * specified in network-byte order.
+ */
+int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
+		     struct ib_mad_send_buf **bad_send_buf);
+
+
+/**
+ * ib_free_recv_mad - Returns data buffers used to receive a MAD.
+ * @mad_recv_wc: Work completion information for a received MAD.
+ *
+ * Clients receiving MADs through their ib_mad_recv_handler must call this
+ * routine to return the work completion buffers to the access layer.
+ */
+void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc);
+
+/**
+ * ib_cancel_mad - Cancels an outstanding send MAD operation.
+ * @mad_agent: Specifies the registration associated with sent MAD.
+ * @send_buf: Indicates the MAD to cancel.
+ *
+ * MADs will be returned to the user through the corresponding
+ * ib_mad_send_handler.
+ */
+void ib_cancel_mad(struct ib_mad_agent *mad_agent,
+		   struct ib_mad_send_buf *send_buf);
+
+/**
+ * ib_modify_mad - Modifies an outstanding send MAD operation.
+ * @mad_agent: Specifies the registration associated with sent MAD.
+ * @send_buf: Indicates the MAD to modify.
+ * @timeout_ms: New timeout value for sent MAD.
+ *
+ * This call will reset the timeout value for a sent MAD to the specified
+ * value.
+ */
+int ib_modify_mad(struct ib_mad_agent *mad_agent,
+		  struct ib_mad_send_buf *send_buf, u32 timeout_ms);
+
+/**
+ * ib_redirect_mad_qp - Registers a QP for MAD services.
+ * @qp: Reference to a QP that requires MAD services.
+ * @rmpp_version: If set, indicates that the client will send
+ *   and receive MADs that contain the RMPP header for the given version.
+ *   If set to 0, indicates that RMPP is not used by this client.
+ * @send_handler: The completion callback routine invoked after a send
+ *   request has completed.
+ * @recv_handler: The completion callback routine invoked for a received
+ *   MAD.
+ * @context: User specified context associated with the registration.
+ *
+ * Use of this call allows clients to use MAD services, such as RMPP,
+ * on user-owned QPs.  After calling this routine, users may send
+ * MADs on the specified QP by calling ib_mad_post_send.
+ */
+struct ib_mad_agent *ib_redirect_mad_qp(struct ib_qp *qp,
+					u8 rmpp_version,
+					ib_mad_send_handler send_handler,
+					ib_mad_recv_handler recv_handler,
+					void *context);
+
+/**
+ * ib_process_mad_wc - Processes a work completion associated with a
+ *   MAD sent or received on a redirected QP.
+ * @mad_agent: Specifies the registered MAD service using the redirected QP.
+ * @wc: References a work completion associated with a sent or received
+ *   MAD segment.
+ *
+ * This routine is used to complete or continue processing on a MAD request.
+ * If the work completion is associated with a send operation, calling
+ * this routine is required to continue an RMPP transfer or to wait for a
+ * corresponding response, if it is a request.  If the work completion is
+ * associated with a receive operation, calling this routine is required to
+ * process an inbound or outbound RMPP transfer, or to match a response MAD
+ * with its corresponding request.
+ */
+int ib_process_mad_wc(struct ib_mad_agent *mad_agent,
+		      struct ib_wc *wc);
+
+/**
+ * ib_create_send_mad - Allocate and initialize a data buffer and work request
+ *   for sending a MAD.
+ * @mad_agent: Specifies the registered MAD service to associate with the MAD.
+ * @remote_qpn: Specifies the QPN of the receiving node.
+ * @pkey_index: Specifies which PKey the MAD will be sent using.  This field
+ *   is valid only if the remote_qpn is QP 1.
+ * @rmpp_active: Indicates if the send will enable RMPP.
+ * @hdr_len: Indicates the size of the data header of the MAD.  This length
+ *   should include the common MAD header, RMPP header, plus any class
+ *   specific header.
+ * @data_len: Indicates the size of any user-transferred data.  The call will
+ *   automatically adjust the allocated buffer size to account for any
+ *   additional padding that may be necessary.
+ * @gfp_mask: GFP mask used for the memory allocation.
+ *
+ * This routine allocates a MAD for sending.  The returned MAD send buffer
+ * will reference a data buffer usable for sending a MAD, along
+ * with an initialized work request structure.  Users may modify the returned
+ * MAD data buffer before posting the send.
+ *
+ * The returned MAD header, class specific headers, and any padding will be
+ * cleared.  Users are responsible for initializing the common MAD header,
+ * any class specific header, and MAD data area.
+ * If @rmpp_active is set, the RMPP header will be initialized for sending.
+ */
+struct ib_mad_send_buf *ib_create_send_mad(struct ib_mad_agent *mad_agent,
+					   u32 remote_qpn, u16 pkey_index,
+					   int rmpp_active,
+					   int hdr_len, int data_len,
+					   gfp_t gfp_mask);
+
+/**
+ * ib_is_mad_class_rmpp - returns whether given management class
+ * supports RMPP.
+ * @mgmt_class: management class
+ *
+ * This routine returns whether the management class supports RMPP.
+ */
+int ib_is_mad_class_rmpp(u8 mgmt_class);
+
+/**
+ * ib_get_mad_data_offset - returns the data offset for a given
+ * management class.
+ * @mgmt_class: management class
+ *
+ * This routine returns the data offset in the MAD for the management
+ * class requested.
+ */
+int ib_get_mad_data_offset(u8 mgmt_class);
+
+/**
+ * ib_get_rmpp_segment - returns the data buffer for a given RMPP segment.
+ * @send_buf: Previously allocated send data buffer.
+ * @seg_num: number of segment to return
+ *
+ * This routine returns a pointer to the data buffer of an RMPP MAD.
+ * Users must provide synchronization to @send_buf around this call.
+ */
+void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num);
+
+/**
+ * ib_free_send_mad - Returns data buffers used to send a MAD.
+ * @send_buf: Previously allocated send data buffer.
+ */
+void ib_free_send_mad(struct ib_mad_send_buf *send_buf);
+
+#endif /* IB_MAD_H */
diff --git a/sys/ofed/include/rdma/ib_marshall.h b/sys/ofed/include/rdma/ib_marshall.h
new file mode 100644
index 0000000..db03720
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_marshall.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2005-2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(IB_USER_MARSHALL_H)
+#define IB_USER_MARSHALL_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_sa.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_user_sa.h>
+
+void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst,
+			     struct ib_qp_attr *src);
+
+void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst,
+			     struct ib_ah_attr *src);
+
+void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst,
+			      struct ib_sa_path_rec *src);
+
+void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst,
+				struct ib_user_path_rec *src);
+
+#endif /* IB_USER_MARSHALL_H */
diff --git a/sys/ofed/include/rdma/ib_pack.h b/sys/ofed/include/rdma/ib_pack.h
new file mode 100644
index 0000000..af615a4
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_pack.h
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_PACK_H
+#define IB_PACK_H
+
+#include <rdma/ib_verbs.h>
+
+enum {
+	IB_LRH_BYTES  = 8,
+	IB_ETH_BYTES  = 14,
+	IB_VLAN_BYTES = 4,
+	IB_GRH_BYTES  = 40,
+	IB_BTH_BYTES  = 12,
+	IB_DETH_BYTES = 8
+};
+
+struct ib_field {
+	size_t struct_offset_bytes;
+	size_t struct_size_bytes;
+	int    offset_words;
+	int    offset_bits;
+	int    size_bits;
+	char  *field_name;
+};
+
+#define RESERVED \
+	.field_name          = "reserved"
+
+/*
+ * This macro cleans up the definitions of constants for BTH opcodes.
+ * It is used to define constants such as IB_OPCODE_UD_SEND_ONLY,
+ * which becomes IB_OPCODE_UD + IB_OPCODE_SEND_ONLY, and this gives
+ * the correct value.
+ *
+ * In short, user code should use the constants defined using the
+ * macro rather than worrying about adding together other constants.
+*/
+#define IB_OPCODE(transport, op) \
+	IB_OPCODE_ ## transport ## _ ## op = \
+		IB_OPCODE_ ## transport + IB_OPCODE_ ## op
+
+enum {
+	/* transport types -- just used to define real constants */
+	IB_OPCODE_RC                                = 0x00,
+	IB_OPCODE_UC                                = 0x20,
+	IB_OPCODE_RD                                = 0x40,
+	IB_OPCODE_UD                                = 0x60,
+
+	/* operations -- just used to define real constants */
+	IB_OPCODE_SEND_FIRST                        = 0x00,
+	IB_OPCODE_SEND_MIDDLE                       = 0x01,
+	IB_OPCODE_SEND_LAST                         = 0x02,
+	IB_OPCODE_SEND_LAST_WITH_IMMEDIATE          = 0x03,
+	IB_OPCODE_SEND_ONLY                         = 0x04,
+	IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE          = 0x05,
+	IB_OPCODE_RDMA_WRITE_FIRST                  = 0x06,
+	IB_OPCODE_RDMA_WRITE_MIDDLE                 = 0x07,
+	IB_OPCODE_RDMA_WRITE_LAST                   = 0x08,
+	IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE    = 0x09,
+	IB_OPCODE_RDMA_WRITE_ONLY                   = 0x0a,
+	IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE    = 0x0b,
+	IB_OPCODE_RDMA_READ_REQUEST                 = 0x0c,
+	IB_OPCODE_RDMA_READ_RESPONSE_FIRST          = 0x0d,
+	IB_OPCODE_RDMA_READ_RESPONSE_MIDDLE         = 0x0e,
+	IB_OPCODE_RDMA_READ_RESPONSE_LAST           = 0x0f,
+	IB_OPCODE_RDMA_READ_RESPONSE_ONLY           = 0x10,
+	IB_OPCODE_ACKNOWLEDGE                       = 0x11,
+	IB_OPCODE_ATOMIC_ACKNOWLEDGE                = 0x12,
+	IB_OPCODE_COMPARE_SWAP                      = 0x13,
+	IB_OPCODE_FETCH_ADD                         = 0x14,
+
+	/* real constants follow -- see comment about above IB_OPCODE()
+	   macro for more details */
+
+	/* RC */
+	IB_OPCODE(RC, SEND_FIRST),
+	IB_OPCODE(RC, SEND_MIDDLE),
+	IB_OPCODE(RC, SEND_LAST),
+	IB_OPCODE(RC, SEND_LAST_WITH_IMMEDIATE),
+	IB_OPCODE(RC, SEND_ONLY),
+	IB_OPCODE(RC, SEND_ONLY_WITH_IMMEDIATE),
+	IB_OPCODE(RC, RDMA_WRITE_FIRST),
+	IB_OPCODE(RC, RDMA_WRITE_MIDDLE),
+	IB_OPCODE(RC, RDMA_WRITE_LAST),
+	IB_OPCODE(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+	IB_OPCODE(RC, RDMA_WRITE_ONLY),
+	IB_OPCODE(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+	IB_OPCODE(RC, RDMA_READ_REQUEST),
+	IB_OPCODE(RC, RDMA_READ_RESPONSE_FIRST),
+	IB_OPCODE(RC, RDMA_READ_RESPONSE_MIDDLE),
+	IB_OPCODE(RC, RDMA_READ_RESPONSE_LAST),
+	IB_OPCODE(RC, RDMA_READ_RESPONSE_ONLY),
+	IB_OPCODE(RC, ACKNOWLEDGE),
+	IB_OPCODE(RC, ATOMIC_ACKNOWLEDGE),
+	IB_OPCODE(RC, COMPARE_SWAP),
+	IB_OPCODE(RC, FETCH_ADD),
+
+	/* UC */
+	IB_OPCODE(UC, SEND_FIRST),
+	IB_OPCODE(UC, SEND_MIDDLE),
+	IB_OPCODE(UC, SEND_LAST),
+	IB_OPCODE(UC, SEND_LAST_WITH_IMMEDIATE),
+	IB_OPCODE(UC, SEND_ONLY),
+	IB_OPCODE(UC, SEND_ONLY_WITH_IMMEDIATE),
+	IB_OPCODE(UC, RDMA_WRITE_FIRST),
+	IB_OPCODE(UC, RDMA_WRITE_MIDDLE),
+	IB_OPCODE(UC, RDMA_WRITE_LAST),
+	IB_OPCODE(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+	IB_OPCODE(UC, RDMA_WRITE_ONLY),
+	IB_OPCODE(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+
+	/* RD */
+	IB_OPCODE(RD, SEND_FIRST),
+	IB_OPCODE(RD, SEND_MIDDLE),
+	IB_OPCODE(RD, SEND_LAST),
+	IB_OPCODE(RD, SEND_LAST_WITH_IMMEDIATE),
+	IB_OPCODE(RD, SEND_ONLY),
+	IB_OPCODE(RD, SEND_ONLY_WITH_IMMEDIATE),
+	IB_OPCODE(RD, RDMA_WRITE_FIRST),
+	IB_OPCODE(RD, RDMA_WRITE_MIDDLE),
+	IB_OPCODE(RD, RDMA_WRITE_LAST),
+	IB_OPCODE(RD, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+	IB_OPCODE(RD, RDMA_WRITE_ONLY),
+	IB_OPCODE(RD, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+	IB_OPCODE(RD, RDMA_READ_REQUEST),
+	IB_OPCODE(RD, RDMA_READ_RESPONSE_FIRST),
+	IB_OPCODE(RD, RDMA_READ_RESPONSE_MIDDLE),
+	IB_OPCODE(RD, RDMA_READ_RESPONSE_LAST),
+	IB_OPCODE(RD, RDMA_READ_RESPONSE_ONLY),
+	IB_OPCODE(RD, ACKNOWLEDGE),
+	IB_OPCODE(RD, ATOMIC_ACKNOWLEDGE),
+	IB_OPCODE(RD, COMPARE_SWAP),
+	IB_OPCODE(RD, FETCH_ADD),
+
+	/* UD */
+	IB_OPCODE(UD, SEND_ONLY),
+	IB_OPCODE(UD, SEND_ONLY_WITH_IMMEDIATE)
+};
+
+enum {
+	IB_LNH_RAW        = 0,
+	IB_LNH_IP         = 1,
+	IB_LNH_IBA_LOCAL  = 2,
+	IB_LNH_IBA_GLOBAL = 3
+};
+
+struct ib_unpacked_lrh {
+	u8        virtual_lane;
+	u8        link_version;
+	u8        service_level;
+	u8        link_next_header;
+	__be16    destination_lid;
+	__be16    packet_length;
+	__be16    source_lid;
+};
+
+struct ib_unpacked_grh {
+	u8    	     ip_version;
+	u8    	     traffic_class;
+	__be32 	     flow_label;
+	__be16       payload_length;
+	u8    	     next_header;
+	u8    	     hop_limit;
+	union ib_gid source_gid;
+	union ib_gid destination_gid;
+};
+
+struct ib_unpacked_bth {
+	u8           opcode;
+	u8           solicited_event;
+	u8           mig_req;
+	u8           pad_count;
+	u8           transport_header_version;
+	__be16       pkey;
+	__be32       destination_qpn;
+	u8           ack_req;
+	__be32       psn;
+};
+
+struct ib_unpacked_deth {
+	__be32       qkey;
+	__be32       source_qpn;
+};
+
+struct ib_unpacked_eth {
+	u8	dmac_h[4];
+	u8	dmac_l[2];
+	u8	smac_h[2];
+	u8	smac_l[4];
+	__be16	type;
+};
+
+struct ib_unpacked_vlan {
+	__be16  tag;
+	__be16  type;
+};
+
+struct ib_ud_header {
+	int                     lrh_present;
+	struct ib_unpacked_lrh  lrh;
+	int                     eth_present;
+	struct ib_unpacked_eth  eth;
+	int                     vlan_present;
+	struct ib_unpacked_vlan vlan;
+	int                     grh_present;
+	struct ib_unpacked_grh  grh;
+	struct ib_unpacked_bth  bth;
+	struct ib_unpacked_deth deth;
+	int            		immediate_present;
+	__be32         		immediate_data;
+};
+
+void ib_pack(const struct ib_field        *desc,
+	     int                           desc_len,
+	     void                         *structure,
+	     void                         *buf);
+
+void ib_unpack(const struct ib_field        *desc,
+	       int                           desc_len,
+	       void                         *buf,
+	       void                         *structure);
+
+void ib_ud_header_init(int     		    payload_bytes,
+		       int		    lrh_present,
+		       int		    eth_present,
+		       int		    vlan_present,
+		       int    		    grh_present,
+		       int		    immediate_present,
+		       struct ib_ud_header *header);
+
+int ib_ud_header_pack(struct ib_ud_header *header,
+		      void                *buf);
+
+int ib_ud_header_unpack(void                *buf,
+			struct ib_ud_header *header);
+int ib_lrh_header_pack(struct ib_unpacked_lrh *lrh, void *buf);
+int ib_lrh_header_unpack(void *buf, struct ib_unpacked_lrh *lrh);
+
+#endif /* IB_PACK_H */
diff --git a/sys/ofed/include/rdma/ib_sa.h b/sys/ofed/include/rdma/ib_sa.h
new file mode 100644
index 0000000..5a8f2ce
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_sa.h
@@ -0,0 +1,559 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc.  All rights reserved.
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_SA_H
+#define IB_SA_H
+
+#include <linux/completion.h>
+#include <linux/compiler.h>
+
+#include <asm/atomic.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_mad.h>
+
+enum {
+	IB_SA_CLASS_VERSION		= 2,	/* IB spec version 1.1/1.2 */
+
+	IB_SA_METHOD_GET_TABLE		= 0x12,
+	IB_SA_METHOD_GET_TABLE_RESP	= 0x92,
+	IB_SA_METHOD_DELETE		= 0x15,
+	IB_SA_METHOD_DELETE_RESP	= 0x95,
+	IB_SA_METHOD_GET_MULTI		= 0x14,
+	IB_SA_METHOD_GET_MULTI_RESP	= 0x94,
+	IB_SA_METHOD_GET_TRACE_TBL	= 0x13
+};
+
+enum {
+	IB_SA_ATTR_CLASS_PORTINFO    = 0x01,
+	IB_SA_ATTR_NOTICE	     = 0x02,
+	IB_SA_ATTR_INFORM_INFO	     = 0x03,
+	IB_SA_ATTR_NODE_REC	     = 0x11,
+	IB_SA_ATTR_PORT_INFO_REC     = 0x12,
+	IB_SA_ATTR_SL2VL_REC	     = 0x13,
+	IB_SA_ATTR_SWITCH_REC	     = 0x14,
+	IB_SA_ATTR_LINEAR_FDB_REC    = 0x15,
+	IB_SA_ATTR_RANDOM_FDB_REC    = 0x16,
+	IB_SA_ATTR_MCAST_FDB_REC     = 0x17,
+	IB_SA_ATTR_SM_INFO_REC	     = 0x18,
+	IB_SA_ATTR_LINK_REC	     = 0x20,
+	IB_SA_ATTR_GUID_INFO_REC     = 0x30,
+	IB_SA_ATTR_SERVICE_REC	     = 0x31,
+	IB_SA_ATTR_PARTITION_REC     = 0x33,
+	IB_SA_ATTR_PATH_REC	     = 0x35,
+	IB_SA_ATTR_VL_ARB_REC	     = 0x36,
+	IB_SA_ATTR_MC_MEMBER_REC     = 0x38,
+	IB_SA_ATTR_TRACE_REC	     = 0x39,
+	IB_SA_ATTR_MULTI_PATH_REC    = 0x3a,
+	IB_SA_ATTR_SERVICE_ASSOC_REC = 0x3b,
+	IB_SA_ATTR_INFORM_INFO_REC   = 0xf3
+};
+
+enum ib_sa_selector {
+	IB_SA_GT   = 0,
+	IB_SA_LT   = 1,
+	IB_SA_EQ   = 2,
+	/*
+	 * The meaning of "best" depends on the attribute: for
+	 * example, for MTU best will return the largest available
+	 * MTU, while for packet life time, best will return the
+	 * smallest available life time.
+	 */
+	IB_SA_BEST = 3
+};
+
+/*
+ * Structures for SA records are named "struct ib_sa_xxx_rec."  No
+ * attempt is made to pack structures to match the physical layout of
+ * SA records in SA MADs; all packing and unpacking is handled by the
+ * SA query code.
+ *
+ * For a record with structure ib_sa_xxx_rec, the naming convention
+ * for the component mask value for field yyy is IB_SA_XXX_REC_YYY (we
+ * never use different abbreviations or otherwise change the spelling
+ * of xxx/yyy between ib_sa_xxx_rec.yyy and IB_SA_XXX_REC_YYY).
+ *
+ * Reserved rows are indicated with comments to help maintainability.
+ */
+
+#define IB_SA_PATH_REC_SERVICE_ID		       (IB_SA_COMP_MASK( 0) |\
+							IB_SA_COMP_MASK( 1))
+#define IB_SA_PATH_REC_DGID				IB_SA_COMP_MASK( 2)
+#define IB_SA_PATH_REC_SGID				IB_SA_COMP_MASK( 3)
+#define IB_SA_PATH_REC_DLID				IB_SA_COMP_MASK( 4)
+#define IB_SA_PATH_REC_SLID				IB_SA_COMP_MASK( 5)
+#define IB_SA_PATH_REC_RAW_TRAFFIC			IB_SA_COMP_MASK( 6)
+/* reserved:								 7 */
+#define IB_SA_PATH_REC_FLOW_LABEL       		IB_SA_COMP_MASK( 8)
+#define IB_SA_PATH_REC_HOP_LIMIT			IB_SA_COMP_MASK( 9)
+#define IB_SA_PATH_REC_TRAFFIC_CLASS			IB_SA_COMP_MASK(10)
+#define IB_SA_PATH_REC_REVERSIBLE			IB_SA_COMP_MASK(11)
+#define IB_SA_PATH_REC_NUMB_PATH			IB_SA_COMP_MASK(12)
+#define IB_SA_PATH_REC_PKEY				IB_SA_COMP_MASK(13)
+#define IB_SA_PATH_REC_QOS_CLASS			IB_SA_COMP_MASK(14)
+#define IB_SA_PATH_REC_SL				IB_SA_COMP_MASK(15)
+#define IB_SA_PATH_REC_MTU_SELECTOR			IB_SA_COMP_MASK(16)
+#define IB_SA_PATH_REC_MTU				IB_SA_COMP_MASK(17)
+#define IB_SA_PATH_REC_RATE_SELECTOR			IB_SA_COMP_MASK(18)
+#define IB_SA_PATH_REC_RATE				IB_SA_COMP_MASK(19)
+#define IB_SA_PATH_REC_PACKET_LIFE_TIME_SELECTOR	IB_SA_COMP_MASK(20)
+#define IB_SA_PATH_REC_PACKET_LIFE_TIME			IB_SA_COMP_MASK(21)
+#define IB_SA_PATH_REC_PREFERENCE			IB_SA_COMP_MASK(22)
+
+struct ib_sa_path_rec {
+	__be64       service_id;
+	union ib_gid dgid;
+	union ib_gid sgid;
+	__be16       dlid;
+	__be16       slid;
+	int          raw_traffic;
+	/* reserved */
+	__be32       flow_label;
+	u8           hop_limit;
+	u8           traffic_class;
+	int          reversible;
+	u8           numb_path;
+	__be16       pkey;
+	__be16       qos_class;
+	u8           sl;
+	u8           mtu_selector;
+	u8           mtu;
+	u8           rate_selector;
+	u8           rate;
+	u8           packet_life_time_selector;
+	u8           packet_life_time;
+	u8           preference;
+};
+
+#define IB_SA_MCMEMBER_REC_MGID				IB_SA_COMP_MASK( 0)
+#define IB_SA_MCMEMBER_REC_PORT_GID			IB_SA_COMP_MASK( 1)
+#define IB_SA_MCMEMBER_REC_QKEY				IB_SA_COMP_MASK( 2)
+#define IB_SA_MCMEMBER_REC_MLID				IB_SA_COMP_MASK( 3)
+#define IB_SA_MCMEMBER_REC_MTU_SELECTOR			IB_SA_COMP_MASK( 4)
+#define IB_SA_MCMEMBER_REC_MTU				IB_SA_COMP_MASK( 5)
+#define IB_SA_MCMEMBER_REC_TRAFFIC_CLASS		IB_SA_COMP_MASK( 6)
+#define IB_SA_MCMEMBER_REC_PKEY				IB_SA_COMP_MASK( 7)
+#define IB_SA_MCMEMBER_REC_RATE_SELECTOR		IB_SA_COMP_MASK( 8)
+#define IB_SA_MCMEMBER_REC_RATE				IB_SA_COMP_MASK( 9)
+#define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR	IB_SA_COMP_MASK(10)
+#define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME		IB_SA_COMP_MASK(11)
+#define IB_SA_MCMEMBER_REC_SL				IB_SA_COMP_MASK(12)
+#define IB_SA_MCMEMBER_REC_FLOW_LABEL			IB_SA_COMP_MASK(13)
+#define IB_SA_MCMEMBER_REC_HOP_LIMIT			IB_SA_COMP_MASK(14)
+#define IB_SA_MCMEMBER_REC_SCOPE			IB_SA_COMP_MASK(15)
+#define IB_SA_MCMEMBER_REC_JOIN_STATE			IB_SA_COMP_MASK(16)
+#define IB_SA_MCMEMBER_REC_PROXY_JOIN			IB_SA_COMP_MASK(17)
+
+struct ib_sa_mcmember_rec {
+	union ib_gid mgid;
+	union ib_gid port_gid;
+	__be32       qkey;
+	__be16       mlid;
+	u8           mtu_selector;
+	u8           mtu;
+	u8           traffic_class;
+	__be16       pkey;
+	u8 	     rate_selector;
+	u8 	     rate;
+	u8 	     packet_life_time_selector;
+	u8 	     packet_life_time;
+	u8           sl;
+	__be32       flow_label;
+	u8           hop_limit;
+	u8           scope;
+	u8           join_state;
+	int          proxy_join;
+};
+
+/* Service Record Component Mask Sec 15.2.5.14 Ver 1.1	*/
+#define IB_SA_SERVICE_REC_SERVICE_ID			IB_SA_COMP_MASK( 0)
+#define IB_SA_SERVICE_REC_SERVICE_GID			IB_SA_COMP_MASK( 1)
+#define IB_SA_SERVICE_REC_SERVICE_PKEY			IB_SA_COMP_MASK( 2)
+/* reserved:								 3 */
+#define IB_SA_SERVICE_REC_SERVICE_LEASE			IB_SA_COMP_MASK( 4)
+#define IB_SA_SERVICE_REC_SERVICE_KEY			IB_SA_COMP_MASK( 5)
+#define IB_SA_SERVICE_REC_SERVICE_NAME			IB_SA_COMP_MASK( 6)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_0		IB_SA_COMP_MASK( 7)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_1		IB_SA_COMP_MASK( 8)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_2		IB_SA_COMP_MASK( 9)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_3		IB_SA_COMP_MASK(10)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_4		IB_SA_COMP_MASK(11)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_5		IB_SA_COMP_MASK(12)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_6		IB_SA_COMP_MASK(13)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_7		IB_SA_COMP_MASK(14)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_8		IB_SA_COMP_MASK(15)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_9		IB_SA_COMP_MASK(16)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_10		IB_SA_COMP_MASK(17)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_11		IB_SA_COMP_MASK(18)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_12		IB_SA_COMP_MASK(19)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_13		IB_SA_COMP_MASK(20)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_14		IB_SA_COMP_MASK(21)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_15		IB_SA_COMP_MASK(22)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_0		IB_SA_COMP_MASK(23)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_1		IB_SA_COMP_MASK(24)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_2		IB_SA_COMP_MASK(25)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_3		IB_SA_COMP_MASK(26)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_4		IB_SA_COMP_MASK(27)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_5		IB_SA_COMP_MASK(28)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_6		IB_SA_COMP_MASK(29)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_7		IB_SA_COMP_MASK(30)
+#define IB_SA_SERVICE_REC_SERVICE_DATA32_0		IB_SA_COMP_MASK(31)
+#define IB_SA_SERVICE_REC_SERVICE_DATA32_1		IB_SA_COMP_MASK(32)
+#define IB_SA_SERVICE_REC_SERVICE_DATA32_2		IB_SA_COMP_MASK(33)
+#define IB_SA_SERVICE_REC_SERVICE_DATA32_3		IB_SA_COMP_MASK(34)
+#define IB_SA_SERVICE_REC_SERVICE_DATA64_0		IB_SA_COMP_MASK(35)
+#define IB_SA_SERVICE_REC_SERVICE_DATA64_1		IB_SA_COMP_MASK(36)
+
+#define IB_DEFAULT_SERVICE_LEASE 	0xFFFFFFFF
+
+struct ib_sa_service_rec {
+	u64		id;
+	union ib_gid	gid;
+	__be16 		pkey;
+	/* reserved */
+	u32		lease;
+	u8		key[16];
+	u8		name[64];
+	u8		data8[16];
+	u16		data16[8];
+	u32		data32[4];
+	u64		data64[2];
+};
+
+enum {
+	IB_SA_EVENT_TYPE_FATAL		= 0x0,
+	IB_SA_EVENT_TYPE_URGENT		= 0x1,
+	IB_SA_EVENT_TYPE_SECURITY	= 0x2,
+	IB_SA_EVENT_TYPE_SM		= 0x3,
+	IB_SA_EVENT_TYPE_INFO		= 0x4,
+	IB_SA_EVENT_TYPE_EMPTY		= 0x7F,
+	IB_SA_EVENT_TYPE_ALL		= 0xFFFF
+};
+
+enum {
+	IB_SA_EVENT_PRODUCER_TYPE_CA		= 0x1,
+	IB_SA_EVENT_PRODUCER_TYPE_SWITCH	= 0x2,
+	IB_SA_EVENT_PRODUCER_TYPE_ROUTER	= 0x3,
+	IB_SA_EVENT_PRODUCER_TYPE_CLASS_MANAGER	= 0x4,
+	IB_SA_EVENT_PRODUCER_TYPE_ALL		= 0xFFFFFF
+};
+
+enum {
+	IB_SA_SM_TRAP_GID_IN_SERVICE			= 64,
+	IB_SA_SM_TRAP_GID_OUT_OF_SERVICE		= 65,
+	IB_SA_SM_TRAP_CREATE_MC_GROUP			= 66,
+	IB_SA_SM_TRAP_DELETE_MC_GROUP			= 67,
+	IB_SA_SM_TRAP_PORT_CHANGE_STATE			= 128,
+	IB_SA_SM_TRAP_LINK_INTEGRITY			= 129,
+	IB_SA_SM_TRAP_EXCESSIVE_BUFFER_OVERRUN		= 130,
+	IB_SA_SM_TRAP_FLOW_CONTROL_UPDATE_EXPIRED	= 131,
+	IB_SA_SM_TRAP_BAD_M_KEY				= 256,
+	IB_SA_SM_TRAP_BAD_P_KEY				= 257,
+	IB_SA_SM_TRAP_BAD_Q_KEY				= 258,
+	IB_SA_SM_TRAP_SWITCH_BAD_P_KEY			= 259,
+	IB_SA_SM_TRAP_ALL				= 0xFFFF
+};
+
+struct ib_sa_inform {
+	union ib_gid	gid;
+	__be16		lid_range_begin;
+	__be16		lid_range_end;
+	u8		is_generic;
+	u8		subscribe;
+	__be16		type;
+	union {
+		struct {
+			__be16	trap_num;
+			__be32	qpn;
+			u8	resp_time;
+			__be32	producer_type;
+		} generic;
+		struct {
+			__be16	device_id;
+			__be32	qpn;
+			u8	resp_time;
+			__be32	vendor_id;
+		} vendor;
+	} trap;
+};
+
+struct ib_sa_notice {
+	u8		is_generic;
+	u8		type;
+	union {
+		struct {
+			__be32	producer_type;
+			__be16	trap_num;
+		} generic;
+		struct {
+			__be32	vendor_id;
+			__be16	device_id;
+		} vendor;
+	} trap;
+	__be16		issuer_lid;
+	__be16		notice_count;
+	u8		notice_toggle;
+	/*
+	 * Align data 16 bits off 64 bit field to match InformInfo definition.
+	 * Data contained within this field will then align properly.
+	 * See IB spec 1.2, sections 13.4.8.2 and 14.2.5.1.
+	 */
+	u8		reserved[5];
+	u8		data_details[54];
+	union ib_gid	issuer_gid;
+};
+
+/*
+ * SM notice data details for:
+ *
+ * IB_SA_SM_TRAP_GID_IN_SERVICE		= 64
+ * IB_SA_SM_TRAP_GID_OUT_OF_SERVICE	= 65
+ * IB_SA_SM_TRAP_CREATE_MC_GROUP	= 66
+ * IB_SA_SM_TRAP_DELETE_MC_GROUP	= 67
+ */
+struct ib_sa_notice_data_gid {
+	u8	reserved[6];
+	u8	gid[16];
+	u8	padding[32];
+};
+
+/*
+ * SM notice data details for:
+ *
+ * IB_SA_SM_TRAP_PORT_CHANGE_STATE	= 128
+ */
+struct ib_sa_notice_data_port_change {
+	__be16	lid;
+	u8	padding[52];
+};
+
+/*
+ * SM notice data details for:
+ *
+ * IB_SA_SM_TRAP_LINK_INTEGRITY			= 129
+ * IB_SA_SM_TRAP_EXCESSIVE_BUFFER_OVERRUN	= 130
+ * IB_SA_SM_TRAP_FLOW_CONTROL_UPDATE_EXPIRED	= 131
+ */
+struct ib_sa_notice_data_port_error {
+	u8	reserved[2];
+	__be16	lid;
+	u8	port_num;
+	u8	padding[49];
+};
+
+struct ib_sa_client {
+	atomic_t users;
+	struct completion comp;
+};
+
+/**
+ * ib_sa_register_client - Register an SA client.
+ */
+void ib_sa_register_client(struct ib_sa_client *client);
+
+/**
+ * ib_sa_unregister_client - Deregister an SA client.
+ * @client: Client object to deregister.
+ */
+void ib_sa_unregister_client(struct ib_sa_client *client);
+
+struct ib_sa_query;
+
+void ib_sa_cancel_query(int id, struct ib_sa_query *query);
+
+int ib_sa_path_rec_get(struct ib_sa_client *client,
+		       struct ib_device *device, u8 port_num,
+		       struct ib_sa_path_rec *rec,
+		       ib_sa_comp_mask comp_mask,
+		       int timeout_ms, gfp_t gfp_mask,
+		       void (*callback)(int status,
+					struct ib_sa_path_rec *resp,
+					void *context),
+		       void *context,
+		       struct ib_sa_query **query);
+
+int ib_sa_service_rec_query(struct ib_sa_client *client,
+			 struct ib_device *device, u8 port_num,
+			 u8 method,
+			 struct ib_sa_service_rec *rec,
+			 ib_sa_comp_mask comp_mask,
+			 int timeout_ms, gfp_t gfp_mask,
+			 void (*callback)(int status,
+					  struct ib_sa_service_rec *resp,
+					  void *context),
+			 void *context,
+			 struct ib_sa_query **sa_query);
+
+struct ib_sa_multicast {
+	struct ib_sa_mcmember_rec rec;
+	ib_sa_comp_mask		comp_mask;
+	int			(*callback)(int status,
+					    struct ib_sa_multicast *multicast);
+	void			*context;
+};
+
+/**
+ * ib_sa_join_multicast - Initiates a join request to the specified multicast
+ *   group.
+ * @client: SA client
+ * @device: Device associated with the multicast group.
+ * @port_num: Port on the specified device to associate with the multicast
+ *   group.
+ * @rec: SA multicast member record specifying group attributes.
+ * @comp_mask: Component mask indicating which group attributes of %rec are
+ *   valid.
+ * @gfp_mask: GFP mask for memory allocations.
+ * @callback: User callback invoked once the join operation completes.
+ * @context: User specified context stored with the ib_sa_multicast structure.
+ *
+ * This call initiates a multicast join request with the SA for the specified
+ * multicast group.  If the join operation is started successfully, it returns
+ * an ib_sa_multicast structure that is used to track the multicast operation.
+ * Users must free this structure by calling ib_free_multicast, even if the
+ * join operation later fails.  (The callback status is non-zero.)
+ *
+ * If the join operation fails; status will be non-zero, with the following
+ * failures possible:
+ * -ETIMEDOUT: The request timed out.
+ * -EIO: An error occurred sending the query.
+ * -EINVAL: The MCMemberRecord values differed from the existing group's.
+ * -ENETRESET: Indicates that an fatal error has occurred on the multicast
+ *   group, and the user must rejoin the group to continue using it.
+ */
+struct ib_sa_multicast *ib_sa_join_multicast(struct ib_sa_client *client,
+					     struct ib_device *device, u8 port_num,
+					     struct ib_sa_mcmember_rec *rec,
+					     ib_sa_comp_mask comp_mask, gfp_t gfp_mask,
+					     int (*callback)(int status,
+							     struct ib_sa_multicast
+								    *multicast),
+					     void *context);
+
+/**
+ * ib_free_multicast - Frees the multicast tracking structure, and releases
+ *    any reference on the multicast group.
+ * @multicast: Multicast tracking structure allocated by ib_join_multicast.
+ *
+ * This call blocks until the multicast identifier is destroyed.  It may
+ * not be called from within the multicast callback; however, returning a non-
+ * zero value from the callback will result in destroying the multicast
+ * tracking structure.
+ */
+void ib_sa_free_multicast(struct ib_sa_multicast *multicast);
+
+/**
+ * ib_get_mcmember_rec - Looks up a multicast member record by its MGID and
+ *   returns it if found.
+ * @device: Device associated with the multicast group.
+ * @port_num: Port on the specified device to associate with the multicast
+ *   group.
+ * @mgid: MGID of multicast group.
+ * @rec: Location to copy SA multicast member record.
+ */
+int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num,
+			   union ib_gid *mgid, struct ib_sa_mcmember_rec *rec);
+
+/**
+ * ib_init_ah_from_mcmember - Initialize address handle attributes based on
+ * an SA multicast member record.
+ */
+int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
+			     struct ib_sa_mcmember_rec *rec,
+			     struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_init_ah_from_path - Initialize address handle attributes based on an SA
+ *   path record.
+ */
+int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
+			 struct ib_sa_path_rec *rec,
+			 struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_sa_unpack_path - Convert a path record from MAD format to struct
+ * ib_sa_path_rec.
+ */
+void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec);
+
+struct ib_inform_info {
+	void		*context;
+	int		(*callback)(int status,
+				    struct ib_inform_info *info,
+				    struct ib_sa_notice *notice);
+	u16		trap_number;
+};
+
+/**
+ * ib_sa_register_inform_info - Registers to receive notice events.
+ * @device: Device associated with the registration.
+ * @port_num: Port on the specified device to associate with the registration.
+ * @trap_number: InformInfo trap number to register for.
+ * @gfp_mask: GFP mask for memory allocations.
+ * @callback: User callback invoked once the registration completes and to
+ *   report noticed events.
+ * @context: User specified context stored with the ib_inform_reg structure.
+ *
+ * This call initiates a registration request with the SA for the specified
+ * trap number.  If the operation is started successfully, it returns
+ * an ib_inform_info structure that is used to track the registration operation.
+ * Users must free this structure by calling ib_unregister_inform_info,
+ * even if the operation later fails.  (The callback status is non-zero.)
+ *
+ * If the registration fails; status will be non-zero.  If the registration
+ * succeeds, the callback status will be zero, but the notice parameter will
+ * be NULL.  If the notice parameter is not NULL, a trap or notice is being
+ * reported to the user.
+ *
+ * A status of -ENETRESET indicates that an error occurred which requires
+ * reregisteration.
+ */
+struct ib_inform_info *
+ib_sa_register_inform_info(struct ib_sa_client *client,
+			   struct ib_device *device, u8 port_num,
+			   u16 trap_number, gfp_t gfp_mask,
+			   int (*callback)(int status,
+					   struct ib_inform_info *info,
+					   struct ib_sa_notice *notice),
+			   void *context);
+
+/**
+ * ib_sa_unregister_inform_info - Releases an InformInfo registration.
+ * @info: InformInfo registration tracking structure.
+ *
+ * This call blocks until the registration request is destroyed.  It may
+ * not be called from within the registration callback.
+ */
+void ib_sa_unregister_inform_info(struct ib_inform_info *info);
+
+#endif /* IB_SA_H */
diff --git a/sys/ofed/include/rdma/ib_smi.h b/sys/ofed/include/rdma/ib_smi.h
new file mode 100644
index 0000000..98b9086
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_smi.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(IB_SMI_H)
+#define IB_SMI_H
+
+#include <rdma/ib_mad.h>
+
+#define IB_SMP_DATA_SIZE			64
+#define IB_SMP_MAX_PATH_HOPS			64
+
+struct ib_smp {
+	u8	base_version;
+	u8	mgmt_class;
+	u8	class_version;
+	u8	method;
+	__be16	status;
+	u8	hop_ptr;
+	u8	hop_cnt;
+	__be64	tid;
+	__be16	attr_id;
+	__be16	resv;
+	__be32	attr_mod;
+	__be64	mkey;
+	__be16	dr_slid;
+	__be16	dr_dlid;
+	u8	reserved[28];
+	u8	data[IB_SMP_DATA_SIZE];
+	u8	initial_path[IB_SMP_MAX_PATH_HOPS];
+	u8	return_path[IB_SMP_MAX_PATH_HOPS];
+} __attribute__ ((packed));
+
+#define IB_SMP_DIRECTION			cpu_to_be16(0x8000)
+
+/* Subnet management attributes */
+#define IB_SMP_ATTR_NOTICE			cpu_to_be16(0x0002)
+#define IB_SMP_ATTR_NODE_DESC			cpu_to_be16(0x0010)
+#define IB_SMP_ATTR_NODE_INFO			cpu_to_be16(0x0011)
+#define IB_SMP_ATTR_SWITCH_INFO			cpu_to_be16(0x0012)
+#define IB_SMP_ATTR_GUID_INFO			cpu_to_be16(0x0014)
+#define IB_SMP_ATTR_PORT_INFO			cpu_to_be16(0x0015)
+#define IB_SMP_ATTR_PKEY_TABLE			cpu_to_be16(0x0016)
+#define IB_SMP_ATTR_SL_TO_VL_TABLE		cpu_to_be16(0x0017)
+#define IB_SMP_ATTR_VL_ARB_TABLE		cpu_to_be16(0x0018)
+#define IB_SMP_ATTR_LINEAR_FORWARD_TABLE	cpu_to_be16(0x0019)
+#define IB_SMP_ATTR_RANDOM_FORWARD_TABLE	cpu_to_be16(0x001A)
+#define IB_SMP_ATTR_MCAST_FORWARD_TABLE		cpu_to_be16(0x001B)
+#define IB_SMP_ATTR_SM_INFO			cpu_to_be16(0x0020)
+#define IB_SMP_ATTR_VENDOR_DIAG			cpu_to_be16(0x0030)
+#define IB_SMP_ATTR_LED_INFO			cpu_to_be16(0x0031)
+#define IB_SMP_ATTR_VENDOR_MASK			cpu_to_be16(0xFF00)
+
+struct ib_port_info {
+	__be64 mkey;
+	__be64 gid_prefix;
+	__be16 lid;
+	__be16 sm_lid;
+	__be32 cap_mask;
+	__be16 diag_code;
+	__be16 mkey_lease_period;
+	u8 local_port_num;
+	u8 link_width_enabled;
+	u8 link_width_supported;
+	u8 link_width_active;
+	u8 linkspeed_portstate;			/* 4 bits, 4 bits */
+	u8 portphysstate_linkdown;		/* 4 bits, 4 bits */
+	u8 mkeyprot_resv_lmc;			/* 2 bits, 3, 3 */
+	u8 linkspeedactive_enabled;		/* 4 bits, 4 bits */
+	u8 neighbormtu_mastersmsl;		/* 4 bits, 4 bits */
+	u8 vlcap_inittype;			/* 4 bits, 4 bits */
+	u8 vl_high_limit;
+	u8 vl_arb_high_cap;
+	u8 vl_arb_low_cap;
+	u8 inittypereply_mtucap;		/* 4 bits, 4 bits */
+	u8 vlstallcnt_hoqlife;			/* 3 bits, 5 bits */
+	u8 operationalvl_pei_peo_fpi_fpo;	/* 4 bits, 1, 1, 1, 1 */
+	__be16 mkey_violations;
+	__be16 pkey_violations;
+	__be16 qkey_violations;
+	u8 guid_cap;
+	u8 clientrereg_resv_subnetto;		/* 1 bit, 2 bits, 5 */
+	u8 resv_resptimevalue;			/* 3 bits, 5 bits */
+	u8 localphyerrors_overrunerrors;	/* 4 bits, 4 bits */
+	__be16 max_credit_hint;
+	u8 resv;
+	u8 link_roundtrip_latency[3];
+};
+
+static inline u8
+ib_get_smp_direction(struct ib_smp *smp)
+{
+	return ((smp->status & IB_SMP_DIRECTION) == IB_SMP_DIRECTION);
+}
+
+#endif /* IB_SMI_H */
diff --git a/sys/ofed/include/rdma/ib_umem.h b/sys/ofed/include/rdma/ib_umem.h
new file mode 100644
index 0000000..afa09f9
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_umem.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2007 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_UMEM_H
+#define IB_UMEM_H
+
+#include <linux/list.h>
+#include <linux/scatterlist.h>
+#include <linux/workqueue.h>
+#include <linux/dma-attrs.h>
+
+struct ib_ucontext;
+
+struct ib_umem {
+	struct ib_ucontext     *context;
+	size_t			length;
+	int			offset;
+	int			page_size;
+	int                     writable;
+	int                     hugetlb;
+	struct list_head	chunk_list;
+#ifdef __linux__
+	struct work_struct	work;
+	struct mm_struct       *mm;
+#else
+	unsigned long		start;
+#endif
+	unsigned long		diff;
+};
+
+struct ib_umem_chunk {
+	struct list_head	list;
+	int                     nents;
+	int                     nmap;
+	struct dma_attrs	attrs;
+	struct scatterlist      page_list[0];
+};
+
+struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
+			    size_t size, int access, int dmasync);
+void ib_umem_release(struct ib_umem *umem);
+int ib_umem_page_count(struct ib_umem *umem);
+
+#endif /* IB_UMEM_H */
diff --git a/sys/ofed/include/rdma/ib_user_cm.h b/sys/ofed/include/rdma/ib_user_cm.h
new file mode 100644
index 0000000..bd3d380
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_user_cm.h
@@ -0,0 +1,324 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_USER_CM_H
+#define IB_USER_CM_H
+
+#include <rdma/ib_user_sa.h>
+
+#define IB_USER_CM_ABI_VERSION 5
+
+enum {
+	IB_USER_CM_CMD_CREATE_ID,
+	IB_USER_CM_CMD_DESTROY_ID,
+	IB_USER_CM_CMD_ATTR_ID,
+
+	IB_USER_CM_CMD_LISTEN,
+	IB_USER_CM_CMD_NOTIFY,
+
+	IB_USER_CM_CMD_SEND_REQ,
+	IB_USER_CM_CMD_SEND_REP,
+	IB_USER_CM_CMD_SEND_RTU,
+	IB_USER_CM_CMD_SEND_DREQ,
+	IB_USER_CM_CMD_SEND_DREP,
+	IB_USER_CM_CMD_SEND_REJ,
+	IB_USER_CM_CMD_SEND_MRA,
+	IB_USER_CM_CMD_SEND_LAP,
+	IB_USER_CM_CMD_SEND_APR,
+	IB_USER_CM_CMD_SEND_SIDR_REQ,
+	IB_USER_CM_CMD_SEND_SIDR_REP,
+
+	IB_USER_CM_CMD_EVENT,
+	IB_USER_CM_CMD_INIT_QP_ATTR,
+};
+/*
+ * command ABI structures.
+ */
+struct ib_ucm_cmd_hdr {
+	__u32 cmd;
+	__u16 in;
+	__u16 out;
+};
+
+struct ib_ucm_create_id {
+	__u64 uid;
+	__u64 response;
+};
+
+struct ib_ucm_create_id_resp {
+	__u32 id;
+};
+
+struct ib_ucm_destroy_id {
+	__u64 response;
+	__u32 id;
+	__u32 reserved;
+};
+
+struct ib_ucm_destroy_id_resp {
+	__u32 events_reported;
+};
+
+struct ib_ucm_attr_id {
+	__u64 response;
+	__u32 id;
+	__u32 reserved;
+};
+
+struct ib_ucm_attr_id_resp {
+	__be64 service_id;
+	__be64 service_mask;
+	__be32 local_id;
+	__be32 remote_id;
+};
+
+struct ib_ucm_init_qp_attr {
+	__u64 response;
+	__u32 id;
+	__u32 qp_state;
+};
+
+struct ib_ucm_listen {
+	__be64 service_id;
+	__be64 service_mask;
+	__u32 id;
+	__u32 reserved;
+};
+
+struct ib_ucm_notify {
+	__u32 id;
+	__u32 event;
+};
+
+struct ib_ucm_private_data {
+	__u64 data;
+	__u32 id;
+	__u8  len;
+	__u8  reserved[3];
+};
+
+struct ib_ucm_req {
+	__u32 id;
+	__u32 qpn;
+	__u32 qp_type;
+	__u32 psn;
+	__be64 sid;
+	__u64 data;
+	__u64 primary_path;
+	__u64 alternate_path;
+	__u8  len;
+	__u8  peer_to_peer;
+	__u8  responder_resources;
+	__u8  initiator_depth;
+	__u8  remote_cm_response_timeout;
+	__u8  flow_control;
+	__u8  local_cm_response_timeout;
+	__u8  retry_count;
+	__u8  rnr_retry_count;
+	__u8  max_cm_retries;
+	__u8  srq;
+	__u8  reserved[5];
+};
+
+struct ib_ucm_rep {
+	__u64 uid;
+	__u64 data;
+	__u32 id;
+	__u32 qpn;
+	__u32 psn;
+	__u8  len;
+	__u8  responder_resources;
+	__u8  initiator_depth;
+	__u8  target_ack_delay;
+	__u8  failover_accepted;
+	__u8  flow_control;
+	__u8  rnr_retry_count;
+	__u8  srq;
+	__u8  reserved[4];
+};
+
+struct ib_ucm_info {
+	__u32 id;
+	__u32 status;
+	__u64 info;
+	__u64 data;
+	__u8  info_len;
+	__u8  data_len;
+	__u8  reserved[6];
+};
+
+struct ib_ucm_mra {
+	__u64 data;
+	__u32 id;
+	__u8  len;
+	__u8  timeout;
+	__u8  reserved[2];
+};
+
+struct ib_ucm_lap {
+	__u64 path;
+	__u64 data;
+	__u32 id;
+	__u8  len;
+	__u8  reserved[3];
+};
+
+struct ib_ucm_sidr_req {
+	__u32 id;
+	__u32 timeout;
+	__be64 sid;
+	__u64 data;
+	__u64 path;
+	__u16 reserved_pkey;
+	__u8  len;
+	__u8  max_cm_retries;
+	__u8  reserved[4];
+};
+
+struct ib_ucm_sidr_rep {
+	__u32 id;
+	__u32 qpn;
+	__u32 qkey;
+	__u32 status;
+	__u64 info;
+	__u64 data;
+	__u8  info_len;
+	__u8  data_len;
+	__u8  reserved[6];
+};
+/*
+ * event notification ABI structures.
+ */
+struct ib_ucm_event_get {
+	__u64 response;
+	__u64 data;
+	__u64 info;
+	__u8  data_len;
+	__u8  info_len;
+	__u8  reserved[6];
+};
+
+struct ib_ucm_req_event_resp {
+	struct ib_user_path_rec primary_path;
+	struct ib_user_path_rec alternate_path;
+	__be64                 remote_ca_guid;
+	__u32                  remote_qkey;
+	__u32                  remote_qpn;
+	__u32                  qp_type;
+	__u32                  starting_psn;
+	__u8  responder_resources;
+	__u8  initiator_depth;
+	__u8  local_cm_response_timeout;
+	__u8  flow_control;
+	__u8  remote_cm_response_timeout;
+	__u8  retry_count;
+	__u8  rnr_retry_count;
+	__u8  srq;
+	__u8  port;
+	__u8  reserved[7];
+};
+
+struct ib_ucm_rep_event_resp {
+	__be64 remote_ca_guid;
+	__u32 remote_qkey;
+	__u32 remote_qpn;
+	__u32 starting_psn;
+	__u8  responder_resources;
+	__u8  initiator_depth;
+	__u8  target_ack_delay;
+	__u8  failover_accepted;
+	__u8  flow_control;
+	__u8  rnr_retry_count;
+	__u8  srq;
+	__u8  reserved[5];
+};
+
+struct ib_ucm_rej_event_resp {
+	__u32 reason;
+	/* ari in ib_ucm_event_get info field. */
+};
+
+struct ib_ucm_mra_event_resp {
+	__u8  timeout;
+	__u8  reserved[3];
+};
+
+struct ib_ucm_lap_event_resp {
+	struct ib_user_path_rec path;
+};
+
+struct ib_ucm_apr_event_resp {
+	__u32 status;
+	/* apr info in ib_ucm_event_get info field. */
+};
+
+struct ib_ucm_sidr_req_event_resp {
+	__u16 pkey;
+	__u8  port;
+	__u8  reserved;
+};
+
+struct ib_ucm_sidr_rep_event_resp {
+	__u32 status;
+	__u32 qkey;
+	__u32 qpn;
+	/* info in ib_ucm_event_get info field. */
+};
+
+#define IB_UCM_PRES_DATA      0x01
+#define IB_UCM_PRES_INFO      0x02
+#define IB_UCM_PRES_PRIMARY   0x04
+#define IB_UCM_PRES_ALTERNATE 0x08
+
+struct ib_ucm_event_resp {
+	__u64 uid;
+	__u32 id;
+	__u32 event;
+	__u32 present;
+	__u32 reserved;
+	union {
+		struct ib_ucm_req_event_resp req_resp;
+		struct ib_ucm_rep_event_resp rep_resp;
+		struct ib_ucm_rej_event_resp rej_resp;
+		struct ib_ucm_mra_event_resp mra_resp;
+		struct ib_ucm_lap_event_resp lap_resp;
+		struct ib_ucm_apr_event_resp apr_resp;
+
+		struct ib_ucm_sidr_req_event_resp sidr_req_resp;
+		struct ib_ucm_sidr_rep_event_resp sidr_rep_resp;
+
+		__u32                             send_status;
+	} u;
+};
+
+#endif /* IB_USER_CM_H */
diff --git a/sys/ofed/include/rdma/ib_user_mad.h b/sys/ofed/include/rdma/ib_user_mad.h
new file mode 100644
index 0000000..3595ae2
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_user_mad.h
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_USER_MAD_H
+#define IB_USER_MAD_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define IB_USER_MAD_ABI_VERSION	5
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ */
+
+/**
+ * ib_user_mad_hdr_old - Old version of MAD packet header without pkey_index
+ * @id - ID of agent MAD received with/to be sent with
+ * @status - 0 on successful receive, ETIMEDOUT if no response
+ *   received (transaction ID in data[] will be set to TID of original
+ *   request) (ignored on send)
+ * @timeout_ms - Milliseconds to wait for response (unset on receive)
+ * @retries - Number of automatic retries to attempt
+ * @qpn - Remote QP number received from/to be sent to
+ * @qkey - Remote Q_Key to be sent with (unset on receive)
+ * @lid - Remote lid received from/to be sent to
+ * @sl - Service level received with/to be sent with
+ * @path_bits - Local path bits received with/to be sent with
+ * @grh_present - If set, GRH was received/should be sent
+ * @gid_index - Local GID index to send with (unset on receive)
+ * @hop_limit - Hop limit in GRH
+ * @traffic_class - Traffic class in GRH
+ * @gid - Remote GID in GRH
+ * @flow_label - Flow label in GRH
+ */
+struct ib_user_mad_hdr_old {
+	__u32	id;
+	__u32	status;
+	__u32	timeout_ms;
+	__u32	retries;
+	__u32	length;
+	__be32	qpn;
+	__be32  qkey;
+	__be16	lid;
+	__u8	sl;
+	__u8	path_bits;
+	__u8	grh_present;
+	__u8	gid_index;
+	__u8	hop_limit;
+	__u8	traffic_class;
+	__u8	gid[16];
+	__be32	flow_label;
+};
+
+/**
+ * ib_user_mad_hdr - MAD packet header
+ *   This layout allows specifying/receiving the P_Key index.  To use
+ *   this capability, an application must call the
+ *   IB_USER_MAD_ENABLE_PKEY ioctl on the user MAD file handle before
+ *   any other actions with the file handle.
+ * @id - ID of agent MAD received with/to be sent with
+ * @status - 0 on successful receive, ETIMEDOUT if no response
+ *   received (transaction ID in data[] will be set to TID of original
+ *   request) (ignored on send)
+ * @timeout_ms - Milliseconds to wait for response (unset on receive)
+ * @retries - Number of automatic retries to attempt
+ * @qpn - Remote QP number received from/to be sent to
+ * @qkey - Remote Q_Key to be sent with (unset on receive)
+ * @lid - Remote lid received from/to be sent to
+ * @sl - Service level received with/to be sent with
+ * @path_bits - Local path bits received with/to be sent with
+ * @grh_present - If set, GRH was received/should be sent
+ * @gid_index - Local GID index to send with (unset on receive)
+ * @hop_limit - Hop limit in GRH
+ * @traffic_class - Traffic class in GRH
+ * @gid - Remote GID in GRH
+ * @flow_label - Flow label in GRH
+ * @pkey_index - P_Key index
+ */
+struct ib_user_mad_hdr {
+	__u32	id;
+	__u32	status;
+	__u32	timeout_ms;
+	__u32	retries;
+	__u32	length;
+	__be32	qpn;
+	__be32  qkey;
+	__be16	lid;
+	__u8	sl;
+	__u8	path_bits;
+	__u8	grh_present;
+	__u8	gid_index;
+	__u8	hop_limit;
+	__u8	traffic_class;
+	__u8	gid[16];
+	__be32	flow_label;
+	__u16	pkey_index;
+	__u8	reserved[6];
+};
+
+/**
+ * ib_user_mad - MAD packet
+ * @hdr - MAD packet header
+ * @data - Contents of MAD
+ *
+ */
+struct ib_user_mad {
+	struct ib_user_mad_hdr hdr;
+	__u64	data[0];
+};
+
+/*
+ * Earlier versions of this interface definition declared the
+ * method_mask[] member as an array of __u32 but treated it as a
+ * bitmap made up of longs in the kernel.  This ambiguity meant that
+ * 32-bit big-endian applications that can run on both 32-bit and
+ * 64-bit kernels had no consistent ABI to rely on, and 64-bit
+ * big-endian applications that treated method_mask as being made up
+ * of 32-bit words would have their bitmap misinterpreted.
+ *
+ * To clear up this confusion, we change the declaration of
+ * method_mask[] to use unsigned long and handle the conversion from
+ * 32-bit userspace to 64-bit kernel for big-endian systems in the
+ * compat_ioctl method.  Unfortunately, to keep the structure layout
+ * the same, we need the method_mask[] array to be aligned only to 4
+ * bytes even when long is 64 bits, which forces us into this ugly
+ * typedef.
+ */
+typedef unsigned long __attribute__((aligned(4))) packed_ulong;
+#define IB_USER_MAD_LONGS_PER_METHOD_MASK (128 / (8 * sizeof (long)))
+
+/**
+ * ib_user_mad_reg_req - MAD registration request
+ * @id - Set by the kernel; used to identify agent in future requests.
+ * @qpn - Queue pair number; must be 0 or 1.
+ * @method_mask - The caller will receive unsolicited MADs for any method
+ *   where @method_mask = 1.
+ * @mgmt_class - Indicates which management class of MADs should be receive
+ *   by the caller.  This field is only required if the user wishes to
+ *   receive unsolicited MADs, otherwise it should be 0.
+ * @mgmt_class_version - Indicates which version of MADs for the given
+ *   management class to receive.
+ * @oui: Indicates IEEE OUI when mgmt_class is a vendor class
+ *   in the range from 0x30 to 0x4f. Otherwise not used.
+ * @rmpp_version: If set, indicates the RMPP version used.
+ *
+ */
+struct ib_user_mad_reg_req {
+	__u32	id;
+	packed_ulong method_mask[IB_USER_MAD_LONGS_PER_METHOD_MASK];
+	__u8	qpn;
+	__u8	mgmt_class;
+	__u8	mgmt_class_version;
+	__u8    oui[3];
+	__u8	rmpp_version;
+};
+
+#define IB_IOCTL_MAGIC		0x1b
+
+#define IB_USER_MAD_REGISTER_AGENT	_IO(IB_IOCTL_MAGIC, 1)
+
+#define IB_USER_MAD_UNREGISTER_AGENT	_IO(IB_IOCTL_MAGIC, 2)
+
+#define IB_USER_MAD_ENABLE_PKEY		_IO(IB_IOCTL_MAGIC, 3)
+
+#endif /* IB_USER_MAD_H */
diff --git a/sys/ofed/include/rdma/ib_user_sa.h b/sys/ofed/include/rdma/ib_user_sa.h
new file mode 100644
index 0000000..cfc7c9b
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_user_sa.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_USER_SA_H
+#define IB_USER_SA_H
+
+#include <linux/types.h>
+
+enum {
+	IB_PATH_GMP		= 1,
+	IB_PATH_PRIMARY		= (1<<1),
+	IB_PATH_ALTERNATE	= (1<<2),
+	IB_PATH_OUTBOUND	= (1<<3),
+	IB_PATH_INBOUND		= (1<<4),
+	IB_PATH_INBOUND_REVERSE = (1<<5),
+	IB_PATH_BIDIRECTIONAL	= IB_PATH_OUTBOUND | IB_PATH_INBOUND_REVERSE
+};
+
+struct ib_path_rec_data {
+	__u32	flags;
+	__u32	reserved;
+	__u32	path_rec[16];
+};
+
+struct ib_user_path_rec {
+	__u8	dgid[16];
+	__u8	sgid[16];
+	__be16	dlid;
+	__be16	slid;
+	__u32	raw_traffic;
+	__be32	flow_label;
+	__u32	reversible;
+	__u32	mtu;
+	__be16	pkey;
+	__u8	hop_limit;
+	__u8	traffic_class;
+	__u8	numb_path;
+	__u8	sl;
+	__u8	mtu_selector;
+	__u8	rate_selector;
+	__u8	rate;
+	__u8	packet_life_time_selector;
+	__u8	packet_life_time;
+	__u8	preference;
+};
+
+#endif /* IB_USER_SA_H */
diff --git a/sys/ofed/include/rdma/ib_user_verbs.h b/sys/ofed/include/rdma/ib_user_verbs.h
new file mode 100644
index 0000000..b2721c7
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_user_verbs.h
@@ -0,0 +1,801 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
+ * Copyright (c) 2006 Mellanox Technologies.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_USER_VERBS_H
+#define IB_USER_VERBS_H
+
+#include <linux/types.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define IB_USER_VERBS_ABI_VERSION	6
+
+enum {
+	IB_USER_VERBS_CMD_GET_CONTEXT,
+	IB_USER_VERBS_CMD_QUERY_DEVICE,
+	IB_USER_VERBS_CMD_QUERY_PORT,
+	IB_USER_VERBS_CMD_ALLOC_PD,
+	IB_USER_VERBS_CMD_DEALLOC_PD,
+	IB_USER_VERBS_CMD_CREATE_AH,
+	IB_USER_VERBS_CMD_MODIFY_AH,
+	IB_USER_VERBS_CMD_QUERY_AH,
+	IB_USER_VERBS_CMD_DESTROY_AH,
+	IB_USER_VERBS_CMD_REG_MR,
+	IB_USER_VERBS_CMD_REG_SMR,
+	IB_USER_VERBS_CMD_REREG_MR,
+	IB_USER_VERBS_CMD_QUERY_MR,
+	IB_USER_VERBS_CMD_DEREG_MR,
+	IB_USER_VERBS_CMD_ALLOC_MW,
+	IB_USER_VERBS_CMD_BIND_MW,
+	IB_USER_VERBS_CMD_DEALLOC_MW,
+	IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL,
+	IB_USER_VERBS_CMD_CREATE_CQ,
+	IB_USER_VERBS_CMD_RESIZE_CQ,
+	IB_USER_VERBS_CMD_DESTROY_CQ,
+	IB_USER_VERBS_CMD_POLL_CQ,
+	IB_USER_VERBS_CMD_PEEK_CQ,
+	IB_USER_VERBS_CMD_REQ_NOTIFY_CQ,
+	IB_USER_VERBS_CMD_CREATE_QP,
+	IB_USER_VERBS_CMD_QUERY_QP,
+	IB_USER_VERBS_CMD_MODIFY_QP,
+	IB_USER_VERBS_CMD_DESTROY_QP,
+	IB_USER_VERBS_CMD_POST_SEND,
+	IB_USER_VERBS_CMD_POST_RECV,
+	IB_USER_VERBS_CMD_ATTACH_MCAST,
+	IB_USER_VERBS_CMD_DETACH_MCAST,
+	IB_USER_VERBS_CMD_CREATE_SRQ,
+	IB_USER_VERBS_CMD_MODIFY_SRQ,
+	IB_USER_VERBS_CMD_QUERY_SRQ,
+	IB_USER_VERBS_CMD_DESTROY_SRQ,
+	IB_USER_VERBS_CMD_POST_SRQ_RECV,
+	IB_USER_VERBS_CMD_CREATE_XRC_SRQ,
+	IB_USER_VERBS_CMD_OPEN_XRC_DOMAIN,
+	IB_USER_VERBS_CMD_CLOSE_XRC_DOMAIN,
+	IB_USER_VERBS_CMD_CREATE_XRC_RCV_QP,
+	IB_USER_VERBS_CMD_MODIFY_XRC_RCV_QP,
+	IB_USER_VERBS_CMD_QUERY_XRC_RCV_QP,
+	IB_USER_VERBS_CMD_REG_XRC_RCV_QP,
+	IB_USER_VERBS_CMD_UNREG_XRC_RCV_QP,
+};
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * Specifically:
+ *  - Do not use pointer types -- pass pointers in __u64 instead.
+ *  - Make sure that any structure larger than 4 bytes is padded to a
+ *    multiple of 8 bytes.  Otherwise the structure size will be
+ *    different between 32-bit and 64-bit architectures.
+ */
+
+struct ib_uverbs_async_event_desc {
+	__u64 element;
+	__u32 event_type;	/* enum ib_event_type */
+	__u32 reserved;
+};
+
+struct ib_uverbs_comp_event_desc {
+	__u64 cq_handle;
+};
+
+/*
+ * All commands from userspace should start with a __u32 command field
+ * followed by __u16 in_words and out_words fields (which give the
+ * length of the command block and response buffer if any in 32-bit
+ * words).  The kernel driver will read these fields first and read
+ * the rest of the command struct based on these value.
+ */
+
+struct ib_uverbs_cmd_hdr {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+};
+
+struct ib_uverbs_get_context {
+	__u64 response;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_get_context_resp {
+	__u32 async_fd;
+	__u32 num_comp_vectors;
+};
+
+struct ib_uverbs_query_device {
+	__u64 response;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_query_device_resp {
+	__u64 fw_ver;
+	__be64 node_guid;
+	__be64 sys_image_guid;
+	__u64 max_mr_size;
+	__u64 page_size_cap;
+	__u32 vendor_id;
+	__u32 vendor_part_id;
+	__u32 hw_ver;
+	__u32 max_qp;
+	__u32 max_qp_wr;
+	__u32 device_cap_flags;
+	__u32 max_sge;
+	__u32 max_sge_rd;
+	__u32 max_cq;
+	__u32 max_cqe;
+	__u32 max_mr;
+	__u32 max_pd;
+	__u32 max_qp_rd_atom;
+	__u32 max_ee_rd_atom;
+	__u32 max_res_rd_atom;
+	__u32 max_qp_init_rd_atom;
+	__u32 max_ee_init_rd_atom;
+	__u32 atomic_cap;
+	__u32 max_ee;
+	__u32 max_rdd;
+	__u32 max_mw;
+	__u32 max_raw_ipv6_qp;
+	__u32 max_raw_ethy_qp;
+	__u32 max_mcast_grp;
+	__u32 max_mcast_qp_attach;
+	__u32 max_total_mcast_qp_attach;
+	__u32 max_ah;
+	__u32 max_fmr;
+	__u32 max_map_per_fmr;
+	__u32 max_srq;
+	__u32 max_srq_wr;
+	__u32 max_srq_sge;
+	__u16 max_pkeys;
+	__u8  local_ca_ack_delay;
+	__u8  phys_port_cnt;
+	__u8  reserved[4];
+};
+
+struct ib_uverbs_query_port {
+	__u64 response;
+	__u8  port_num;
+	__u8  reserved[7];
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_query_port_resp {
+	__u32 port_cap_flags;
+	__u32 max_msg_sz;
+	__u32 bad_pkey_cntr;
+	__u32 qkey_viol_cntr;
+	__u32 gid_tbl_len;
+	__u16 pkey_tbl_len;
+	__u16 lid;
+	__u16 sm_lid;
+	__u8  state;
+	__u8  max_mtu;
+	__u8  active_mtu;
+	__u8  lmc;
+	__u8  max_vl_num;
+	__u8  sm_sl;
+	__u8  subnet_timeout;
+	__u8  init_type_reply;
+	__u8  active_width;
+	__u8  active_speed;
+	__u8  phys_state;
+	__u8  link_layer;
+	__u8  reserved[2];
+};
+
+struct ib_uverbs_alloc_pd {
+	__u64 response;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_alloc_pd_resp {
+	__u32 pd_handle;
+};
+
+struct ib_uverbs_dealloc_pd {
+	__u32 pd_handle;
+};
+
+struct ib_uverbs_reg_mr {
+	__u64 response;
+	__u64 start;
+	__u64 length;
+	__u64 hca_va;
+	__u32 pd_handle;
+	__u32 access_flags;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_reg_mr_resp {
+	__u32 mr_handle;
+	__u32 lkey;
+	__u32 rkey;
+};
+
+struct ib_uverbs_dereg_mr {
+	__u32 mr_handle;
+};
+
+struct ib_uverbs_create_comp_channel {
+	__u64 response;
+};
+
+struct ib_uverbs_create_comp_channel_resp {
+	__u32 fd;
+};
+
+struct ib_uverbs_create_cq {
+	__u64 response;
+	__u64 user_handle;
+	__u32 cqe;
+	__u32 comp_vector;
+	__s32 comp_channel;
+	__u32 reserved;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_create_cq_resp {
+	__u32 cq_handle;
+	__u32 cqe;
+};
+
+struct ib_uverbs_resize_cq {
+	__u64 response;
+	__u32 cq_handle;
+	__u32 cqe;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_resize_cq_resp {
+	__u32 cqe;
+	__u32 reserved;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_poll_cq {
+	__u64 response;
+	__u32 cq_handle;
+	__u32 ne;
+};
+
+struct ib_uverbs_wc {
+	__u64 wr_id;
+	__u32 status;
+	__u32 opcode;
+	__u32 vendor_err;
+	__u32 byte_len;
+	union {
+		__u32 imm_data;
+		__u32 invalidate_rkey;
+	} ex;
+	__u32 qp_num;
+	__u32 src_qp;
+	__u32 wc_flags;
+	__u16 pkey_index;
+	__u16 slid;
+	__u8 sl;
+	__u8 dlid_path_bits;
+	__u8 port_num;
+	__u8 reserved;
+};
+
+struct ib_uverbs_poll_cq_resp {
+	__u32 count;
+	__u32 reserved;
+	struct ib_uverbs_wc wc[0];
+};
+
+struct ib_uverbs_req_notify_cq {
+	__u32 cq_handle;
+	__u32 solicited_only;
+};
+
+struct ib_uverbs_destroy_cq {
+	__u64 response;
+	__u32 cq_handle;
+	__u32 reserved;
+};
+
+struct ib_uverbs_destroy_cq_resp {
+	__u32 comp_events_reported;
+	__u32 async_events_reported;
+};
+
+struct ib_uverbs_global_route {
+	__u8  dgid[16];
+	__u32 flow_label;
+	__u8  sgid_index;
+	__u8  hop_limit;
+	__u8  traffic_class;
+	__u8  reserved;
+};
+
+struct ib_uverbs_ah_attr {
+	struct ib_uverbs_global_route grh;
+	__u16 dlid;
+	__u8  sl;
+	__u8  src_path_bits;
+	__u8  static_rate;
+	__u8  is_global;
+	__u8  port_num;
+	__u8  reserved;
+};
+
+struct ib_uverbs_qp_attr {
+	__u32	qp_attr_mask;
+	__u32	qp_state;
+	__u32	cur_qp_state;
+	__u32	path_mtu;
+	__u32	path_mig_state;
+	__u32	qkey;
+	__u32	rq_psn;
+	__u32	sq_psn;
+	__u32	dest_qp_num;
+	__u32	qp_access_flags;
+
+	struct ib_uverbs_ah_attr ah_attr;
+	struct ib_uverbs_ah_attr alt_ah_attr;
+
+	/* ib_qp_cap */
+	__u32	max_send_wr;
+	__u32	max_recv_wr;
+	__u32	max_send_sge;
+	__u32	max_recv_sge;
+	__u32	max_inline_data;
+
+	__u16	pkey_index;
+	__u16	alt_pkey_index;
+	__u8	en_sqd_async_notify;
+	__u8	sq_draining;
+	__u8	max_rd_atomic;
+	__u8	max_dest_rd_atomic;
+	__u8	min_rnr_timer;
+	__u8	port_num;
+	__u8	timeout;
+	__u8	retry_cnt;
+	__u8	rnr_retry;
+	__u8	alt_port_num;
+	__u8	alt_timeout;
+	__u8	reserved[5];
+};
+
+struct ib_uverbs_create_qp {
+	__u64 response;
+	__u64 user_handle;
+	__u32 pd_handle;
+	__u32 send_cq_handle;
+	__u32 recv_cq_handle;
+	__u32 srq_handle;
+	__u32 max_send_wr;
+	__u32 max_recv_wr;
+	__u32 max_send_sge;
+	__u32 max_recv_sge;
+	__u32 max_inline_data;
+	__u8  sq_sig_all;
+	__u8  qp_type;
+	__u8  is_srq;
+	__u8  reserved;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_create_qp_resp {
+	__u32 qp_handle;
+	__u32 qpn;
+	__u32 max_send_wr;
+	__u32 max_recv_wr;
+	__u32 max_send_sge;
+	__u32 max_recv_sge;
+	__u32 max_inline_data;
+	__u32 reserved;
+};
+
+/*
+ * This struct needs to remain a multiple of 8 bytes to keep the
+ * alignment of the modify QP parameters.
+ */
+struct ib_uverbs_qp_dest {
+	__u8  dgid[16];
+	__u32 flow_label;
+	__u16 dlid;
+	__u16 reserved;
+	__u8  sgid_index;
+	__u8  hop_limit;
+	__u8  traffic_class;
+	__u8  sl;
+	__u8  src_path_bits;
+	__u8  static_rate;
+	__u8  is_global;
+	__u8  port_num;
+};
+
+struct ib_uverbs_query_qp {
+	__u64 response;
+	__u32 qp_handle;
+	__u32 attr_mask;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_query_qp_resp {
+	struct ib_uverbs_qp_dest dest;
+	struct ib_uverbs_qp_dest alt_dest;
+	__u32 max_send_wr;
+	__u32 max_recv_wr;
+	__u32 max_send_sge;
+	__u32 max_recv_sge;
+	__u32 max_inline_data;
+	__u32 qkey;
+	__u32 rq_psn;
+	__u32 sq_psn;
+	__u32 dest_qp_num;
+	__u32 qp_access_flags;
+	__u16 pkey_index;
+	__u16 alt_pkey_index;
+	__u8  qp_state;
+	__u8  cur_qp_state;
+	__u8  path_mtu;
+	__u8  path_mig_state;
+	__u8  sq_draining;
+	__u8  max_rd_atomic;
+	__u8  max_dest_rd_atomic;
+	__u8  min_rnr_timer;
+	__u8  port_num;
+	__u8  timeout;
+	__u8  retry_cnt;
+	__u8  rnr_retry;
+	__u8  alt_port_num;
+	__u8  alt_timeout;
+	__u8  sq_sig_all;
+	__u8  reserved[5];
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_modify_qp {
+	struct ib_uverbs_qp_dest dest;
+	struct ib_uverbs_qp_dest alt_dest;
+	__u32 qp_handle;
+	__u32 attr_mask;
+	__u32 qkey;
+	__u32 rq_psn;
+	__u32 sq_psn;
+	__u32 dest_qp_num;
+	__u32 qp_access_flags;
+	__u16 pkey_index;
+	__u16 alt_pkey_index;
+	__u8  qp_state;
+	__u8  cur_qp_state;
+	__u8  path_mtu;
+	__u8  path_mig_state;
+	__u8  en_sqd_async_notify;
+	__u8  max_rd_atomic;
+	__u8  max_dest_rd_atomic;
+	__u8  min_rnr_timer;
+	__u8  port_num;
+	__u8  timeout;
+	__u8  retry_cnt;
+	__u8  rnr_retry;
+	__u8  alt_port_num;
+	__u8  alt_timeout;
+	__u8  reserved[2];
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_modify_qp_resp {
+};
+
+struct ib_uverbs_destroy_qp {
+	__u64 response;
+	__u32 qp_handle;
+	__u32 reserved;
+};
+
+struct ib_uverbs_destroy_qp_resp {
+	__u32 events_reported;
+};
+
+/*
+ * The ib_uverbs_sge structure isn't used anywhere, since we assume
+ * the ib_sge structure is packed the same way on 32-bit and 64-bit
+ * architectures in both kernel and user space.  It's just here to
+ * document the ABI.
+ */
+struct ib_uverbs_sge {
+	__u64 addr;
+	__u32 length;
+	__u32 lkey;
+};
+
+struct ib_uverbs_send_wr {
+	__u64 wr_id;
+	__u32 num_sge;
+	__u32 opcode;
+	__u32 send_flags;
+	union {
+		__u32 imm_data;
+		__u32 invalidate_rkey;
+	} ex;
+	union {
+		struct {
+			__u64 remote_addr;
+			__u32 rkey;
+			__u32 reserved;
+		} rdma;
+		struct {
+			__u64 remote_addr;
+			__u64 compare_add;
+			__u64 swap;
+			__u32 rkey;
+			__u32 reserved;
+		} atomic;
+		struct {
+			__u32 ah;
+			__u32 remote_qpn;
+			__u32 remote_qkey;
+			__u32 reserved;
+		} ud;
+	} wr;
+};
+
+struct ib_uverbs_post_send {
+	__u64 response;
+	__u32 qp_handle;
+	__u32 wr_count;
+	__u32 sge_count;
+	__u32 wqe_size;
+	struct ib_uverbs_send_wr send_wr[0];
+};
+
+struct ib_uverbs_post_send_resp {
+	__u32 bad_wr;
+};
+
+struct ib_uverbs_recv_wr {
+	__u64 wr_id;
+	__u32 num_sge;
+	__u32 reserved;
+};
+
+struct ib_uverbs_post_recv {
+	__u64 response;
+	__u32 qp_handle;
+	__u32 wr_count;
+	__u32 sge_count;
+	__u32 wqe_size;
+	struct ib_uverbs_recv_wr recv_wr[0];
+};
+
+struct ib_uverbs_post_recv_resp {
+	__u32 bad_wr;
+};
+
+struct ib_uverbs_post_srq_recv {
+	__u64 response;
+	__u32 srq_handle;
+	__u32 wr_count;
+	__u32 sge_count;
+	__u32 wqe_size;
+	struct ib_uverbs_recv_wr recv[0];
+};
+
+struct ib_uverbs_post_srq_recv_resp {
+	__u32 bad_wr;
+};
+
+struct ib_uverbs_create_ah {
+	__u64 response;
+	__u64 user_handle;
+	__u32 pd_handle;
+	__u32 reserved;
+	struct ib_uverbs_ah_attr attr;
+};
+
+struct ib_uverbs_create_ah_resp {
+	__u32 ah_handle;
+};
+
+struct ib_uverbs_destroy_ah {
+	__u32 ah_handle;
+};
+
+struct ib_uverbs_attach_mcast {
+	__u8  gid[16];
+	__u32 qp_handle;
+	__u16 mlid;
+	__u16 reserved;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_detach_mcast {
+	__u8  gid[16];
+	__u32 qp_handle;
+	__u16 mlid;
+	__u16 reserved;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_create_srq {
+	__u64 response;
+	__u64 user_handle;
+	__u32 pd_handle;
+	__u32 max_wr;
+	__u32 max_sge;
+	__u32 srq_limit;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_create_xrc_srq {
+	__u64 response;
+	__u64 user_handle;
+	__u32 pd_handle;
+	__u32 max_wr;
+	__u32 max_sge;
+	__u32 srq_limit;
+	__u32 xrcd_handle;
+	__u32 xrc_cq;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_create_srq_resp {
+	__u32 srq_handle;
+	__u32 max_wr;
+	__u32 max_sge;
+	__u32 reserved;
+};
+
+struct ib_uverbs_modify_srq {
+	__u32 srq_handle;
+	__u32 attr_mask;
+	__u32 max_wr;
+	__u32 srq_limit;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_query_srq {
+	__u64 response;
+	__u32 srq_handle;
+	__u32 reserved;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_query_srq_resp {
+	__u32 max_wr;
+	__u32 max_sge;
+	__u32 srq_limit;
+	__u32 reserved;
+};
+
+struct ib_uverbs_destroy_srq {
+	__u64 response;
+	__u32 srq_handle;
+	__u32 reserved;
+};
+
+struct ib_uverbs_destroy_srq_resp {
+	__u32 events_reported;
+};
+
+struct ib_uverbs_open_xrc_domain {
+	__u64 response;
+	__u32 fd;
+	__u32 oflags;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_open_xrc_domain_resp {
+	__u32 xrcd_handle;
+};
+
+struct ib_uverbs_close_xrc_domain {
+	__u64 response;
+	__u32 xrcd_handle;
+	__u32 reserved;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_create_xrc_rcv_qp {
+	__u64 response;
+	__u64 user_handle;
+	__u32 xrc_domain_handle;
+	__u32 max_send_wr;
+	__u32 max_recv_wr;
+	__u32 max_send_sge;
+	__u32 max_recv_sge;
+	__u32 max_inline_data;
+	__u8  sq_sig_all;
+	__u8  qp_type;
+	__u8  reserved[6];
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_create_xrc_rcv_qp_resp {
+	__u32 qpn;
+	__u32 reserved;
+};
+
+struct ib_uverbs_modify_xrc_rcv_qp {
+	__u32 xrc_domain_handle;
+	__u32 qp_num;
+	struct ib_uverbs_qp_dest dest;
+	struct ib_uverbs_qp_dest alt_dest;
+	__u32 attr_mask;
+	__u32 qkey;
+	__u32 rq_psn;
+	__u32 sq_psn;
+	__u32 dest_qp_num;
+	__u32 qp_access_flags;
+	__u16 pkey_index;
+	__u16 alt_pkey_index;
+	__u8  qp_state;
+	__u8  cur_qp_state;
+	__u8  path_mtu;
+	__u8  path_mig_state;
+	__u8  en_sqd_async_notify;
+	__u8  max_rd_atomic;
+	__u8  max_dest_rd_atomic;
+	__u8  min_rnr_timer;
+	__u8  port_num;
+	__u8  timeout;
+	__u8  retry_cnt;
+	__u8  rnr_retry;
+	__u8  alt_port_num;
+	__u8  alt_timeout;
+	__u8  reserved[6];
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_query_xrc_rcv_qp {
+	__u64 response;
+	__u32 xrc_domain_handle;
+	__u32 qp_num;
+	__u32 attr_mask;
+	__u32 reserved;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_reg_xrc_rcv_qp {
+	__u32 xrc_domain_handle;
+	__u32 qp_num;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_unreg_xrc_rcv_qp {
+	__u32 xrc_domain_handle;
+	__u32 qp_num;
+	__u64 driver_data[0];
+};
+
+
+#endif /* IB_USER_VERBS_H */
diff --git a/sys/ofed/include/rdma/ib_verbs.h b/sys/ofed/include/rdma/ib_verbs.h
new file mode 100644
index 0000000..f5b054a
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_verbs.h
@@ -0,0 +1,2170 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(IB_VERBS_H)
+#define IB_VERBS_H
+
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/rwsem.h>
+#include <linux/scatterlist.h>
+
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+#include <linux/rbtree.h>
+#include <linux/mutex.h>
+
+union ib_gid {
+	u8	raw[16];
+	struct {
+		__be64	subnet_prefix;
+		__be64	interface_id;
+	} global;
+};
+
+enum rdma_node_type {
+	/* IB values map to NodeInfo:NodeType. */
+	RDMA_NODE_IB_CA 	= 1,
+	RDMA_NODE_IB_SWITCH,
+	RDMA_NODE_IB_ROUTER,
+	RDMA_NODE_RNIC
+};
+
+enum rdma_transport_type {
+	RDMA_TRANSPORT_IB,
+	RDMA_TRANSPORT_IWARP
+};
+
+enum rdma_transport_type
+rdma_node_get_transport(enum rdma_node_type node_type) __attribute_const__;
+
+enum rdma_link_layer {
+	IB_LINK_LAYER_UNSPECIFIED,
+	IB_LINK_LAYER_INFINIBAND,
+	IB_LINK_LAYER_ETHERNET,
+};
+
+enum ib_device_cap_flags {
+	IB_DEVICE_RESIZE_MAX_WR		= 1,
+	IB_DEVICE_BAD_PKEY_CNTR		= (1<<1),
+	IB_DEVICE_BAD_QKEY_CNTR		= (1<<2),
+	IB_DEVICE_RAW_MULTI		= (1<<3),
+	IB_DEVICE_AUTO_PATH_MIG		= (1<<4),
+	IB_DEVICE_CHANGE_PHY_PORT	= (1<<5),
+	IB_DEVICE_UD_AV_PORT_ENFORCE	= (1<<6),
+	IB_DEVICE_CURR_QP_STATE_MOD	= (1<<7),
+	IB_DEVICE_SHUTDOWN_PORT		= (1<<8),
+	IB_DEVICE_INIT_TYPE		= (1<<9),
+	IB_DEVICE_PORT_ACTIVE_EVENT	= (1<<10),
+	IB_DEVICE_SYS_IMAGE_GUID	= (1<<11),
+	IB_DEVICE_RC_RNR_NAK_GEN	= (1<<12),
+	IB_DEVICE_SRQ_RESIZE		= (1<<13),
+	IB_DEVICE_N_NOTIFY_CQ		= (1<<14),
+	IB_DEVICE_LOCAL_DMA_LKEY	= (1<<15),
+	IB_DEVICE_RESERVED		= (1<<16), /* old SEND_W_INV */
+	IB_DEVICE_MEM_WINDOW		= (1<<17),
+	/*
+	 * Devices should set IB_DEVICE_UD_IP_SUM if they support
+	 * insertion of UDP and TCP checksum on outgoing UD IPoIB
+	 * messages and can verify the validity of checksum for
+	 * incoming messages.  Setting this flag implies that the
+	 * IPoIB driver may set NETIF_F_IP_CSUM for datagram mode.
+	 */
+	IB_DEVICE_UD_IP_CSUM		= (1<<18),
+	IB_DEVICE_UD_TSO		= (1<<19),
+	IB_DEVICE_XRC			= (1<<20),
+	IB_DEVICE_MEM_MGT_EXTENSIONS	= (1<<21),
+	IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1<<22),
+};
+
+enum ib_atomic_cap {
+	IB_ATOMIC_NONE,
+	IB_ATOMIC_HCA,
+	IB_ATOMIC_GLOB
+};
+
+struct ib_device_attr {
+	u64			fw_ver;
+	__be64			sys_image_guid;
+	u64			max_mr_size;
+	u64			page_size_cap;
+	u32			vendor_id;
+	u32			vendor_part_id;
+	u32			hw_ver;
+	int			max_qp;
+	int			max_qp_wr;
+	int			device_cap_flags;
+	int			max_sge;
+	int			max_sge_rd;
+	int			max_cq;
+	int			max_cqe;
+	int			max_mr;
+	int			max_pd;
+	int			max_qp_rd_atom;
+	int			max_ee_rd_atom;
+	int			max_res_rd_atom;
+	int			max_qp_init_rd_atom;
+	int			max_ee_init_rd_atom;
+	enum ib_atomic_cap	atomic_cap;
+	enum ib_atomic_cap	masked_atomic_cap;
+	int			max_ee;
+	int			max_rdd;
+	int			max_mw;
+	int			max_raw_ipv6_qp;
+	int			max_raw_ethy_qp;
+	int			max_mcast_grp;
+	int			max_mcast_qp_attach;
+	int			max_total_mcast_qp_attach;
+	int			max_ah;
+	int			max_fmr;
+	int			max_map_per_fmr;
+	int			max_srq;
+	int			max_srq_wr;
+	int			max_srq_sge;
+	unsigned int		max_fast_reg_page_list_len;
+	u16			max_pkeys;
+	u8			local_ca_ack_delay;
+};
+
+enum ib_mtu {
+	IB_MTU_256  = 1,
+	IB_MTU_512  = 2,
+	IB_MTU_1024 = 3,
+	IB_MTU_2048 = 4,
+	IB_MTU_4096 = 5
+};
+
+static inline int ib_mtu_enum_to_int(enum ib_mtu mtu)
+{
+	switch (mtu) {
+	case IB_MTU_256:  return  256;
+	case IB_MTU_512:  return  512;
+	case IB_MTU_1024: return 1024;
+	case IB_MTU_2048: return 2048;
+	case IB_MTU_4096: return 4096;
+	default: 	  return -1;
+	}
+}
+
+enum ib_port_state {
+	IB_PORT_NOP		= 0,
+	IB_PORT_DOWN		= 1,
+	IB_PORT_INIT		= 2,
+	IB_PORT_ARMED		= 3,
+	IB_PORT_ACTIVE		= 4,
+	IB_PORT_ACTIVE_DEFER	= 5
+};
+
+enum ib_port_cap_flags {
+	IB_PORT_SM				= 1 <<  1,
+	IB_PORT_NOTICE_SUP			= 1 <<  2,
+	IB_PORT_TRAP_SUP			= 1 <<  3,
+	IB_PORT_OPT_IPD_SUP                     = 1 <<  4,
+	IB_PORT_AUTO_MIGR_SUP			= 1 <<  5,
+	IB_PORT_SL_MAP_SUP			= 1 <<  6,
+	IB_PORT_MKEY_NVRAM			= 1 <<  7,
+	IB_PORT_PKEY_NVRAM			= 1 <<  8,
+	IB_PORT_LED_INFO_SUP			= 1 <<  9,
+	IB_PORT_SM_DISABLED			= 1 << 10,
+	IB_PORT_SYS_IMAGE_GUID_SUP		= 1 << 11,
+	IB_PORT_PKEY_SW_EXT_PORT_TRAP_SUP	= 1 << 12,
+	IB_PORT_CM_SUP				= 1 << 16,
+	IB_PORT_SNMP_TUNNEL_SUP			= 1 << 17,
+	IB_PORT_REINIT_SUP			= 1 << 18,
+	IB_PORT_DEVICE_MGMT_SUP			= 1 << 19,
+	IB_PORT_VENDOR_CLASS_SUP		= 1 << 20,
+	IB_PORT_DR_NOTICE_SUP			= 1 << 21,
+	IB_PORT_CAP_MASK_NOTICE_SUP		= 1 << 22,
+	IB_PORT_BOOT_MGMT_SUP			= 1 << 23,
+	IB_PORT_LINK_LATENCY_SUP		= 1 << 24,
+	IB_PORT_CLIENT_REG_SUP			= 1 << 25
+};
+
+enum ib_port_width {
+	IB_WIDTH_1X	= 1,
+	IB_WIDTH_4X	= 2,
+	IB_WIDTH_8X	= 4,
+	IB_WIDTH_12X	= 8
+};
+
+static inline int ib_width_enum_to_int(enum ib_port_width width)
+{
+	switch (width) {
+	case IB_WIDTH_1X:  return  1;
+	case IB_WIDTH_4X:  return  4;
+	case IB_WIDTH_8X:  return  8;
+	case IB_WIDTH_12X: return 12;
+	default: 	  return -1;
+	}
+}
+
+struct ib_protocol_stats {
+	/* TBD... */
+};
+
+struct iw_protocol_stats {
+	u64	ipInReceives;
+	u64	ipInHdrErrors;
+	u64	ipInTooBigErrors;
+	u64	ipInNoRoutes;
+	u64	ipInAddrErrors;
+	u64	ipInUnknownProtos;
+	u64	ipInTruncatedPkts;
+	u64	ipInDiscards;
+	u64	ipInDelivers;
+	u64	ipOutForwDatagrams;
+	u64	ipOutRequests;
+	u64	ipOutDiscards;
+	u64	ipOutNoRoutes;
+	u64	ipReasmTimeout;
+	u64	ipReasmReqds;
+	u64	ipReasmOKs;
+	u64	ipReasmFails;
+	u64	ipFragOKs;
+	u64	ipFragFails;
+	u64	ipFragCreates;
+	u64	ipInMcastPkts;
+	u64	ipOutMcastPkts;
+	u64	ipInBcastPkts;
+	u64	ipOutBcastPkts;
+
+	u64	tcpRtoAlgorithm;
+	u64	tcpRtoMin;
+	u64	tcpRtoMax;
+	u64	tcpMaxConn;
+	u64	tcpActiveOpens;
+	u64	tcpPassiveOpens;
+	u64	tcpAttemptFails;
+	u64	tcpEstabResets;
+	u64	tcpCurrEstab;
+	u64	tcpInSegs;
+	u64	tcpOutSegs;
+	u64	tcpRetransSegs;
+	u64	tcpInErrs;
+	u64	tcpOutRsts;
+};
+
+union rdma_protocol_stats {
+	struct ib_protocol_stats	ib;
+	struct iw_protocol_stats	iw;
+};
+
+struct ib_port_attr {
+	enum ib_port_state	state;
+	enum ib_mtu		max_mtu;
+	enum ib_mtu		active_mtu;
+	int			gid_tbl_len;
+	u32			port_cap_flags;
+	u32			max_msg_sz;
+	u32			bad_pkey_cntr;
+	u32			qkey_viol_cntr;
+	u16			pkey_tbl_len;
+	u16			lid;
+	u16			sm_lid;
+	u8			lmc;
+	u8			max_vl_num;
+	u8			sm_sl;
+	u8			subnet_timeout;
+	u8			init_type_reply;
+	u8			active_width;
+	u8			active_speed;
+	u8                      phys_state;
+	enum rdma_link_layer	link_layer;
+};
+
+enum ib_device_modify_flags {
+	IB_DEVICE_MODIFY_SYS_IMAGE_GUID	= 1 << 0,
+	IB_DEVICE_MODIFY_NODE_DESC	= 1 << 1
+};
+
+struct ib_device_modify {
+	u64	sys_image_guid;
+	char	node_desc[64];
+};
+
+enum ib_port_modify_flags {
+	IB_PORT_SHUTDOWN		= 1,
+	IB_PORT_INIT_TYPE		= (1<<2),
+	IB_PORT_RESET_QKEY_CNTR		= (1<<3)
+};
+
+struct ib_port_modify {
+	u32	set_port_cap_mask;
+	u32	clr_port_cap_mask;
+	u8	init_type;
+};
+
+enum ib_event_type {
+	IB_EVENT_CQ_ERR,
+	IB_EVENT_QP_FATAL,
+	IB_EVENT_QP_REQ_ERR,
+	IB_EVENT_QP_ACCESS_ERR,
+	IB_EVENT_COMM_EST,
+	IB_EVENT_SQ_DRAINED,
+	IB_EVENT_PATH_MIG,
+	IB_EVENT_PATH_MIG_ERR,
+	IB_EVENT_DEVICE_FATAL,
+	IB_EVENT_PORT_ACTIVE,
+	IB_EVENT_PORT_ERR,
+	IB_EVENT_LID_CHANGE,
+	IB_EVENT_PKEY_CHANGE,
+	IB_EVENT_SM_CHANGE,
+	IB_EVENT_SRQ_ERR,
+	IB_EVENT_SRQ_LIMIT_REACHED,
+	IB_EVENT_QP_LAST_WQE_REACHED,
+	IB_EVENT_CLIENT_REREGISTER,
+	IB_EVENT_GID_CHANGE,
+};
+
+enum ib_event_flags {
+	IB_XRC_QP_EVENT_FLAG = 0x80000000,
+};
+
+struct ib_event {
+	struct ib_device	*device;
+	union {
+		struct ib_cq	*cq;
+		struct ib_qp	*qp;
+		struct ib_srq	*srq;
+		u8		port_num;
+		u32		xrc_qp_num;
+	} element;
+	enum ib_event_type	event;
+};
+
+struct ib_event_handler {
+	struct ib_device *device;
+	void            (*handler)(struct ib_event_handler *, struct ib_event *);
+	struct list_head  list;
+};
+
+#define INIT_IB_EVENT_HANDLER(_ptr, _device, _handler)		\
+	do {							\
+		(_ptr)->device  = _device;			\
+		(_ptr)->handler = _handler;			\
+		INIT_LIST_HEAD(&(_ptr)->list);			\
+	} while (0)
+
+struct ib_global_route {
+	union ib_gid	dgid;
+	u32		flow_label;
+	u8		sgid_index;
+	u8		hop_limit;
+	u8		traffic_class;
+};
+
+struct ib_grh {
+	__be32		version_tclass_flow;
+	__be16		paylen;
+	u8		next_hdr;
+	u8		hop_limit;
+	union ib_gid	sgid;
+	union ib_gid	dgid;
+};
+
+enum {
+	IB_MULTICAST_QPN = 0xffffff
+};
+
+#define IB_LID_PERMISSIVE	cpu_to_be16(0xFFFF)
+
+enum ib_ah_flags {
+	IB_AH_GRH	= 1
+};
+
+enum ib_rate {
+	IB_RATE_PORT_CURRENT = 0,
+	IB_RATE_2_5_GBPS = 2,
+	IB_RATE_5_GBPS   = 5,
+	IB_RATE_10_GBPS  = 3,
+	IB_RATE_20_GBPS  = 6,
+	IB_RATE_30_GBPS  = 4,
+	IB_RATE_40_GBPS  = 7,
+	IB_RATE_60_GBPS  = 8,
+	IB_RATE_80_GBPS  = 9,
+	IB_RATE_120_GBPS = 10
+};
+
+/**
+ * ib_rate_to_mult - Convert the IB rate enum to a multiple of the
+ * base rate of 2.5 Gbit/sec.  For example, IB_RATE_5_GBPS will be
+ * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec.
+ * @rate: rate to convert.
+ */
+int ib_rate_to_mult(enum ib_rate rate) __attribute_const__;
+
+/**
+ * mult_to_ib_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate
+ * enum.
+ * @mult: multiple to convert.
+ */
+enum ib_rate mult_to_ib_rate(int mult) __attribute_const__;
+
+struct ib_ah_attr {
+	struct ib_global_route	grh;
+	u16			dlid;
+	u8			sl;
+	u8			src_path_bits;
+	u8			static_rate;
+	u8			ah_flags;
+	u8			port_num;
+};
+
+enum ib_wc_status {
+	IB_WC_SUCCESS,
+	IB_WC_LOC_LEN_ERR,
+	IB_WC_LOC_QP_OP_ERR,
+	IB_WC_LOC_EEC_OP_ERR,
+	IB_WC_LOC_PROT_ERR,
+	IB_WC_WR_FLUSH_ERR,
+	IB_WC_MW_BIND_ERR,
+	IB_WC_BAD_RESP_ERR,
+	IB_WC_LOC_ACCESS_ERR,
+	IB_WC_REM_INV_REQ_ERR,
+	IB_WC_REM_ACCESS_ERR,
+	IB_WC_REM_OP_ERR,
+	IB_WC_RETRY_EXC_ERR,
+	IB_WC_RNR_RETRY_EXC_ERR,
+	IB_WC_LOC_RDD_VIOL_ERR,
+	IB_WC_REM_INV_RD_REQ_ERR,
+	IB_WC_REM_ABORT_ERR,
+	IB_WC_INV_EECN_ERR,
+	IB_WC_INV_EEC_STATE_ERR,
+	IB_WC_FATAL_ERR,
+	IB_WC_RESP_TIMEOUT_ERR,
+	IB_WC_GENERAL_ERR
+};
+
+enum ib_wc_opcode {
+	IB_WC_SEND,
+	IB_WC_RDMA_WRITE,
+	IB_WC_RDMA_READ,
+	IB_WC_COMP_SWAP,
+	IB_WC_FETCH_ADD,
+	IB_WC_BIND_MW,
+	IB_WC_LSO,
+	IB_WC_LOCAL_INV,
+	IB_WC_FAST_REG_MR,
+	IB_WC_MASKED_COMP_SWAP,
+	IB_WC_MASKED_FETCH_ADD,
+/*
+ * Set value of IB_WC_RECV so consumers can test if a completion is a
+ * receive by testing (opcode & IB_WC_RECV).
+ */
+	IB_WC_RECV			= 1 << 7,
+	IB_WC_RECV_RDMA_WITH_IMM
+};
+
+enum ib_wc_flags {
+	IB_WC_GRH		= 1,
+	IB_WC_WITH_IMM		= (1<<1),
+	IB_WC_WITH_INVALIDATE	= (1<<2),
+};
+
+struct ib_wc {
+	u64			wr_id;
+	enum ib_wc_status	status;
+	enum ib_wc_opcode	opcode;
+	u32			vendor_err;
+	u32			byte_len;
+	struct ib_qp	       *qp;
+	union {
+		__be32		imm_data;
+		u32		invalidate_rkey;
+	} ex;
+	u32			src_qp;
+	int			wc_flags;
+	u16			pkey_index;
+	u16			slid;
+	u8			sl;
+	u8			dlid_path_bits;
+	u8			port_num;	/* valid only for DR SMPs on switches */
+	int			csum_ok;
+};
+
+enum ib_cq_notify_flags {
+	IB_CQ_SOLICITED			= 1 << 0,
+	IB_CQ_NEXT_COMP			= 1 << 1,
+	IB_CQ_SOLICITED_MASK		= IB_CQ_SOLICITED | IB_CQ_NEXT_COMP,
+	IB_CQ_REPORT_MISSED_EVENTS	= 1 << 2,
+};
+
+enum ib_srq_attr_mask {
+	IB_SRQ_MAX_WR	= 1 << 0,
+	IB_SRQ_LIMIT	= 1 << 1,
+};
+
+struct ib_srq_attr {
+	u32	max_wr;
+	u32	max_sge;
+	u32	srq_limit;
+};
+
+struct ib_srq_init_attr {
+	void		      (*event_handler)(struct ib_event *, void *);
+	void		       *srq_context;
+	struct ib_srq_attr	attr;
+};
+
+struct ib_qp_cap {
+	u32	max_send_wr;
+	u32	max_recv_wr;
+	u32	max_send_sge;
+	u32	max_recv_sge;
+	u32	max_inline_data;
+};
+
+enum ib_sig_type {
+	IB_SIGNAL_ALL_WR,
+	IB_SIGNAL_REQ_WR
+};
+
+enum ib_qp_type {
+	/*
+	 * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries
+	 * here (and in that order) since the MAD layer uses them as
+	 * indices into a 2-entry table.
+	 */
+	IB_QPT_SMI,
+	IB_QPT_GSI,
+
+	IB_QPT_RC,
+	IB_QPT_UC,
+	IB_QPT_UD,
+	IB_QPT_XRC,
+	IB_QPT_RAW_IPV6,
+	IB_QPT_RAW_ETY,
+	IB_QPT_RAW_ETH
+};
+
+enum ib_qp_create_flags {
+	IB_QP_CREATE_IPOIB_UD_LSO		= 1 << 0,
+	IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK	= 1 << 1,
+};
+
+struct ib_qp_init_attr {
+	void                  (*event_handler)(struct ib_event *, void *);
+	void		       *qp_context;
+	struct ib_cq	       *send_cq;
+	struct ib_cq	       *recv_cq;
+	struct ib_srq	       *srq;
+	struct ib_qp_cap	cap;
+	enum ib_sig_type	sq_sig_type;
+	enum ib_qp_type		qp_type;
+	enum ib_qp_create_flags	create_flags;
+	struct ib_xrcd	       *xrc_domain; /* XRC qp's only */
+	u8			port_num; /* special QP types only */
+};
+
+enum ib_rnr_timeout {
+	IB_RNR_TIMER_655_36 =  0,
+	IB_RNR_TIMER_000_01 =  1,
+	IB_RNR_TIMER_000_02 =  2,
+	IB_RNR_TIMER_000_03 =  3,
+	IB_RNR_TIMER_000_04 =  4,
+	IB_RNR_TIMER_000_06 =  5,
+	IB_RNR_TIMER_000_08 =  6,
+	IB_RNR_TIMER_000_12 =  7,
+	IB_RNR_TIMER_000_16 =  8,
+	IB_RNR_TIMER_000_24 =  9,
+	IB_RNR_TIMER_000_32 = 10,
+	IB_RNR_TIMER_000_48 = 11,
+	IB_RNR_TIMER_000_64 = 12,
+	IB_RNR_TIMER_000_96 = 13,
+	IB_RNR_TIMER_001_28 = 14,
+	IB_RNR_TIMER_001_92 = 15,
+	IB_RNR_TIMER_002_56 = 16,
+	IB_RNR_TIMER_003_84 = 17,
+	IB_RNR_TIMER_005_12 = 18,
+	IB_RNR_TIMER_007_68 = 19,
+	IB_RNR_TIMER_010_24 = 20,
+	IB_RNR_TIMER_015_36 = 21,
+	IB_RNR_TIMER_020_48 = 22,
+	IB_RNR_TIMER_030_72 = 23,
+	IB_RNR_TIMER_040_96 = 24,
+	IB_RNR_TIMER_061_44 = 25,
+	IB_RNR_TIMER_081_92 = 26,
+	IB_RNR_TIMER_122_88 = 27,
+	IB_RNR_TIMER_163_84 = 28,
+	IB_RNR_TIMER_245_76 = 29,
+	IB_RNR_TIMER_327_68 = 30,
+	IB_RNR_TIMER_491_52 = 31
+};
+
+enum ib_qp_attr_mask {
+	IB_QP_STATE			= 1,
+	IB_QP_CUR_STATE			= (1<<1),
+	IB_QP_EN_SQD_ASYNC_NOTIFY	= (1<<2),
+	IB_QP_ACCESS_FLAGS		= (1<<3),
+	IB_QP_PKEY_INDEX		= (1<<4),
+	IB_QP_PORT			= (1<<5),
+	IB_QP_QKEY			= (1<<6),
+	IB_QP_AV			= (1<<7),
+	IB_QP_PATH_MTU			= (1<<8),
+	IB_QP_TIMEOUT			= (1<<9),
+	IB_QP_RETRY_CNT			= (1<<10),
+	IB_QP_RNR_RETRY			= (1<<11),
+	IB_QP_RQ_PSN			= (1<<12),
+	IB_QP_MAX_QP_RD_ATOMIC		= (1<<13),
+	IB_QP_ALT_PATH			= (1<<14),
+	IB_QP_MIN_RNR_TIMER		= (1<<15),
+	IB_QP_SQ_PSN			= (1<<16),
+	IB_QP_MAX_DEST_RD_ATOMIC	= (1<<17),
+	IB_QP_PATH_MIG_STATE		= (1<<18),
+	IB_QP_CAP			= (1<<19),
+	IB_QP_DEST_QPN			= (1<<20)
+};
+
+enum ib_qp_state {
+	IB_QPS_RESET,
+	IB_QPS_INIT,
+	IB_QPS_RTR,
+	IB_QPS_RTS,
+	IB_QPS_SQD,
+	IB_QPS_SQE,
+	IB_QPS_ERR
+};
+
+enum ib_mig_state {
+	IB_MIG_MIGRATED,
+	IB_MIG_REARM,
+	IB_MIG_ARMED
+};
+
+struct ib_qp_attr {
+	enum ib_qp_state	qp_state;
+	enum ib_qp_state	cur_qp_state;
+	enum ib_mtu		path_mtu;
+	enum ib_mig_state	path_mig_state;
+	u32			qkey;
+	u32			rq_psn;
+	u32			sq_psn;
+	u32			dest_qp_num;
+	int			qp_access_flags;
+	struct ib_qp_cap	cap;
+	struct ib_ah_attr	ah_attr;
+	struct ib_ah_attr	alt_ah_attr;
+	u16			pkey_index;
+	u16			alt_pkey_index;
+	u8			en_sqd_async_notify;
+	u8			sq_draining;
+	u8			max_rd_atomic;
+	u8			max_dest_rd_atomic;
+	u8			min_rnr_timer;
+	u8			port_num;
+	u8			timeout;
+	u8			retry_cnt;
+	u8			rnr_retry;
+	u8			alt_port_num;
+	u8			alt_timeout;
+};
+
+enum ib_wr_opcode {
+	IB_WR_RDMA_WRITE,
+	IB_WR_RDMA_WRITE_WITH_IMM,
+	IB_WR_SEND,
+	IB_WR_SEND_WITH_IMM,
+	IB_WR_RDMA_READ,
+	IB_WR_ATOMIC_CMP_AND_SWP,
+	IB_WR_ATOMIC_FETCH_AND_ADD,
+	IB_WR_LSO,
+	IB_WR_BIG_LSO,
+	IB_WR_SEND_WITH_INV,
+	IB_WR_RDMA_READ_WITH_INV,
+	IB_WR_LOCAL_INV,
+	IB_WR_FAST_REG_MR,
+	IB_WR_MASKED_ATOMIC_CMP_AND_SWP,
+	IB_WR_MASKED_ATOMIC_FETCH_AND_ADD,
+};
+
+enum ib_send_flags {
+	IB_SEND_FENCE		= 1,
+	IB_SEND_SIGNALED	= (1<<1),
+	IB_SEND_SOLICITED	= (1<<2),
+	IB_SEND_INLINE		= (1<<3),
+	IB_SEND_IP_CSUM		= (1<<4)
+};
+
+struct ib_sge {
+	u64	addr;
+	u32	length;
+	u32	lkey;
+};
+
+struct ib_fast_reg_page_list {
+	struct ib_device       *device;
+	u64		       *page_list;
+	unsigned int		max_page_list_len;
+};
+
+struct ib_send_wr {
+	struct ib_send_wr      *next;
+	u64			wr_id;
+	struct ib_sge	       *sg_list;
+	int			num_sge;
+	enum ib_wr_opcode	opcode;
+	int			send_flags;
+	union {
+		__be32		imm_data;
+		u32		invalidate_rkey;
+	} ex;
+	union {
+		struct {
+			u64	remote_addr;
+			u32	rkey;
+		} rdma;
+		struct {
+			u64	remote_addr;
+			u64	compare_add;
+			u64	swap;
+			u64	compare_add_mask;
+			u64	swap_mask;
+			u32	rkey;
+		} atomic;
+		struct {
+			struct ib_ah *ah;
+			void   *header;
+			int     hlen;
+			int     mss;
+			u32	remote_qpn;
+			u32	remote_qkey;
+			u16	pkey_index; /* valid for GSI only */
+			u8	port_num;   /* valid for DR SMPs on switch only */
+		} ud;
+		struct {
+			u64				iova_start;
+			struct ib_fast_reg_page_list   *page_list;
+			unsigned int			page_shift;
+			unsigned int			page_list_len;
+			u32				length;
+			int				access_flags;
+			u32				rkey;
+		} fast_reg;
+		struct {
+			struct ib_unpacked_lrh	*lrh;
+			u32			eth_type;
+			u8			static_rate;
+		} raw_ety;
+	} wr;
+	u32			xrc_remote_srq_num; /* valid for XRC sends only */
+};
+
+struct ib_recv_wr {
+	struct ib_recv_wr      *next;
+	u64			wr_id;
+	struct ib_sge	       *sg_list;
+	int			num_sge;
+};
+
+enum ib_access_flags {
+	IB_ACCESS_LOCAL_WRITE	= 1,
+	IB_ACCESS_REMOTE_WRITE	= (1<<1),
+	IB_ACCESS_REMOTE_READ	= (1<<2),
+	IB_ACCESS_REMOTE_ATOMIC	= (1<<3),
+	IB_ACCESS_MW_BIND	= (1<<4)
+};
+
+struct ib_phys_buf {
+	u64      addr;
+	u64      size;
+};
+
+struct ib_mr_attr {
+	struct ib_pd	*pd;
+	u64		device_virt_addr;
+	u64		size;
+	int		mr_access_flags;
+	u32		lkey;
+	u32		rkey;
+};
+
+enum ib_mr_rereg_flags {
+	IB_MR_REREG_TRANS	= 1,
+	IB_MR_REREG_PD		= (1<<1),
+	IB_MR_REREG_ACCESS	= (1<<2)
+};
+
+struct ib_mw_bind {
+	struct ib_mr   *mr;
+	u64		wr_id;
+	u64		addr;
+	u32		length;
+	int		send_flags;
+	int		mw_access_flags;
+};
+
+struct ib_fmr_attr {
+	int	max_pages;
+	int	max_maps;
+	u8	page_shift;
+};
+
+struct ib_ucontext {
+	struct ib_device       *device;
+	struct list_head	pd_list;
+	struct list_head	mr_list;
+	struct list_head	mw_list;
+	struct list_head	cq_list;
+	struct list_head	qp_list;
+	struct list_head	srq_list;
+	struct list_head	ah_list;
+	struct list_head	xrc_domain_list;
+	int			closing;
+};
+
+struct ib_uobject {
+	u64			user_handle;	/* handle given to us by userspace */
+	struct ib_ucontext     *context;	/* associated user context */
+	void		       *object;		/* containing object */
+	struct list_head	list;		/* link to context's list */
+	int			id;		/* index into kernel idr */
+	struct kref		ref;
+	struct rw_semaphore	mutex;		/* protects .live */
+	int			live;
+};
+
+struct ib_udata {
+	void __user *inbuf;
+	void __user *outbuf;
+	size_t       inlen;
+	size_t       outlen;
+};
+
+struct ib_uxrc_rcv_object {
+	struct list_head	list;		/* link to context's list */
+	u32			qp_num;
+	u32			domain_handle;
+};
+
+struct ib_pd {
+	struct ib_device       *device;
+	struct ib_uobject      *uobject;
+	atomic_t          	usecnt; /* count all resources */
+};
+
+struct ib_xrcd {
+	struct ib_device       *device;
+	struct ib_uobject      *uobject;
+	struct inode	       *inode;
+	struct rb_node		node;
+	atomic_t		usecnt; /* count all resources */
+};
+
+
+struct ib_ah {
+	struct ib_device	*device;
+	struct ib_pd		*pd;
+	struct ib_uobject	*uobject;
+};
+
+typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context);
+
+struct ib_cq {
+	struct ib_device       *device;
+	struct ib_uobject      *uobject;
+	ib_comp_handler   	comp_handler;
+	void                  (*event_handler)(struct ib_event *, void *);
+	void                   *cq_context;
+	int               	cqe;
+	atomic_t          	usecnt; /* count number of work queues */
+};
+
+struct ib_srq {
+	struct ib_device       *device;
+	struct ib_pd	       *pd;
+	struct ib_cq	       *xrc_cq;
+	struct ib_xrcd	       *xrcd;
+	struct ib_uobject      *uobject;
+	void		      (*event_handler)(struct ib_event *, void *);
+	void		       *srq_context;
+	atomic_t		usecnt;
+	u32			xrc_srq_num;
+};
+
+struct ib_qp {
+	struct ib_device       *device;
+	struct ib_pd	       *pd;
+	struct ib_cq	       *send_cq;
+	struct ib_cq	       *recv_cq;
+	struct ib_srq	       *srq;
+	struct ib_uobject      *uobject;
+	void                  (*event_handler)(struct ib_event *, void *);
+	void		       *qp_context;
+	u32			qp_num;
+	enum ib_qp_type		qp_type;
+	struct ib_xrcd	       *xrcd;  /* XRC QPs only */
+};
+
+struct ib_mr {
+	struct ib_device  *device;
+	struct ib_pd	  *pd;
+	struct ib_uobject *uobject;
+	u32		   lkey;
+	u32		   rkey;
+	atomic_t	   usecnt; /* count number of MWs */
+};
+
+struct ib_mw {
+	struct ib_device	*device;
+	struct ib_pd		*pd;
+	struct ib_uobject	*uobject;
+	u32			rkey;
+};
+
+struct ib_fmr {
+	struct ib_device	*device;
+	struct ib_pd		*pd;
+	struct list_head	list;
+	u32			lkey;
+	u32			rkey;
+};
+
+struct ib_mad;
+struct ib_grh;
+
+enum ib_process_mad_flags {
+	IB_MAD_IGNORE_MKEY	= 1,
+	IB_MAD_IGNORE_BKEY	= 2,
+	IB_MAD_IGNORE_ALL	= IB_MAD_IGNORE_MKEY | IB_MAD_IGNORE_BKEY
+};
+
+enum ib_mad_result {
+	IB_MAD_RESULT_FAILURE  = 0,      /* (!SUCCESS is the important flag) */
+	IB_MAD_RESULT_SUCCESS  = 1 << 0, /* MAD was successfully processed   */
+	IB_MAD_RESULT_REPLY    = 1 << 1, /* Reply packet needs to be sent    */
+	IB_MAD_RESULT_CONSUMED = 1 << 2  /* Packet consumed: stop processing */
+};
+
+#define IB_DEVICE_NAME_MAX 64
+
+struct ib_cache {
+	rwlock_t                lock;
+	struct ib_event_handler event_handler;
+	struct ib_pkey_cache  **pkey_cache;
+	struct ib_gid_cache   **gid_cache;
+	u8                     *lmc_cache;
+};
+
+struct ib_dma_mapping_ops {
+	int		(*mapping_error)(struct ib_device *dev,
+					 u64 dma_addr);
+	u64		(*map_single)(struct ib_device *dev,
+				      void *ptr, size_t size,
+				      enum dma_data_direction direction);
+	void		(*unmap_single)(struct ib_device *dev,
+					u64 addr, size_t size,
+					enum dma_data_direction direction);
+	u64		(*map_page)(struct ib_device *dev,
+				    struct page *page, unsigned long offset,
+				    size_t size,
+				    enum dma_data_direction direction);
+	void		(*unmap_page)(struct ib_device *dev,
+				      u64 addr, size_t size,
+				      enum dma_data_direction direction);
+	int		(*map_sg)(struct ib_device *dev,
+				  struct scatterlist *sg, int nents,
+				  enum dma_data_direction direction);
+	void		(*unmap_sg)(struct ib_device *dev,
+				    struct scatterlist *sg, int nents,
+				    enum dma_data_direction direction);
+	u64		(*dma_address)(struct ib_device *dev,
+				       struct scatterlist *sg);
+	unsigned int	(*dma_len)(struct ib_device *dev,
+				   struct scatterlist *sg);
+	void		(*sync_single_for_cpu)(struct ib_device *dev,
+					       u64 dma_handle,
+					       size_t size,
+					       enum dma_data_direction dir);
+	void		(*sync_single_for_device)(struct ib_device *dev,
+						  u64 dma_handle,
+						  size_t size,
+						  enum dma_data_direction dir);
+	void		*(*alloc_coherent)(struct ib_device *dev,
+					   size_t size,
+					   u64 *dma_handle,
+					   gfp_t flag);
+	void		(*free_coherent)(struct ib_device *dev,
+					 size_t size, void *cpu_addr,
+					 u64 dma_handle);
+};
+
+struct iw_cm_verbs;
+
+struct ib_device {
+	struct device                *dma_device;
+
+	char                          name[IB_DEVICE_NAME_MAX];
+
+	struct list_head              event_handler_list;
+	spinlock_t                    event_handler_lock;
+
+	struct list_head              core_list;
+	struct list_head              client_data_list;
+	spinlock_t                    client_data_lock;
+
+	struct ib_cache               cache;
+	int                          *pkey_tbl_len;
+	int                          *gid_tbl_len;
+
+	int			      num_comp_vectors;
+
+	struct iw_cm_verbs	     *iwcm;
+
+	int		           (*get_protocol_stats)(struct ib_device *device,
+							 union rdma_protocol_stats *stats);
+	int		           (*query_device)(struct ib_device *device,
+						   struct ib_device_attr *device_attr);
+	int		           (*query_port)(struct ib_device *device,
+						 u8 port_num,
+						 struct ib_port_attr *port_attr);
+	enum rdma_link_layer	   (*get_link_layer)(struct ib_device *device,
+						     u8 port_num);
+	int		           (*query_gid)(struct ib_device *device,
+						u8 port_num, int index,
+						union ib_gid *gid);
+	int		           (*query_pkey)(struct ib_device *device,
+						 u8 port_num, u16 index, u16 *pkey);
+	int		           (*modify_device)(struct ib_device *device,
+						    int device_modify_mask,
+						    struct ib_device_modify *device_modify);
+	int		           (*modify_port)(struct ib_device *device,
+						  u8 port_num, int port_modify_mask,
+						  struct ib_port_modify *port_modify);
+	struct ib_ucontext *       (*alloc_ucontext)(struct ib_device *device,
+						     struct ib_udata *udata);
+	int                        (*dealloc_ucontext)(struct ib_ucontext *context);
+	int                        (*mmap)(struct ib_ucontext *context,
+					   struct vm_area_struct *vma);
+	struct ib_pd *             (*alloc_pd)(struct ib_device *device,
+					       struct ib_ucontext *context,
+					       struct ib_udata *udata);
+	int                        (*dealloc_pd)(struct ib_pd *pd);
+	struct ib_ah *             (*create_ah)(struct ib_pd *pd,
+						struct ib_ah_attr *ah_attr);
+	int                        (*modify_ah)(struct ib_ah *ah,
+						struct ib_ah_attr *ah_attr);
+	int                        (*query_ah)(struct ib_ah *ah,
+					       struct ib_ah_attr *ah_attr);
+	int                        (*destroy_ah)(struct ib_ah *ah);
+	struct ib_srq *            (*create_srq)(struct ib_pd *pd,
+						 struct ib_srq_init_attr *srq_init_attr,
+						 struct ib_udata *udata);
+	int                        (*modify_srq)(struct ib_srq *srq,
+						 struct ib_srq_attr *srq_attr,
+						 enum ib_srq_attr_mask srq_attr_mask,
+						 struct ib_udata *udata);
+	int                        (*query_srq)(struct ib_srq *srq,
+						struct ib_srq_attr *srq_attr);
+	int                        (*destroy_srq)(struct ib_srq *srq);
+	int                        (*post_srq_recv)(struct ib_srq *srq,
+						    struct ib_recv_wr *recv_wr,
+						    struct ib_recv_wr **bad_recv_wr);
+	struct ib_qp *             (*create_qp)(struct ib_pd *pd,
+						struct ib_qp_init_attr *qp_init_attr,
+						struct ib_udata *udata);
+	int                        (*modify_qp)(struct ib_qp *qp,
+						struct ib_qp_attr *qp_attr,
+						int qp_attr_mask,
+						struct ib_udata *udata);
+	int                        (*query_qp)(struct ib_qp *qp,
+					       struct ib_qp_attr *qp_attr,
+					       int qp_attr_mask,
+					       struct ib_qp_init_attr *qp_init_attr);
+	int                        (*destroy_qp)(struct ib_qp *qp);
+	int                        (*post_send)(struct ib_qp *qp,
+						struct ib_send_wr *send_wr,
+						struct ib_send_wr **bad_send_wr);
+	int                        (*post_recv)(struct ib_qp *qp,
+						struct ib_recv_wr *recv_wr,
+						struct ib_recv_wr **bad_recv_wr);
+	struct ib_cq *             (*create_cq)(struct ib_device *device, int cqe,
+						int comp_vector,
+						struct ib_ucontext *context,
+						struct ib_udata *udata);
+	int                        (*modify_cq)(struct ib_cq *cq, u16 cq_count,
+						u16 cq_period);
+	int                        (*destroy_cq)(struct ib_cq *cq);
+	int                        (*resize_cq)(struct ib_cq *cq, int cqe,
+						struct ib_udata *udata);
+	int                        (*poll_cq)(struct ib_cq *cq, int num_entries,
+					      struct ib_wc *wc);
+	int                        (*peek_cq)(struct ib_cq *cq, int wc_cnt);
+	int                        (*req_notify_cq)(struct ib_cq *cq,
+						    enum ib_cq_notify_flags flags);
+	int                        (*req_ncomp_notif)(struct ib_cq *cq,
+						      int wc_cnt);
+	struct ib_mr *             (*get_dma_mr)(struct ib_pd *pd,
+						 int mr_access_flags);
+	struct ib_mr *             (*reg_phys_mr)(struct ib_pd *pd,
+						  struct ib_phys_buf *phys_buf_array,
+						  int num_phys_buf,
+						  int mr_access_flags,
+						  u64 *iova_start);
+	struct ib_mr *             (*reg_user_mr)(struct ib_pd *pd,
+						  u64 start, u64 length,
+						  u64 virt_addr,
+						  int mr_access_flags,
+						  struct ib_udata *udata);
+	int                        (*query_mr)(struct ib_mr *mr,
+					       struct ib_mr_attr *mr_attr);
+	int                        (*dereg_mr)(struct ib_mr *mr);
+	struct ib_mr *		   (*alloc_fast_reg_mr)(struct ib_pd *pd,
+					       int max_page_list_len);
+	struct ib_fast_reg_page_list * (*alloc_fast_reg_page_list)(struct ib_device *device,
+								   int page_list_len);
+	void			   (*free_fast_reg_page_list)(struct ib_fast_reg_page_list *page_list);
+	int                        (*rereg_phys_mr)(struct ib_mr *mr,
+						    int mr_rereg_mask,
+						    struct ib_pd *pd,
+						    struct ib_phys_buf *phys_buf_array,
+						    int num_phys_buf,
+						    int mr_access_flags,
+						    u64 *iova_start);
+	struct ib_mw *             (*alloc_mw)(struct ib_pd *pd);
+	int                        (*bind_mw)(struct ib_qp *qp,
+					      struct ib_mw *mw,
+					      struct ib_mw_bind *mw_bind);
+	int                        (*dealloc_mw)(struct ib_mw *mw);
+	struct ib_fmr *	           (*alloc_fmr)(struct ib_pd *pd,
+						int mr_access_flags,
+						struct ib_fmr_attr *fmr_attr);
+	int		           (*map_phys_fmr)(struct ib_fmr *fmr,
+						   u64 *page_list, int list_len,
+						   u64 iova);
+	int		           (*unmap_fmr)(struct list_head *fmr_list);
+	int		           (*dealloc_fmr)(struct ib_fmr *fmr);
+	int                        (*attach_mcast)(struct ib_qp *qp,
+						   union ib_gid *gid,
+						   u16 lid);
+	int                        (*detach_mcast)(struct ib_qp *qp,
+						   union ib_gid *gid,
+						   u16 lid);
+	int                        (*process_mad)(struct ib_device *device,
+						  int process_mad_flags,
+						  u8 port_num,
+						  struct ib_wc *in_wc,
+						  struct ib_grh *in_grh,
+						  struct ib_mad *in_mad,
+						  struct ib_mad *out_mad);
+	struct ib_srq *		   (*create_xrc_srq)(struct ib_pd *pd,
+						     struct ib_cq *xrc_cq,
+						     struct ib_xrcd *xrcd,
+						     struct ib_srq_init_attr *srq_init_attr,
+						     struct ib_udata *udata);
+	struct ib_xrcd *	   (*alloc_xrcd)(struct ib_device *device,
+						 struct ib_ucontext *context,
+						 struct ib_udata *udata);
+	int			   (*dealloc_xrcd)(struct ib_xrcd *xrcd);
+	int			   (*create_xrc_rcv_qp)(struct ib_qp_init_attr *init_attr,
+							u32 *qp_num);
+	int			   (*modify_xrc_rcv_qp)(struct ib_xrcd *xrcd,
+							u32 qp_num,
+							struct ib_qp_attr *attr,
+							int attr_mask);
+	int			   (*query_xrc_rcv_qp)(struct ib_xrcd *xrcd,
+						       u32 qp_num,
+						       struct ib_qp_attr *attr,
+						       int attr_mask,
+						       struct ib_qp_init_attr *init_attr);
+	int 			   (*reg_xrc_rcv_qp)(struct ib_xrcd *xrcd,
+						     void *context,
+						     u32 qp_num);
+	int 			   (*unreg_xrc_rcv_qp)(struct ib_xrcd *xrcd,
+						       void *context,
+						       u32 qp_num);
+
+	struct ib_dma_mapping_ops   *dma_ops;
+
+	struct module               *owner;
+	struct device                dev;
+	struct kobject               *ports_parent;
+	struct list_head             port_list;
+
+	enum {
+		IB_DEV_UNINITIALIZED,
+		IB_DEV_REGISTERED,
+		IB_DEV_UNREGISTERED
+	}                            reg_state;
+
+	u64			     uverbs_cmd_mask;
+	int			     uverbs_abi_ver;
+
+	char			     node_desc[64];
+	__be64			     node_guid;
+	u32			     local_dma_lkey;
+	u8                           node_type;
+	u8                           phys_port_cnt;
+	struct rb_root		     ib_uverbs_xrcd_table;
+	struct mutex		     xrcd_table_mutex;
+};
+
+struct ib_client {
+	char  *name;
+	void (*add)   (struct ib_device *);
+	void (*remove)(struct ib_device *);
+
+	struct list_head list;
+};
+
+struct ib_device *ib_alloc_device(size_t size);
+void ib_dealloc_device(struct ib_device *device);
+
+int ib_register_device   (struct ib_device *device);
+void ib_unregister_device(struct ib_device *device);
+
+int ib_register_client   (struct ib_client *client);
+void ib_unregister_client(struct ib_client *client);
+
+void *ib_get_client_data(struct ib_device *device, struct ib_client *client);
+void  ib_set_client_data(struct ib_device *device, struct ib_client *client,
+			 void *data);
+
+static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len)
+{
+	return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0;
+}
+
+static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len)
+{
+	return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0;
+}
+
+/**
+ * ib_sysfs_create_port_files - iterate over port sysfs directories
+ * @device: the IB device
+ * @create: a function to create sysfs files in each port directory
+ */
+int ib_sysfs_create_port_files(struct ib_device *device,
+			       int (*create)(struct ib_device *dev, u8 port_num,
+					     struct kobject *kobj));
+
+/**
+ * ib_modify_qp_is_ok - Check that the supplied attribute mask
+ * contains all required attributes and no attributes not allowed for
+ * the given QP state transition.
+ * @cur_state: Current QP state
+ * @next_state: Next QP state
+ * @type: QP type
+ * @mask: Mask of supplied QP attributes
+ *
+ * This function is a helper function that a low-level driver's
+ * modify_qp method can use to validate the consumer's input.  It
+ * checks that cur_state and next_state are valid QP states, that a
+ * transition from cur_state to next_state is allowed by the IB spec,
+ * and that the attribute mask supplied is allowed for the transition.
+ */
+int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
+		       enum ib_qp_type type, enum ib_qp_attr_mask mask);
+
+int ib_register_event_handler  (struct ib_event_handler *event_handler);
+int ib_unregister_event_handler(struct ib_event_handler *event_handler);
+void ib_dispatch_event(struct ib_event *event);
+
+int ib_query_device(struct ib_device *device,
+		    struct ib_device_attr *device_attr);
+
+int ib_query_port(struct ib_device *device,
+		  u8 port_num, struct ib_port_attr *port_attr);
+
+enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device,
+					       u8 port_num);
+
+int ib_query_gid(struct ib_device *device,
+		 u8 port_num, int index, union ib_gid *gid);
+
+int ib_query_pkey(struct ib_device *device,
+		  u8 port_num, u16 index, u16 *pkey);
+
+int ib_modify_device(struct ib_device *device,
+		     int device_modify_mask,
+		     struct ib_device_modify *device_modify);
+
+int ib_modify_port(struct ib_device *device,
+		   u8 port_num, int port_modify_mask,
+		   struct ib_port_modify *port_modify);
+
+int ib_find_gid(struct ib_device *device, union ib_gid *gid,
+		u8 *port_num, u16 *index);
+
+int ib_find_pkey(struct ib_device *device,
+		 u8 port_num, u16 pkey, u16 *index);
+
+/**
+ * ib_alloc_pd - Allocates an unused protection domain.
+ * @device: The device on which to allocate the protection domain.
+ *
+ * A protection domain object provides an association between QPs, shared
+ * receive queues, address handles, memory regions, and memory windows.
+ */
+struct ib_pd *ib_alloc_pd(struct ib_device *device);
+
+/**
+ * ib_dealloc_pd - Deallocates a protection domain.
+ * @pd: The protection domain to deallocate.
+ */
+int ib_dealloc_pd(struct ib_pd *pd);
+
+/**
+ * ib_create_ah - Creates an address handle for the given address vector.
+ * @pd: The protection domain associated with the address handle.
+ * @ah_attr: The attributes of the address vector.
+ *
+ * The address handle is used to reference a local or global destination
+ * in all UD QP post sends.
+ */
+struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_init_ah_from_wc - Initializes address handle attributes from a
+ *   work completion.
+ * @device: Device on which the received message arrived.
+ * @port_num: Port on which the received message arrived.
+ * @wc: Work completion associated with the received message.
+ * @grh: References the received global route header.  This parameter is
+ *   ignored unless the work completion indicates that the GRH is valid.
+ * @ah_attr: Returned attributes that can be used when creating an address
+ *   handle for replying to the message.
+ */
+int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
+		       struct ib_grh *grh, struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_create_ah_from_wc - Creates an address handle associated with the
+ *   sender of the specified work completion.
+ * @pd: The protection domain associated with the address handle.
+ * @wc: Work completion information associated with a received message.
+ * @grh: References the received global route header.  This parameter is
+ *   ignored unless the work completion indicates that the GRH is valid.
+ * @port_num: The outbound port number to associate with the address.
+ *
+ * The address handle is used to reference a local or global destination
+ * in all UD QP post sends.
+ */
+struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc,
+				   struct ib_grh *grh, u8 port_num);
+
+/**
+ * ib_modify_ah - Modifies the address vector associated with an address
+ *   handle.
+ * @ah: The address handle to modify.
+ * @ah_attr: The new address vector attributes to associate with the
+ *   address handle.
+ */
+int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_query_ah - Queries the address vector associated with an address
+ *   handle.
+ * @ah: The address handle to query.
+ * @ah_attr: The address vector attributes associated with the address
+ *   handle.
+ */
+int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_destroy_ah - Destroys an address handle.
+ * @ah: The address handle to destroy.
+ */
+int ib_destroy_ah(struct ib_ah *ah);
+
+/**
+ * ib_create_xrc_srq - Creates an XRC SRQ associated with the specified
+ *   protection domain, cq, and xrc domain.
+ * @pd: The protection domain associated with the SRQ.
+ * @xrc_cq: The cq to be associated with the XRC SRQ.
+ * @xrcd: The XRC domain to be associated with the XRC SRQ.
+ * @srq_init_attr: A list of initial attributes required to create the
+ *   XRC SRQ.  If XRC SRQ creation succeeds, then the attributes are updated
+ *   to the actual capabilities of the created XRC SRQ.
+ *
+ * srq_attr->max_wr and srq_attr->max_sge are read the determine the
+ * requested size of the XRC SRQ, and set to the actual values allocated
+ * on return.  If ib_create_xrc_srq() succeeds, then max_wr and max_sge
+ * will always be at least as large as the requested values.
+ */
+struct ib_srq *ib_create_xrc_srq(struct ib_pd *pd,
+				 struct ib_cq *xrc_cq,
+				 struct ib_xrcd *xrcd,
+				 struct ib_srq_init_attr *srq_init_attr);
+
+/**
+ * ib_create_srq - Creates an SRQ associated with the specified
+ *   protection domain.
+ * @pd: The protection domain associated with the SRQ.
+ * @srq_init_attr: A list of initial attributes required to create the
+ *   SRQ.  If SRQ creation succeeds, then the attributes are updated to
+ *   the actual capabilities of the created SRQ.
+ *
+ * srq_attr->max_wr and srq_attr->max_sge are read the determine the
+ * requested size of the SRQ, and set to the actual values allocated
+ * on return.  If ib_create_srq() succeeds, then max_wr and max_sge
+ * will always be at least as large as the requested values.
+ */
+struct ib_srq *ib_create_srq(struct ib_pd *pd,
+			     struct ib_srq_init_attr *srq_init_attr);
+
+/**
+ * ib_modify_srq - Modifies the attributes for the specified SRQ.
+ * @srq: The SRQ to modify.
+ * @srq_attr: On input, specifies the SRQ attributes to modify.  On output,
+ *   the current values of selected SRQ attributes are returned.
+ * @srq_attr_mask: A bit-mask used to specify which attributes of the SRQ
+ *   are being modified.
+ *
+ * The mask may contain IB_SRQ_MAX_WR to resize the SRQ and/or
+ * IB_SRQ_LIMIT to set the SRQ's limit and request notification when
+ * the number of receives queued drops below the limit.
+ */
+int ib_modify_srq(struct ib_srq *srq,
+		  struct ib_srq_attr *srq_attr,
+		  enum ib_srq_attr_mask srq_attr_mask);
+
+/**
+ * ib_query_srq - Returns the attribute list and current values for the
+ *   specified SRQ.
+ * @srq: The SRQ to query.
+ * @srq_attr: The attributes of the specified SRQ.
+ */
+int ib_query_srq(struct ib_srq *srq,
+		 struct ib_srq_attr *srq_attr);
+
+/**
+ * ib_destroy_srq - Destroys the specified SRQ.
+ * @srq: The SRQ to destroy.
+ */
+int ib_destroy_srq(struct ib_srq *srq);
+
+/**
+ * ib_post_srq_recv - Posts a list of work requests to the specified SRQ.
+ * @srq: The SRQ to post the work request on.
+ * @recv_wr: A list of work requests to post on the receive queue.
+ * @bad_recv_wr: On an immediate failure, this parameter will reference
+ *   the work request that failed to be posted on the QP.
+ */
+static inline int ib_post_srq_recv(struct ib_srq *srq,
+				   struct ib_recv_wr *recv_wr,
+				   struct ib_recv_wr **bad_recv_wr)
+{
+	return srq->device->post_srq_recv(srq, recv_wr, bad_recv_wr);
+}
+
+/**
+ * ib_create_qp - Creates a QP associated with the specified protection
+ *   domain.
+ * @pd: The protection domain associated with the QP.
+ * @qp_init_attr: A list of initial attributes required to create the
+ *   QP.  If QP creation succeeds, then the attributes are updated to
+ *   the actual capabilities of the created QP.
+ */
+struct ib_qp *ib_create_qp(struct ib_pd *pd,
+			   struct ib_qp_init_attr *qp_init_attr);
+
+/**
+ * ib_modify_qp - Modifies the attributes for the specified QP and then
+ *   transitions the QP to the given state.
+ * @qp: The QP to modify.
+ * @qp_attr: On input, specifies the QP attributes to modify.  On output,
+ *   the current values of selected QP attributes are returned.
+ * @qp_attr_mask: A bit-mask used to specify which attributes of the QP
+ *   are being modified.
+ */
+int ib_modify_qp(struct ib_qp *qp,
+		 struct ib_qp_attr *qp_attr,
+		 int qp_attr_mask);
+
+/**
+ * ib_query_qp - Returns the attribute list and current values for the
+ *   specified QP.
+ * @qp: The QP to query.
+ * @qp_attr: The attributes of the specified QP.
+ * @qp_attr_mask: A bit-mask used to select specific attributes to query.
+ * @qp_init_attr: Additional attributes of the selected QP.
+ *
+ * The qp_attr_mask may be used to limit the query to gathering only the
+ * selected attributes.
+ */
+int ib_query_qp(struct ib_qp *qp,
+		struct ib_qp_attr *qp_attr,
+		int qp_attr_mask,
+		struct ib_qp_init_attr *qp_init_attr);
+
+/**
+ * ib_destroy_qp - Destroys the specified QP.
+ * @qp: The QP to destroy.
+ */
+int ib_destroy_qp(struct ib_qp *qp);
+
+/**
+ * ib_post_send - Posts a list of work requests to the send queue of
+ *   the specified QP.
+ * @qp: The QP to post the work request on.
+ * @send_wr: A list of work requests to post on the send queue.
+ * @bad_send_wr: On an immediate failure, this parameter will reference
+ *   the work request that failed to be posted on the QP.
+ */
+static inline int ib_post_send(struct ib_qp *qp,
+			       struct ib_send_wr *send_wr,
+			       struct ib_send_wr **bad_send_wr)
+{
+	return qp->device->post_send(qp, send_wr, bad_send_wr);
+}
+
+/**
+ * ib_post_recv - Posts a list of work requests to the receive queue of
+ *   the specified QP.
+ * @qp: The QP to post the work request on.
+ * @recv_wr: A list of work requests to post on the receive queue.
+ * @bad_recv_wr: On an immediate failure, this parameter will reference
+ *   the work request that failed to be posted on the QP.
+ */
+static inline int ib_post_recv(struct ib_qp *qp,
+			       struct ib_recv_wr *recv_wr,
+			       struct ib_recv_wr **bad_recv_wr)
+{
+	return qp->device->post_recv(qp, recv_wr, bad_recv_wr);
+}
+
+/*
+ * IB_CQ_VECTOR_LEAST_ATTACHED: The constant specifies that
+ *	the CQ will be attached to the completion vector that has
+ *	the least number of CQs already attached to it.
+ */
+#define IB_CQ_VECTOR_LEAST_ATTACHED	0xffffffff
+
+/**
+ * ib_create_cq - Creates a CQ on the specified device.
+ * @device: The device on which to create the CQ.
+ * @comp_handler: A user-specified callback that is invoked when a
+ *   completion event occurs on the CQ.
+ * @event_handler: A user-specified callback that is invoked when an
+ *   asynchronous event not associated with a completion occurs on the CQ.
+ * @cq_context: Context associated with the CQ returned to the user via
+ *   the associated completion and event handlers.
+ * @cqe: The minimum size of the CQ.
+ * @comp_vector - Completion vector used to signal completion events.
+ *     Must be >= 0 and < context->num_comp_vectors
+ *     or IB_CQ_VECTOR_LEAST_ATTACHED.
+ *
+ * Users can examine the cq structure to determine the actual CQ size.
+ */
+struct ib_cq *ib_create_cq(struct ib_device *device,
+			   ib_comp_handler comp_handler,
+			   void (*event_handler)(struct ib_event *, void *),
+			   void *cq_context, int cqe, int comp_vector);
+
+/**
+ * ib_resize_cq - Modifies the capacity of the CQ.
+ * @cq: The CQ to resize.
+ * @cqe: The minimum size of the CQ.
+ *
+ * Users can examine the cq structure to determine the actual CQ size.
+ */
+int ib_resize_cq(struct ib_cq *cq, int cqe);
+
+/**
+ * ib_modify_cq - Modifies moderation params of the CQ
+ * @cq: The CQ to modify.
+ * @cq_count: number of CQEs that will trigger an event
+ * @cq_period: max period of time in usec before triggering an event
+ *
+ */
+int ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
+
+/**
+ * ib_destroy_cq - Destroys the specified CQ.
+ * @cq: The CQ to destroy.
+ */
+int ib_destroy_cq(struct ib_cq *cq);
+
+/**
+ * ib_poll_cq - poll a CQ for completion(s)
+ * @cq:the CQ being polled
+ * @num_entries:maximum number of completions to return
+ * @wc:array of at least @num_entries &struct ib_wc where completions
+ *   will be returned
+ *
+ * Poll a CQ for (possibly multiple) completions.  If the return value
+ * is < 0, an error occurred.  If the return value is >= 0, it is the
+ * number of completions returned.  If the return value is
+ * non-negative and < num_entries, then the CQ was emptied.
+ */
+static inline int ib_poll_cq(struct ib_cq *cq, int num_entries,
+			     struct ib_wc *wc)
+{
+	return cq->device->poll_cq(cq, num_entries, wc);
+}
+
+/**
+ * ib_peek_cq - Returns the number of unreaped completions currently
+ *   on the specified CQ.
+ * @cq: The CQ to peek.
+ * @wc_cnt: A minimum number of unreaped completions to check for.
+ *
+ * If the number of unreaped completions is greater than or equal to wc_cnt,
+ * this function returns wc_cnt, otherwise, it returns the actual number of
+ * unreaped completions.
+ */
+int ib_peek_cq(struct ib_cq *cq, int wc_cnt);
+
+/**
+ * ib_req_notify_cq - Request completion notification on a CQ.
+ * @cq: The CQ to generate an event for.
+ * @flags:
+ *   Must contain exactly one of %IB_CQ_SOLICITED or %IB_CQ_NEXT_COMP
+ *   to request an event on the next solicited event or next work
+ *   completion at any type, respectively. %IB_CQ_REPORT_MISSED_EVENTS
+ *   may also be |ed in to request a hint about missed events, as
+ *   described below.
+ *
+ * Return Value:
+ *    < 0 means an error occurred while requesting notification
+ *   == 0 means notification was requested successfully, and if
+ *        IB_CQ_REPORT_MISSED_EVENTS was passed in, then no events
+ *        were missed and it is safe to wait for another event.  In
+ *        this case is it guaranteed that any work completions added
+ *        to the CQ since the last CQ poll will trigger a completion
+ *        notification event.
+ *    > 0 is only returned if IB_CQ_REPORT_MISSED_EVENTS was passed
+ *        in.  It means that the consumer must poll the CQ again to
+ *        make sure it is empty to avoid missing an event because of a
+ *        race between requesting notification and an entry being
+ *        added to the CQ.  This return value means it is possible
+ *        (but not guaranteed) that a work completion has been added
+ *        to the CQ since the last poll without triggering a
+ *        completion notification event.
+ */
+static inline int ib_req_notify_cq(struct ib_cq *cq,
+				   enum ib_cq_notify_flags flags)
+{
+	return cq->device->req_notify_cq(cq, flags);
+}
+
+/**
+ * ib_req_ncomp_notif - Request completion notification when there are
+ *   at least the specified number of unreaped completions on the CQ.
+ * @cq: The CQ to generate an event for.
+ * @wc_cnt: The number of unreaped completions that should be on the
+ *   CQ before an event is generated.
+ */
+static inline int ib_req_ncomp_notif(struct ib_cq *cq, int wc_cnt)
+{
+	return cq->device->req_ncomp_notif ?
+		cq->device->req_ncomp_notif(cq, wc_cnt) :
+		-ENOSYS;
+}
+
+/**
+ * ib_get_dma_mr - Returns a memory region for system memory that is
+ *   usable for DMA.
+ * @pd: The protection domain associated with the memory region.
+ * @mr_access_flags: Specifies the memory access rights.
+ *
+ * Note that the ib_dma_*() functions defined below must be used
+ * to create/destroy addresses used with the Lkey or Rkey returned
+ * by ib_get_dma_mr().
+ */
+struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
+
+/**
+ * ib_dma_mapping_error - check a DMA addr for error
+ * @dev: The device for which the dma_addr was created
+ * @dma_addr: The DMA address to check
+ */
+static inline int ib_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+	if (dev->dma_ops)
+		return dev->dma_ops->mapping_error(dev, dma_addr);
+	return dma_mapping_error(dev->dma_device, dma_addr);
+}
+
+/**
+ * ib_dma_map_single - Map a kernel virtual address to DMA address
+ * @dev: The device for which the dma_addr is to be created
+ * @cpu_addr: The kernel virtual address
+ * @size: The size of the region in bytes
+ * @direction: The direction of the DMA
+ */
+static inline u64 ib_dma_map_single(struct ib_device *dev,
+				    void *cpu_addr, size_t size,
+				    enum dma_data_direction direction)
+{
+	if (dev->dma_ops)
+		return dev->dma_ops->map_single(dev, cpu_addr, size, direction);
+	return dma_map_single(dev->dma_device, cpu_addr, size, direction);
+}
+
+/**
+ * ib_dma_unmap_single - Destroy a mapping created by ib_dma_map_single()
+ * @dev: The device for which the DMA address was created
+ * @addr: The DMA address
+ * @size: The size of the region in bytes
+ * @direction: The direction of the DMA
+ */
+static inline void ib_dma_unmap_single(struct ib_device *dev,
+				       u64 addr, size_t size,
+				       enum dma_data_direction direction)
+{
+	if (dev->dma_ops)
+		dev->dma_ops->unmap_single(dev, addr, size, direction);
+	else
+		dma_unmap_single(dev->dma_device, addr, size, direction);
+}
+
+static inline u64 ib_dma_map_single_attrs(struct ib_device *dev,
+					  void *cpu_addr, size_t size,
+					  enum dma_data_direction direction,
+					  struct dma_attrs *attrs)
+{
+	return dma_map_single_attrs(dev->dma_device, cpu_addr, size,
+				    direction, attrs);
+}
+
+static inline void ib_dma_unmap_single_attrs(struct ib_device *dev,
+					     u64 addr, size_t size,
+					     enum dma_data_direction direction,
+					     struct dma_attrs *attrs)
+{
+	return dma_unmap_single_attrs(dev->dma_device, addr, size,
+				      direction, attrs);
+}
+
+/**
+ * ib_dma_map_page - Map a physical page to DMA address
+ * @dev: The device for which the dma_addr is to be created
+ * @page: The page to be mapped
+ * @offset: The offset within the page
+ * @size: The size of the region in bytes
+ * @direction: The direction of the DMA
+ */
+static inline u64 ib_dma_map_page(struct ib_device *dev,
+				  struct page *page,
+				  unsigned long offset,
+				  size_t size,
+					 enum dma_data_direction direction)
+{
+	if (dev->dma_ops)
+		return dev->dma_ops->map_page(dev, page, offset, size, direction);
+	return dma_map_page(dev->dma_device, page, offset, size, direction);
+}
+
+/**
+ * ib_dma_unmap_page - Destroy a mapping created by ib_dma_map_page()
+ * @dev: The device for which the DMA address was created
+ * @addr: The DMA address
+ * @size: The size of the region in bytes
+ * @direction: The direction of the DMA
+ */
+static inline void ib_dma_unmap_page(struct ib_device *dev,
+				     u64 addr, size_t size,
+				     enum dma_data_direction direction)
+{
+	if (dev->dma_ops)
+		dev->dma_ops->unmap_page(dev, addr, size, direction);
+	else
+		dma_unmap_page(dev->dma_device, addr, size, direction);
+}
+
+/**
+ * ib_dma_map_sg - Map a scatter/gather list to DMA addresses
+ * @dev: The device for which the DMA addresses are to be created
+ * @sg: The array of scatter/gather entries
+ * @nents: The number of scatter/gather entries
+ * @direction: The direction of the DMA
+ */
+static inline int ib_dma_map_sg(struct ib_device *dev,
+				struct scatterlist *sg, int nents,
+				enum dma_data_direction direction)
+{
+	if (dev->dma_ops)
+		return dev->dma_ops->map_sg(dev, sg, nents, direction);
+	return dma_map_sg(dev->dma_device, sg, nents, direction);
+}
+
+/**
+ * ib_dma_unmap_sg - Unmap a scatter/gather list of DMA addresses
+ * @dev: The device for which the DMA addresses were created
+ * @sg: The array of scatter/gather entries
+ * @nents: The number of scatter/gather entries
+ * @direction: The direction of the DMA
+ */
+static inline void ib_dma_unmap_sg(struct ib_device *dev,
+				   struct scatterlist *sg, int nents,
+				   enum dma_data_direction direction)
+{
+	if (dev->dma_ops)
+		dev->dma_ops->unmap_sg(dev, sg, nents, direction);
+	else
+		dma_unmap_sg(dev->dma_device, sg, nents, direction);
+}
+
+static inline int ib_dma_map_sg_attrs(struct ib_device *dev,
+				      struct scatterlist *sg, int nents,
+				      enum dma_data_direction direction,
+				      struct dma_attrs *attrs)
+{
+	return dma_map_sg_attrs(dev->dma_device, sg, nents, direction, attrs);
+}
+
+static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev,
+					 struct scatterlist *sg, int nents,
+					 enum dma_data_direction direction,
+					 struct dma_attrs *attrs)
+{
+	dma_unmap_sg_attrs(dev->dma_device, sg, nents, direction, attrs);
+}
+/**
+ * ib_sg_dma_address - Return the DMA address from a scatter/gather entry
+ * @dev: The device for which the DMA addresses were created
+ * @sg: The scatter/gather entry
+ */
+static inline u64 ib_sg_dma_address(struct ib_device *dev,
+				    struct scatterlist *sg)
+{
+	if (dev->dma_ops)
+		return dev->dma_ops->dma_address(dev, sg);
+	return sg_dma_address(sg);
+}
+
+/**
+ * ib_sg_dma_len - Return the DMA length from a scatter/gather entry
+ * @dev: The device for which the DMA addresses were created
+ * @sg: The scatter/gather entry
+ */
+static inline unsigned int ib_sg_dma_len(struct ib_device *dev,
+					 struct scatterlist *sg)
+{
+	if (dev->dma_ops)
+		return dev->dma_ops->dma_len(dev, sg);
+	return sg_dma_len(sg);
+}
+
+/**
+ * ib_dma_sync_single_for_cpu - Prepare DMA region to be accessed by CPU
+ * @dev: The device for which the DMA address was created
+ * @addr: The DMA address
+ * @size: The size of the region in bytes
+ * @dir: The direction of the DMA
+ */
+static inline void ib_dma_sync_single_for_cpu(struct ib_device *dev,
+					      u64 addr,
+					      size_t size,
+					      enum dma_data_direction dir)
+{
+	if (dev->dma_ops)
+		dev->dma_ops->sync_single_for_cpu(dev, addr, size, dir);
+	else
+		dma_sync_single_for_cpu(dev->dma_device, addr, size, dir);
+}
+
+/**
+ * ib_dma_sync_single_for_device - Prepare DMA region to be accessed by device
+ * @dev: The device for which the DMA address was created
+ * @addr: The DMA address
+ * @size: The size of the region in bytes
+ * @dir: The direction of the DMA
+ */
+static inline void ib_dma_sync_single_for_device(struct ib_device *dev,
+						 u64 addr,
+						 size_t size,
+						 enum dma_data_direction dir)
+{
+	if (dev->dma_ops)
+		dev->dma_ops->sync_single_for_device(dev, addr, size, dir);
+	else
+		dma_sync_single_for_device(dev->dma_device, addr, size, dir);
+}
+
+/**
+ * ib_dma_alloc_coherent - Allocate memory and map it for DMA
+ * @dev: The device for which the DMA address is requested
+ * @size: The size of the region to allocate in bytes
+ * @dma_handle: A pointer for returning the DMA address of the region
+ * @flag: memory allocator flags
+ */
+static inline void *ib_dma_alloc_coherent(struct ib_device *dev,
+					   size_t size,
+					   u64 *dma_handle,
+					   gfp_t flag)
+{
+	if (dev->dma_ops)
+		return dev->dma_ops->alloc_coherent(dev, size, dma_handle, flag);
+	else {
+		dma_addr_t handle;
+		void *ret;
+
+		ret = dma_alloc_coherent(dev->dma_device, size, &handle, flag);
+		*dma_handle = handle;
+		return ret;
+	}
+}
+
+/**
+ * ib_dma_free_coherent - Free memory allocated by ib_dma_alloc_coherent()
+ * @dev: The device for which the DMA addresses were allocated
+ * @size: The size of the region
+ * @cpu_addr: the address returned by ib_dma_alloc_coherent()
+ * @dma_handle: the DMA address returned by ib_dma_alloc_coherent()
+ */
+static inline void ib_dma_free_coherent(struct ib_device *dev,
+					size_t size, void *cpu_addr,
+					u64 dma_handle)
+{
+	if (dev->dma_ops)
+		dev->dma_ops->free_coherent(dev, size, cpu_addr, dma_handle);
+	else
+		dma_free_coherent(dev->dma_device, size, cpu_addr, dma_handle);
+}
+
+/**
+ * ib_reg_phys_mr - Prepares a virtually addressed memory region for use
+ *   by an HCA.
+ * @pd: The protection domain associated assigned to the registered region.
+ * @phys_buf_array: Specifies a list of physical buffers to use in the
+ *   memory region.
+ * @num_phys_buf: Specifies the size of the phys_buf_array.
+ * @mr_access_flags: Specifies the memory access rights.
+ * @iova_start: The offset of the region's starting I/O virtual address.
+ */
+struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
+			     struct ib_phys_buf *phys_buf_array,
+			     int num_phys_buf,
+			     int mr_access_flags,
+			     u64 *iova_start);
+
+/**
+ * ib_rereg_phys_mr - Modifies the attributes of an existing memory region.
+ *   Conceptually, this call performs the functions deregister memory region
+ *   followed by register physical memory region.  Where possible,
+ *   resources are reused instead of deallocated and reallocated.
+ * @mr: The memory region to modify.
+ * @mr_rereg_mask: A bit-mask used to indicate which of the following
+ *   properties of the memory region are being modified.
+ * @pd: If %IB_MR_REREG_PD is set in mr_rereg_mask, this field specifies
+ *   the new protection domain to associated with the memory region,
+ *   otherwise, this parameter is ignored.
+ * @phys_buf_array: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
+ *   field specifies a list of physical buffers to use in the new
+ *   translation, otherwise, this parameter is ignored.
+ * @num_phys_buf: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
+ *   field specifies the size of the phys_buf_array, otherwise, this
+ *   parameter is ignored.
+ * @mr_access_flags: If %IB_MR_REREG_ACCESS is set in mr_rereg_mask, this
+ *   field specifies the new memory access rights, otherwise, this
+ *   parameter is ignored.
+ * @iova_start: The offset of the region's starting I/O virtual address.
+ */
+int ib_rereg_phys_mr(struct ib_mr *mr,
+		     int mr_rereg_mask,
+		     struct ib_pd *pd,
+		     struct ib_phys_buf *phys_buf_array,
+		     int num_phys_buf,
+		     int mr_access_flags,
+		     u64 *iova_start);
+
+/**
+ * ib_query_mr - Retrieves information about a specific memory region.
+ * @mr: The memory region to retrieve information about.
+ * @mr_attr: The attributes of the specified memory region.
+ */
+int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
+
+/**
+ * ib_dereg_mr - Deregisters a memory region and removes it from the
+ *   HCA translation table.
+ * @mr: The memory region to deregister.
+ */
+int ib_dereg_mr(struct ib_mr *mr);
+
+/**
+ * ib_alloc_fast_reg_mr - Allocates memory region usable with the
+ *   IB_WR_FAST_REG_MR send work request.
+ * @pd: The protection domain associated with the region.
+ * @max_page_list_len: requested max physical buffer list length to be
+ *   used with fast register work requests for this MR.
+ */
+struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len);
+
+/**
+ * ib_alloc_fast_reg_page_list - Allocates a page list array
+ * @device - ib device pointer.
+ * @page_list_len - size of the page list array to be allocated.
+ *
+ * This allocates and returns a struct ib_fast_reg_page_list * and a
+ * page_list array that is at least page_list_len in size.  The actual
+ * size is returned in max_page_list_len.  The caller is responsible
+ * for initializing the contents of the page_list array before posting
+ * a send work request with the IB_WC_FAST_REG_MR opcode.
+ *
+ * The page_list array entries must be translated using one of the
+ * ib_dma_*() functions just like the addresses passed to
+ * ib_map_phys_fmr().  Once the ib_post_send() is issued, the struct
+ * ib_fast_reg_page_list must not be modified by the caller until the
+ * IB_WC_FAST_REG_MR work request completes.
+ */
+struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(
+				struct ib_device *device, int page_list_len);
+
+/**
+ * ib_free_fast_reg_page_list - Deallocates a previously allocated
+ *   page list array.
+ * @page_list - struct ib_fast_reg_page_list pointer to be deallocated.
+ */
+void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);
+
+/**
+ * ib_update_fast_reg_key - updates the key portion of the fast_reg MR
+ *   R_Key and L_Key.
+ * @mr - struct ib_mr pointer to be updated.
+ * @newkey - new key to be used.
+ */
+static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey)
+{
+	mr->lkey = (mr->lkey & 0xffffff00) | newkey;
+	mr->rkey = (mr->rkey & 0xffffff00) | newkey;
+}
+
+/**
+ * ib_alloc_mw - Allocates a memory window.
+ * @pd: The protection domain associated with the memory window.
+ */
+struct ib_mw *ib_alloc_mw(struct ib_pd *pd);
+
+/**
+ * ib_bind_mw - Posts a work request to the send queue of the specified
+ *   QP, which binds the memory window to the given address range and
+ *   remote access attributes.
+ * @qp: QP to post the bind work request on.
+ * @mw: The memory window to bind.
+ * @mw_bind: Specifies information about the memory window, including
+ *   its address range, remote access rights, and associated memory region.
+ */
+static inline int ib_bind_mw(struct ib_qp *qp,
+			     struct ib_mw *mw,
+			     struct ib_mw_bind *mw_bind)
+{
+	/* XXX reference counting in corresponding MR? */
+	return mw->device->bind_mw ?
+		mw->device->bind_mw(qp, mw, mw_bind) :
+		-ENOSYS;
+}
+
+/**
+ * ib_dealloc_mw - Deallocates a memory window.
+ * @mw: The memory window to deallocate.
+ */
+int ib_dealloc_mw(struct ib_mw *mw);
+
+/**
+ * ib_alloc_fmr - Allocates a unmapped fast memory region.
+ * @pd: The protection domain associated with the unmapped region.
+ * @mr_access_flags: Specifies the memory access rights.
+ * @fmr_attr: Attributes of the unmapped region.
+ *
+ * A fast memory region must be mapped before it can be used as part of
+ * a work request.
+ */
+struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
+			    int mr_access_flags,
+			    struct ib_fmr_attr *fmr_attr);
+
+/**
+ * ib_map_phys_fmr - Maps a list of physical pages to a fast memory region.
+ * @fmr: The fast memory region to associate with the pages.
+ * @page_list: An array of physical pages to map to the fast memory region.
+ * @list_len: The number of pages in page_list.
+ * @iova: The I/O virtual address to use with the mapped region.
+ */
+static inline int ib_map_phys_fmr(struct ib_fmr *fmr,
+				  u64 *page_list, int list_len,
+				  u64 iova)
+{
+	return fmr->device->map_phys_fmr(fmr, page_list, list_len, iova);
+}
+
+/**
+ * ib_unmap_fmr - Removes the mapping from a list of fast memory regions.
+ * @fmr_list: A linked list of fast memory regions to unmap.
+ */
+int ib_unmap_fmr(struct list_head *fmr_list);
+
+/**
+ * ib_dealloc_fmr - Deallocates a fast memory region.
+ * @fmr: The fast memory region to deallocate.
+ */
+int ib_dealloc_fmr(struct ib_fmr *fmr);
+
+/**
+ * ib_attach_mcast - Attaches the specified QP to a multicast group.
+ * @qp: QP to attach to the multicast group.  The QP must be type
+ *   IB_QPT_UD.
+ * @gid: Multicast group GID.
+ * @lid: Multicast group LID in host byte order.
+ *
+ * In order to send and receive multicast packets, subnet
+ * administration must have created the multicast group and configured
+ * the fabric appropriately.  The port associated with the specified
+ * QP must also be a member of the multicast group.
+ */
+int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+/**
+ * ib_detach_mcast - Detaches the specified QP from a multicast group.
+ * @qp: QP to detach from the multicast group.
+ * @gid: Multicast group GID.
+ * @lid: Multicast group LID in host byte order.
+ */
+int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+
+/**
+ * ib_dealloc_xrcd - Deallocates an extended reliably connected domain.
+ * @xrcd: The xrc domain to deallocate.
+ */
+int ib_dealloc_xrcd(struct ib_xrcd *xrcd);
+
+/**
+ * ib_alloc_xrcd - Allocates an extended reliably connected domain.
+ * @device: The device on which to allocate the xrcd.
+ */
+struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device);
+
+#endif /* IB_VERBS_H */
diff --git a/sys/ofed/include/rdma/iw_cm.h b/sys/ofed/include/rdma/iw_cm.h
new file mode 100644
index 0000000..cbb822e
--- /dev/null
+++ b/sys/ofed/include/rdma/iw_cm.h
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef IW_CM_H
+#define IW_CM_H
+
+#include <linux/in.h>
+#include <rdma/ib_cm.h>
+
+struct iw_cm_id;
+
+enum iw_cm_event_type {
+	IW_CM_EVENT_CONNECT_REQUEST = 1, /* connect request received */
+	IW_CM_EVENT_CONNECT_REPLY,	 /* reply from active connect request */
+	IW_CM_EVENT_ESTABLISHED,	 /* passive side accept successful */
+	IW_CM_EVENT_DISCONNECT,		 /* orderly shutdown */
+	IW_CM_EVENT_CLOSE		 /* close complete */
+};
+
+enum iw_cm_event_status {
+	IW_CM_EVENT_STATUS_OK = 0,	 /* request successful */
+	IW_CM_EVENT_STATUS_ACCEPTED = 0, /* connect request accepted */
+	IW_CM_EVENT_STATUS_REJECTED,	 /* connect request rejected */
+	IW_CM_EVENT_STATUS_TIMEOUT,	 /* the operation timed out */
+	IW_CM_EVENT_STATUS_RESET,	 /* reset from remote peer */
+	IW_CM_EVENT_STATUS_EINVAL,	 /* asynchronous failure for bad parm */
+};
+
+struct iw_cm_event {
+	enum iw_cm_event_type event;
+	enum iw_cm_event_status status;
+	struct sockaddr_in local_addr;
+	struct sockaddr_in remote_addr;
+	void *private_data;
+	u8 private_data_len;
+	void *provider_data;
+};
+
+/**
+ * iw_cm_handler - Function to be called by the IW CM when delivering events
+ * to the client.
+ *
+ * @cm_id: The IW CM identifier associated with the event.
+ * @event: Pointer to the event structure.
+ */
+typedef int (*iw_cm_handler)(struct iw_cm_id *cm_id,
+			     struct iw_cm_event *event);
+
+/**
+ * iw_event_handler - Function called by the provider when delivering provider
+ * events to the IW CM.  Returns either 0 indicating the event was processed
+ * or -errno if the event could not be processed.
+ *
+ * @cm_id: The IW CM identifier associated with the event.
+ * @event: Pointer to the event structure.
+ */
+typedef int (*iw_event_handler)(struct iw_cm_id *cm_id,
+				 struct iw_cm_event *event);
+
+struct iw_cm_id {
+	iw_cm_handler		cm_handler;      /* client callback function */
+	void		        *context;	 /* client cb context */
+	struct ib_device	*device;
+	struct sockaddr_in      local_addr;
+	struct sockaddr_in	remote_addr;
+	void			*provider_data;	 /* provider private data */
+	iw_event_handler        event_handler;   /* cb for provider
+						    events */
+	/* Used by provider to add and remove refs on IW cm_id */
+	void (*add_ref)(struct iw_cm_id *);
+	void (*rem_ref)(struct iw_cm_id *);
+};
+
+struct iw_cm_conn_param {
+	const void *private_data;
+	u16 private_data_len;
+	u32 ord;
+	u32 ird;
+	u32 qpn;
+};
+
+struct iw_cm_verbs {
+	void		(*add_ref)(struct ib_qp *qp);
+
+	void		(*rem_ref)(struct ib_qp *qp);
+
+	struct ib_qp *	(*get_qp)(struct ib_device *device,
+				  int qpn);
+
+	int		(*connect)(struct iw_cm_id *cm_id,
+				   struct iw_cm_conn_param *conn_param);
+
+	int		(*accept)(struct iw_cm_id *cm_id,
+				  struct iw_cm_conn_param *conn_param);
+
+	int		(*reject)(struct iw_cm_id *cm_id,
+				  const void *pdata, u8 pdata_len);
+
+	int		(*create_listen)(struct iw_cm_id *cm_id,
+					 int backlog);
+
+	int		(*destroy_listen)(struct iw_cm_id *cm_id);
+};
+
+/**
+ * iw_create_cm_id - Create an IW CM identifier.
+ *
+ * @device: The IB device on which to create the IW CM identier.
+ * @event_handler: User callback invoked to report events associated with the
+ *   returned IW CM identifier.
+ * @context: User specified context associated with the id.
+ */
+struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
+				 iw_cm_handler cm_handler, void *context);
+
+/**
+ * iw_destroy_cm_id - Destroy an IW CM identifier.
+ *
+ * @cm_id: The previously created IW CM identifier to destroy.
+ *
+ * The client can assume that no events will be delivered for the CM ID after
+ * this function returns.
+ */
+void iw_destroy_cm_id(struct iw_cm_id *cm_id);
+
+/**
+ * iw_cm_bind_qp - Unbind the specified IW CM identifier and QP
+ *
+ * @cm_id: The IW CM idenfier to unbind from the QP.
+ * @qp: The QP
+ *
+ * This is called by the provider when destroying the QP to ensure
+ * that any references held by the IWCM are released. It may also
+ * be called by the IWCM when destroying a CM_ID to that any
+ * references held by the provider are released.
+ */
+void iw_cm_unbind_qp(struct iw_cm_id *cm_id, struct ib_qp *qp);
+
+/**
+ * iw_cm_get_qp - Return the ib_qp associated with a QPN
+ *
+ * @ib_device: The IB device
+ * @qpn: The queue pair number
+ */
+struct ib_qp *iw_cm_get_qp(struct ib_device *device, int qpn);
+
+/**
+ * iw_cm_listen - Listen for incoming connection requests on the
+ * specified IW CM id.
+ *
+ * @cm_id: The IW CM identifier.
+ * @backlog: The maximum number of outstanding un-accepted inbound listen
+ *   requests to queue.
+ *
+ * The source address and port number are specified in the IW CM identifier
+ * structure.
+ */
+int iw_cm_listen(struct iw_cm_id *cm_id, int backlog);
+
+/**
+ * iw_cm_accept - Called to accept an incoming connect request.
+ *
+ * @cm_id: The IW CM identifier associated with the connection request.
+ * @iw_param: Pointer to a structure containing connection establishment
+ *   parameters.
+ *
+ * The specified cm_id will have been provided in the event data for a
+ * CONNECT_REQUEST event. Subsequent events related to this connection will be
+ * delivered to the specified IW CM identifier prior and may occur prior to
+ * the return of this function. If this function returns a non-zero value, the
+ * client can assume that no events will be delivered to the specified IW CM
+ * identifier.
+ */
+int iw_cm_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param);
+
+/**
+ * iw_cm_reject - Reject an incoming connection request.
+ *
+ * @cm_id: Connection identifier associated with the request.
+ * @private_daa: Pointer to data to deliver to the remote peer as part of the
+ *   reject message.
+ * @private_data_len: The number of bytes in the private_data parameter.
+ *
+ * The client can assume that no events will be delivered to the specified IW
+ * CM identifier following the return of this function. The private_data
+ * buffer is available for reuse when this function returns.
+ */
+int iw_cm_reject(struct iw_cm_id *cm_id, const void *private_data,
+		 u8 private_data_len);
+
+/**
+ * iw_cm_connect - Called to request a connection to a remote peer.
+ *
+ * @cm_id: The IW CM identifier for the connection.
+ * @iw_param: Pointer to a structure containing connection  establishment
+ *   parameters.
+ *
+ * Events may be delivered to the specified IW CM identifier prior to the
+ * return of this function. If this function returns a non-zero value, the
+ * client can assume that no events will be delivered to the specified IW CM
+ * identifier.
+ */
+int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param);
+
+/**
+ * iw_cm_disconnect - Close the specified connection.
+ *
+ * @cm_id: The IW CM identifier to close.
+ * @abrupt: If 0, the connection will be closed gracefully, otherwise, the
+ *   connection will be reset.
+ *
+ * The IW CM identifier is still active until the IW_CM_EVENT_CLOSE event is
+ * delivered.
+ */
+int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt);
+
+/**
+ * iw_cm_init_qp_attr - Called to initialize the attributes of the QP
+ * associated with a IW CM identifier.
+ *
+ * @cm_id: The IW CM identifier associated with the QP
+ * @qp_attr: Pointer to the QP attributes structure.
+ * @qp_attr_mask: Pointer to a bit vector specifying which QP attributes are
+ *   valid.
+ */
+int iw_cm_init_qp_attr(struct iw_cm_id *cm_id, struct ib_qp_attr *qp_attr,
+		       int *qp_attr_mask);
+
+#endif /* IW_CM_H */
diff --git a/sys/ofed/include/rdma/rdma_cm.h b/sys/ofed/include/rdma/rdma_cm.h
new file mode 100644
index 0000000..c6b2962
--- /dev/null
+++ b/sys/ofed/include/rdma/rdma_cm.h
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc.  All rights reserved.
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(RDMA_CM_H)
+#define RDMA_CM_H
+
+#include <linux/socket.h>
+#include <linux/in6.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_sa.h>
+
+/*
+ * Upon receiving a device removal event, users must destroy the associated
+ * RDMA identifier and release all resources allocated with the device.
+ */
+enum rdma_cm_event_type {
+	RDMA_CM_EVENT_ADDR_RESOLVED,
+	RDMA_CM_EVENT_ADDR_ERROR,
+	RDMA_CM_EVENT_ROUTE_RESOLVED,
+	RDMA_CM_EVENT_ROUTE_ERROR,
+	RDMA_CM_EVENT_CONNECT_REQUEST,
+	RDMA_CM_EVENT_CONNECT_RESPONSE,
+	RDMA_CM_EVENT_CONNECT_ERROR,
+	RDMA_CM_EVENT_UNREACHABLE,
+	RDMA_CM_EVENT_REJECTED,
+	RDMA_CM_EVENT_ESTABLISHED,
+	RDMA_CM_EVENT_DISCONNECTED,
+	RDMA_CM_EVENT_DEVICE_REMOVAL,
+	RDMA_CM_EVENT_MULTICAST_JOIN,
+	RDMA_CM_EVENT_MULTICAST_ERROR,
+	RDMA_CM_EVENT_ADDR_CHANGE,
+	RDMA_CM_EVENT_TIMEWAIT_EXIT
+};
+
+enum rdma_port_space {
+	RDMA_PS_SDP   = 0x0001,
+	RDMA_PS_IPOIB = 0x0002,
+	RDMA_PS_TCP   = 0x0106,
+	RDMA_PS_UDP   = 0x0111,
+	RDMA_PS_SCTP  = 0x0183
+};
+
+struct rdma_addr {
+	struct sockaddr_storage src_addr;
+	struct sockaddr_storage dst_addr;
+	struct rdma_dev_addr dev_addr;
+};
+
+struct rdma_route {
+	struct rdma_addr addr;
+	struct ib_sa_path_rec *path_rec;
+	int num_paths;
+};
+
+struct rdma_conn_param {
+	const void *private_data;
+	u8 private_data_len;
+	u8 responder_resources;
+	u8 initiator_depth;
+	u8 flow_control;
+	u8 retry_count;		/* ignored when accepting */
+	u8 rnr_retry_count;
+	/* Fields below ignored if a QP is created on the rdma_cm_id. */
+	u8 srq;
+	u32 qp_num;
+};
+
+struct rdma_ud_param {
+	const void *private_data;
+	u8 private_data_len;
+	struct ib_ah_attr ah_attr;
+	u32 qp_num;
+	u32 qkey;
+};
+
+struct rdma_cm_event {
+	enum rdma_cm_event_type	 event;
+	int			 status;
+	union {
+		struct rdma_conn_param	conn;
+		struct rdma_ud_param	ud;
+	} param;
+};
+
+struct rdma_cm_id;
+
+/**
+ * rdma_cm_event_handler - Callback used to report user events.
+ *
+ * Notes: Users may not call rdma_destroy_id from this callback to destroy
+ *   the passed in id, or a corresponding listen id.  Returning a
+ *   non-zero value from the callback will destroy the passed in id.
+ */
+typedef int (*rdma_cm_event_handler)(struct rdma_cm_id *id,
+				     struct rdma_cm_event *event);
+
+struct rdma_cm_id {
+	struct ib_device	*device;
+	void			*context;
+	struct ib_qp		*qp;
+	rdma_cm_event_handler	 event_handler;
+	struct rdma_route	 route;
+	enum rdma_port_space	 ps;
+	u8			 port_num;
+};
+
+/**
+ * rdma_create_id - Create an RDMA identifier.
+ *
+ * @event_handler: User callback invoked to report events associated with the
+ *   returned rdma_id.
+ * @context: User specified context associated with the id.
+ * @ps: RDMA port space.
+ */
+struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
+				  void *context, enum rdma_port_space ps);
+
+/**
+  * rdma_destroy_id - Destroys an RDMA identifier.
+  *
+  * @id: RDMA identifier.
+  *
+  * Note: calling this function has the effect of canceling in-flight
+  * asynchronous operations associated with the id.
+  */
+void rdma_destroy_id(struct rdma_cm_id *id);
+
+/**
+ * rdma_bind_addr - Bind an RDMA identifier to a source address and
+ *   associated RDMA device, if needed.
+ *
+ * @id: RDMA identifier.
+ * @addr: Local address information.  Wildcard values are permitted.
+ *
+ * This associates a source address with the RDMA identifier before calling
+ * rdma_listen.  If a specific local address is given, the RDMA identifier will
+ * be bound to a local RDMA device.
+ */
+int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr);
+
+/**
+ * rdma_resolve_addr - Resolve destination and optional source addresses
+ *   from IP addresses to an RDMA address.  If successful, the specified
+ *   rdma_cm_id will be bound to a local device.
+ *
+ * @id: RDMA identifier.
+ * @src_addr: Source address information.  This parameter may be NULL.
+ * @dst_addr: Destination address information.
+ * @timeout_ms: Time to wait for resolution to complete.
+ */
+int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+		      struct sockaddr *dst_addr, int timeout_ms);
+
+/**
+ * rdma_resolve_route - Resolve the RDMA address bound to the RDMA identifier
+ *   into route information needed to establish a connection.
+ *
+ * This is called on the client side of a connection.
+ * Users must have first called rdma_resolve_addr to resolve a dst_addr
+ * into an RDMA address before calling this routine.
+ */
+int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms);
+
+/**
+ * rdma_create_qp - Allocate a QP and associate it with the specified RDMA
+ * identifier.
+ *
+ * QPs allocated to an rdma_cm_id will automatically be transitioned by the CMA
+ * through their states.
+ */
+int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd,
+		   struct ib_qp_init_attr *qp_init_attr);
+
+/**
+ * rdma_destroy_qp - Deallocate the QP associated with the specified RDMA
+ * identifier.
+ *
+ * Users must destroy any QP associated with an RDMA identifier before
+ * destroying the RDMA ID.
+ */
+void rdma_destroy_qp(struct rdma_cm_id *id);
+
+/**
+ * rdma_init_qp_attr - Initializes the QP attributes for use in transitioning
+ *   to a specified QP state.
+ * @id: Communication identifier associated with the QP attributes to
+ *   initialize.
+ * @qp_attr: On input, specifies the desired QP state.  On output, the
+ *   mandatory and desired optional attributes will be set in order to
+ *   modify the QP to the specified state.
+ * @qp_attr_mask: The QP attribute mask that may be used to transition the
+ *   QP to the specified state.
+ *
+ * Users must set the @qp_attr->qp_state to the desired QP state.  This call
+ * will set all required attributes for the given transition, along with
+ * known optional attributes.  Users may override the attributes returned from
+ * this call before calling ib_modify_qp.
+ *
+ * Users that wish to have their QP automatically transitioned through its
+ * states can associate a QP with the rdma_cm_id by calling rdma_create_qp().
+ */
+int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
+		       int *qp_attr_mask);
+
+/**
+ * rdma_connect - Initiate an active connection request.
+ * @id: Connection identifier to connect.
+ * @conn_param: Connection information used for connected QPs.
+ *
+ * Users must have resolved a route for the rdma_cm_id to connect with
+ * by having called rdma_resolve_route before calling this routine.
+ *
+ * This call will either connect to a remote QP or obtain remote QP
+ * information for unconnected rdma_cm_id's.  The actual operation is
+ * based on the rdma_cm_id's port space.
+ */
+int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param);
+
+/**
+ * rdma_listen - This function is called by the passive side to
+ *   listen for incoming connection requests.
+ *
+ * Users must have bound the rdma_cm_id to a local address by calling
+ * rdma_bind_addr before calling this routine.
+ */
+int rdma_listen(struct rdma_cm_id *id, int backlog);
+
+/**
+ * rdma_accept - Called to accept a connection request or response.
+ * @id: Connection identifier associated with the request.
+ * @conn_param: Information needed to establish the connection.  This must be
+ *   provided if accepting a connection request.  If accepting a connection
+ *   response, this parameter must be NULL.
+ *
+ * Typically, this routine is only called by the listener to accept a connection
+ * request.  It must also be called on the active side of a connection if the
+ * user is performing their own QP transitions.
+ *
+ * In the case of error, a reject message is sent to the remote side and the
+ * state of the qp associated with the id is modified to error, such that any
+ * previously posted receive buffers would be flushed.
+ */
+int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param);
+
+/**
+ * rdma_notify - Notifies the RDMA CM of an asynchronous event that has
+ * occurred on the connection.
+ * @id: Connection identifier to transition to established.
+ * @event: Asynchronous event.
+ *
+ * This routine should be invoked by users to notify the CM of relevant
+ * communication events.  Events that should be reported to the CM and
+ * when to report them are:
+ *
+ * IB_EVENT_COMM_EST - Used when a message is received on a connected
+ *    QP before an RTU has been received.
+ */
+int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event);
+
+/**
+ * rdma_reject - Called to reject a connection request or response.
+ */
+int rdma_reject(struct rdma_cm_id *id, const void *private_data,
+		u8 private_data_len);
+
+/**
+ * rdma_disconnect - This function disconnects the associated QP and
+ *   transitions it into the error state.
+ */
+int rdma_disconnect(struct rdma_cm_id *id);
+
+/**
+ * rdma_join_multicast - Join the multicast group specified by the given
+ *   address.
+ * @id: Communication identifier associated with the request.
+ * @addr: Multicast address identifying the group to join.
+ * @context: User-defined context associated with the join request, returned
+ * to the user through the private_data pointer in multicast events.
+ */
+int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
+			void *context);
+
+/**
+ * rdma_leave_multicast - Leave the multicast group specified by the given
+ *   address.
+ */
+void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr);
+
+/**
+ * rdma_set_service_type - Set the type of service associated with a
+ *   connection identifier.
+ * @id: Communication identifier to associated with service type.
+ * @tos: Type of service.
+ *
+ * The type of service is interpretted as a differentiated service
+ * field (RFC 2474).  The service type should be specified before
+ * performing route resolution, as existing communication on the
+ * connection identifier may be unaffected.  The type of service
+ * requested may not be supported by the network to all destinations.
+ */
+void rdma_set_service_type(struct rdma_cm_id *id, int tos);
+
+#endif /* RDMA_CM_H */
diff --git a/sys/ofed/include/rdma/rdma_cm_ib.h b/sys/ofed/include/rdma/rdma_cm_ib.h
new file mode 100644
index 0000000..2389c3b
--- /dev/null
+++ b/sys/ofed/include/rdma/rdma_cm_ib.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(RDMA_CM_IB_H)
+#define RDMA_CM_IB_H
+
+#include <rdma/rdma_cm.h>
+
+/**
+ * rdma_set_ib_paths - Manually sets the path records used to establish a
+ *   connection.
+ * @id: Connection identifier associated with the request.
+ * @path_rec: Reference to the path record
+ *
+ * This call permits a user to specify routing information for rdma_cm_id's
+ * bound to Infiniband devices.  It is called on the client side of a
+ * connection and replaces the call to rdma_resolve_route.
+ */
+int rdma_set_ib_paths(struct rdma_cm_id *id,
+		      struct ib_sa_path_rec *path_rec, int num_paths);
+
+/* Global qkey for UDP QPs and multicast groups. */
+#define RDMA_UDP_QKEY 0x01234567
+
+#endif /* RDMA_CM_IB_H */
diff --git a/sys/ofed/include/rdma/rdma_user_cm.h b/sys/ofed/include/rdma/rdma_user_cm.h
new file mode 100644
index 0000000..1d16502
--- /dev/null
+++ b/sys/ofed/include/rdma/rdma_user_cm.h
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2005-2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RDMA_USER_CM_H
+#define RDMA_USER_CM_H
+
+#include <linux/types.h>
+#include <linux/in6.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_user_sa.h>
+
+#define RDMA_USER_CM_ABI_VERSION	4
+
+#define RDMA_MAX_PRIVATE_DATA		256
+
+enum {
+	RDMA_USER_CM_CMD_CREATE_ID,
+	RDMA_USER_CM_CMD_DESTROY_ID,
+	RDMA_USER_CM_CMD_BIND_ADDR,
+	RDMA_USER_CM_CMD_RESOLVE_ADDR,
+	RDMA_USER_CM_CMD_RESOLVE_ROUTE,
+	RDMA_USER_CM_CMD_QUERY_ROUTE,
+	RDMA_USER_CM_CMD_CONNECT,
+	RDMA_USER_CM_CMD_LISTEN,
+	RDMA_USER_CM_CMD_ACCEPT,
+	RDMA_USER_CM_CMD_REJECT,
+	RDMA_USER_CM_CMD_DISCONNECT,
+	RDMA_USER_CM_CMD_INIT_QP_ATTR,
+	RDMA_USER_CM_CMD_GET_EVENT,
+	RDMA_USER_CM_CMD_GET_OPTION,
+	RDMA_USER_CM_CMD_SET_OPTION,
+	RDMA_USER_CM_CMD_NOTIFY,
+	RDMA_USER_CM_CMD_JOIN_MCAST,
+	RDMA_USER_CM_CMD_LEAVE_MCAST,
+	RDMA_USER_CM_CMD_MIGRATE_ID
+};
+
+/*
+ * command ABI structures.
+ */
+struct rdma_ucm_cmd_hdr {
+	__u32 cmd;
+	__u16 in;
+	__u16 out;
+};
+
+struct rdma_ucm_create_id {
+	__u64 uid;
+	__u64 response;
+	__u16 ps;
+	__u8  reserved[6];
+};
+
+struct rdma_ucm_create_id_resp {
+	__u32 id;
+};
+
+struct rdma_ucm_destroy_id {
+	__u64 response;
+	__u32 id;
+	__u32 reserved;
+};
+
+struct rdma_ucm_destroy_id_resp {
+	__u32 events_reported;
+};
+
+struct rdma_ucm_bind_addr {
+	__u64 response;
+	struct sockaddr_in6 addr;
+	__u32 id;
+};
+
+struct rdma_ucm_resolve_addr {
+	struct sockaddr_in6 src_addr;
+	struct sockaddr_in6 dst_addr;
+	__u32 id;
+	__u32 timeout_ms;
+};
+
+struct rdma_ucm_resolve_route {
+	__u32 id;
+	__u32 timeout_ms;
+};
+
+struct rdma_ucm_query_route {
+	__u64 response;
+	__u32 id;
+	__u32 reserved;
+};
+
+struct rdma_ucm_query_route_resp {
+	__u64 node_guid;
+	struct ib_user_path_rec ib_route[2];
+	struct sockaddr_in6 src_addr;
+	struct sockaddr_in6 dst_addr;
+	__u32 num_paths;
+	__u8 port_num;
+	__u8 reserved[3];
+};
+
+struct rdma_ucm_conn_param {
+	__u32 qp_num;
+	__u32 reserved;
+	__u8  private_data[RDMA_MAX_PRIVATE_DATA];
+	__u8  private_data_len;
+	__u8  srq;
+	__u8  responder_resources;
+	__u8  initiator_depth;
+	__u8  flow_control;
+	__u8  retry_count;
+	__u8  rnr_retry_count;
+	__u8  valid;
+};
+
+struct rdma_ucm_ud_param {
+	__u32 qp_num;
+	__u32 qkey;
+	struct ib_uverbs_ah_attr ah_attr;
+	__u8  private_data[RDMA_MAX_PRIVATE_DATA];
+	__u8  private_data_len;
+	__u8  reserved[7];
+};
+
+struct rdma_ucm_connect {
+	struct rdma_ucm_conn_param conn_param;
+	__u32 id;
+	__u32 reserved;
+};
+
+struct rdma_ucm_listen {
+	__u32 id;
+	__u32 backlog;
+};
+
+struct rdma_ucm_accept {
+	__u64 uid;
+	struct rdma_ucm_conn_param conn_param;
+	__u32 id;
+	__u32 reserved;
+};
+
+struct rdma_ucm_reject {
+	__u32 id;
+	__u8  private_data_len;
+	__u8  reserved[3];
+	__u8  private_data[RDMA_MAX_PRIVATE_DATA];
+};
+
+struct rdma_ucm_disconnect {
+	__u32 id;
+};
+
+struct rdma_ucm_init_qp_attr {
+	__u64 response;
+	__u32 id;
+	__u32 qp_state;
+};
+
+struct rdma_ucm_notify {
+	__u32 id;
+	__u32 event;
+};
+
+struct rdma_ucm_join_mcast {
+	__u64 response;		/* rdma_ucm_create_id_resp */
+	__u64 uid;
+	struct sockaddr_in6 addr;
+	__u32 id;
+};
+
+struct rdma_ucm_get_event {
+	__u64 response;
+};
+
+struct rdma_ucm_event_resp {
+	__u64 uid;
+	__u32 id;
+	__u32 event;
+	__u32 status;
+	union {
+		struct rdma_ucm_conn_param conn;
+		struct rdma_ucm_ud_param   ud;
+	} param;
+};
+
+/* Option levels */
+enum {
+	RDMA_OPTION_ID		= 0,
+	RDMA_OPTION_IB		= 1
+};
+
+/* Option details */
+enum {
+	RDMA_OPTION_ID_TOS	= 0,
+	RDMA_OPTION_IB_PATH	= 1
+};
+
+struct rdma_ucm_set_option {
+	__u64 optval;
+	__u32 id;
+	__u32 level;
+	__u32 optname;
+	__u32 optlen;
+};
+
+struct rdma_ucm_migrate_id {
+	__u64 response;
+	__u32 id;
+	__u32 fd;
+};
+
+struct rdma_ucm_migrate_resp {
+	__u32 events_reported;
+};
+
+#endif /* RDMA_USER_CM_H */
diff --git a/sys/ofed/include/rdma/sdp_socket.h b/sys/ofed/include/rdma/sdp_socket.h
new file mode 100644
index 0000000..902dc97
--- /dev/null
+++ b/sys/ofed/include/rdma/sdp_socket.h
@@ -0,0 +1,21 @@
+/* Stuff that should go into include/linux/socket.h */
+
+#ifndef SDP_SOCKET_H
+#define SDP_SOCKET_H
+
+#ifndef AF_INET_SDP
+#define AF_INET_SDP 27
+#define PF_INET_SDP AF_INET_SDP
+#endif
+
+#ifndef SDP_ZCOPY_THRESH
+#define SDP_ZCOPY_THRESH 80
+#endif
+
+#ifndef SDP_LAST_BIND_ERR
+#define SDP_LAST_BIND_ERR 81
+#endif
+
+/* TODO: AF_INET6_SDP ? */
+
+#endif
author	jeff <jeff@FreeBSD.org>	2011-03-21 09:58:24 +0000
committer	jeff <jeff@FreeBSD.org>	2011-03-21 09:58:24 +0000
commit	5115240a6cdc054f7eea804355742f97c74578d8 (patch)
tree	3051c12f4ce44a65c025b72ec5821b35b2ec46be /sys
parent	2d7d8c05e7404fbebf1f0fe24c13bc5bb58d2338 (diff)
download	FreeBSD-src-5115240a6cdc054f7eea804355742f97c74578d8.zip FreeBSD-src-5115240a6cdc054f7eea804355742f97c74578d8.tar.gz