summaryrefslogtreecommitdiffstats
path: root/sys
diff options
context:
space:
mode:
authorjeff <jeff@FreeBSD.org>2011-03-21 09:58:24 +0000
committerjeff <jeff@FreeBSD.org>2011-03-21 09:58:24 +0000
commit5115240a6cdc054f7eea804355742f97c74578d8 (patch)
tree3051c12f4ce44a65c025b72ec5821b35b2ec46be /sys
parent2d7d8c05e7404fbebf1f0fe24c13bc5bb58d2338 (diff)
downloadFreeBSD-src-5115240a6cdc054f7eea804355742f97c74578d8.zip
FreeBSD-src-5115240a6cdc054f7eea804355742f97c74578d8.tar.gz
- Merge in OFED 1.5.3 from projects/ofed/head
Diffstat (limited to 'sys')
-rw-r--r--sys/modules/Makefile4
-rw-r--r--sys/modules/mlx4/Makefile14
-rw-r--r--sys/modules/mlx4ib/Makefile11
-rw-r--r--sys/modules/mlxen/Makefile13
-rw-r--r--sys/modules/mthca/Makefile15
-rw-r--r--sys/ofed/drivers/infiniband/Kconfig66
-rw-r--r--sys/ofed/drivers/infiniband/Makefile17
-rw-r--r--sys/ofed/drivers/infiniband/core/Makefile32
-rw-r--r--sys/ofed/drivers/infiniband/core/addr.c630
-rw-r--r--sys/ofed/drivers/infiniband/core/agent.c216
-rw-r--r--sys/ofed/drivers/infiniband/core/agent.h51
-rw-r--r--sys/ofed/drivers/infiniband/core/cache.c398
-rw-r--r--sys/ofed/drivers/infiniband/core/cm.c3894
-rw-r--r--sys/ofed/drivers/infiniband/core/cm_msgs.h819
-rw-r--r--sys/ofed/drivers/infiniband/core/cma.c3386
-rw-r--r--sys/ofed/drivers/infiniband/core/core_priv.h50
-rw-r--r--sys/ofed/drivers/infiniband/core/device.c754
-rw-r--r--sys/ofed/drivers/infiniband/core/fmr_pool.c544
-rw-r--r--sys/ofed/drivers/infiniband/core/iwcm.c1025
-rw-r--r--sys/ofed/drivers/infiniband/core/iwcm.h62
-rw-r--r--sys/ofed/drivers/infiniband/core/local_sa.c1273
-rw-r--r--sys/ofed/drivers/infiniband/core/mad.c3057
-rw-r--r--sys/ofed/drivers/infiniband/core/mad_priv.h231
-rw-r--r--sys/ofed/drivers/infiniband/core/mad_rmpp.c951
-rw-r--r--sys/ofed/drivers/infiniband/core/mad_rmpp.h58
-rw-r--r--sys/ofed/drivers/infiniband/core/multicast.c868
-rw-r--r--sys/ofed/drivers/infiniband/core/notice.c749
-rw-r--r--sys/ofed/drivers/infiniband/core/packer.c202
-rw-r--r--sys/ofed/drivers/infiniband/core/sa.h105
-rw-r--r--sys/ofed/drivers/infiniband/core/sa_query.c1480
-rw-r--r--sys/ofed/drivers/infiniband/core/smi.c245
-rw-r--r--sys/ofed/drivers/infiniband/core/smi.h90
-rw-r--r--sys/ofed/drivers/infiniband/core/sysfs.c911
-rw-r--r--sys/ofed/drivers/infiniband/core/ucm.c1348
-rw-r--r--sys/ofed/drivers/infiniband/core/ucma.c1337
-rw-r--r--sys/ofed/drivers/infiniband/core/ud_header.c453
-rw-r--r--sys/ofed/drivers/infiniband/core/umem.c532
-rw-r--r--sys/ofed/drivers/infiniband/core/user_mad.c1225
-rw-r--r--sys/ofed/drivers/infiniband/core/uverbs.h220
-rw-r--r--sys/ofed/drivers/infiniband/core/uverbs_cmd.c3022
-rw-r--r--sys/ofed/drivers/infiniband/core/uverbs_main.c1012
-rw-r--r--sys/ofed/drivers/infiniband/core/uverbs_marshall.c139
-rw-r--r--sys/ofed/drivers/infiniband/core/verbs.c1073
-rw-r--r--sys/ofed/drivers/infiniband/debug/Makefile3
-rw-r--r--sys/ofed/drivers/infiniband/debug/memtrack.c600
-rw-r--r--sys/ofed/drivers/infiniband/debug/memtrack.h45
-rw-r--r--sys/ofed/drivers/infiniband/debug/mtrack.h138
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/Kconfig8
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/Makefile4
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/ah.c205
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/cq.c861
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/doorbell.c98
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/mad.c452
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/main.c1580
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h409
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/mr.c424
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/qp.c2770
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/srq.c401
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/user.h97
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/wc.c74
-rw-r--r--sys/ofed/drivers/infiniband/hw/mlx4/wc.h41
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/Kconfig17
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/Makefile7
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_allocator.c301
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_av.c374
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_catas.c198
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.c1931
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.h330
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_config_reg.h50
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_cq.c992
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_dev.h596
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_doorbell.h109
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_eq.c920
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_mad.c346
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_main.c1360
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_mcg.c372
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.c879
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.h179
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_mr.c985
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_pd.c81
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_profile.c285
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_profile.h59
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c1427
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.h343
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_qp.c2332
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_reset.c298
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_srq.c715
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_uar.c78
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_user.h112
-rw-r--r--sys/ofed/drivers/infiniband/hw/mthca/mthca_wqe.h131
-rw-r--r--sys/ofed/drivers/infiniband/ulp/ipoib/Kconfig50
-rw-r--r--sys/ofed/drivers/infiniband/ulp/ipoib/Makefile11
-rw-r--r--sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h757
-rw-r--r--sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c1445
-rw-r--r--sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c159
-rw-r--r--sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_fs.c298
-rw-r--r--sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c997
-rw-r--r--sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c1537
-rw-r--r--sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c907
-rw-r--r--sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c294
-rw-r--r--sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_vlan.c190
-rw-r--r--sys/ofed/drivers/infiniband/ulp/sdp/Kconfig28
-rw-r--r--sys/ofed/drivers/infiniband/ulp/sdp/Makefile6
-rw-r--r--sys/ofed/drivers/infiniband/ulp/sdp/sdp.h721
-rw-r--r--sys/ofed/drivers/infiniband/ulp/sdp/sdp_bcopy.c258
-rw-r--r--sys/ofed/drivers/infiniband/ulp/sdp/sdp_cma.c456
-rw-r--r--sys/ofed/drivers/infiniband/ulp/sdp/sdp_dbg.h167
-rw-r--r--sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c1962
-rw-r--r--sys/ofed/drivers/infiniband/ulp/sdp/sdp_proc.c533
-rw-r--r--sys/ofed/drivers/infiniband/ulp/sdp/sdp_rx.c783
-rw-r--r--sys/ofed/drivers/infiniband/ulp/sdp/sdp_tx.c490
-rw-r--r--sys/ofed/drivers/infiniband/ulp/sdp/sdp_zcopy.c804
-rw-r--r--sys/ofed/drivers/infiniband/util/Kconfig6
-rw-r--r--sys/ofed/drivers/infiniband/util/Makefile3
-rw-r--r--sys/ofed/drivers/infiniband/util/madeye.c593
-rw-r--r--sys/ofed/drivers/net/mlx4/Makefile9
-rw-r--r--sys/ofed/drivers/net/mlx4/alloc.c453
-rw-r--r--sys/ofed/drivers/net/mlx4/catas.c158
-rw-r--r--sys/ofed/drivers/net/mlx4/cmd.c453
-rw-r--r--sys/ofed/drivers/net/mlx4/cq.c338
-rw-r--r--sys/ofed/drivers/net/mlx4/en_cq.c158
-rw-r--r--sys/ofed/drivers/net/mlx4/en_ethtool.c512
-rw-r--r--sys/ofed/drivers/net/mlx4/en_frag.c188
-rw-r--r--sys/ofed/drivers/net/mlx4/en_main.c384
-rw-r--r--sys/ofed/drivers/net/mlx4/en_netdev.c1511
-rw-r--r--sys/ofed/drivers/net/mlx4/en_params.c0
-rw-r--r--sys/ofed/drivers/net/mlx4/en_port.c314
-rw-r--r--sys/ofed/drivers/net/mlx4/en_port.h591
-rw-r--r--sys/ofed/drivers/net/mlx4/en_resources.c103
-rw-r--r--sys/ofed/drivers/net/mlx4/en_rx.c1006
-rw-r--r--sys/ofed/drivers/net/mlx4/en_selftest.c179
-rw-r--r--sys/ofed/drivers/net/mlx4/en_tx.c1035
-rw-r--r--sys/ofed/drivers/net/mlx4/eq.c727
-rw-r--r--sys/ofed/drivers/net/mlx4/fw.c1010
-rw-r--r--sys/ofed/drivers/net/mlx4/fw.h186
-rw-r--r--sys/ofed/drivers/net/mlx4/icm.c455
-rw-r--r--sys/ofed/drivers/net/mlx4/icm.h130
-rw-r--r--sys/ofed/drivers/net/mlx4/intf.c212
-rw-r--r--sys/ofed/drivers/net/mlx4/main.c1704
-rw-r--r--sys/ofed/drivers/net/mlx4/mcg.c366
-rw-r--r--sys/ofed/drivers/net/mlx4/mlx4.h427
-rw-r--r--sys/ofed/drivers/net/mlx4/mlx4_en.h651
-rw-r--r--sys/ofed/drivers/net/mlx4/mr.c773
-rw-r--r--sys/ofed/drivers/net/mlx4/pd.c211
-rw-r--r--sys/ofed/drivers/net/mlx4/port.c354
-rw-r--r--sys/ofed/drivers/net/mlx4/profile.c240
-rw-r--r--sys/ofed/drivers/net/mlx4/qp.c410
-rw-r--r--sys/ofed/drivers/net/mlx4/reset.c186
-rw-r--r--sys/ofed/drivers/net/mlx4/sense.c172
-rw-r--r--sys/ofed/drivers/net/mlx4/srq.c273
-rw-r--r--sys/ofed/drivers/net/mlx4/xrcd.c70
-rw-r--r--sys/ofed/include/asm/atomic-long.h79
-rw-r--r--sys/ofed/include/asm/atomic.h85
-rw-r--r--sys/ofed/include/asm/byteorder.h90
-rw-r--r--sys/ofed/include/asm/current.h32
-rw-r--r--sys/ofed/include/asm/fcntl.h33
-rw-r--r--sys/ofed/include/asm/io.h29
-rw-r--r--sys/ofed/include/asm/page.h29
-rw-r--r--sys/ofed/include/asm/pgtable.h33
-rw-r--r--sys/ofed/include/asm/semaphore.h34
-rw-r--r--sys/ofed/include/asm/system.h27
-rw-r--r--sys/ofed/include/asm/types.h67
-rw-r--r--sys/ofed/include/asm/uaccess.h49
-rw-r--r--sys/ofed/include/linux/bitmap.h34
-rw-r--r--sys/ofed/include/linux/bitops.h312
-rw-r--r--sys/ofed/include/linux/cdev.h129
-rw-r--r--sys/ofed/include/linux/compat.h33
-rw-r--r--sys/ofed/include/linux/compiler.h65
-rw-r--r--sys/ofed/include/linux/completion.h155
-rw-r--r--sys/ofed/include/linux/ctype.h34
-rw-r--r--sys/ofed/include/linux/delay.h43
-rw-r--r--sys/ofed/include/linux/device.h388
-rw-r--r--sys/ofed/include/linux/dma-attrs.h47
-rw-r--r--sys/ofed/include/linux/dma-mapping.h263
-rw-r--r--sys/ofed/include/linux/dmapool.h85
-rw-r--r--sys/ofed/include/linux/err.h60
-rw-r--r--sys/ofed/include/linux/errno.h39
-rw-r--r--sys/ofed/include/linux/ethtool.h31
-rw-r--r--sys/ofed/include/linux/file.h120
-rw-r--r--sys/ofed/include/linux/fs.h182
-rw-r--r--sys/ofed/include/linux/gfp.h122
-rw-r--r--sys/ofed/include/linux/hardirq.h39
-rw-r--r--sys/ofed/include/linux/idr.h70
-rw-r--r--sys/ofed/include/linux/if_arp.h32
-rw-r--r--sys/ofed/include/linux/if_ether.h37
-rw-r--r--sys/ofed/include/linux/if_vlan.h35
-rw-r--r--sys/ofed/include/linux/in.h37
-rw-r--r--sys/ofed/include/linux/in6.h36
-rw-r--r--sys/ofed/include/linux/inet.h31
-rw-r--r--sys/ofed/include/linux/inetdevice.h56
-rw-r--r--sys/ofed/include/linux/init.h31
-rw-r--r--sys/ofed/include/linux/interrupt.h139
-rw-r--r--sys/ofed/include/linux/io-mapping.h77
-rw-r--r--sys/ofed/include/linux/io.h125
-rw-r--r--sys/ofed/include/linux/ioctl.h34
-rw-r--r--sys/ofed/include/linux/jhash.h143
-rw-r--r--sys/ofed/include/linux/jiffies.h56
-rw-r--r--sys/ofed/include/linux/kdev_t.h36
-rw-r--r--sys/ofed/include/linux/kernel.h88
-rw-r--r--sys/ofed/include/linux/kobject.h153
-rw-r--r--sys/ofed/include/linux/kref.h62
-rw-r--r--sys/ofed/include/linux/kthread.h104
-rw-r--r--sys/ofed/include/linux/linux_compat.c695
-rw-r--r--sys/ofed/include/linux/linux_idr.c447
-rw-r--r--sys/ofed/include/linux/linux_radix.c170
-rw-r--r--sys/ofed/include/linux/list.h331
-rw-r--r--sys/ofed/include/linux/lockdep.h37
-rw-r--r--sys/ofed/include/linux/log2.h60
-rw-r--r--sys/ofed/include/linux/miscdevice.h72
-rw-r--r--sys/ofed/include/linux/mlx4/cmd.h196
-rw-r--r--sys/ofed/include/linux/mlx4/cq.h150
-rw-r--r--sys/ofed/include/linux/mlx4/device.h619
-rw-r--r--sys/ofed/include/linux/mlx4/doorbell.h86
-rw-r--r--sys/ofed/include/linux/mlx4/driver.h82
-rw-r--r--sys/ofed/include/linux/mlx4/qp.h346
-rw-r--r--sys/ofed/include/linux/mlx4/srq.h54
-rw-r--r--sys/ofed/include/linux/mm.h84
-rw-r--r--sys/ofed/include/linux/module.h87
-rw-r--r--sys/ofed/include/linux/moduleparam.h226
-rw-r--r--sys/ofed/include/linux/mount.h33
-rw-r--r--sys/ofed/include/linux/mutex.h61
-rw-r--r--sys/ofed/include/linux/net.h73
-rw-r--r--sys/ofed/include/linux/netdevice.h159
-rw-r--r--sys/ofed/include/linux/notifier.h54
-rw-r--r--sys/ofed/include/linux/page.h49
-rw-r--r--sys/ofed/include/linux/pci.h580
-rw-r--r--sys/ofed/include/linux/poll.h44
-rw-r--r--sys/ofed/include/linux/radix-tree.h60
-rw-r--r--sys/ofed/include/linux/random.h40
-rw-r--r--sys/ofed/include/linux/rbtree.h111
-rw-r--r--sys/ofed/include/linux/rtnetlink.h27
-rw-r--r--sys/ofed/include/linux/rwlock.h63
-rw-r--r--sys/ofed/include/linux/rwsem.h56
-rw-r--r--sys/ofed/include/linux/scatterlist.h98
-rw-r--r--sys/ofed/include/linux/sched.h109
-rw-r--r--sys/ofed/include/linux/semaphore.h66
-rw-r--r--sys/ofed/include/linux/slab.h102
-rw-r--r--sys/ofed/include/linux/socket.h66
-rw-r--r--sys/ofed/include/linux/spinlock.h68
-rw-r--r--sys/ofed/include/linux/stddef.h34
-rw-r--r--sys/ofed/include/linux/string.h49
-rw-r--r--sys/ofed/include/linux/sysfs.h182
-rw-r--r--sys/ofed/include/linux/timer.h87
-rw-r--r--sys/ofed/include/linux/types.h55
-rw-r--r--sys/ofed/include/linux/uaccess.h34
-rw-r--r--sys/ofed/include/linux/vmalloc.h41
-rw-r--r--sys/ofed/include/linux/wait.h112
-rw-r--r--sys/ofed/include/linux/workqueue.h191
-rw-r--r--sys/ofed/include/net/addrconf.h27
-rw-r--r--sys/ofed/include/net/arp.h27
-rw-r--r--sys/ofed/include/net/ip.h77
-rw-r--r--sys/ofed/include/net/ip6_route.h27
-rw-r--r--sys/ofed/include/net/ipv6.h62
-rw-r--r--sys/ofed/include/net/neighbour.h27
-rw-r--r--sys/ofed/include/net/netevent.h71
-rw-r--r--sys/ofed/include/net/tcp.h38
-rw-r--r--sys/ofed/include/rdma/Kbuild1
-rw-r--r--sys/ofed/include/rdma/ib_addr.h312
-rw-r--r--sys/ofed/include/rdma/ib_cache.h116
-rw-r--r--sys/ofed/include/rdma/ib_cm.h589
-rw-r--r--sys/ofed/include/rdma/ib_fmr_pool.h93
-rw-r--r--sys/ofed/include/rdma/ib_mad.h655
-rw-r--r--sys/ofed/include/rdma/ib_marshall.h53
-rw-r--r--sys/ofed/include/rdma/ib_pack.h269
-rw-r--r--sys/ofed/include/rdma/ib_sa.h559
-rw-r--r--sys/ofed/include/rdma/ib_smi.h128
-rw-r--r--sys/ofed/include/rdma/ib_umem.h73
-rw-r--r--sys/ofed/include/rdma/ib_user_cm.h324
-rw-r--r--sys/ofed/include/rdma/ib_user_mad.h202
-rw-r--r--sys/ofed/include/rdma/ib_user_sa.h76
-rw-r--r--sys/ofed/include/rdma/ib_user_verbs.h801
-rw-r--r--sys/ofed/include/rdma/ib_verbs.h2170
-rw-r--r--sys/ofed/include/rdma/iw_cm.h258
-rw-r--r--sys/ofed/include/rdma/rdma_cm.h333
-rw-r--r--sys/ofed/include/rdma/rdma_cm_ib.h54
-rw-r--r--sys/ofed/include/rdma/rdma_user_cm.h246
-rw-r--r--sys/ofed/include/rdma/sdp_socket.h21
277 files changed, 104288 insertions, 0 deletions
diff --git a/sys/modules/Makefile b/sys/modules/Makefile
index 41ed4bf..0925a21 100644
--- a/sys/modules/Makefile
+++ b/sys/modules/Makefile
@@ -185,6 +185,9 @@ SUBDIR= ${_3dfx} \
mfi \
mii \
mlx \
+ mlx4 \
+ mlx4ib \
+ mlxen \
${_mly} \
mmc \
mmcsd \
@@ -195,6 +198,7 @@ SUBDIR= ${_3dfx} \
msdosfs_iconv \
${_mse} \
msk \
+ mthca \
mvs \
mwl \
mwlfw \
diff --git a/sys/modules/mlx4/Makefile b/sys/modules/mlx4/Makefile
new file mode 100644
index 0000000..8ea3340
--- /dev/null
+++ b/sys/modules/mlx4/Makefile
@@ -0,0 +1,14 @@
+# $FreeBSD$
+.PATH: ${.CURDIR}/../../ofed/drivers/net/mlx4
+KMOD = mlx4
+SRCS = device_if.h bus_if.h pci_if.h vnode_if.h
+SRCS+= alloc.c catas.c cmd.c cq.c eq.c fw.c icm.c intf.c main.c mcg.c mr.c
+SRCS+= pd.c port.c profile.c qp.c reset.c sense.c srq.c xrcd.c
+
+CFLAGS+= -I${.CURDIR}/../../ofed/drivers/net/mlx4
+CFLAGS+= -I${.CURDIR}/../../ofed/include/
+CFLAGS+= -DINET6
+
+.include <bsd.kmod.mk>
+
+CFLAGS+= -Wno-cast-qual -Wno-pointer-arith -fms-extensions
diff --git a/sys/modules/mlx4ib/Makefile b/sys/modules/mlx4ib/Makefile
new file mode 100644
index 0000000..5fe01e7
--- /dev/null
+++ b/sys/modules/mlx4ib/Makefile
@@ -0,0 +1,11 @@
+# $FreeBSD$
+.PATH: ${.CURDIR}/../../ofed/drivers/infiniband/hw/mlx4
+KMOD = mlx4ib
+SRCS = device_if.h bus_if.h pci_if.h vnode_if.h
+SRCS+= ah.c cq.c doorbell.c mad.c main.c mr.c qp.c srq.c wc.c
+
+CFLAGS+= -I${.CURDIR}/../../ofed/include/ -DINET6
+
+.include <bsd.kmod.mk>
+
+CFLAGS+= -Wno-cast-qual -Wno-pointer-arith -fms-extensions
diff --git a/sys/modules/mlxen/Makefile b/sys/modules/mlxen/Makefile
new file mode 100644
index 0000000..b83b4a5
--- /dev/null
+++ b/sys/modules/mlxen/Makefile
@@ -0,0 +1,13 @@
+# $FreeBSD$
+.PATH: ${.CURDIR}/../../ofed/drivers/net/mlx4
+KMOD = mlxen
+SRCS = device_if.h bus_if.h pci_if.h vnode_if.h
+SRCS += en_cq.c en_frag.c en_main.c en_netdev.c en_port.c en_resources.c
+SRCS += en_rx.c en_tx.c
+CFLAGS+= -I${.CURDIR}/../../ofed/drivers/net/mlx4
+CFLAGS+= -I${.CURDIR}/../../ofed/include/
+CFLAGS+= -DINET6
+
+.include <bsd.kmod.mk>
+
+CFLAGS+= -Wno-cast-qual -Wno-pointer-arith -fms-extensions
diff --git a/sys/modules/mthca/Makefile b/sys/modules/mthca/Makefile
new file mode 100644
index 0000000..de860fe
--- /dev/null
+++ b/sys/modules/mthca/Makefile
@@ -0,0 +1,15 @@
+# $FreeBSD$
+
+.PATH: ${.CURDIR}/../../ofed/drivers/infiniband/hw/mthca
+KMOD = mthca
+SRCS = device_if.h bus_if.h pci_if.h vnode_if.h
+SRCS+= mthca_allocator.c mthca_av.c mthca_catas.c mthca_cmd.c mthca_cq.c
+SRCS+= mthca_eq.c mthca_mad.c mthca_main.c mthca_mcg.c mthca_memfree.c
+SRCS+= mthca_mr.c mthca_pd.c mthca_profile.c mthca_provider.c mthca_qp.c
+SRCS+= mthca_reset.c mthca_srq.c mthca_uar.c
+
+CFLAGS+= -I${.CURDIR}/../../ofed/include/ -DINET6
+
+.include <bsd.kmod.mk>
+
+CFLAGS+= -Wno-cast-qual -Wno-pointer-arith -fms-extensions
diff --git a/sys/ofed/drivers/infiniband/Kconfig b/sys/ofed/drivers/infiniband/Kconfig
new file mode 100644
index 0000000..0a2ef11
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/Kconfig
@@ -0,0 +1,66 @@
+menuconfig INFINIBAND
+ tristate "InfiniBand support"
+ depends on PCI || BROKEN
+ depends on HAS_IOMEM
+ ---help---
+ Core support for InfiniBand (IB). Make sure to also select
+ any protocols you wish to use as well as drivers for your
+ InfiniBand hardware.
+
+if INFINIBAND
+
+config INFINIBAND_USER_MAD
+ tristate "InfiniBand userspace MAD support"
+ depends on INFINIBAND
+ ---help---
+ Userspace InfiniBand Management Datagram (MAD) support. This
+ is the kernel side of the userspace MAD support, which allows
+ userspace processes to send and receive MADs. You will also
+ need libibumad from <http://www.openib.org>.
+
+config INFINIBAND_USER_ACCESS
+ tristate "InfiniBand userspace access (verbs and CM)"
+ ---help---
+ Userspace InfiniBand access support. This enables the
+ kernel side of userspace verbs and the userspace
+ communication manager (CM). This allows userspace processes
+ to set up connections and directly access InfiniBand
+ hardware for fast-path operations. You will also need
+ libibverbs, libibcm and a hardware driver library from
+ <http://www.openib.org>.
+
+config INFINIBAND_USER_MEM
+ bool
+ depends on INFINIBAND_USER_ACCESS != n
+ default y
+
+config INFINIBAND_ADDR_TRANS
+ bool
+ depends on INET
+ depends on !(INFINIBAND = y && IPV6 = m)
+ default y
+
+source "drivers/infiniband/hw/mthca/Kconfig"
+source "drivers/infiniband/hw/ipath/Kconfig"
+source "drivers/infiniband/hw/qib/Kconfig"
+source "drivers/infiniband/hw/ehca/Kconfig"
+source "drivers/infiniband/hw/amso1100/Kconfig"
+source "drivers/infiniband/hw/cxgb3/Kconfig"
+source "drivers/infiniband/hw/mlx4/Kconfig"
+source "drivers/infiniband/hw/nes/Kconfig"
+
+source "drivers/infiniband/ulp/ipoib/Kconfig"
+
+source "drivers/infiniband/ulp/srp/Kconfig"
+
+source "drivers/infiniband/ulp/srpt/Kconfig"
+
+source "drivers/infiniband/ulp/iser/Kconfig"
+
+source "drivers/infiniband/ulp/sdp/Kconfig"
+
+source "drivers/infiniband/ulp/qlgc_vnic/Kconfig"
+
+source "drivers/infiniband/util/Kconfig"
+
+endif # INFINIBAND
diff --git a/sys/ofed/drivers/infiniband/Makefile b/sys/ofed/drivers/infiniband/Makefile
new file mode 100644
index 0000000..ea5dbe0
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/Makefile
@@ -0,0 +1,17 @@
+obj-$(CONFIG_INFINIBAND) += core/
+obj-$(CONFIG_INFINIBAND_MTHCA) += hw/mthca/
+obj-$(CONFIG_INFINIBAND_IPATH) += hw/ipath/
+obj-$(CONFIG_INFINIBAND_QIB) += hw/qib/
+obj-$(CONFIG_INFINIBAND_EHCA) += hw/ehca/
+obj-$(CONFIG_INFINIBAND_AMSO1100) += hw/amso1100/
+obj-$(CONFIG_INFINIBAND_CXGB3) += hw/cxgb3/
+obj-$(CONFIG_INFINIBAND_NES) += hw/nes/
+obj-$(CONFIG_MLX4_INFINIBAND) += hw/mlx4/
+obj-$(CONFIG_INFINIBAND_NES) += hw/nes/
+obj-$(CONFIG_INFINIBAND_IPOIB) += ulp/ipoib/
+obj-$(CONFIG_INFINIBAND_SRP) += ulp/srp/
+obj-$(CONFIG_INFINIBAND_SRPT) += ulp/srpt/
+obj-$(CONFIG_INFINIBAND_ISER) += ulp/iser/
+obj-$(CONFIG_INFINIBAND_SDP) += ulp/sdp/
+obj-$(CONFIG_INFINIBAND_QLGC_VNIC) += ulp/qlgc_vnic/
+obj-$(CONFIG_INFINIBAND_MADEYE) += util/
diff --git a/sys/ofed/drivers/infiniband/core/Makefile b/sys/ofed/drivers/infiniband/core/Makefile
new file mode 100644
index 0000000..f646040
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/Makefile
@@ -0,0 +1,32 @@
+infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS) := ib_addr.o rdma_cm.o
+user_access-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_ucm.o
+
+obj-$(CONFIG_INFINIBAND) += ib_core.o ib_mad.o ib_sa.o \
+ ib_cm.o iw_cm.o $(infiniband-y)
+obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o
+obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \
+ $(user_access-y)
+
+ib_core-y := packer.o ud_header.o verbs.o sysfs.o \
+ device.o fmr_pool.o cache.o
+ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
+
+ib_mad-y := mad.o smi.o agent.o mad_rmpp.o
+
+ib_sa-y := sa_query.o multicast.o notice.o local_sa.o
+
+ib_cm-y := cm.o
+
+iw_cm-y := iwcm.o
+
+rdma_cm-y := cma.o
+
+rdma_ucm-y := ucma.o
+
+ib_addr-y := addr.o
+
+ib_umad-y := user_mad.o
+
+ib_ucm-y := ucm.o
+
+ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o
diff --git a/sys/ofed/drivers/infiniband/core/addr.c b/sys/ofed/drivers/infiniband/core/addr.c
new file mode 100644
index 0000000..cde4037
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/addr.c
@@ -0,0 +1,630 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc. All rights reserved.
+ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mutex.h>
+#include <linux/inetdevice.h>
+#include <linux/workqueue.h>
+#include <net/arp.h>
+#include <net/neighbour.h>
+#include <net/route.h>
+#include <net/netevent.h>
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+#include <rdma/ib_addr.h>
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("IB Address Translation");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct addr_req {
+ struct list_head list;
+ struct sockaddr_storage src_addr;
+ struct sockaddr_storage dst_addr;
+ struct rdma_dev_addr *addr;
+ struct rdma_addr_client *client;
+ void *context;
+ void (*callback)(int status, struct sockaddr *src_addr,
+ struct rdma_dev_addr *addr, void *context);
+ unsigned long timeout;
+ int status;
+};
+
+static void process_req(struct work_struct *work);
+
+static DEFINE_MUTEX(lock);
+static LIST_HEAD(req_list);
+static struct delayed_work work;
+static struct workqueue_struct *addr_wq;
+
+void rdma_addr_register_client(struct rdma_addr_client *client)
+{
+ atomic_set(&client->refcount, 1);
+ init_completion(&client->comp);
+}
+EXPORT_SYMBOL(rdma_addr_register_client);
+
+static inline void put_client(struct rdma_addr_client *client)
+{
+ if (atomic_dec_and_test(&client->refcount))
+ complete(&client->comp);
+}
+
+void rdma_addr_unregister_client(struct rdma_addr_client *client)
+{
+ put_client(client);
+ wait_for_completion(&client->comp);
+}
+EXPORT_SYMBOL(rdma_addr_unregister_client);
+
+#ifdef __linux__
+int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev,
+ const unsigned char *dst_dev_addr)
+{
+ dev_addr->dev_type = dev->type;
+ memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN);
+ memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN);
+ if (dst_dev_addr)
+ memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN);
+ dev_addr->bound_dev_if = dev->ifindex;
+ return 0;
+}
+#else
+int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct ifnet *dev,
+ const unsigned char *dst_dev_addr)
+{
+ if (dev->if_type == IFT_INFINIBAND)
+ dev_addr->dev_type = ARPHRD_INFINIBAND;
+ else if (dev->if_type == IFT_ETHER)
+ dev_addr->dev_type = ARPHRD_ETHER;
+ else
+ dev_addr->dev_type = 0;
+ memcpy(dev_addr->src_dev_addr, IF_LLADDR(dev), dev->if_addrlen);
+ memcpy(dev_addr->broadcast, __DECONST(char *, dev->if_broadcastaddr),
+ dev->if_addrlen);
+ if (dst_dev_addr)
+ memcpy(dev_addr->dst_dev_addr, dst_dev_addr, dev->if_addrlen);
+ dev_addr->bound_dev_if = dev->if_index;
+ return 0;
+}
+#endif
+EXPORT_SYMBOL(rdma_copy_addr);
+
+int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr)
+{
+ struct net_device *dev;
+ int ret = -EADDRNOTAVAIL;
+
+ if (dev_addr->bound_dev_if) {
+ dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+ if (!dev)
+ return -ENODEV;
+ ret = rdma_copy_addr(dev_addr, dev, NULL);
+ dev_put(dev);
+ return ret;
+ }
+
+ switch (addr->sa_family) {
+ case AF_INET:
+ dev = ip_dev_find(NULL,
+ ((struct sockaddr_in *) addr)->sin_addr.s_addr);
+
+ if (!dev)
+ return ret;
+
+ ret = rdma_copy_addr(dev_addr, dev, NULL);
+ dev_put(dev);
+ break;
+
+#if defined(INET6)
+ case AF_INET6:
+#ifdef __linux__
+ read_lock(&dev_base_lock);
+ for_each_netdev(&init_net, dev) {
+ if (ipv6_chk_addr(&init_net,
+ &((struct sockaddr_in6 *) addr)->sin6_addr,
+ dev, 1)) {
+ ret = rdma_copy_addr(dev_addr, dev, NULL);
+ break;
+ }
+ }
+ read_unlock(&dev_base_lock);
+#else
+ {
+ struct sockaddr_in6 *sin6;
+ struct ifaddr *ifa;
+ in_port_t port;
+
+ sin6 = (struct sockaddr_in6 *)addr;
+ port = sin6->sin6_port;
+ sin6->sin6_port = 0;
+ ifa = ifa_ifwithaddr(addr);
+ sin6->sin6_port = port;
+ if (ifa == NULL) {
+ ret = -ENODEV;
+ break;
+ }
+ ret = rdma_copy_addr(dev_addr, ifa->ifa_ifp, NULL);
+ ifa_free(ifa);
+ break;
+ }
+#endif
+ break;
+#endif
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rdma_translate_ip);
+
+static void set_timeout(unsigned long time)
+{
+ unsigned long delay;
+
+ cancel_delayed_work(&work);
+
+ delay = time - jiffies;
+ if ((long)delay <= 0)
+ delay = 1;
+
+ queue_delayed_work(addr_wq, &work, delay);
+}
+
+static void queue_req(struct addr_req *req)
+{
+ struct addr_req *temp_req;
+
+ mutex_lock(&lock);
+ list_for_each_entry_reverse(temp_req, &req_list, list) {
+ if (time_after_eq(req->timeout, temp_req->timeout))
+ break;
+ }
+
+ list_add(&req->list, &temp_req->list);
+
+ if (req_list.next == &req->list)
+ set_timeout(req->timeout);
+ mutex_unlock(&lock);
+}
+
+#ifdef __linux__
+static int addr4_resolve(struct sockaddr_in *src_in,
+ struct sockaddr_in *dst_in,
+ struct rdma_dev_addr *addr)
+{
+ __be32 src_ip = src_in->sin_addr.s_addr;
+ __be32 dst_ip = dst_in->sin_addr.s_addr;
+ struct flowi fl;
+ struct rtable *rt;
+ struct neighbour *neigh;
+ int ret;
+
+ memset(&fl, 0, sizeof fl);
+ fl.nl_u.ip4_u.daddr = dst_ip;
+ fl.nl_u.ip4_u.saddr = src_ip;
+ fl.oif = addr->bound_dev_if;
+
+ ret = ip_route_output_key(&init_net, &rt, &fl);
+ if (ret)
+ goto out;
+
+ src_in->sin_family = AF_INET;
+ src_in->sin_addr.s_addr = rt->rt_src;
+
+ if (rt->idev->dev->flags & IFF_LOOPBACK) {
+ ret = rdma_translate_ip((struct sockaddr *) dst_in, addr);
+ if (!ret)
+ memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
+ goto put;
+ }
+
+ /* If the device does ARP internally, return 'done' */
+ if (rt->idev->dev->flags & IFF_NOARP) {
+ rdma_copy_addr(addr, rt->idev->dev, NULL);
+ goto put;
+ }
+
+ neigh = neigh_lookup(&arp_tbl, &rt->rt_gateway, rt->idev->dev);
+ if (!neigh || !(neigh->nud_state & NUD_VALID)) {
+ neigh_event_send(rt->u.dst.neighbour, NULL);
+ ret = -ENODATA;
+ if (neigh)
+ goto release;
+ goto put;
+ }
+
+ ret = rdma_copy_addr(addr, neigh->dev, neigh->ha);
+release:
+ neigh_release(neigh);
+put:
+ ip_rt_put(rt);
+out:
+ return ret;
+}
+
+#if defined(INET6)
+static int addr6_resolve(struct sockaddr_in6 *src_in,
+ struct sockaddr_in6 *dst_in,
+ struct rdma_dev_addr *addr)
+{
+ struct flowi fl;
+ struct neighbour *neigh;
+ struct dst_entry *dst;
+ int ret;
+
+ memset(&fl, 0, sizeof fl);
+ ipv6_addr_copy(&fl.fl6_dst, &dst_in->sin6_addr);
+ ipv6_addr_copy(&fl.fl6_src, &src_in->sin6_addr);
+ fl.oif = addr->bound_dev_if;
+
+ dst = ip6_route_output(&init_net, NULL, &fl);
+ if ((ret = dst->error))
+ goto put;
+
+ if (ipv6_addr_any(&fl.fl6_src)) {
+ ret = ipv6_dev_get_saddr(&init_net, ip6_dst_idev(dst)->dev,
+ &fl.fl6_dst, 0, &fl.fl6_src);
+ if (ret)
+ goto put;
+
+ src_in->sin6_family = AF_INET6;
+ ipv6_addr_copy(&src_in->sin6_addr, &fl.fl6_src);
+ }
+
+ if (dst->dev->flags & IFF_LOOPBACK) {
+ ret = rdma_translate_ip((struct sockaddr *) dst_in, addr);
+ if (!ret)
+ memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
+ goto put;
+ }
+
+ /* If the device does ARP internally, return 'done' */
+ if (dst->dev->flags & IFF_NOARP) {
+ ret = rdma_copy_addr(addr, dst->dev, NULL);
+ goto put;
+ }
+
+ neigh = dst->neighbour;
+ if (!neigh || !(neigh->nud_state & NUD_VALID)) {
+ neigh_event_send(dst->neighbour, NULL);
+ ret = -ENODATA;
+ goto put;
+ }
+
+ ret = rdma_copy_addr(addr, dst->dev, neigh->ha);
+put:
+ dst_release(dst);
+ return ret;
+}
+#else
+static int addr6_resolve(struct sockaddr_in6 *src_in,
+ struct sockaddr_in6 *dst_in,
+ struct rdma_dev_addr *addr)
+{
+ return -EADDRNOTAVAIL;
+}
+#endif
+
+#else
+#include <netinet/if_ether.h>
+
+static int addr_resolve(struct sockaddr *src_in,
+ struct sockaddr *dst_in,
+ struct rdma_dev_addr *addr)
+{
+ struct sockaddr_in *sin;
+ struct sockaddr_in6 *sin6;
+ struct ifaddr *ifa;
+ struct ifnet *ifp;
+ struct llentry *lle;
+ struct rtentry *rte;
+ in_port_t port;
+ u_char edst[MAX_ADDR_LEN];
+ int multi;
+ int bcast;
+ int error;
+
+ /*
+ * Determine whether the address is unicast, multicast, or broadcast
+ * and whether the source interface is valid.
+ */
+ multi = 0;
+ bcast = 0;
+ sin = NULL;
+ sin6 = NULL;
+ ifp = NULL;
+ rte = NULL;
+ switch (dst_in->sa_family) {
+ case AF_INET:
+ sin = (struct sockaddr_in *)dst_in;
+ if (sin->sin_addr.s_addr == INADDR_BROADCAST)
+ bcast = 1;
+ if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
+ multi = 1;
+ sin = (struct sockaddr_in *)src_in;
+ if (sin->sin_addr.s_addr != INADDR_ANY) {
+ /*
+ * Address comparison fails if the port is set
+ * cache it here to be restored later.
+ */
+ port = sin->sin_port;
+ sin->sin_port = 0;
+ memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+ } else
+ src_in = NULL;
+ break;
+#ifdef INET6
+ case AF_INET6:
+ sin6 = (struct sockaddr_in6 *)dst_in;
+ if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
+ multi = 1;
+ sin6 = (struct sockaddr_in6 *)src_in;
+ if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
+ port = sin6->sin6_port;
+ sin6->sin6_port = 0;
+ } else
+ src_in = NULL;
+ break;
+#endif
+ default:
+ return -EINVAL;
+ }
+ /*
+ * If we have a source address to use look it up first and verify
+ * that it is a local interface.
+ */
+ if (src_in) {
+ ifa = ifa_ifwithaddr(src_in);
+ if (sin)
+ sin->sin_port = port;
+ if (sin6)
+ sin6->sin6_port = port;
+ if (ifa == NULL)
+ return -ENETUNREACH;
+ ifp = ifa->ifa_ifp;
+ ifa_free(ifa);
+ if (bcast || multi)
+ goto mcast;
+ }
+ /*
+ * Make sure the route exists and has a valid link.
+ */
+ rte = rtalloc1(dst_in, 1, 0);
+ if (rte == NULL || rte->rt_ifp == NULL || !RT_LINK_IS_UP(rte->rt_ifp)) {
+ if (rte)
+ RTFREE_LOCKED(rte);
+ return -EHOSTUNREACH;
+ }
+ /*
+ * If it's not multicast or broadcast and the route doesn't match the
+ * requested interface return unreachable. Otherwise fetch the
+ * correct interface pointer and unlock the route.
+ */
+ if (multi || bcast) {
+ if (ifp == NULL)
+ ifp = rte->rt_ifp;
+ RTFREE_LOCKED(rte);
+ } else if (ifp && ifp != rte->rt_ifp) {
+ RTFREE_LOCKED(rte);
+ return -ENETUNREACH;
+ } else {
+ if (ifp == NULL)
+ ifp = rte->rt_ifp;
+ RT_UNLOCK(rte);
+ }
+mcast:
+ if (bcast)
+ return rdma_copy_addr(addr, ifp, ifp->if_broadcastaddr);
+ if (multi) {
+ struct sockaddr *llsa;
+
+ error = ifp->if_resolvemulti(ifp, &llsa, dst_in);
+ if (error)
+ return -error;
+ error = rdma_copy_addr(addr, ifp,
+ LLADDR((struct sockaddr_dl *)llsa));
+ free(llsa, M_IFMADDR);
+ return error;
+ }
+ /*
+ * Resolve the link local address.
+ */
+ if (dst_in->sa_family == AF_INET)
+ error = arpresolve(ifp, rte, NULL, dst_in, edst, &lle);
+#ifdef INET6
+ else
+ error = nd6_storelladdr(ifp, NULL, dst_in, (u_char *)edst, &lle);
+#endif
+ RTFREE(rte);
+ if (error == 0)
+ return rdma_copy_addr(addr, ifp, edst);
+ if (error == EWOULDBLOCK)
+ return -ENODATA;
+ return -error;
+}
+
+#endif
+
+static void process_req(struct work_struct *work)
+{
+ struct addr_req *req, *temp_req;
+ struct sockaddr *src_in, *dst_in;
+ struct list_head done_list;
+
+ INIT_LIST_HEAD(&done_list);
+
+ mutex_lock(&lock);
+ list_for_each_entry_safe(req, temp_req, &req_list, list) {
+ if (req->status == -ENODATA) {
+ src_in = (struct sockaddr *) &req->src_addr;
+ dst_in = (struct sockaddr *) &req->dst_addr;
+ req->status = addr_resolve(src_in, dst_in, req->addr);
+ if (req->status && time_after_eq(jiffies, req->timeout))
+ req->status = -ETIMEDOUT;
+ else if (req->status == -ENODATA)
+ continue;
+ }
+ list_move_tail(&req->list, &done_list);
+ }
+
+ if (!list_empty(&req_list)) {
+ req = list_entry(req_list.next, struct addr_req, list);
+ set_timeout(req->timeout);
+ }
+ mutex_unlock(&lock);
+
+ list_for_each_entry_safe(req, temp_req, &done_list, list) {
+ list_del(&req->list);
+ req->callback(req->status, (struct sockaddr *) &req->src_addr,
+ req->addr, req->context);
+ put_client(req->client);
+ kfree(req);
+ }
+}
+
+int rdma_resolve_ip(struct rdma_addr_client *client,
+ struct sockaddr *src_addr, struct sockaddr *dst_addr,
+ struct rdma_dev_addr *addr, int timeout_ms,
+ void (*callback)(int status, struct sockaddr *src_addr,
+ struct rdma_dev_addr *addr, void *context),
+ void *context)
+{
+ struct sockaddr *src_in, *dst_in;
+ struct addr_req *req;
+ int ret = 0;
+
+ req = kzalloc(sizeof *req, GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ src_in = (struct sockaddr *) &req->src_addr;
+ dst_in = (struct sockaddr *) &req->dst_addr;
+
+ if (src_addr) {
+ if (src_addr->sa_family != dst_addr->sa_family) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ memcpy(src_in, src_addr, ip_addr_size(src_addr));
+ } else {
+ src_in->sa_family = dst_addr->sa_family;
+ }
+
+ memcpy(dst_in, dst_addr, ip_addr_size(dst_addr));
+ req->addr = addr;
+ req->callback = callback;
+ req->context = context;
+ req->client = client;
+ atomic_inc(&client->refcount);
+
+ req->status = addr_resolve(src_in, dst_in, addr);
+ switch (req->status) {
+ case 0:
+ req->timeout = jiffies;
+ queue_req(req);
+ break;
+ case -ENODATA:
+ req->timeout = msecs_to_jiffies(timeout_ms) + jiffies;
+ queue_req(req);
+ break;
+ default:
+ ret = req->status;
+ atomic_dec(&client->refcount);
+ goto err;
+ }
+ return ret;
+err:
+ kfree(req);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_ip);
+
+void rdma_addr_cancel(struct rdma_dev_addr *addr)
+{
+ struct addr_req *req, *temp_req;
+
+ mutex_lock(&lock);
+ list_for_each_entry_safe(req, temp_req, &req_list, list) {
+ if (req->addr == addr) {
+ req->status = -ECANCELED;
+ req->timeout = jiffies;
+ list_move(&req->list, &req_list);
+ set_timeout(req->timeout);
+ break;
+ }
+ }
+ mutex_unlock(&lock);
+}
+EXPORT_SYMBOL(rdma_addr_cancel);
+
+static int netevent_callback(struct notifier_block *self, unsigned long event,
+ void *ctx)
+{
+ if (event == NETEVENT_NEIGH_UPDATE) {
+#ifdef __linux__
+ struct neighbour *neigh = ctx;
+
+ if (neigh->nud_state & NUD_VALID) {
+ set_timeout(jiffies);
+ }
+#else
+ set_timeout(jiffies);
+#endif
+ }
+ return 0;
+}
+
+static struct notifier_block nb = {
+ .notifier_call = netevent_callback
+};
+
+static int addr_init(void)
+{
+ INIT_DELAYED_WORK(&work, process_req);
+ addr_wq = create_singlethread_workqueue("ib_addr");
+ if (!addr_wq)
+ return -ENOMEM;
+
+ register_netevent_notifier(&nb);
+ return 0;
+}
+
+static void addr_cleanup(void)
+{
+ unregister_netevent_notifier(&nb);
+ destroy_workqueue(addr_wq);
+}
+
+module_init(addr_init);
+module_exit(addr_cleanup);
diff --git a/sys/ofed/drivers/infiniband/core/agent.c b/sys/ofed/drivers/infiniband/core/agent.c
new file mode 100644
index 0000000..91916a8
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/agent.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2004, 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004, 2005 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004, 2005 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include "agent.h"
+#include "smi.h"
+#include "mad_priv.h"
+
+#define SPFX "ib_agent: "
+
+struct ib_agent_port_private {
+ struct list_head port_list;
+ struct ib_mad_agent *agent[2];
+};
+
+static DEFINE_SPINLOCK(ib_agent_port_list_lock);
+static LIST_HEAD(ib_agent_port_list);
+
+static struct ib_agent_port_private *
+__ib_get_agent_port(struct ib_device *device, int port_num)
+{
+ struct ib_agent_port_private *entry;
+
+ list_for_each_entry(entry, &ib_agent_port_list, port_list) {
+ if (entry->agent[1]->device == device &&
+ entry->agent[1]->port_num == port_num)
+ return entry;
+ }
+ return NULL;
+}
+
+static struct ib_agent_port_private *
+ib_get_agent_port(struct ib_device *device, int port_num)
+{
+ struct ib_agent_port_private *entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+ entry = __ib_get_agent_port(device, port_num);
+ spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+ return entry;
+}
+
+void agent_send_response(struct ib_mad *mad, struct ib_grh *grh,
+ struct ib_wc *wc, struct ib_device *device,
+ int port_num, int qpn)
+{
+ struct ib_agent_port_private *port_priv;
+ struct ib_mad_agent *agent;
+ struct ib_mad_send_buf *send_buf;
+ struct ib_ah *ah;
+ struct ib_mad_send_wr_private *mad_send_wr;
+
+ if (device->node_type == RDMA_NODE_IB_SWITCH)
+ port_priv = ib_get_agent_port(device, 0);
+ else
+ port_priv = ib_get_agent_port(device, port_num);
+
+ if (!port_priv) {
+ printk(KERN_ERR SPFX "Unable to find port agent\n");
+ return;
+ }
+
+ agent = port_priv->agent[qpn];
+ ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num);
+ if (IS_ERR(ah)) {
+ printk(KERN_ERR SPFX "ib_create_ah_from_wc error\n");
+ return;
+ }
+
+ send_buf = ib_create_send_mad(agent, wc->src_qp, wc->pkey_index, 0,
+ IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+ GFP_KERNEL);
+ if (IS_ERR(send_buf)) {
+ printk(KERN_ERR SPFX "ib_create_send_mad error\n");
+ goto err1;
+ }
+
+ memcpy(send_buf->mad, mad, sizeof *mad);
+ send_buf->ah = ah;
+
+ if (device->node_type == RDMA_NODE_IB_SWITCH) {
+ mad_send_wr = container_of(send_buf,
+ struct ib_mad_send_wr_private,
+ send_buf);
+ mad_send_wr->send_wr.wr.ud.port_num = port_num;
+ }
+
+ if (ib_post_send_mad(send_buf, NULL)) {
+ printk(KERN_ERR SPFX "ib_post_send_mad error\n");
+ goto err2;
+ }
+ return;
+err2:
+ ib_free_send_mad(send_buf);
+err1:
+ ib_destroy_ah(ah);
+}
+
+static void agent_send_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ ib_destroy_ah(mad_send_wc->send_buf->ah);
+ ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+int ib_agent_port_open(struct ib_device *device, int port_num)
+{
+ struct ib_agent_port_private *port_priv;
+ unsigned long flags;
+ int ret;
+
+ /* Create new device info */
+ port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
+ if (!port_priv) {
+ printk(KERN_ERR SPFX "No memory for ib_agent_port_private\n");
+ ret = -ENOMEM;
+ goto error1;
+ }
+
+ if (rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND) {
+ /* Obtain send only MAD agent for SMI QP */
+ port_priv->agent[0] = ib_register_mad_agent(device, port_num,
+ IB_QPT_SMI, NULL, 0,
+ &agent_send_handler,
+ NULL, NULL);
+ if (IS_ERR(port_priv->agent[0])) {
+ ret = PTR_ERR(port_priv->agent[0]);
+ goto error2;
+ }
+ }
+
+ /* Obtain send only MAD agent for GSI QP */
+ port_priv->agent[1] = ib_register_mad_agent(device, port_num,
+ IB_QPT_GSI, NULL, 0,
+ &agent_send_handler,
+ NULL, NULL);
+ if (IS_ERR(port_priv->agent[1])) {
+ ret = PTR_ERR(port_priv->agent[1]);
+ goto error3;
+ }
+
+ spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+ list_add_tail(&port_priv->port_list, &ib_agent_port_list);
+ spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+
+ return 0;
+
+error3:
+ if (port_priv->agent[0])
+ ib_unregister_mad_agent(port_priv->agent[0]);
+error2:
+ kfree(port_priv);
+error1:
+ return ret;
+}
+
+int ib_agent_port_close(struct ib_device *device, int port_num)
+{
+ struct ib_agent_port_private *port_priv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+ port_priv = __ib_get_agent_port(device, port_num);
+ if (port_priv == NULL) {
+ spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+ printk(KERN_ERR SPFX "Port %d not found\n", port_num);
+ return -ENODEV;
+ }
+ list_del(&port_priv->port_list);
+ spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+
+ ib_unregister_mad_agent(port_priv->agent[1]);
+ if (port_priv->agent[0])
+ ib_unregister_mad_agent(port_priv->agent[0]);
+
+ kfree(port_priv);
+ return 0;
+}
diff --git a/sys/ofed/drivers/infiniband/core/agent.h b/sys/ofed/drivers/infiniband/core/agent.h
new file mode 100644
index 0000000..6669287
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/agent.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __AGENT_H_
+#define __AGENT_H_
+
+#include <linux/err.h>
+#include <rdma/ib_mad.h>
+
+extern int ib_agent_port_open(struct ib_device *device, int port_num);
+
+extern int ib_agent_port_close(struct ib_device *device, int port_num);
+
+extern void agent_send_response(struct ib_mad *mad, struct ib_grh *grh,
+ struct ib_wc *wc, struct ib_device *device,
+ int port_num, int qpn);
+
+#endif /* __AGENT_H_ */
diff --git a/sys/ofed/drivers/infiniband/core/cache.c b/sys/ofed/drivers/infiniband/core/cache.c
new file mode 100644
index 0000000..660bff5
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/cache.c
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include <rdma/ib_cache.h>
+
+#include "core_priv.h"
+
+struct ib_pkey_cache {
+ int table_len;
+ u16 table[0];
+};
+
+struct ib_gid_cache {
+ int table_len;
+ union ib_gid table[0];
+};
+
+struct ib_update_work {
+ struct work_struct work;
+ struct ib_device *device;
+ u8 port_num;
+};
+
+static inline int start_port(struct ib_device *device)
+{
+ return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1;
+}
+
+static inline int end_port(struct ib_device *device)
+{
+ return (device->node_type == RDMA_NODE_IB_SWITCH) ?
+ 0 : device->phys_port_cnt;
+}
+
+int ib_get_cached_gid(struct ib_device *device,
+ u8 port_num,
+ int index,
+ union ib_gid *gid)
+{
+ struct ib_gid_cache *cache;
+ unsigned long flags;
+ int ret = 0;
+
+ if (port_num < start_port(device) || port_num > end_port(device))
+ return -EINVAL;
+
+ read_lock_irqsave(&device->cache.lock, flags);
+
+ cache = device->cache.gid_cache[port_num - start_port(device)];
+
+ if (index < 0 || index >= cache->table_len)
+ ret = -EINVAL;
+ else
+ *gid = cache->table[index];
+
+ read_unlock_irqrestore(&device->cache.lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_gid);
+
+int ib_find_cached_gid(struct ib_device *device,
+ union ib_gid *gid,
+ u8 *port_num,
+ u16 *index)
+{
+ struct ib_gid_cache *cache;
+ unsigned long flags;
+ int p, i;
+ int ret = -ENOENT;
+
+ *port_num = -1;
+ if (index)
+ *index = -1;
+
+ read_lock_irqsave(&device->cache.lock, flags);
+
+ for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+ cache = device->cache.gid_cache[p];
+ for (i = 0; i < cache->table_len; ++i) {
+ if (!memcmp(gid, &cache->table[i], sizeof *gid)) {
+ *port_num = p + start_port(device);
+ if (index)
+ *index = i;
+ ret = 0;
+ goto found;
+ }
+ }
+ }
+found:
+ read_unlock_irqrestore(&device->cache.lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_find_cached_gid);
+
+int ib_get_cached_pkey(struct ib_device *device,
+ u8 port_num,
+ int index,
+ u16 *pkey)
+{
+ struct ib_pkey_cache *cache;
+ unsigned long flags;
+ int ret = 0;
+
+ if (port_num < start_port(device) || port_num > end_port(device))
+ return -EINVAL;
+
+ read_lock_irqsave(&device->cache.lock, flags);
+
+ cache = device->cache.pkey_cache[port_num - start_port(device)];
+
+ if (index < 0 || index >= cache->table_len)
+ ret = -EINVAL;
+ else
+ *pkey = cache->table[index];
+
+ read_unlock_irqrestore(&device->cache.lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_pkey);
+
+int ib_find_cached_pkey(struct ib_device *device,
+ u8 port_num,
+ u16 pkey,
+ u16 *index)
+{
+ struct ib_pkey_cache *cache;
+ unsigned long flags;
+ int i;
+ int ret = -ENOENT;
+
+ if (port_num < start_port(device) || port_num > end_port(device))
+ return -EINVAL;
+
+ read_lock_irqsave(&device->cache.lock, flags);
+
+ cache = device->cache.pkey_cache[port_num - start_port(device)];
+
+ *index = -1;
+
+ for (i = 0; i < cache->table_len; ++i)
+ if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) {
+ *index = i;
+ ret = 0;
+ break;
+ }
+
+ read_unlock_irqrestore(&device->cache.lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_find_cached_pkey);
+
+int ib_get_cached_lmc(struct ib_device *device,
+ u8 port_num,
+ u8 *lmc)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ if (port_num < start_port(device) || port_num > end_port(device))
+ return -EINVAL;
+
+ read_lock_irqsave(&device->cache.lock, flags);
+ *lmc = device->cache.lmc_cache[port_num - start_port(device)];
+ read_unlock_irqrestore(&device->cache.lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_lmc);
+
+static void ib_cache_update(struct ib_device *device,
+ u8 port)
+{
+ struct ib_port_attr *tprops = NULL;
+ struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache;
+ struct ib_gid_cache *gid_cache = NULL, *old_gid_cache;
+ int i;
+ int ret;
+
+ tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+ if (!tprops)
+ return;
+
+ ret = ib_query_port(device, port, tprops);
+ if (ret) {
+ printk(KERN_WARNING "ib_query_port failed (%d) for %s\n",
+ ret, device->name);
+ goto err;
+ }
+
+ pkey_cache = kmalloc(sizeof *pkey_cache + tprops->pkey_tbl_len *
+ sizeof *pkey_cache->table, GFP_KERNEL);
+ if (!pkey_cache)
+ goto err;
+
+ pkey_cache->table_len = tprops->pkey_tbl_len;
+
+ gid_cache = kmalloc(sizeof *gid_cache + tprops->gid_tbl_len *
+ sizeof *gid_cache->table, GFP_KERNEL);
+ if (!gid_cache)
+ goto err;
+
+ gid_cache->table_len = tprops->gid_tbl_len;
+
+ for (i = 0; i < pkey_cache->table_len; ++i) {
+ ret = ib_query_pkey(device, port, i, pkey_cache->table + i);
+ if (ret) {
+ printk(KERN_WARNING "ib_query_pkey failed (%d) for %s (index %d)\n",
+ ret, device->name, i);
+ goto err;
+ }
+ }
+
+ for (i = 0; i < gid_cache->table_len; ++i) {
+ ret = ib_query_gid(device, port, i, gid_cache->table + i);
+ if (ret) {
+ printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n",
+ ret, device->name, i);
+ goto err;
+ }
+ }
+
+ write_lock_irq(&device->cache.lock);
+
+ old_pkey_cache = device->cache.pkey_cache[port - start_port(device)];
+ old_gid_cache = device->cache.gid_cache [port - start_port(device)];
+
+ device->cache.pkey_cache[port - start_port(device)] = pkey_cache;
+ device->cache.gid_cache [port - start_port(device)] = gid_cache;
+
+ device->cache.lmc_cache[port - start_port(device)] = tprops->lmc;
+
+ write_unlock_irq(&device->cache.lock);
+
+ kfree(old_pkey_cache);
+ kfree(old_gid_cache);
+ kfree(tprops);
+ return;
+
+err:
+ kfree(pkey_cache);
+ kfree(gid_cache);
+ kfree(tprops);
+}
+
+static void ib_cache_task(struct work_struct *_work)
+{
+ struct ib_update_work *work =
+ container_of(_work, struct ib_update_work, work);
+
+ ib_cache_update(work->device, work->port_num);
+ kfree(work);
+}
+
+static void ib_cache_event(struct ib_event_handler *handler,
+ struct ib_event *event)
+{
+ struct ib_update_work *work;
+
+ if (event->event == IB_EVENT_PORT_ERR ||
+ event->event == IB_EVENT_PORT_ACTIVE ||
+ event->event == IB_EVENT_LID_CHANGE ||
+ event->event == IB_EVENT_PKEY_CHANGE ||
+ event->event == IB_EVENT_SM_CHANGE ||
+ event->event == IB_EVENT_CLIENT_REREGISTER ||
+ event->event == IB_EVENT_GID_CHANGE) {
+ work = kmalloc(sizeof *work, GFP_ATOMIC);
+ if (work) {
+ INIT_WORK(&work->work, ib_cache_task);
+ work->device = event->device;
+ work->port_num = event->element.port_num;
+ schedule_work(&work->work);
+ }
+ }
+}
+
+static void ib_cache_setup_one(struct ib_device *device)
+{
+ int p;
+
+ rwlock_init(&device->cache.lock);
+
+ device->cache.pkey_cache =
+ kmalloc(sizeof *device->cache.pkey_cache *
+ (end_port(device) - start_port(device) + 1), GFP_KERNEL);
+ device->cache.gid_cache =
+ kmalloc(sizeof *device->cache.gid_cache *
+ (end_port(device) - start_port(device) + 1), GFP_KERNEL);
+
+ device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache *
+ (end_port(device) -
+ start_port(device) + 1),
+ GFP_KERNEL);
+
+ if (!device->cache.pkey_cache || !device->cache.gid_cache ||
+ !device->cache.lmc_cache) {
+ printk(KERN_WARNING "Couldn't allocate cache "
+ "for %s\n", device->name);
+ goto err;
+ }
+
+ for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+ device->cache.pkey_cache[p] = NULL;
+ device->cache.gid_cache [p] = NULL;
+ ib_cache_update(device, p + start_port(device));
+ }
+
+ INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
+ device, ib_cache_event);
+ if (ib_register_event_handler(&device->cache.event_handler))
+ goto err_cache;
+
+ return;
+
+err_cache:
+ for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+ kfree(device->cache.pkey_cache[p]);
+ kfree(device->cache.gid_cache[p]);
+ }
+
+err:
+ kfree(device->cache.pkey_cache);
+ kfree(device->cache.gid_cache);
+ kfree(device->cache.lmc_cache);
+}
+
+static void ib_cache_cleanup_one(struct ib_device *device)
+{
+ int p;
+
+ ib_unregister_event_handler(&device->cache.event_handler);
+ flush_scheduled_work();
+
+ for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+ kfree(device->cache.pkey_cache[p]);
+ kfree(device->cache.gid_cache[p]);
+ }
+
+ kfree(device->cache.pkey_cache);
+ kfree(device->cache.gid_cache);
+ kfree(device->cache.lmc_cache);
+}
+
+static struct ib_client cache_client = {
+ .name = "cache",
+ .add = ib_cache_setup_one,
+ .remove = ib_cache_cleanup_one
+};
+
+int __init ib_cache_setup(void)
+{
+ return ib_register_client(&cache_client);
+}
+
+void __exit ib_cache_cleanup(void)
+{
+ ib_unregister_client(&cache_client);
+}
diff --git a/sys/ofed/drivers/infiniband/core/cm.c b/sys/ofed/drivers/infiniband/core/cm.c
new file mode 100644
index 0000000..24f8b12
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/cm.c
@@ -0,0 +1,3894 @@
+/*
+ * Copyright (c) 2004-2007 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004, 2005 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/dma-mapping.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/interrupt.h>
+#include <linux/random.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/sysfs.h>
+#include <linux/workqueue.h>
+#include <linux/kdev_t.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_cm.h>
+#include "cm_msgs.h"
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("InfiniBand CM");
+MODULE_LICENSE("Dual BSD/GPL");
+
+#define PFX "ib_cm: "
+
+/*
+ * Limit CM message timeouts to something reasonable:
+ * 8 seconds per message, with up to 15 retries
+ */
+static int max_timeout = 21;
+module_param(max_timeout, int, 0644);
+MODULE_PARM_DESC(max_timeout, "Maximum IB CM per message timeout "
+ "(default=21, or ~8 seconds)");
+
+static void cm_add_one(struct ib_device *device);
+static void cm_remove_one(struct ib_device *device);
+
+static struct ib_client cm_client = {
+ .name = "cm",
+ .add = cm_add_one,
+ .remove = cm_remove_one
+};
+
+static struct ib_cm {
+ spinlock_t lock;
+ struct list_head device_list;
+ rwlock_t device_lock;
+ struct rb_root listen_service_table;
+ u64 listen_service_id;
+ /* struct rb_root peer_service_table; todo: fix peer to peer */
+ struct rb_root remote_qp_table;
+ struct rb_root remote_id_table;
+ struct rb_root remote_sidr_table;
+ struct idr local_id_table;
+ __be32 random_id_operand;
+ struct list_head timewait_list;
+ struct workqueue_struct *wq;
+} cm;
+
+/* Counter indexes ordered by attribute ID */
+enum {
+ CM_REQ_COUNTER,
+ CM_MRA_COUNTER,
+ CM_REJ_COUNTER,
+ CM_REP_COUNTER,
+ CM_RTU_COUNTER,
+ CM_DREQ_COUNTER,
+ CM_DREP_COUNTER,
+ CM_SIDR_REQ_COUNTER,
+ CM_SIDR_REP_COUNTER,
+ CM_LAP_COUNTER,
+ CM_APR_COUNTER,
+ CM_ATTR_COUNT,
+ CM_ATTR_ID_OFFSET = 0x0010,
+};
+
+enum {
+ CM_XMIT,
+ CM_XMIT_RETRIES,
+ CM_RECV,
+ CM_RECV_DUPLICATES,
+ CM_COUNTER_GROUPS
+};
+
+static char const counter_group_names[CM_COUNTER_GROUPS]
+ [sizeof("cm_rx_duplicates")] = {
+ "cm_tx_msgs", "cm_tx_retries",
+ "cm_rx_msgs", "cm_rx_duplicates"
+};
+
+struct cm_counter_group {
+ struct kobject obj;
+ atomic_long_t counter[CM_ATTR_COUNT];
+};
+
+struct cm_counter_attribute {
+ struct attribute attr;
+ int index;
+};
+
+#define CM_COUNTER_ATTR(_name, _index) \
+struct cm_counter_attribute cm_##_name##_counter_attr = { \
+ .attr = { .name = __stringify(_name), .mode = 0444 }, \
+ .index = _index \
+}
+
+static CM_COUNTER_ATTR(req, CM_REQ_COUNTER);
+static CM_COUNTER_ATTR(mra, CM_MRA_COUNTER);
+static CM_COUNTER_ATTR(rej, CM_REJ_COUNTER);
+static CM_COUNTER_ATTR(rep, CM_REP_COUNTER);
+static CM_COUNTER_ATTR(rtu, CM_RTU_COUNTER);
+static CM_COUNTER_ATTR(dreq, CM_DREQ_COUNTER);
+static CM_COUNTER_ATTR(drep, CM_DREP_COUNTER);
+static CM_COUNTER_ATTR(sidr_req, CM_SIDR_REQ_COUNTER);
+static CM_COUNTER_ATTR(sidr_rep, CM_SIDR_REP_COUNTER);
+static CM_COUNTER_ATTR(lap, CM_LAP_COUNTER);
+static CM_COUNTER_ATTR(apr, CM_APR_COUNTER);
+
+static struct attribute *cm_counter_default_attrs[] = {
+ &cm_req_counter_attr.attr,
+ &cm_mra_counter_attr.attr,
+ &cm_rej_counter_attr.attr,
+ &cm_rep_counter_attr.attr,
+ &cm_rtu_counter_attr.attr,
+ &cm_dreq_counter_attr.attr,
+ &cm_drep_counter_attr.attr,
+ &cm_sidr_req_counter_attr.attr,
+ &cm_sidr_rep_counter_attr.attr,
+ &cm_lap_counter_attr.attr,
+ &cm_apr_counter_attr.attr,
+ NULL
+};
+
+struct cm_port {
+ struct cm_device *cm_dev;
+ struct ib_mad_agent *mad_agent;
+ struct kobject port_obj;
+ u8 port_num;
+ struct cm_counter_group counter_group[CM_COUNTER_GROUPS];
+};
+
+struct cm_device {
+ struct list_head list;
+ struct ib_device *ib_device;
+ struct device *device;
+ u8 ack_delay;
+ struct cm_port *port[0];
+};
+
+struct cm_av {
+ struct cm_port *port;
+ union ib_gid dgid;
+ struct ib_ah_attr ah_attr;
+ u16 pkey_index;
+ u8 timeout;
+};
+
+struct cm_work {
+ struct delayed_work work;
+ struct list_head list;
+ struct cm_port *port;
+ struct ib_mad_recv_wc *mad_recv_wc; /* Received MADs */
+ __be32 local_id; /* Established / timewait */
+ __be32 remote_id;
+ struct ib_cm_event cm_event;
+ struct ib_sa_path_rec path[0];
+};
+
+struct cm_timewait_info {
+ struct cm_work work; /* Must be first. */
+ struct list_head list;
+ struct rb_node remote_qp_node;
+ struct rb_node remote_id_node;
+ __be64 remote_ca_guid;
+ __be32 remote_qpn;
+ u8 inserted_remote_qp;
+ u8 inserted_remote_id;
+};
+
+struct cm_id_private {
+ struct ib_cm_id id;
+
+ struct rb_node service_node;
+ struct rb_node sidr_id_node;
+ spinlock_t lock; /* Do not acquire inside cm.lock */
+ struct completion comp;
+ atomic_t refcount;
+
+ struct ib_mad_send_buf *msg;
+ struct cm_timewait_info *timewait_info;
+ /* todo: use alternate port on send failure */
+ struct cm_av av;
+ struct cm_av alt_av;
+ struct ib_cm_compare_data *compare_data;
+
+ void *private_data;
+ __be64 tid;
+ __be32 local_qpn;
+ __be32 remote_qpn;
+ enum ib_qp_type qp_type;
+ __be32 sq_psn;
+ __be32 rq_psn;
+ int timeout_ms;
+ enum ib_mtu path_mtu;
+ __be16 pkey;
+ u8 private_data_len;
+ u8 max_cm_retries;
+ u8 peer_to_peer;
+ u8 responder_resources;
+ u8 initiator_depth;
+ u8 retry_count;
+ u8 rnr_retry_count;
+ u8 service_timeout;
+ u8 target_ack_delay;
+
+ struct list_head work_list;
+ atomic_t work_count;
+};
+
+static void cm_work_handler(struct work_struct *work);
+
+static inline void cm_deref_id(struct cm_id_private *cm_id_priv)
+{
+ if (atomic_dec_and_test(&cm_id_priv->refcount))
+ complete(&cm_id_priv->comp);
+}
+
+static int cm_alloc_msg(struct cm_id_private *cm_id_priv,
+ struct ib_mad_send_buf **msg)
+{
+ struct ib_mad_agent *mad_agent;
+ struct ib_mad_send_buf *m;
+ struct ib_ah *ah;
+
+ mad_agent = cm_id_priv->av.port->mad_agent;
+ ah = ib_create_ah(mad_agent->qp->pd, &cm_id_priv->av.ah_attr);
+ if (IS_ERR(ah))
+ return PTR_ERR(ah);
+
+ m = ib_create_send_mad(mad_agent, cm_id_priv->id.remote_cm_qpn,
+ cm_id_priv->av.pkey_index,
+ 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+ GFP_ATOMIC);
+ if (IS_ERR(m)) {
+ ib_destroy_ah(ah);
+ return PTR_ERR(m);
+ }
+
+ /* Timeout set by caller if response is expected. */
+ m->ah = ah;
+ m->retries = cm_id_priv->max_cm_retries;
+
+ atomic_inc(&cm_id_priv->refcount);
+ m->context[0] = cm_id_priv;
+ *msg = m;
+ return 0;
+}
+
+static int cm_alloc_response_msg(struct cm_port *port,
+ struct ib_mad_recv_wc *mad_recv_wc,
+ struct ib_mad_send_buf **msg)
+{
+ struct ib_mad_send_buf *m;
+ struct ib_ah *ah;
+
+ ah = ib_create_ah_from_wc(port->mad_agent->qp->pd, mad_recv_wc->wc,
+ mad_recv_wc->recv_buf.grh, port->port_num);
+ if (IS_ERR(ah))
+ return PTR_ERR(ah);
+
+ m = ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index,
+ 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+ GFP_ATOMIC);
+ if (IS_ERR(m)) {
+ ib_destroy_ah(ah);
+ return PTR_ERR(m);
+ }
+ m->ah = ah;
+ *msg = m;
+ return 0;
+}
+
+static void cm_free_msg(struct ib_mad_send_buf *msg)
+{
+ ib_destroy_ah(msg->ah);
+ if (msg->context[0])
+ cm_deref_id(msg->context[0]);
+ ib_free_send_mad(msg);
+}
+
+static void * cm_copy_private_data(const void *private_data,
+ u8 private_data_len)
+{
+ void *data;
+
+ if (!private_data || !private_data_len)
+ return NULL;
+
+ data = kmemdup(private_data, private_data_len, GFP_KERNEL);
+ if (!data)
+ return ERR_PTR(-ENOMEM);
+
+ return data;
+}
+
+static void cm_set_private_data(struct cm_id_private *cm_id_priv,
+ void *private_data, u8 private_data_len)
+{
+ if (cm_id_priv->private_data && cm_id_priv->private_data_len)
+ kfree(cm_id_priv->private_data);
+
+ cm_id_priv->private_data = private_data;
+ cm_id_priv->private_data_len = private_data_len;
+}
+
+static void cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc,
+ struct ib_grh *grh, struct cm_av *av)
+{
+ av->port = port;
+ av->pkey_index = wc->pkey_index;
+ ib_init_ah_from_wc(port->cm_dev->ib_device, port->port_num, wc,
+ grh, &av->ah_attr);
+}
+
+static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av)
+{
+ struct cm_device *cm_dev;
+ struct cm_port *port = NULL;
+ unsigned long flags;
+ int ret;
+ u8 p;
+
+ read_lock_irqsave(&cm.device_lock, flags);
+ list_for_each_entry(cm_dev, &cm.device_list, list) {
+ if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid,
+ &p, NULL)) {
+ port = cm_dev->port[p-1];
+ break;
+ }
+ }
+ read_unlock_irqrestore(&cm.device_lock, flags);
+
+ if (!port)
+ return -EINVAL;
+
+ ret = ib_find_cached_pkey(cm_dev->ib_device, port->port_num,
+ be16_to_cpu(path->pkey), &av->pkey_index);
+ if (ret)
+ return ret;
+
+ av->port = port;
+ ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path,
+ &av->ah_attr);
+ av->timeout = path->packet_life_time + 1;
+ return 0;
+}
+
+static int cm_alloc_id(struct cm_id_private *cm_id_priv)
+{
+ unsigned long flags;
+ int ret, id;
+ static int next_id;
+
+ do {
+ spin_lock_irqsave(&cm.lock, flags);
+ ret = idr_get_new_above(&cm.local_id_table, cm_id_priv,
+ next_id, &id);
+ if (!ret)
+ next_id = ((unsigned) id + 1) & MAX_ID_MASK;
+ spin_unlock_irqrestore(&cm.lock, flags);
+ } while( (ret == -EAGAIN) && idr_pre_get(&cm.local_id_table, GFP_KERNEL) );
+
+ cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand;
+ return ret;
+}
+
+static void cm_free_id(__be32 local_id)
+{
+ spin_lock_irq(&cm.lock);
+ idr_remove(&cm.local_id_table,
+ (__force int) (local_id ^ cm.random_id_operand));
+ spin_unlock_irq(&cm.lock);
+}
+
+static struct cm_id_private * cm_get_id(__be32 local_id, __be32 remote_id)
+{
+ struct cm_id_private *cm_id_priv;
+
+ cm_id_priv = idr_find(&cm.local_id_table,
+ (__force int) (local_id ^ cm.random_id_operand));
+ if (cm_id_priv) {
+ if (cm_id_priv->id.remote_id == remote_id)
+ atomic_inc(&cm_id_priv->refcount);
+ else
+ cm_id_priv = NULL;
+ }
+
+ return cm_id_priv;
+}
+
+static struct cm_id_private * cm_acquire_id(__be32 local_id, __be32 remote_id)
+{
+ struct cm_id_private *cm_id_priv;
+
+ spin_lock_irq(&cm.lock);
+ cm_id_priv = cm_get_id(local_id, remote_id);
+ spin_unlock_irq(&cm.lock);
+
+ return cm_id_priv;
+}
+
+static void cm_mask_copy(u8 *dst, u8 *src, u8 *mask)
+{
+ int i;
+
+ for (i = 0; i < IB_CM_COMPARE_SIZE / sizeof(unsigned long); i++)
+ ((unsigned long *) dst)[i] = ((unsigned long *) src)[i] &
+ ((unsigned long *) mask)[i];
+}
+
+static int cm_compare_data(struct ib_cm_compare_data *src_data,
+ struct ib_cm_compare_data *dst_data)
+{
+ u8 src[IB_CM_COMPARE_SIZE];
+ u8 dst[IB_CM_COMPARE_SIZE];
+
+ if (!src_data || !dst_data)
+ return 0;
+
+ cm_mask_copy(src, src_data->data, dst_data->mask);
+ cm_mask_copy(dst, dst_data->data, src_data->mask);
+ return memcmp(src, dst, IB_CM_COMPARE_SIZE);
+}
+
+static int cm_compare_private_data(u8 *private_data,
+ struct ib_cm_compare_data *dst_data)
+{
+ u8 src[IB_CM_COMPARE_SIZE];
+
+ if (!dst_data)
+ return 0;
+
+ cm_mask_copy(src, private_data, dst_data->mask);
+ return memcmp(src, dst_data->data, IB_CM_COMPARE_SIZE);
+}
+
+/*
+ * Trivial helpers to strip endian annotation and compare; the
+ * endianness doesn't actually matter since we just need a stable
+ * order for the RB tree.
+ */
+static int be32_lt(__be32 a, __be32 b)
+{
+ return (__force u32) a < (__force u32) b;
+}
+
+static int be32_gt(__be32 a, __be32 b)
+{
+ return (__force u32) a > (__force u32) b;
+}
+
+static int be64_lt(__be64 a, __be64 b)
+{
+ return (__force u64) a < (__force u64) b;
+}
+
+static int be64_gt(__be64 a, __be64 b)
+{
+ return (__force u64) a > (__force u64) b;
+}
+
+static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
+{
+ struct rb_node **link = &cm.listen_service_table.rb_node;
+ struct rb_node *parent = NULL;
+ struct cm_id_private *cur_cm_id_priv;
+ __be64 service_id = cm_id_priv->id.service_id;
+ __be64 service_mask = cm_id_priv->id.service_mask;
+ int data_cmp;
+
+ while (*link) {
+ parent = *link;
+ cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
+ service_node);
+ data_cmp = cm_compare_data(cm_id_priv->compare_data,
+ cur_cm_id_priv->compare_data);
+ if ((cur_cm_id_priv->id.service_mask & service_id) ==
+ (service_mask & cur_cm_id_priv->id.service_id) &&
+ (cm_id_priv->id.device == cur_cm_id_priv->id.device) &&
+ !data_cmp)
+ return cur_cm_id_priv;
+
+ if (cm_id_priv->id.device < cur_cm_id_priv->id.device)
+ link = &(*link)->rb_left;
+ else if (cm_id_priv->id.device > cur_cm_id_priv->id.device)
+ link = &(*link)->rb_right;
+ else if (be64_lt(service_id, cur_cm_id_priv->id.service_id))
+ link = &(*link)->rb_left;
+ else if (be64_gt(service_id, cur_cm_id_priv->id.service_id))
+ link = &(*link)->rb_right;
+ else if (data_cmp < 0)
+ link = &(*link)->rb_left;
+ else
+ link = &(*link)->rb_right;
+ }
+ rb_link_node(&cm_id_priv->service_node, parent, link);
+ rb_insert_color(&cm_id_priv->service_node, &cm.listen_service_table);
+ return NULL;
+}
+
+static struct cm_id_private * cm_find_listen(struct ib_device *device,
+ __be64 service_id,
+ u8 *private_data)
+{
+ struct rb_node *node = cm.listen_service_table.rb_node;
+ struct cm_id_private *cm_id_priv;
+ int data_cmp;
+
+ while (node) {
+ cm_id_priv = rb_entry(node, struct cm_id_private, service_node);
+ data_cmp = cm_compare_private_data(private_data,
+ cm_id_priv->compare_data);
+ if ((cm_id_priv->id.service_mask & service_id) ==
+ cm_id_priv->id.service_id &&
+ (cm_id_priv->id.device == device) && !data_cmp)
+ return cm_id_priv;
+
+ if (device < cm_id_priv->id.device)
+ node = node->rb_left;
+ else if (device > cm_id_priv->id.device)
+ node = node->rb_right;
+ else if (be64_lt(service_id, cm_id_priv->id.service_id))
+ node = node->rb_left;
+ else if (be64_gt(service_id, cm_id_priv->id.service_id))
+ node = node->rb_right;
+ else if (data_cmp < 0)
+ node = node->rb_left;
+ else
+ node = node->rb_right;
+ }
+ return NULL;
+}
+
+static struct cm_timewait_info * cm_insert_remote_id(struct cm_timewait_info
+ *timewait_info)
+{
+ struct rb_node **link = &cm.remote_id_table.rb_node;
+ struct rb_node *parent = NULL;
+ struct cm_timewait_info *cur_timewait_info;
+ __be64 remote_ca_guid = timewait_info->remote_ca_guid;
+ __be32 remote_id = timewait_info->work.remote_id;
+
+ while (*link) {
+ parent = *link;
+ cur_timewait_info = rb_entry(parent, struct cm_timewait_info,
+ remote_id_node);
+ if (be32_lt(remote_id, cur_timewait_info->work.remote_id))
+ link = &(*link)->rb_left;
+ else if (be32_gt(remote_id, cur_timewait_info->work.remote_id))
+ link = &(*link)->rb_right;
+ else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+ link = &(*link)->rb_left;
+ else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+ link = &(*link)->rb_right;
+ else
+ return cur_timewait_info;
+ }
+ timewait_info->inserted_remote_id = 1;
+ rb_link_node(&timewait_info->remote_id_node, parent, link);
+ rb_insert_color(&timewait_info->remote_id_node, &cm.remote_id_table);
+ return NULL;
+}
+
+static struct cm_timewait_info * cm_find_remote_id(__be64 remote_ca_guid,
+ __be32 remote_id)
+{
+ struct rb_node *node = cm.remote_id_table.rb_node;
+ struct cm_timewait_info *timewait_info;
+
+ while (node) {
+ timewait_info = rb_entry(node, struct cm_timewait_info,
+ remote_id_node);
+ if (be32_lt(remote_id, timewait_info->work.remote_id))
+ node = node->rb_left;
+ else if (be32_gt(remote_id, timewait_info->work.remote_id))
+ node = node->rb_right;
+ else if (be64_lt(remote_ca_guid, timewait_info->remote_ca_guid))
+ node = node->rb_left;
+ else if (be64_gt(remote_ca_guid, timewait_info->remote_ca_guid))
+ node = node->rb_right;
+ else
+ return timewait_info;
+ }
+ return NULL;
+}
+
+static struct cm_timewait_info * cm_insert_remote_qpn(struct cm_timewait_info
+ *timewait_info)
+{
+ struct rb_node **link = &cm.remote_qp_table.rb_node;
+ struct rb_node *parent = NULL;
+ struct cm_timewait_info *cur_timewait_info;
+ __be64 remote_ca_guid = timewait_info->remote_ca_guid;
+ __be32 remote_qpn = timewait_info->remote_qpn;
+
+ while (*link) {
+ parent = *link;
+ cur_timewait_info = rb_entry(parent, struct cm_timewait_info,
+ remote_qp_node);
+ if (be32_lt(remote_qpn, cur_timewait_info->remote_qpn))
+ link = &(*link)->rb_left;
+ else if (be32_gt(remote_qpn, cur_timewait_info->remote_qpn))
+ link = &(*link)->rb_right;
+ else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+ link = &(*link)->rb_left;
+ else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+ link = &(*link)->rb_right;
+ else
+ return cur_timewait_info;
+ }
+ timewait_info->inserted_remote_qp = 1;
+ rb_link_node(&timewait_info->remote_qp_node, parent, link);
+ rb_insert_color(&timewait_info->remote_qp_node, &cm.remote_qp_table);
+ return NULL;
+}
+
+static struct cm_id_private * cm_insert_remote_sidr(struct cm_id_private
+ *cm_id_priv)
+{
+ struct rb_node **link = &cm.remote_sidr_table.rb_node;
+ struct rb_node *parent = NULL;
+ struct cm_id_private *cur_cm_id_priv;
+ union ib_gid *port_gid = &cm_id_priv->av.dgid;
+ __be32 remote_id = cm_id_priv->id.remote_id;
+
+ while (*link) {
+ parent = *link;
+ cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
+ sidr_id_node);
+ if (be32_lt(remote_id, cur_cm_id_priv->id.remote_id))
+ link = &(*link)->rb_left;
+ else if (be32_gt(remote_id, cur_cm_id_priv->id.remote_id))
+ link = &(*link)->rb_right;
+ else {
+ int cmp;
+ cmp = memcmp(port_gid, &cur_cm_id_priv->av.dgid,
+ sizeof *port_gid);
+ if (cmp < 0)
+ link = &(*link)->rb_left;
+ else if (cmp > 0)
+ link = &(*link)->rb_right;
+ else
+ return cur_cm_id_priv;
+ }
+ }
+ rb_link_node(&cm_id_priv->sidr_id_node, parent, link);
+ rb_insert_color(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
+ return NULL;
+}
+
+static void cm_reject_sidr_req(struct cm_id_private *cm_id_priv,
+ enum ib_cm_sidr_status status)
+{
+ struct ib_cm_sidr_rep_param param;
+
+ memset(&param, 0, sizeof param);
+ param.status = status;
+ ib_send_cm_sidr_rep(&cm_id_priv->id, &param);
+}
+
+struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
+ ib_cm_handler cm_handler,
+ void *context)
+{
+ struct cm_id_private *cm_id_priv;
+ int ret;
+
+ cm_id_priv = kzalloc(sizeof *cm_id_priv, GFP_KERNEL);
+ if (!cm_id_priv)
+ return ERR_PTR(-ENOMEM);
+
+ cm_id_priv->id.state = IB_CM_IDLE;
+ cm_id_priv->id.device = device;
+ cm_id_priv->id.cm_handler = cm_handler;
+ cm_id_priv->id.context = context;
+ cm_id_priv->id.remote_cm_qpn = 1;
+ ret = cm_alloc_id(cm_id_priv);
+ if (ret)
+ goto error;
+
+ spin_lock_init(&cm_id_priv->lock);
+ init_completion(&cm_id_priv->comp);
+ INIT_LIST_HEAD(&cm_id_priv->work_list);
+ atomic_set(&cm_id_priv->work_count, -1);
+ atomic_set(&cm_id_priv->refcount, 1);
+ return &cm_id_priv->id;
+
+error:
+ kfree(cm_id_priv);
+ return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(ib_create_cm_id);
+
+static struct cm_work * cm_dequeue_work(struct cm_id_private *cm_id_priv)
+{
+ struct cm_work *work;
+
+ if (list_empty(&cm_id_priv->work_list))
+ return NULL;
+
+ work = list_entry(cm_id_priv->work_list.next, struct cm_work, list);
+ list_del(&work->list);
+ return work;
+}
+
+static void cm_free_work(struct cm_work *work)
+{
+ if (work->mad_recv_wc)
+ ib_free_recv_mad(work->mad_recv_wc);
+ kfree(work);
+}
+
+static inline int cm_convert_to_ms(int iba_time)
+{
+ /* approximate conversion to ms from 4.096us x 2^iba_time */
+ return 1 << max(iba_time - 8, 0);
+}
+
+/*
+ * calculate: 4.096x2^ack_timeout = 4.096x2^ack_delay + 2x4.096x2^life_time
+ * Because of how ack_timeout is stored, adding one doubles the timeout.
+ * To avoid large timeouts, select the max(ack_delay, life_time + 1), and
+ * increment it (round up) only if the other is within 50%.
+ */
+static u8 cm_ack_timeout(u8 ca_ack_delay, u8 packet_life_time)
+{
+ int ack_timeout = packet_life_time + 1;
+
+ if (ack_timeout >= ca_ack_delay)
+ ack_timeout += (ca_ack_delay >= (ack_timeout - 1));
+ else
+ ack_timeout = ca_ack_delay +
+ (ack_timeout >= (ca_ack_delay - 1));
+
+ return min(31, ack_timeout);
+}
+
+static void cm_cleanup_timewait(struct cm_timewait_info *timewait_info)
+{
+ if (timewait_info->inserted_remote_id) {
+ rb_erase(&timewait_info->remote_id_node, &cm.remote_id_table);
+ timewait_info->inserted_remote_id = 0;
+ }
+
+ if (timewait_info->inserted_remote_qp) {
+ rb_erase(&timewait_info->remote_qp_node, &cm.remote_qp_table);
+ timewait_info->inserted_remote_qp = 0;
+ }
+}
+
+static struct cm_timewait_info * cm_create_timewait_info(__be32 local_id)
+{
+ struct cm_timewait_info *timewait_info;
+
+ timewait_info = kzalloc(sizeof *timewait_info, GFP_KERNEL);
+ if (!timewait_info)
+ return ERR_PTR(-ENOMEM);
+
+ timewait_info->work.local_id = local_id;
+ INIT_DELAYED_WORK(&timewait_info->work.work, cm_work_handler);
+ timewait_info->work.cm_event.event = IB_CM_TIMEWAIT_EXIT;
+ return timewait_info;
+}
+
+static void cm_enter_timewait(struct cm_id_private *cm_id_priv)
+{
+ int wait_time;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cm.lock, flags);
+ cm_cleanup_timewait(cm_id_priv->timewait_info);
+ list_add_tail(&cm_id_priv->timewait_info->list, &cm.timewait_list);
+ spin_unlock_irqrestore(&cm.lock, flags);
+
+ /*
+ * The cm_id could be destroyed by the user before we exit timewait.
+ * To protect against this, we search for the cm_id after exiting
+ * timewait before notifying the user that we've exited timewait.
+ */
+ cm_id_priv->id.state = IB_CM_TIMEWAIT;
+ wait_time = cm_convert_to_ms(cm_id_priv->av.timeout);
+ queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work,
+ msecs_to_jiffies(wait_time));
+ cm_id_priv->timewait_info = NULL;
+}
+
+static void cm_reset_to_idle(struct cm_id_private *cm_id_priv)
+{
+ unsigned long flags;
+
+ cm_id_priv->id.state = IB_CM_IDLE;
+ if (cm_id_priv->timewait_info) {
+ spin_lock_irqsave(&cm.lock, flags);
+ cm_cleanup_timewait(cm_id_priv->timewait_info);
+ spin_unlock_irqrestore(&cm.lock, flags);
+ kfree(cm_id_priv->timewait_info);
+ cm_id_priv->timewait_info = NULL;
+ }
+}
+
+static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_work *work;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+retest:
+ spin_lock_irq(&cm_id_priv->lock);
+ switch (cm_id->state) {
+ case IB_CM_LISTEN:
+ cm_id->state = IB_CM_IDLE;
+ spin_unlock_irq(&cm_id_priv->lock);
+ spin_lock_irq(&cm.lock);
+ rb_erase(&cm_id_priv->service_node, &cm.listen_service_table);
+ spin_unlock_irq(&cm.lock);
+ break;
+ case IB_CM_SIDR_REQ_SENT:
+ cm_id->state = IB_CM_IDLE;
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ spin_unlock_irq(&cm_id_priv->lock);
+ break;
+ case IB_CM_SIDR_REQ_RCVD:
+ spin_unlock_irq(&cm_id_priv->lock);
+ cm_reject_sidr_req(cm_id_priv, IB_SIDR_REJECT);
+ break;
+ case IB_CM_REQ_SENT:
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ spin_unlock_irq(&cm_id_priv->lock);
+ ib_send_cm_rej(cm_id, IB_CM_REJ_TIMEOUT,
+ &cm_id_priv->id.device->node_guid,
+ sizeof cm_id_priv->id.device->node_guid,
+ NULL, 0);
+ break;
+ case IB_CM_REQ_RCVD:
+ if (err == -ENOMEM) {
+ /* Do not reject to allow future retries. */
+ cm_reset_to_idle(cm_id_priv);
+ spin_unlock_irq(&cm_id_priv->lock);
+ } else {
+ spin_unlock_irq(&cm_id_priv->lock);
+ ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
+ NULL, 0, NULL, 0);
+ }
+ break;
+ case IB_CM_MRA_REQ_RCVD:
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ /* Fall through */
+ case IB_CM_MRA_REQ_SENT:
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ spin_unlock_irq(&cm_id_priv->lock);
+ ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
+ NULL, 0, NULL, 0);
+ break;
+ case IB_CM_ESTABLISHED:
+ spin_unlock_irq(&cm_id_priv->lock);
+ ib_send_cm_dreq(cm_id, NULL, 0);
+ goto retest;
+ case IB_CM_DREQ_SENT:
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ cm_enter_timewait(cm_id_priv);
+ spin_unlock_irq(&cm_id_priv->lock);
+ break;
+ case IB_CM_DREQ_RCVD:
+ spin_unlock_irq(&cm_id_priv->lock);
+ ib_send_cm_drep(cm_id, NULL, 0);
+ break;
+ default:
+ spin_unlock_irq(&cm_id_priv->lock);
+ break;
+ }
+
+ cm_free_id(cm_id->local_id);
+ cm_deref_id(cm_id_priv);
+ wait_for_completion(&cm_id_priv->comp);
+ while ((work = cm_dequeue_work(cm_id_priv)) != NULL)
+ cm_free_work(work);
+ kfree(cm_id_priv->compare_data);
+ kfree(cm_id_priv->private_data);
+ kfree(cm_id_priv);
+}
+
+void ib_destroy_cm_id(struct ib_cm_id *cm_id)
+{
+ cm_destroy_id(cm_id, 0);
+}
+EXPORT_SYMBOL(ib_destroy_cm_id);
+
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
+ struct ib_cm_compare_data *compare_data)
+{
+ struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
+ unsigned long flags;
+ int ret = 0;
+
+ service_mask = service_mask ? service_mask : ~cpu_to_be64(0);
+ service_id &= service_mask;
+ if ((service_id & IB_SERVICE_ID_AGN_MASK) == IB_CM_ASSIGN_SERVICE_ID &&
+ (service_id != IB_CM_ASSIGN_SERVICE_ID))
+ return -EINVAL;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ if (cm_id->state != IB_CM_IDLE)
+ return -EINVAL;
+
+ if (compare_data) {
+ cm_id_priv->compare_data = kzalloc(sizeof *compare_data,
+ GFP_KERNEL);
+ if (!cm_id_priv->compare_data)
+ return -ENOMEM;
+ cm_mask_copy(cm_id_priv->compare_data->data,
+ compare_data->data, compare_data->mask);
+ memcpy(cm_id_priv->compare_data->mask, compare_data->mask,
+ IB_CM_COMPARE_SIZE);
+ }
+
+ cm_id->state = IB_CM_LISTEN;
+
+ spin_lock_irqsave(&cm.lock, flags);
+ if (service_id == IB_CM_ASSIGN_SERVICE_ID) {
+ cm_id->service_id = cpu_to_be64(cm.listen_service_id++);
+ cm_id->service_mask = ~cpu_to_be64(0);
+ } else {
+ cm_id->service_id = service_id;
+ cm_id->service_mask = service_mask;
+ }
+ cur_cm_id_priv = cm_insert_listen(cm_id_priv);
+ spin_unlock_irqrestore(&cm.lock, flags);
+
+ if (cur_cm_id_priv) {
+ cm_id->state = IB_CM_IDLE;
+ kfree(cm_id_priv->compare_data);
+ cm_id_priv->compare_data = NULL;
+ ret = -EBUSY;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(ib_cm_listen);
+
+static __be64 cm_form_tid(struct cm_id_private *cm_id_priv,
+ enum cm_msg_sequence msg_seq)
+{
+ u64 hi_tid, low_tid;
+
+ hi_tid = ((u64) cm_id_priv->av.port->mad_agent->hi_tid) << 32;
+ low_tid = (u64) ((__force u32)cm_id_priv->id.local_id |
+ (msg_seq << 30));
+ return cpu_to_be64(hi_tid | low_tid);
+}
+
+static void cm_format_mad_hdr(struct ib_mad_hdr *hdr,
+ __be16 attr_id, __be64 tid)
+{
+ hdr->base_version = IB_MGMT_BASE_VERSION;
+ hdr->mgmt_class = IB_MGMT_CLASS_CM;
+ hdr->class_version = IB_CM_CLASS_VERSION;
+ hdr->method = IB_MGMT_METHOD_SEND;
+ hdr->attr_id = attr_id;
+ hdr->tid = tid;
+}
+
+static void cm_format_req(struct cm_req_msg *req_msg,
+ struct cm_id_private *cm_id_priv,
+ struct ib_cm_req_param *param)
+{
+ struct ib_sa_path_rec *pri_path = param->primary_path;
+ struct ib_sa_path_rec *alt_path = param->alternate_path;
+
+ cm_format_mad_hdr(&req_msg->hdr, CM_REQ_ATTR_ID,
+ cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_REQ));
+
+ req_msg->local_comm_id = cm_id_priv->id.local_id;
+ req_msg->service_id = param->service_id;
+ req_msg->local_ca_guid = cm_id_priv->id.device->node_guid;
+ cm_req_set_local_qpn(req_msg, cpu_to_be32(param->qp_num));
+ cm_req_set_resp_res(req_msg, param->responder_resources);
+ cm_req_set_init_depth(req_msg, param->initiator_depth);
+ cm_req_set_remote_resp_timeout(req_msg,
+ param->remote_cm_response_timeout);
+ if (param->remote_cm_response_timeout > (u8) max_timeout) {
+ printk(KERN_WARNING PFX "req remote_cm_response_timeout %d > "
+ "%d, decreasing\n", param->remote_cm_response_timeout,
+ max_timeout);
+ cm_req_set_remote_resp_timeout(req_msg, (u8) max_timeout);
+ }
+ cm_req_set_qp_type(req_msg, param->qp_type);
+ cm_req_set_flow_ctrl(req_msg, param->flow_control);
+ cm_req_set_starting_psn(req_msg, cpu_to_be32(param->starting_psn));
+ cm_req_set_local_resp_timeout(req_msg,
+ param->local_cm_response_timeout);
+ if (param->local_cm_response_timeout > (u8) max_timeout) {
+ printk(KERN_WARNING PFX "req local_cm_response_timeout %d > "
+ "%d, decreasing\n", param->local_cm_response_timeout,
+ max_timeout);
+ cm_req_set_local_resp_timeout(req_msg, (u8) max_timeout);
+ }
+ cm_req_set_retry_count(req_msg, param->retry_count);
+ req_msg->pkey = param->primary_path->pkey;
+ cm_req_set_path_mtu(req_msg, param->primary_path->mtu);
+ cm_req_set_rnr_retry_count(req_msg, param->rnr_retry_count);
+ cm_req_set_max_cm_retries(req_msg, param->max_cm_retries);
+ cm_req_set_srq(req_msg, param->srq);
+
+ if (pri_path->hop_limit <= 1) {
+ req_msg->primary_local_lid = pri_path->slid;
+ req_msg->primary_remote_lid = pri_path->dlid;
+ } else {
+ /* Work-around until there's a way to obtain remote LID info */
+ req_msg->primary_local_lid = IB_LID_PERMISSIVE;
+ req_msg->primary_remote_lid = IB_LID_PERMISSIVE;
+ }
+ req_msg->primary_local_gid = pri_path->sgid;
+ req_msg->primary_remote_gid = pri_path->dgid;
+ cm_req_set_primary_flow_label(req_msg, pri_path->flow_label);
+ cm_req_set_primary_packet_rate(req_msg, pri_path->rate);
+ req_msg->primary_traffic_class = pri_path->traffic_class;
+ req_msg->primary_hop_limit = pri_path->hop_limit;
+ cm_req_set_primary_sl(req_msg, pri_path->sl);
+ cm_req_set_primary_subnet_local(req_msg, (pri_path->hop_limit <= 1));
+ cm_req_set_primary_local_ack_timeout(req_msg,
+ cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
+ pri_path->packet_life_time));
+
+ if (alt_path) {
+ if (alt_path->hop_limit <= 1) {
+ req_msg->alt_local_lid = alt_path->slid;
+ req_msg->alt_remote_lid = alt_path->dlid;
+ } else {
+ req_msg->alt_local_lid = IB_LID_PERMISSIVE;
+ req_msg->alt_remote_lid = IB_LID_PERMISSIVE;
+ }
+ req_msg->alt_local_gid = alt_path->sgid;
+ req_msg->alt_remote_gid = alt_path->dgid;
+ cm_req_set_alt_flow_label(req_msg,
+ alt_path->flow_label);
+ cm_req_set_alt_packet_rate(req_msg, alt_path->rate);
+ req_msg->alt_traffic_class = alt_path->traffic_class;
+ req_msg->alt_hop_limit = alt_path->hop_limit;
+ cm_req_set_alt_sl(req_msg, alt_path->sl);
+ cm_req_set_alt_subnet_local(req_msg, (alt_path->hop_limit <= 1));
+ cm_req_set_alt_local_ack_timeout(req_msg,
+ cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
+ alt_path->packet_life_time));
+ }
+
+ if (param->private_data && param->private_data_len)
+ memcpy(req_msg->private_data, param->private_data,
+ param->private_data_len);
+}
+
+static int cm_validate_req_param(struct ib_cm_req_param *param)
+{
+ /* peer-to-peer not supported */
+ if (param->peer_to_peer)
+ return -EINVAL;
+
+ if (!param->primary_path)
+ return -EINVAL;
+
+ if (param->qp_type != IB_QPT_RC && param->qp_type != IB_QPT_UC)
+ return -EINVAL;
+
+ if (param->private_data &&
+ param->private_data_len > IB_CM_REQ_PRIVATE_DATA_SIZE)
+ return -EINVAL;
+
+ if (param->alternate_path &&
+ (param->alternate_path->pkey != param->primary_path->pkey ||
+ param->alternate_path->mtu != param->primary_path->mtu))
+ return -EINVAL;
+
+ return 0;
+}
+
+int ib_send_cm_req(struct ib_cm_id *cm_id,
+ struct ib_cm_req_param *param)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_req_msg *req_msg;
+ unsigned long flags;
+ int ret;
+
+ ret = cm_validate_req_param(param);
+ if (ret)
+ return ret;
+
+ /* Verify that we're not in timewait. */
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state != IB_CM_IDLE) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ ret = -EINVAL;
+ goto out;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
+ id.local_id);
+ if (IS_ERR(cm_id_priv->timewait_info)) {
+ ret = PTR_ERR(cm_id_priv->timewait_info);
+ goto out;
+ }
+
+ ret = cm_init_av_by_path(param->primary_path, &cm_id_priv->av);
+ if (ret)
+ goto error1;
+ if (param->alternate_path) {
+ ret = cm_init_av_by_path(param->alternate_path,
+ &cm_id_priv->alt_av);
+ if (ret)
+ goto error1;
+ }
+ cm_id->service_id = param->service_id;
+ cm_id->service_mask = ~cpu_to_be64(0);
+ cm_id_priv->timeout_ms = cm_convert_to_ms(
+ param->primary_path->packet_life_time) * 2 +
+ cm_convert_to_ms(
+ param->remote_cm_response_timeout);
+ if (cm_id_priv->timeout_ms > cm_convert_to_ms(max_timeout)) {
+ printk(KERN_WARNING PFX "req timeout_ms %d > %d, decreasing\n",
+ cm_id_priv->timeout_ms, cm_convert_to_ms(max_timeout));
+ cm_id_priv->timeout_ms = cm_convert_to_ms(max_timeout);
+ }
+ cm_id_priv->max_cm_retries = param->max_cm_retries;
+ cm_id_priv->initiator_depth = param->initiator_depth;
+ cm_id_priv->responder_resources = param->responder_resources;
+ cm_id_priv->retry_count = param->retry_count;
+ cm_id_priv->path_mtu = param->primary_path->mtu;
+ cm_id_priv->pkey = param->primary_path->pkey;
+ cm_id_priv->qp_type = param->qp_type;
+
+ ret = cm_alloc_msg(cm_id_priv, &cm_id_priv->msg);
+ if (ret)
+ goto error1;
+
+ req_msg = (struct cm_req_msg *) cm_id_priv->msg->mad;
+ cm_format_req(req_msg, cm_id_priv, param);
+ cm_id_priv->tid = req_msg->hdr.tid;
+ cm_id_priv->msg->timeout_ms = cm_id_priv->timeout_ms;
+ cm_id_priv->msg->context[1] = (void *) (unsigned long) IB_CM_REQ_SENT;
+
+ cm_id_priv->local_qpn = cm_req_get_local_qpn(req_msg);
+ cm_id_priv->rq_psn = cm_req_get_starting_psn(req_msg);
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ ret = ib_post_send_mad(cm_id_priv->msg, NULL);
+ if (ret) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ goto error2;
+ }
+ BUG_ON(cm_id->state != IB_CM_IDLE);
+ cm_id->state = IB_CM_REQ_SENT;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return 0;
+
+error2: cm_free_msg(cm_id_priv->msg);
+error1: kfree(cm_id_priv->timewait_info);
+out: return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_req);
+
+static int cm_issue_rej(struct cm_port *port,
+ struct ib_mad_recv_wc *mad_recv_wc,
+ enum ib_cm_rej_reason reason,
+ enum cm_msg_response msg_rejected,
+ void *ari, u8 ari_length)
+{
+ struct ib_mad_send_buf *msg = NULL;
+ struct cm_rej_msg *rej_msg, *rcv_msg;
+ int ret;
+
+ ret = cm_alloc_response_msg(port, mad_recv_wc, &msg);
+ if (ret)
+ return ret;
+
+ /* We just need common CM header information. Cast to any message. */
+ rcv_msg = (struct cm_rej_msg *) mad_recv_wc->recv_buf.mad;
+ rej_msg = (struct cm_rej_msg *) msg->mad;
+
+ cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, rcv_msg->hdr.tid);
+ rej_msg->remote_comm_id = rcv_msg->local_comm_id;
+ rej_msg->local_comm_id = rcv_msg->remote_comm_id;
+ cm_rej_set_msg_rejected(rej_msg, msg_rejected);
+ rej_msg->reason = cpu_to_be16(reason);
+
+ if (ari && ari_length) {
+ cm_rej_set_reject_info_len(rej_msg, ari_length);
+ memcpy(rej_msg->ari, ari, ari_length);
+ }
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ cm_free_msg(msg);
+
+ return ret;
+}
+
+static inline int cm_is_active_peer(__be64 local_ca_guid, __be64 remote_ca_guid,
+ __be32 local_qpn, __be32 remote_qpn)
+{
+ return (be64_to_cpu(local_ca_guid) > be64_to_cpu(remote_ca_guid) ||
+ ((local_ca_guid == remote_ca_guid) &&
+ (be32_to_cpu(local_qpn) > be32_to_cpu(remote_qpn))));
+}
+
+static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
+ struct ib_sa_path_rec *primary_path,
+ struct ib_sa_path_rec *alt_path)
+{
+ memset(primary_path, 0, sizeof *primary_path);
+ primary_path->dgid = req_msg->primary_local_gid;
+ primary_path->sgid = req_msg->primary_remote_gid;
+ primary_path->dlid = req_msg->primary_local_lid;
+ primary_path->slid = req_msg->primary_remote_lid;
+ primary_path->flow_label = cm_req_get_primary_flow_label(req_msg);
+ primary_path->hop_limit = req_msg->primary_hop_limit;
+ primary_path->traffic_class = req_msg->primary_traffic_class;
+ primary_path->reversible = 1;
+ primary_path->pkey = req_msg->pkey;
+ primary_path->sl = cm_req_get_primary_sl(req_msg);
+ primary_path->mtu_selector = IB_SA_EQ;
+ primary_path->mtu = cm_req_get_path_mtu(req_msg);
+ primary_path->rate_selector = IB_SA_EQ;
+ primary_path->rate = cm_req_get_primary_packet_rate(req_msg);
+ primary_path->packet_life_time_selector = IB_SA_EQ;
+ primary_path->packet_life_time =
+ cm_req_get_primary_local_ack_timeout(req_msg);
+ primary_path->packet_life_time -= (primary_path->packet_life_time > 0);
+
+ if (req_msg->alt_local_lid) {
+ memset(alt_path, 0, sizeof *alt_path);
+ alt_path->dgid = req_msg->alt_local_gid;
+ alt_path->sgid = req_msg->alt_remote_gid;
+ alt_path->dlid = req_msg->alt_local_lid;
+ alt_path->slid = req_msg->alt_remote_lid;
+ alt_path->flow_label = cm_req_get_alt_flow_label(req_msg);
+ alt_path->hop_limit = req_msg->alt_hop_limit;
+ alt_path->traffic_class = req_msg->alt_traffic_class;
+ alt_path->reversible = 1;
+ alt_path->pkey = req_msg->pkey;
+ alt_path->sl = cm_req_get_alt_sl(req_msg);
+ alt_path->mtu_selector = IB_SA_EQ;
+ alt_path->mtu = cm_req_get_path_mtu(req_msg);
+ alt_path->rate_selector = IB_SA_EQ;
+ alt_path->rate = cm_req_get_alt_packet_rate(req_msg);
+ alt_path->packet_life_time_selector = IB_SA_EQ;
+ alt_path->packet_life_time =
+ cm_req_get_alt_local_ack_timeout(req_msg);
+ alt_path->packet_life_time -= (alt_path->packet_life_time > 0);
+ }
+}
+
+static void cm_format_req_event(struct cm_work *work,
+ struct cm_id_private *cm_id_priv,
+ struct ib_cm_id *listen_id)
+{
+ struct cm_req_msg *req_msg;
+ struct ib_cm_req_event_param *param;
+
+ req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
+ param = &work->cm_event.param.req_rcvd;
+ param->listen_id = listen_id;
+ param->port = cm_id_priv->av.port->port_num;
+ param->primary_path = &work->path[0];
+ if (req_msg->alt_local_lid)
+ param->alternate_path = &work->path[1];
+ else
+ param->alternate_path = NULL;
+ param->remote_ca_guid = req_msg->local_ca_guid;
+ param->remote_qkey = be32_to_cpu(req_msg->local_qkey);
+ param->remote_qpn = be32_to_cpu(cm_req_get_local_qpn(req_msg));
+ param->qp_type = cm_req_get_qp_type(req_msg);
+ param->starting_psn = be32_to_cpu(cm_req_get_starting_psn(req_msg));
+ param->responder_resources = cm_req_get_init_depth(req_msg);
+ param->initiator_depth = cm_req_get_resp_res(req_msg);
+ param->local_cm_response_timeout =
+ cm_req_get_remote_resp_timeout(req_msg);
+ param->flow_control = cm_req_get_flow_ctrl(req_msg);
+ param->remote_cm_response_timeout =
+ cm_req_get_local_resp_timeout(req_msg);
+ param->retry_count = cm_req_get_retry_count(req_msg);
+ param->rnr_retry_count = cm_req_get_rnr_retry_count(req_msg);
+ param->srq = cm_req_get_srq(req_msg);
+ work->cm_event.private_data = &req_msg->private_data;
+}
+
+static void cm_process_work(struct cm_id_private *cm_id_priv,
+ struct cm_work *work)
+{
+ int ret;
+
+ /* We will typically only have the current event to report. */
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->cm_event);
+ cm_free_work(work);
+
+ while (!ret && !atomic_add_negative(-1, &cm_id_priv->work_count)) {
+ spin_lock_irq(&cm_id_priv->lock);
+ work = cm_dequeue_work(cm_id_priv);
+ spin_unlock_irq(&cm_id_priv->lock);
+ BUG_ON(!work);
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id,
+ &work->cm_event);
+ cm_free_work(work);
+ }
+ cm_deref_id(cm_id_priv);
+ if (ret)
+ cm_destroy_id(&cm_id_priv->id, ret);
+}
+
+static void cm_format_mra(struct cm_mra_msg *mra_msg,
+ struct cm_id_private *cm_id_priv,
+ enum cm_msg_response msg_mraed, u8 service_timeout,
+ const void *private_data, u8 private_data_len)
+{
+ cm_format_mad_hdr(&mra_msg->hdr, CM_MRA_ATTR_ID, cm_id_priv->tid);
+ cm_mra_set_msg_mraed(mra_msg, msg_mraed);
+ mra_msg->local_comm_id = cm_id_priv->id.local_id;
+ mra_msg->remote_comm_id = cm_id_priv->id.remote_id;
+ cm_mra_set_service_timeout(mra_msg, service_timeout);
+
+ if (private_data && private_data_len)
+ memcpy(mra_msg->private_data, private_data, private_data_len);
+}
+
+static void cm_format_rej(struct cm_rej_msg *rej_msg,
+ struct cm_id_private *cm_id_priv,
+ enum ib_cm_rej_reason reason,
+ void *ari,
+ u8 ari_length,
+ const void *private_data,
+ u8 private_data_len)
+{
+ cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, cm_id_priv->tid);
+ rej_msg->remote_comm_id = cm_id_priv->id.remote_id;
+
+ switch(cm_id_priv->id.state) {
+ case IB_CM_REQ_RCVD:
+ rej_msg->local_comm_id = 0;
+ cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REQ);
+ break;
+ case IB_CM_MRA_REQ_SENT:
+ rej_msg->local_comm_id = cm_id_priv->id.local_id;
+ cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REQ);
+ break;
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ rej_msg->local_comm_id = cm_id_priv->id.local_id;
+ cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REP);
+ break;
+ default:
+ rej_msg->local_comm_id = cm_id_priv->id.local_id;
+ cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_OTHER);
+ break;
+ }
+
+ rej_msg->reason = cpu_to_be16(reason);
+ if (ari && ari_length) {
+ cm_rej_set_reject_info_len(rej_msg, ari_length);
+ memcpy(rej_msg->ari, ari, ari_length);
+ }
+
+ if (private_data && private_data_len)
+ memcpy(rej_msg->private_data, private_data, private_data_len);
+}
+
+static void cm_dup_req_handler(struct cm_work *work,
+ struct cm_id_private *cm_id_priv)
+{
+ struct ib_mad_send_buf *msg = NULL;
+ int ret;
+
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_REQ_COUNTER]);
+
+ /* Quick state check to discard duplicate REQs. */
+ if (cm_id_priv->id.state == IB_CM_REQ_RCVD)
+ return;
+
+ ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
+ if (ret)
+ return;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ switch (cm_id_priv->id.state) {
+ case IB_CM_MRA_REQ_SENT:
+ cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+ CM_MSG_RESPONSE_REQ, cm_id_priv->service_timeout,
+ cm_id_priv->private_data,
+ cm_id_priv->private_data_len);
+ break;
+ case IB_CM_TIMEWAIT:
+ cm_format_rej((struct cm_rej_msg *) msg->mad, cm_id_priv,
+ IB_CM_REJ_STALE_CONN, NULL, 0, NULL, 0);
+ break;
+ default:
+ goto unlock;
+ }
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ goto free;
+ return;
+
+unlock: spin_unlock_irq(&cm_id_priv->lock);
+free: cm_free_msg(msg);
+}
+
+static struct cm_id_private * cm_match_req(struct cm_work *work,
+ struct cm_id_private *cm_id_priv)
+{
+ struct cm_id_private *listen_cm_id_priv, *cur_cm_id_priv;
+ struct cm_timewait_info *timewait_info;
+ struct cm_req_msg *req_msg;
+
+ req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
+
+ /* Check for possible duplicate REQ. */
+ spin_lock_irq(&cm.lock);
+ timewait_info = cm_insert_remote_id(cm_id_priv->timewait_info);
+ if (timewait_info) {
+ cur_cm_id_priv = cm_get_id(timewait_info->work.local_id,
+ timewait_info->work.remote_id);
+ spin_unlock_irq(&cm.lock);
+ if (cur_cm_id_priv) {
+ cm_dup_req_handler(work, cur_cm_id_priv);
+ cm_deref_id(cur_cm_id_priv);
+ }
+ return NULL;
+ }
+
+ /* Check for stale connections. */
+ timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info);
+ if (timewait_info) {
+ cm_cleanup_timewait(cm_id_priv->timewait_info);
+ spin_unlock_irq(&cm.lock);
+ cm_issue_rej(work->port, work->mad_recv_wc,
+ IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REQ,
+ NULL, 0);
+ return NULL;
+ }
+
+ /* Find matching listen request. */
+ listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device,
+ req_msg->service_id,
+ req_msg->private_data);
+ if (!listen_cm_id_priv) {
+ cm_cleanup_timewait(cm_id_priv->timewait_info);
+ spin_unlock_irq(&cm.lock);
+ cm_issue_rej(work->port, work->mad_recv_wc,
+ IB_CM_REJ_INVALID_SERVICE_ID, CM_MSG_RESPONSE_REQ,
+ NULL, 0);
+ goto out;
+ }
+ atomic_inc(&listen_cm_id_priv->refcount);
+ atomic_inc(&cm_id_priv->refcount);
+ cm_id_priv->id.state = IB_CM_REQ_RCVD;
+ atomic_inc(&cm_id_priv->work_count);
+ spin_unlock_irq(&cm.lock);
+out:
+ return listen_cm_id_priv;
+}
+
+/*
+ * Work-around for inter-subnet connections. If the LIDs are permissive,
+ * we need to override the LID/SL data in the REQ with the LID information
+ * in the work completion.
+ */
+static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc)
+{
+ if (!cm_req_get_primary_subnet_local(req_msg)) {
+ if (req_msg->primary_local_lid == IB_LID_PERMISSIVE) {
+ req_msg->primary_local_lid = cpu_to_be16(wc->slid);
+ cm_req_set_primary_sl(req_msg, wc->sl);
+ }
+
+ if (req_msg->primary_remote_lid == IB_LID_PERMISSIVE)
+ req_msg->primary_remote_lid = cpu_to_be16(wc->dlid_path_bits);
+ }
+
+ if (!cm_req_get_alt_subnet_local(req_msg)) {
+ if (req_msg->alt_local_lid == IB_LID_PERMISSIVE) {
+ req_msg->alt_local_lid = cpu_to_be16(wc->slid);
+ cm_req_set_alt_sl(req_msg, wc->sl);
+ }
+
+ if (req_msg->alt_remote_lid == IB_LID_PERMISSIVE)
+ req_msg->alt_remote_lid = cpu_to_be16(wc->dlid_path_bits);
+ }
+}
+
+static int cm_req_handler(struct cm_work *work)
+{
+ struct ib_cm_id *cm_id;
+ struct cm_id_private *cm_id_priv, *listen_cm_id_priv;
+ struct cm_req_msg *req_msg;
+ int ret;
+
+ req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
+
+ cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL);
+ if (IS_ERR(cm_id))
+ return PTR_ERR(cm_id);
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ cm_id_priv->id.remote_id = req_msg->local_comm_id;
+ cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+ work->mad_recv_wc->recv_buf.grh,
+ &cm_id_priv->av);
+ cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
+ id.local_id);
+ if (IS_ERR(cm_id_priv->timewait_info)) {
+ ret = PTR_ERR(cm_id_priv->timewait_info);
+ goto destroy;
+ }
+ cm_id_priv->timewait_info->work.remote_id = req_msg->local_comm_id;
+ cm_id_priv->timewait_info->remote_ca_guid = req_msg->local_ca_guid;
+ cm_id_priv->timewait_info->remote_qpn = cm_req_get_local_qpn(req_msg);
+
+ listen_cm_id_priv = cm_match_req(work, cm_id_priv);
+ if (!listen_cm_id_priv) {
+ ret = -EINVAL;
+ kfree(cm_id_priv->timewait_info);
+ goto destroy;
+ }
+
+ cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler;
+ cm_id_priv->id.context = listen_cm_id_priv->id.context;
+ cm_id_priv->id.service_id = req_msg->service_id;
+ cm_id_priv->id.service_mask = ~cpu_to_be64(0);
+
+ cm_process_routed_req(req_msg, work->mad_recv_wc->wc);
+ cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]);
+ ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av);
+ if (ret) {
+ ib_get_cached_gid(work->port->cm_dev->ib_device,
+ work->port->port_num, 0, &work->path[0].sgid);
+ ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID,
+ &work->path[0].sgid, sizeof work->path[0].sgid,
+ NULL, 0);
+ goto rejected;
+ }
+ if (req_msg->alt_local_lid) {
+ ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av);
+ if (ret) {
+ ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_ALT_GID,
+ &work->path[0].sgid,
+ sizeof work->path[0].sgid, NULL, 0);
+ goto rejected;
+ }
+ }
+ cm_id_priv->tid = req_msg->hdr.tid;
+ cm_id_priv->timeout_ms = cm_convert_to_ms(
+ cm_req_get_local_resp_timeout(req_msg));
+ if (cm_req_get_local_resp_timeout(req_msg) > (u8) max_timeout) {
+ printk(KERN_WARNING PFX "rcvd cm_local_resp_timeout %d > %d, "
+ "decreasing used timeout_ms\n",
+ cm_req_get_local_resp_timeout(req_msg), max_timeout);
+ cm_id_priv->timeout_ms = cm_convert_to_ms(max_timeout);
+ }
+
+ cm_id_priv->max_cm_retries = cm_req_get_max_cm_retries(req_msg);
+ cm_id_priv->remote_qpn = cm_req_get_local_qpn(req_msg);
+ cm_id_priv->initiator_depth = cm_req_get_resp_res(req_msg);
+ cm_id_priv->responder_resources = cm_req_get_init_depth(req_msg);
+ cm_id_priv->path_mtu = cm_req_get_path_mtu(req_msg);
+ cm_id_priv->pkey = req_msg->pkey;
+ cm_id_priv->sq_psn = cm_req_get_starting_psn(req_msg);
+ cm_id_priv->retry_count = cm_req_get_retry_count(req_msg);
+ cm_id_priv->rnr_retry_count = cm_req_get_rnr_retry_count(req_msg);
+ cm_id_priv->qp_type = cm_req_get_qp_type(req_msg);
+
+ cm_format_req_event(work, cm_id_priv, &listen_cm_id_priv->id);
+ cm_process_work(cm_id_priv, work);
+ cm_deref_id(listen_cm_id_priv);
+ return 0;
+
+rejected:
+ atomic_dec(&cm_id_priv->refcount);
+ cm_deref_id(listen_cm_id_priv);
+destroy:
+ ib_destroy_cm_id(cm_id);
+ return ret;
+}
+
+static void cm_format_rep(struct cm_rep_msg *rep_msg,
+ struct cm_id_private *cm_id_priv,
+ struct ib_cm_rep_param *param)
+{
+ cm_format_mad_hdr(&rep_msg->hdr, CM_REP_ATTR_ID, cm_id_priv->tid);
+ rep_msg->local_comm_id = cm_id_priv->id.local_id;
+ rep_msg->remote_comm_id = cm_id_priv->id.remote_id;
+ cm_rep_set_local_qpn(rep_msg, cpu_to_be32(param->qp_num));
+ cm_rep_set_starting_psn(rep_msg, cpu_to_be32(param->starting_psn));
+ rep_msg->resp_resources = param->responder_resources;
+ rep_msg->initiator_depth = param->initiator_depth;
+ cm_rep_set_target_ack_delay(rep_msg,
+ cm_id_priv->av.port->cm_dev->ack_delay);
+ cm_rep_set_failover(rep_msg, param->failover_accepted);
+ cm_rep_set_flow_ctrl(rep_msg, param->flow_control);
+ cm_rep_set_rnr_retry_count(rep_msg, param->rnr_retry_count);
+ cm_rep_set_srq(rep_msg, param->srq);
+ rep_msg->local_ca_guid = cm_id_priv->id.device->node_guid;
+
+ if (param->private_data && param->private_data_len)
+ memcpy(rep_msg->private_data, param->private_data,
+ param->private_data_len);
+}
+
+int ib_send_cm_rep(struct ib_cm_id *cm_id,
+ struct ib_cm_rep_param *param)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ struct cm_rep_msg *rep_msg;
+ unsigned long flags;
+ int ret;
+
+ if (param->private_data &&
+ param->private_data_len > IB_CM_REP_PRIVATE_DATA_SIZE)
+ return -EINVAL;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state != IB_CM_REQ_RCVD &&
+ cm_id->state != IB_CM_MRA_REQ_SENT) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (ret)
+ goto out;
+
+ rep_msg = (struct cm_rep_msg *) msg->mad;
+ cm_format_rep(rep_msg, cm_id_priv, param);
+ msg->timeout_ms = cm_id_priv->timeout_ms;
+ msg->context[1] = (void *) (unsigned long) IB_CM_REP_SENT;
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_free_msg(msg);
+ return ret;
+ }
+
+ cm_id->state = IB_CM_REP_SENT;
+ cm_id_priv->msg = msg;
+ cm_id_priv->initiator_depth = param->initiator_depth;
+ cm_id_priv->responder_resources = param->responder_resources;
+ cm_id_priv->rq_psn = cm_rep_get_starting_psn(rep_msg);
+ cm_id_priv->local_qpn = cm_rep_get_local_qpn(rep_msg);
+
+out: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_rep);
+
+static void cm_format_rtu(struct cm_rtu_msg *rtu_msg,
+ struct cm_id_private *cm_id_priv,
+ const void *private_data,
+ u8 private_data_len)
+{
+ cm_format_mad_hdr(&rtu_msg->hdr, CM_RTU_ATTR_ID, cm_id_priv->tid);
+ rtu_msg->local_comm_id = cm_id_priv->id.local_id;
+ rtu_msg->remote_comm_id = cm_id_priv->id.remote_id;
+
+ if (private_data && private_data_len)
+ memcpy(rtu_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_rtu(struct ib_cm_id *cm_id,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ void *data;
+ int ret;
+
+ if (private_data && private_data_len > IB_CM_RTU_PRIVATE_DATA_SIZE)
+ return -EINVAL;
+
+ data = cm_copy_private_data(private_data, private_data_len);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state != IB_CM_REP_RCVD &&
+ cm_id->state != IB_CM_MRA_REP_SENT) {
+ ret = -EINVAL;
+ goto error;
+ }
+
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (ret)
+ goto error;
+
+ cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv,
+ private_data, private_data_len);
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_free_msg(msg);
+ kfree(data);
+ return ret;
+ }
+
+ cm_id->state = IB_CM_ESTABLISHED;
+ cm_set_private_data(cm_id_priv, data, private_data_len);
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return 0;
+
+error: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ kfree(data);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_rtu);
+
+static void cm_format_rep_event(struct cm_work *work)
+{
+ struct cm_rep_msg *rep_msg;
+ struct ib_cm_rep_event_param *param;
+
+ rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad;
+ param = &work->cm_event.param.rep_rcvd;
+ param->remote_ca_guid = rep_msg->local_ca_guid;
+ param->remote_qkey = be32_to_cpu(rep_msg->local_qkey);
+ param->remote_qpn = be32_to_cpu(cm_rep_get_local_qpn(rep_msg));
+ param->starting_psn = be32_to_cpu(cm_rep_get_starting_psn(rep_msg));
+ param->responder_resources = rep_msg->initiator_depth;
+ param->initiator_depth = rep_msg->resp_resources;
+ param->target_ack_delay = cm_rep_get_target_ack_delay(rep_msg);
+ param->failover_accepted = cm_rep_get_failover(rep_msg);
+ param->flow_control = cm_rep_get_flow_ctrl(rep_msg);
+ param->rnr_retry_count = cm_rep_get_rnr_retry_count(rep_msg);
+ param->srq = cm_rep_get_srq(rep_msg);
+ work->cm_event.private_data = &rep_msg->private_data;
+}
+
+static void cm_dup_rep_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_rep_msg *rep_msg;
+ struct ib_mad_send_buf *msg = NULL;
+ int ret;
+
+ rep_msg = (struct cm_rep_msg *) work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id,
+ rep_msg->local_comm_id);
+ if (!cm_id_priv)
+ return;
+
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_REP_COUNTER]);
+ ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
+ if (ret)
+ goto deref;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state == IB_CM_ESTABLISHED)
+ cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv,
+ cm_id_priv->private_data,
+ cm_id_priv->private_data_len);
+ else if (cm_id_priv->id.state == IB_CM_MRA_REP_SENT)
+ cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+ CM_MSG_RESPONSE_REP, cm_id_priv->service_timeout,
+ cm_id_priv->private_data,
+ cm_id_priv->private_data_len);
+ else
+ goto unlock;
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ goto free;
+ goto deref;
+
+unlock: spin_unlock_irq(&cm_id_priv->lock);
+free: cm_free_msg(msg);
+deref: cm_deref_id(cm_id_priv);
+}
+
+static int cm_rep_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_rep_msg *rep_msg;
+ int ret;
+
+ rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id, 0);
+ if (!cm_id_priv) {
+ cm_dup_rep_handler(work);
+ return -EINVAL;
+ }
+
+ cm_format_rep_event(work);
+
+ spin_lock_irq(&cm_id_priv->lock);
+ switch (cm_id_priv->id.state) {
+ case IB_CM_REQ_SENT:
+ case IB_CM_MRA_REQ_RCVD:
+ break;
+ default:
+ spin_unlock_irq(&cm_id_priv->lock);
+ ret = -EINVAL;
+ goto error;
+ }
+
+ cm_id_priv->timewait_info->work.remote_id = rep_msg->local_comm_id;
+ cm_id_priv->timewait_info->remote_ca_guid = rep_msg->local_ca_guid;
+ cm_id_priv->timewait_info->remote_qpn = cm_rep_get_local_qpn(rep_msg);
+
+ spin_lock(&cm.lock);
+ /* Check for duplicate REP. */
+ if (cm_insert_remote_id(cm_id_priv->timewait_info)) {
+ spin_unlock(&cm.lock);
+ spin_unlock_irq(&cm_id_priv->lock);
+ ret = -EINVAL;
+ goto error;
+ }
+ /* Check for a stale connection. */
+ if (cm_insert_remote_qpn(cm_id_priv->timewait_info)) {
+ rb_erase(&cm_id_priv->timewait_info->remote_id_node,
+ &cm.remote_id_table);
+ cm_id_priv->timewait_info->inserted_remote_id = 0;
+ spin_unlock(&cm.lock);
+ spin_unlock_irq(&cm_id_priv->lock);
+ cm_issue_rej(work->port, work->mad_recv_wc,
+ IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP,
+ NULL, 0);
+ ret = -EINVAL;
+ goto error;
+ }
+ spin_unlock(&cm.lock);
+
+ cm_id_priv->id.state = IB_CM_REP_RCVD;
+ cm_id_priv->id.remote_id = rep_msg->local_comm_id;
+ cm_id_priv->remote_qpn = cm_rep_get_local_qpn(rep_msg);
+ cm_id_priv->initiator_depth = rep_msg->resp_resources;
+ cm_id_priv->responder_resources = rep_msg->initiator_depth;
+ cm_id_priv->sq_psn = cm_rep_get_starting_psn(rep_msg);
+ cm_id_priv->rnr_retry_count = cm_rep_get_rnr_retry_count(rep_msg);
+ cm_id_priv->target_ack_delay = cm_rep_get_target_ack_delay(rep_msg);
+ cm_id_priv->av.timeout =
+ cm_ack_timeout(cm_id_priv->target_ack_delay,
+ cm_id_priv->av.timeout - 1);
+ cm_id_priv->alt_av.timeout =
+ cm_ack_timeout(cm_id_priv->target_ack_delay,
+ cm_id_priv->alt_av.timeout - 1);
+
+ /* todo: handle peer_to_peer */
+
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+
+error:
+ cm_deref_id(cm_id_priv);
+ return ret;
+}
+
+static int cm_establish_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ int ret;
+
+ /* See comment in cm_establish about lookup. */
+ cm_id_priv = cm_acquire_id(work->local_id, work->remote_id);
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state != IB_CM_ESTABLISHED) {
+ spin_unlock_irq(&cm_id_priv->lock);
+ goto out;
+ }
+
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static int cm_rtu_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_rtu_msg *rtu_msg;
+ int ret;
+
+ rtu_msg = (struct cm_rtu_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(rtu_msg->remote_comm_id,
+ rtu_msg->local_comm_id);
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ work->cm_event.private_data = &rtu_msg->private_data;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state != IB_CM_REP_SENT &&
+ cm_id_priv->id.state != IB_CM_MRA_REP_RCVD) {
+ spin_unlock_irq(&cm_id_priv->lock);
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_RTU_COUNTER]);
+ goto out;
+ }
+ cm_id_priv->id.state = IB_CM_ESTABLISHED;
+
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static void cm_format_dreq(struct cm_dreq_msg *dreq_msg,
+ struct cm_id_private *cm_id_priv,
+ const void *private_data,
+ u8 private_data_len)
+{
+ cm_format_mad_hdr(&dreq_msg->hdr, CM_DREQ_ATTR_ID,
+ cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_DREQ));
+ dreq_msg->local_comm_id = cm_id_priv->id.local_id;
+ dreq_msg->remote_comm_id = cm_id_priv->id.remote_id;
+ cm_dreq_set_remote_qpn(dreq_msg, cm_id_priv->remote_qpn);
+
+ if (private_data && private_data_len)
+ memcpy(dreq_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_dreq(struct ib_cm_id *cm_id,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ int ret;
+
+ if (private_data && private_data_len > IB_CM_DREQ_PRIVATE_DATA_SIZE)
+ return -EINVAL;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state != IB_CM_ESTABLISHED) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (ret) {
+ cm_enter_timewait(cm_id_priv);
+ goto out;
+ }
+
+ cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv,
+ private_data, private_data_len);
+ msg->timeout_ms = cm_id_priv->timeout_ms;
+ msg->context[1] = (void *) (unsigned long) IB_CM_DREQ_SENT;
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ cm_enter_timewait(cm_id_priv);
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_free_msg(msg);
+ return ret;
+ }
+
+ cm_id->state = IB_CM_DREQ_SENT;
+ cm_id_priv->msg = msg;
+out: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_dreq);
+
+static void cm_format_drep(struct cm_drep_msg *drep_msg,
+ struct cm_id_private *cm_id_priv,
+ const void *private_data,
+ u8 private_data_len)
+{
+ cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, cm_id_priv->tid);
+ drep_msg->local_comm_id = cm_id_priv->id.local_id;
+ drep_msg->remote_comm_id = cm_id_priv->id.remote_id;
+
+ if (private_data && private_data_len)
+ memcpy(drep_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_drep(struct ib_cm_id *cm_id,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ void *data;
+ int ret;
+
+ if (private_data && private_data_len > IB_CM_DREP_PRIVATE_DATA_SIZE)
+ return -EINVAL;
+
+ data = cm_copy_private_data(private_data, private_data_len);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state != IB_CM_DREQ_RCVD) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ kfree(data);
+ return -EINVAL;
+ }
+
+ cm_set_private_data(cm_id_priv, data, private_data_len);
+ cm_enter_timewait(cm_id_priv);
+
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (ret)
+ goto out;
+
+ cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv,
+ private_data, private_data_len);
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_free_msg(msg);
+ return ret;
+ }
+
+out: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_drep);
+
+static int cm_issue_drep(struct cm_port *port,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_mad_send_buf *msg = NULL;
+ struct cm_dreq_msg *dreq_msg;
+ struct cm_drep_msg *drep_msg;
+ int ret;
+
+ ret = cm_alloc_response_msg(port, mad_recv_wc, &msg);
+ if (ret)
+ return ret;
+
+ dreq_msg = (struct cm_dreq_msg *) mad_recv_wc->recv_buf.mad;
+ drep_msg = (struct cm_drep_msg *) msg->mad;
+
+ cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, dreq_msg->hdr.tid);
+ drep_msg->remote_comm_id = dreq_msg->local_comm_id;
+ drep_msg->local_comm_id = dreq_msg->remote_comm_id;
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ cm_free_msg(msg);
+
+ return ret;
+}
+
+static int cm_dreq_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_dreq_msg *dreq_msg;
+ struct ib_mad_send_buf *msg = NULL;
+ int ret;
+
+ dreq_msg = (struct cm_dreq_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(dreq_msg->remote_comm_id,
+ dreq_msg->local_comm_id);
+ if (!cm_id_priv) {
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_DREQ_COUNTER]);
+ cm_issue_drep(work->port, work->mad_recv_wc);
+ return -EINVAL;
+ }
+
+ work->cm_event.private_data = &dreq_msg->private_data;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->local_qpn != cm_dreq_get_remote_qpn(dreq_msg))
+ goto unlock;
+
+ switch (cm_id_priv->id.state) {
+ case IB_CM_REP_SENT:
+ case IB_CM_DREQ_SENT:
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ break;
+ case IB_CM_ESTABLISHED:
+ case IB_CM_MRA_REP_RCVD:
+ break;
+ case IB_CM_TIMEWAIT:
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_DREQ_COUNTER]);
+ if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg))
+ goto unlock;
+
+ cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv,
+ cm_id_priv->private_data,
+ cm_id_priv->private_data_len);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ib_post_send_mad(msg, NULL))
+ cm_free_msg(msg);
+ goto deref;
+ case IB_CM_DREQ_RCVD:
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_DREQ_COUNTER]);
+ goto unlock;
+ default:
+ goto unlock;
+ }
+ cm_id_priv->id.state = IB_CM_DREQ_RCVD;
+ cm_id_priv->tid = dreq_msg->hdr.tid;
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+
+unlock: spin_unlock_irq(&cm_id_priv->lock);
+deref: cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static int cm_drep_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_drep_msg *drep_msg;
+ int ret;
+
+ drep_msg = (struct cm_drep_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(drep_msg->remote_comm_id,
+ drep_msg->local_comm_id);
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ work->cm_event.private_data = &drep_msg->private_data;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state != IB_CM_DREQ_SENT &&
+ cm_id_priv->id.state != IB_CM_DREQ_RCVD) {
+ spin_unlock_irq(&cm_id_priv->lock);
+ goto out;
+ }
+ cm_enter_timewait(cm_id_priv);
+
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+int ib_send_cm_rej(struct ib_cm_id *cm_id,
+ enum ib_cm_rej_reason reason,
+ void *ari,
+ u8 ari_length,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ int ret;
+
+ if ((private_data && private_data_len > IB_CM_REJ_PRIVATE_DATA_SIZE) ||
+ (ari && ari_length > IB_CM_REJ_ARI_LENGTH))
+ return -EINVAL;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id->state) {
+ case IB_CM_REQ_SENT:
+ case IB_CM_MRA_REQ_RCVD:
+ case IB_CM_REQ_RCVD:
+ case IB_CM_MRA_REQ_SENT:
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (!ret)
+ cm_format_rej((struct cm_rej_msg *) msg->mad,
+ cm_id_priv, reason, ari, ari_length,
+ private_data, private_data_len);
+
+ cm_reset_to_idle(cm_id_priv);
+ break;
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (!ret)
+ cm_format_rej((struct cm_rej_msg *) msg->mad,
+ cm_id_priv, reason, ari, ari_length,
+ private_data, private_data_len);
+
+ cm_enter_timewait(cm_id_priv);
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (ret)
+ goto out;
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ cm_free_msg(msg);
+
+out: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_rej);
+
+static void cm_format_rej_event(struct cm_work *work)
+{
+ struct cm_rej_msg *rej_msg;
+ struct ib_cm_rej_event_param *param;
+
+ rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad;
+ param = &work->cm_event.param.rej_rcvd;
+ param->ari = rej_msg->ari;
+ param->ari_length = cm_rej_get_reject_info_len(rej_msg);
+ param->reason = __be16_to_cpu(rej_msg->reason);
+ work->cm_event.private_data = &rej_msg->private_data;
+}
+
+static struct cm_id_private * cm_acquire_rejected_id(struct cm_rej_msg *rej_msg)
+{
+ struct cm_timewait_info *timewait_info;
+ struct cm_id_private *cm_id_priv;
+ __be32 remote_id;
+
+ remote_id = rej_msg->local_comm_id;
+
+ if (__be16_to_cpu(rej_msg->reason) == IB_CM_REJ_TIMEOUT) {
+ spin_lock_irq(&cm.lock);
+ timewait_info = cm_find_remote_id( *((__be64 *) rej_msg->ari),
+ remote_id);
+ if (!timewait_info) {
+ spin_unlock_irq(&cm.lock);
+ return NULL;
+ }
+ cm_id_priv = idr_find(&cm.local_id_table, (__force int)
+ (timewait_info->work.local_id ^
+ cm.random_id_operand));
+ if (cm_id_priv) {
+ if (cm_id_priv->id.remote_id == remote_id)
+ atomic_inc(&cm_id_priv->refcount);
+ else
+ cm_id_priv = NULL;
+ }
+ spin_unlock_irq(&cm.lock);
+ } else if (cm_rej_get_msg_rejected(rej_msg) == CM_MSG_RESPONSE_REQ)
+ cm_id_priv = cm_acquire_id(rej_msg->remote_comm_id, 0);
+ else
+ cm_id_priv = cm_acquire_id(rej_msg->remote_comm_id, remote_id);
+
+ return cm_id_priv;
+}
+
+static int cm_rej_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_rej_msg *rej_msg;
+ int ret;
+
+ rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_rejected_id(rej_msg);
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ cm_format_rej_event(work);
+
+ spin_lock_irq(&cm_id_priv->lock);
+ switch (cm_id_priv->id.state) {
+ case IB_CM_REQ_SENT:
+ case IB_CM_MRA_REQ_RCVD:
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ /* fall through */
+ case IB_CM_REQ_RCVD:
+ case IB_CM_MRA_REQ_SENT:
+ if (__be16_to_cpu(rej_msg->reason) == IB_CM_REJ_STALE_CONN)
+ cm_enter_timewait(cm_id_priv);
+ else
+ cm_reset_to_idle(cm_id_priv);
+ break;
+ case IB_CM_DREQ_SENT:
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ /* fall through */
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ case IB_CM_ESTABLISHED:
+ cm_enter_timewait(cm_id_priv);
+ break;
+ default:
+ spin_unlock_irq(&cm_id_priv->lock);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+int ib_send_cm_mra(struct ib_cm_id *cm_id,
+ u8 service_timeout,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ enum ib_cm_state cm_state;
+ enum ib_cm_lap_state lap_state;
+ enum cm_msg_response msg_response;
+ void *data;
+ unsigned long flags;
+ int ret;
+
+ if (private_data && private_data_len > IB_CM_MRA_PRIVATE_DATA_SIZE)
+ return -EINVAL;
+
+ data = cm_copy_private_data(private_data, private_data_len);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch(cm_id_priv->id.state) {
+ case IB_CM_REQ_RCVD:
+ cm_state = IB_CM_MRA_REQ_SENT;
+ lap_state = cm_id->lap_state;
+ msg_response = CM_MSG_RESPONSE_REQ;
+ break;
+ case IB_CM_REP_RCVD:
+ cm_state = IB_CM_MRA_REP_SENT;
+ lap_state = cm_id->lap_state;
+ msg_response = CM_MSG_RESPONSE_REP;
+ break;
+ case IB_CM_ESTABLISHED:
+ if (cm_id->lap_state == IB_CM_LAP_RCVD) {
+ cm_state = cm_id->state;
+ lap_state = IB_CM_MRA_LAP_SENT;
+ msg_response = CM_MSG_RESPONSE_OTHER;
+ break;
+ }
+ default:
+ ret = -EINVAL;
+ goto error1;
+ }
+
+ if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) {
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (ret)
+ goto error1;
+
+ cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+ msg_response, service_timeout,
+ private_data, private_data_len);
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ goto error2;
+ }
+
+ cm_id->state = cm_state;
+ cm_id->lap_state = lap_state;
+ cm_id_priv->service_timeout = service_timeout;
+ cm_set_private_data(cm_id_priv, data, private_data_len);
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return 0;
+
+error1: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ kfree(data);
+ return ret;
+
+error2: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ kfree(data);
+ cm_free_msg(msg);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_mra);
+
+static struct cm_id_private * cm_acquire_mraed_id(struct cm_mra_msg *mra_msg)
+{
+ switch (cm_mra_get_msg_mraed(mra_msg)) {
+ case CM_MSG_RESPONSE_REQ:
+ return cm_acquire_id(mra_msg->remote_comm_id, 0);
+ case CM_MSG_RESPONSE_REP:
+ case CM_MSG_RESPONSE_OTHER:
+ return cm_acquire_id(mra_msg->remote_comm_id,
+ mra_msg->local_comm_id);
+ default:
+ return NULL;
+ }
+}
+
+static int cm_mra_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_mra_msg *mra_msg;
+ int timeout, ret;
+
+ mra_msg = (struct cm_mra_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_mraed_id(mra_msg);
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ work->cm_event.private_data = &mra_msg->private_data;
+ work->cm_event.param.mra_rcvd.service_timeout =
+ cm_mra_get_service_timeout(mra_msg);
+ timeout = cm_convert_to_ms(cm_mra_get_service_timeout(mra_msg)) +
+ cm_convert_to_ms(cm_id_priv->av.timeout);
+ if (timeout > cm_convert_to_ms(max_timeout)) {
+ printk(KERN_WARNING PFX "calculated mra timeout %d > %d, "
+ "decreasing used timeout_ms\n", timeout,
+ cm_convert_to_ms(max_timeout));
+ timeout = cm_convert_to_ms(max_timeout);
+ }
+
+ spin_lock_irq(&cm_id_priv->lock);
+ switch (cm_id_priv->id.state) {
+ case IB_CM_REQ_SENT:
+ if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_REQ ||
+ ib_modify_mad(cm_id_priv->av.port->mad_agent,
+ cm_id_priv->msg, timeout))
+ goto out;
+ cm_id_priv->id.state = IB_CM_MRA_REQ_RCVD;
+ break;
+ case IB_CM_REP_SENT:
+ if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_REP ||
+ ib_modify_mad(cm_id_priv->av.port->mad_agent,
+ cm_id_priv->msg, timeout))
+ goto out;
+ cm_id_priv->id.state = IB_CM_MRA_REP_RCVD;
+ break;
+ case IB_CM_ESTABLISHED:
+ if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_OTHER ||
+ cm_id_priv->id.lap_state != IB_CM_LAP_SENT ||
+ ib_modify_mad(cm_id_priv->av.port->mad_agent,
+ cm_id_priv->msg, timeout)) {
+ if (cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
+ atomic_long_inc(&work->port->
+ counter_group[CM_RECV_DUPLICATES].
+ counter[CM_MRA_COUNTER]);
+ goto out;
+ }
+ cm_id_priv->id.lap_state = IB_CM_MRA_LAP_RCVD;
+ break;
+ case IB_CM_MRA_REQ_RCVD:
+ case IB_CM_MRA_REP_RCVD:
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_MRA_COUNTER]);
+ /* fall through */
+ default:
+ goto out;
+ }
+
+ cm_id_priv->msg->context[1] = (void *) (unsigned long)
+ cm_id_priv->id.state;
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+out:
+ spin_unlock_irq(&cm_id_priv->lock);
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static void cm_format_lap(struct cm_lap_msg *lap_msg,
+ struct cm_id_private *cm_id_priv,
+ struct ib_sa_path_rec *alternate_path,
+ const void *private_data,
+ u8 private_data_len)
+{
+ cm_format_mad_hdr(&lap_msg->hdr, CM_LAP_ATTR_ID,
+ cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_LAP));
+ lap_msg->local_comm_id = cm_id_priv->id.local_id;
+ lap_msg->remote_comm_id = cm_id_priv->id.remote_id;
+ cm_lap_set_remote_qpn(lap_msg, cm_id_priv->remote_qpn);
+ /* todo: need remote CM response timeout */
+ cm_lap_set_remote_resp_timeout(lap_msg, 0x1F);
+ lap_msg->alt_local_lid = alternate_path->slid;
+ lap_msg->alt_remote_lid = alternate_path->dlid;
+ lap_msg->alt_local_gid = alternate_path->sgid;
+ lap_msg->alt_remote_gid = alternate_path->dgid;
+ cm_lap_set_flow_label(lap_msg, alternate_path->flow_label);
+ cm_lap_set_traffic_class(lap_msg, alternate_path->traffic_class);
+ lap_msg->alt_hop_limit = alternate_path->hop_limit;
+ cm_lap_set_packet_rate(lap_msg, alternate_path->rate);
+ cm_lap_set_sl(lap_msg, alternate_path->sl);
+ cm_lap_set_subnet_local(lap_msg, 1); /* local only... */
+ cm_lap_set_local_ack_timeout(lap_msg,
+ cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
+ alternate_path->packet_life_time));
+
+ if (private_data && private_data_len)
+ memcpy(lap_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_lap(struct ib_cm_id *cm_id,
+ struct ib_sa_path_rec *alternate_path,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ int ret;
+
+ if (private_data && private_data_len > IB_CM_LAP_PRIVATE_DATA_SIZE)
+ return -EINVAL;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state != IB_CM_ESTABLISHED ||
+ (cm_id->lap_state != IB_CM_LAP_UNINIT &&
+ cm_id->lap_state != IB_CM_LAP_IDLE)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = cm_init_av_by_path(alternate_path, &cm_id_priv->alt_av);
+ if (ret)
+ goto out;
+ cm_id_priv->alt_av.timeout =
+ cm_ack_timeout(cm_id_priv->target_ack_delay,
+ cm_id_priv->alt_av.timeout - 1);
+
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (ret)
+ goto out;
+
+ cm_format_lap((struct cm_lap_msg *) msg->mad, cm_id_priv,
+ alternate_path, private_data, private_data_len);
+ msg->timeout_ms = cm_id_priv->timeout_ms;
+ msg->context[1] = (void *) (unsigned long) IB_CM_ESTABLISHED;
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_free_msg(msg);
+ return ret;
+ }
+
+ cm_id->lap_state = IB_CM_LAP_SENT;
+ cm_id_priv->msg = msg;
+
+out: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_lap);
+
+static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv,
+ struct ib_sa_path_rec *path,
+ struct cm_lap_msg *lap_msg)
+{
+ memset(path, 0, sizeof *path);
+ path->dgid = lap_msg->alt_local_gid;
+ path->sgid = lap_msg->alt_remote_gid;
+ path->dlid = lap_msg->alt_local_lid;
+ path->slid = lap_msg->alt_remote_lid;
+ path->flow_label = cm_lap_get_flow_label(lap_msg);
+ path->hop_limit = lap_msg->alt_hop_limit;
+ path->traffic_class = cm_lap_get_traffic_class(lap_msg);
+ path->reversible = 1;
+ path->pkey = cm_id_priv->pkey;
+ path->sl = cm_lap_get_sl(lap_msg);
+ path->mtu_selector = IB_SA_EQ;
+ path->mtu = cm_id_priv->path_mtu;
+ path->rate_selector = IB_SA_EQ;
+ path->rate = cm_lap_get_packet_rate(lap_msg);
+ path->packet_life_time_selector = IB_SA_EQ;
+ path->packet_life_time = cm_lap_get_local_ack_timeout(lap_msg);
+ path->packet_life_time -= (path->packet_life_time > 0);
+}
+
+static int cm_lap_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_lap_msg *lap_msg;
+ struct ib_cm_lap_event_param *param;
+ struct ib_mad_send_buf *msg = NULL;
+ int ret;
+
+ /* todo: verify LAP request and send reject APR if invalid. */
+ lap_msg = (struct cm_lap_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(lap_msg->remote_comm_id,
+ lap_msg->local_comm_id);
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ param = &work->cm_event.param.lap_rcvd;
+ param->alternate_path = &work->path[0];
+ cm_format_path_from_lap(cm_id_priv, param->alternate_path, lap_msg);
+ work->cm_event.private_data = &lap_msg->private_data;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state != IB_CM_ESTABLISHED)
+ goto unlock;
+
+ switch (cm_id_priv->id.lap_state) {
+ case IB_CM_LAP_UNINIT:
+ case IB_CM_LAP_IDLE:
+ break;
+ case IB_CM_MRA_LAP_SENT:
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_LAP_COUNTER]);
+ if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg))
+ goto unlock;
+
+ cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+ CM_MSG_RESPONSE_OTHER,
+ cm_id_priv->service_timeout,
+ cm_id_priv->private_data,
+ cm_id_priv->private_data_len);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ib_post_send_mad(msg, NULL))
+ cm_free_msg(msg);
+ goto deref;
+ case IB_CM_LAP_RCVD:
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_LAP_COUNTER]);
+ goto unlock;
+ default:
+ goto unlock;
+ }
+
+ cm_id_priv->id.lap_state = IB_CM_LAP_RCVD;
+ cm_id_priv->tid = lap_msg->hdr.tid;
+ cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+ work->mad_recv_wc->recv_buf.grh,
+ &cm_id_priv->av);
+ cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av);
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+
+unlock: spin_unlock_irq(&cm_id_priv->lock);
+deref: cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static void cm_format_apr(struct cm_apr_msg *apr_msg,
+ struct cm_id_private *cm_id_priv,
+ enum ib_cm_apr_status status,
+ void *info,
+ u8 info_length,
+ const void *private_data,
+ u8 private_data_len)
+{
+ cm_format_mad_hdr(&apr_msg->hdr, CM_APR_ATTR_ID, cm_id_priv->tid);
+ apr_msg->local_comm_id = cm_id_priv->id.local_id;
+ apr_msg->remote_comm_id = cm_id_priv->id.remote_id;
+ apr_msg->ap_status = (u8) status;
+
+ if (info && info_length) {
+ apr_msg->info_length = info_length;
+ memcpy(apr_msg->info, info, info_length);
+ }
+
+ if (private_data && private_data_len)
+ memcpy(apr_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_apr(struct ib_cm_id *cm_id,
+ enum ib_cm_apr_status status,
+ void *info,
+ u8 info_length,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ int ret;
+
+ if ((private_data && private_data_len > IB_CM_APR_PRIVATE_DATA_SIZE) ||
+ (info && info_length > IB_CM_APR_INFO_LENGTH))
+ return -EINVAL;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state != IB_CM_ESTABLISHED ||
+ (cm_id->lap_state != IB_CM_LAP_RCVD &&
+ cm_id->lap_state != IB_CM_MRA_LAP_SENT)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (ret)
+ goto out;
+
+ cm_format_apr((struct cm_apr_msg *) msg->mad, cm_id_priv, status,
+ info, info_length, private_data, private_data_len);
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_free_msg(msg);
+ return ret;
+ }
+
+ cm_id->lap_state = IB_CM_LAP_IDLE;
+out: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_apr);
+
+static int cm_apr_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_apr_msg *apr_msg;
+ int ret;
+
+ apr_msg = (struct cm_apr_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(apr_msg->remote_comm_id,
+ apr_msg->local_comm_id);
+ if (!cm_id_priv)
+ return -EINVAL; /* Unmatched reply. */
+
+ work->cm_event.param.apr_rcvd.ap_status = apr_msg->ap_status;
+ work->cm_event.param.apr_rcvd.apr_info = &apr_msg->info;
+ work->cm_event.param.apr_rcvd.info_len = apr_msg->info_length;
+ work->cm_event.private_data = &apr_msg->private_data;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state != IB_CM_ESTABLISHED ||
+ (cm_id_priv->id.lap_state != IB_CM_LAP_SENT &&
+ cm_id_priv->id.lap_state != IB_CM_MRA_LAP_RCVD)) {
+ spin_unlock_irq(&cm_id_priv->lock);
+ goto out;
+ }
+ cm_id_priv->id.lap_state = IB_CM_LAP_IDLE;
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ cm_id_priv->msg = NULL;
+
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static int cm_timewait_handler(struct cm_work *work)
+{
+ struct cm_timewait_info *timewait_info;
+ struct cm_id_private *cm_id_priv;
+ int ret;
+
+ timewait_info = (struct cm_timewait_info *)work;
+ spin_lock_irq(&cm.lock);
+ list_del(&timewait_info->list);
+ spin_unlock_irq(&cm.lock);
+
+ cm_id_priv = cm_acquire_id(timewait_info->work.local_id,
+ timewait_info->work.remote_id);
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state != IB_CM_TIMEWAIT ||
+ cm_id_priv->remote_qpn != timewait_info->remote_qpn) {
+ spin_unlock_irq(&cm_id_priv->lock);
+ goto out;
+ }
+ cm_id_priv->id.state = IB_CM_IDLE;
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static void cm_format_sidr_req(struct cm_sidr_req_msg *sidr_req_msg,
+ struct cm_id_private *cm_id_priv,
+ struct ib_cm_sidr_req_param *param)
+{
+ cm_format_mad_hdr(&sidr_req_msg->hdr, CM_SIDR_REQ_ATTR_ID,
+ cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_SIDR));
+ sidr_req_msg->request_id = cm_id_priv->id.local_id;
+ sidr_req_msg->pkey = param->path->pkey;
+ sidr_req_msg->service_id = param->service_id;
+
+ if (param->private_data && param->private_data_len)
+ memcpy(sidr_req_msg->private_data, param->private_data,
+ param->private_data_len);
+}
+
+int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
+ struct ib_cm_sidr_req_param *param)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ int ret;
+
+ if (!param->path || (param->private_data &&
+ param->private_data_len > IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE))
+ return -EINVAL;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ ret = cm_init_av_by_path(param->path, &cm_id_priv->av);
+ if (ret)
+ goto out;
+
+ cm_id->service_id = param->service_id;
+ cm_id->service_mask = ~cpu_to_be64(0);
+ cm_id_priv->timeout_ms = param->timeout_ms;
+ if (cm_id_priv->timeout_ms > cm_convert_to_ms(max_timeout)) {
+ printk(KERN_WARNING PFX "sidr req timeout_ms %d > %d, "
+ "decreasing used timeout_ms\n", param->timeout_ms,
+ cm_convert_to_ms(max_timeout));
+ cm_id_priv->timeout_ms = cm_convert_to_ms(max_timeout);
+ }
+ cm_id_priv->max_cm_retries = param->max_cm_retries;
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (ret)
+ goto out;
+
+ cm_format_sidr_req((struct cm_sidr_req_msg *) msg->mad, cm_id_priv,
+ param);
+ msg->timeout_ms = cm_id_priv->timeout_ms;
+ msg->context[1] = (void *) (unsigned long) IB_CM_SIDR_REQ_SENT;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state == IB_CM_IDLE)
+ ret = ib_post_send_mad(msg, NULL);
+ else
+ ret = -EINVAL;
+
+ if (ret) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_free_msg(msg);
+ goto out;
+ }
+ cm_id->state = IB_CM_SIDR_REQ_SENT;
+ cm_id_priv->msg = msg;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+out:
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_sidr_req);
+
+static void cm_format_sidr_req_event(struct cm_work *work,
+ struct ib_cm_id *listen_id)
+{
+ struct cm_sidr_req_msg *sidr_req_msg;
+ struct ib_cm_sidr_req_event_param *param;
+
+ sidr_req_msg = (struct cm_sidr_req_msg *)
+ work->mad_recv_wc->recv_buf.mad;
+ param = &work->cm_event.param.sidr_req_rcvd;
+ param->pkey = __be16_to_cpu(sidr_req_msg->pkey);
+ param->listen_id = listen_id;
+ param->port = work->port->port_num;
+ work->cm_event.private_data = &sidr_req_msg->private_data;
+}
+
+static int cm_sidr_req_handler(struct cm_work *work)
+{
+ struct ib_cm_id *cm_id;
+ struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
+ struct cm_sidr_req_msg *sidr_req_msg;
+ struct ib_wc *wc;
+
+ cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL);
+ if (IS_ERR(cm_id))
+ return PTR_ERR(cm_id);
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+
+ /* Record SGID/SLID and request ID for lookup. */
+ sidr_req_msg = (struct cm_sidr_req_msg *)
+ work->mad_recv_wc->recv_buf.mad;
+ wc = work->mad_recv_wc->wc;
+ cm_id_priv->av.dgid.global.subnet_prefix = cpu_to_be64(wc->slid);
+ cm_id_priv->av.dgid.global.interface_id = 0;
+ cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+ work->mad_recv_wc->recv_buf.grh,
+ &cm_id_priv->av);
+ cm_id_priv->id.remote_id = sidr_req_msg->request_id;
+ cm_id_priv->tid = sidr_req_msg->hdr.tid;
+ atomic_inc(&cm_id_priv->work_count);
+
+ spin_lock_irq(&cm.lock);
+ cur_cm_id_priv = cm_insert_remote_sidr(cm_id_priv);
+ if (cur_cm_id_priv) {
+ spin_unlock_irq(&cm.lock);
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_SIDR_REQ_COUNTER]);
+ goto out; /* Duplicate message. */
+ }
+ cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD;
+ cur_cm_id_priv = cm_find_listen(cm_id->device,
+ sidr_req_msg->service_id,
+ sidr_req_msg->private_data);
+ if (!cur_cm_id_priv) {
+ spin_unlock_irq(&cm.lock);
+ cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED);
+ goto out; /* No match. */
+ }
+ atomic_inc(&cur_cm_id_priv->refcount);
+ spin_unlock_irq(&cm.lock);
+
+ cm_id_priv->id.cm_handler = cur_cm_id_priv->id.cm_handler;
+ cm_id_priv->id.context = cur_cm_id_priv->id.context;
+ cm_id_priv->id.service_id = sidr_req_msg->service_id;
+ cm_id_priv->id.service_mask = ~cpu_to_be64(0);
+
+ cm_format_sidr_req_event(work, &cur_cm_id_priv->id);
+ cm_process_work(cm_id_priv, work);
+ cm_deref_id(cur_cm_id_priv);
+ return 0;
+out:
+ ib_destroy_cm_id(&cm_id_priv->id);
+ return -EINVAL;
+}
+
+static void cm_format_sidr_rep(struct cm_sidr_rep_msg *sidr_rep_msg,
+ struct cm_id_private *cm_id_priv,
+ struct ib_cm_sidr_rep_param *param)
+{
+ cm_format_mad_hdr(&sidr_rep_msg->hdr, CM_SIDR_REP_ATTR_ID,
+ cm_id_priv->tid);
+ sidr_rep_msg->request_id = cm_id_priv->id.remote_id;
+ sidr_rep_msg->status = param->status;
+ cm_sidr_rep_set_qpn(sidr_rep_msg, cpu_to_be32(param->qp_num));
+ sidr_rep_msg->service_id = cm_id_priv->id.service_id;
+ sidr_rep_msg->qkey = cpu_to_be32(param->qkey);
+
+ if (param->info && param->info_length)
+ memcpy(sidr_rep_msg->info, param->info, param->info_length);
+
+ if (param->private_data && param->private_data_len)
+ memcpy(sidr_rep_msg->private_data, param->private_data,
+ param->private_data_len);
+}
+
+int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id,
+ struct ib_cm_sidr_rep_param *param)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ int ret;
+
+ if ((param->info && param->info_length > IB_CM_SIDR_REP_INFO_LENGTH) ||
+ (param->private_data &&
+ param->private_data_len > IB_CM_SIDR_REP_PRIVATE_DATA_SIZE))
+ return -EINVAL;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state != IB_CM_SIDR_REQ_RCVD) {
+ ret = -EINVAL;
+ goto error;
+ }
+
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (ret)
+ goto error;
+
+ cm_format_sidr_rep((struct cm_sidr_rep_msg *) msg->mad, cm_id_priv,
+ param);
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_free_msg(msg);
+ return ret;
+ }
+ cm_id->state = IB_CM_IDLE;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ spin_lock_irqsave(&cm.lock, flags);
+ rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
+ spin_unlock_irqrestore(&cm.lock, flags);
+ return 0;
+
+error: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_sidr_rep);
+
+static void cm_format_sidr_rep_event(struct cm_work *work)
+{
+ struct cm_sidr_rep_msg *sidr_rep_msg;
+ struct ib_cm_sidr_rep_event_param *param;
+
+ sidr_rep_msg = (struct cm_sidr_rep_msg *)
+ work->mad_recv_wc->recv_buf.mad;
+ param = &work->cm_event.param.sidr_rep_rcvd;
+ param->status = sidr_rep_msg->status;
+ param->qkey = be32_to_cpu(sidr_rep_msg->qkey);
+ param->qpn = be32_to_cpu(cm_sidr_rep_get_qpn(sidr_rep_msg));
+ param->info = &sidr_rep_msg->info;
+ param->info_len = sidr_rep_msg->info_length;
+ work->cm_event.private_data = &sidr_rep_msg->private_data;
+}
+
+static int cm_sidr_rep_handler(struct cm_work *work)
+{
+ struct cm_sidr_rep_msg *sidr_rep_msg;
+ struct cm_id_private *cm_id_priv;
+
+ sidr_rep_msg = (struct cm_sidr_rep_msg *)
+ work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(sidr_rep_msg->request_id, 0);
+ if (!cm_id_priv)
+ return -EINVAL; /* Unmatched reply. */
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state != IB_CM_SIDR_REQ_SENT) {
+ spin_unlock_irq(&cm_id_priv->lock);
+ goto out;
+ }
+ cm_id_priv->id.state = IB_CM_IDLE;
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ cm_format_sidr_rep_event(work);
+ cm_process_work(cm_id_priv, work);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static void cm_process_send_error(struct ib_mad_send_buf *msg,
+ enum ib_wc_status wc_status)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_cm_event cm_event;
+ enum ib_cm_state state;
+ int ret;
+
+ memset(&cm_event, 0, sizeof cm_event);
+ cm_id_priv = msg->context[0];
+
+ /* Discard old sends or ones without a response. */
+ spin_lock_irq(&cm_id_priv->lock);
+ state = (enum ib_cm_state) (unsigned long) msg->context[1];
+ if (msg != cm_id_priv->msg || state != cm_id_priv->id.state)
+ goto discard;
+
+ switch (state) {
+ case IB_CM_REQ_SENT:
+ case IB_CM_MRA_REQ_RCVD:
+ cm_reset_to_idle(cm_id_priv);
+ cm_event.event = IB_CM_REQ_ERROR;
+ break;
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ cm_reset_to_idle(cm_id_priv);
+ cm_event.event = IB_CM_REP_ERROR;
+ break;
+ case IB_CM_DREQ_SENT:
+ cm_enter_timewait(cm_id_priv);
+ cm_event.event = IB_CM_DREQ_ERROR;
+ break;
+ case IB_CM_SIDR_REQ_SENT:
+ cm_id_priv->id.state = IB_CM_IDLE;
+ cm_event.event = IB_CM_SIDR_REQ_ERROR;
+ break;
+ default:
+ goto discard;
+ }
+ spin_unlock_irq(&cm_id_priv->lock);
+ cm_event.param.send_status = wc_status;
+
+ /* No other events can occur on the cm_id at this point. */
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &cm_event);
+ cm_free_msg(msg);
+ if (ret)
+ ib_destroy_cm_id(&cm_id_priv->id);
+ return;
+discard:
+ spin_unlock_irq(&cm_id_priv->lock);
+ cm_free_msg(msg);
+}
+
+static void cm_send_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ struct ib_mad_send_buf *msg = mad_send_wc->send_buf;
+ struct cm_port *port;
+ u16 attr_index;
+
+ port = mad_agent->context;
+ attr_index = be16_to_cpu(((struct ib_mad_hdr *)
+ msg->mad)->attr_id) - CM_ATTR_ID_OFFSET;
+
+ /*
+ * If the send was in response to a received message (context[0] is not
+ * set to a cm_id), and is not a REJ, then it is a send that was
+ * manually retried.
+ */
+ if (!msg->context[0] && (attr_index != CM_REJ_COUNTER))
+ msg->retries = 1;
+
+ atomic_long_add(1 + msg->retries,
+ &port->counter_group[CM_XMIT].counter[attr_index]);
+ if (msg->retries)
+ atomic_long_add(msg->retries,
+ &port->counter_group[CM_XMIT_RETRIES].
+ counter[attr_index]);
+
+ switch (mad_send_wc->status) {
+ case IB_WC_SUCCESS:
+ case IB_WC_WR_FLUSH_ERR:
+ cm_free_msg(msg);
+ break;
+ default:
+ if (msg->context[0] && msg->context[1])
+ cm_process_send_error(msg, mad_send_wc->status);
+ else
+ cm_free_msg(msg);
+ break;
+ }
+}
+
+static void cm_work_handler(struct work_struct *_work)
+{
+ struct cm_work *work = container_of(_work, struct cm_work, work.work);
+ int ret;
+
+ switch (work->cm_event.event) {
+ case IB_CM_REQ_RECEIVED:
+ ret = cm_req_handler(work);
+ break;
+ case IB_CM_MRA_RECEIVED:
+ ret = cm_mra_handler(work);
+ break;
+ case IB_CM_REJ_RECEIVED:
+ ret = cm_rej_handler(work);
+ break;
+ case IB_CM_REP_RECEIVED:
+ ret = cm_rep_handler(work);
+ break;
+ case IB_CM_RTU_RECEIVED:
+ ret = cm_rtu_handler(work);
+ break;
+ case IB_CM_USER_ESTABLISHED:
+ ret = cm_establish_handler(work);
+ break;
+ case IB_CM_DREQ_RECEIVED:
+ ret = cm_dreq_handler(work);
+ break;
+ case IB_CM_DREP_RECEIVED:
+ ret = cm_drep_handler(work);
+ break;
+ case IB_CM_SIDR_REQ_RECEIVED:
+ ret = cm_sidr_req_handler(work);
+ break;
+ case IB_CM_SIDR_REP_RECEIVED:
+ ret = cm_sidr_rep_handler(work);
+ break;
+ case IB_CM_LAP_RECEIVED:
+ ret = cm_lap_handler(work);
+ break;
+ case IB_CM_APR_RECEIVED:
+ ret = cm_apr_handler(work);
+ break;
+ case IB_CM_TIMEWAIT_EXIT:
+ ret = cm_timewait_handler(work);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ if (ret)
+ cm_free_work(work);
+}
+
+static int cm_establish(struct ib_cm_id *cm_id)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_work *work;
+ unsigned long flags;
+ int ret = 0;
+
+ work = kmalloc(sizeof *work, GFP_ATOMIC);
+ if (!work)
+ return -ENOMEM;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id->state)
+ {
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ cm_id->state = IB_CM_ESTABLISHED;
+ break;
+ case IB_CM_ESTABLISHED:
+ ret = -EISCONN;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ if (ret) {
+ kfree(work);
+ goto out;
+ }
+
+ /*
+ * The CM worker thread may try to destroy the cm_id before it
+ * can execute this work item. To prevent potential deadlock,
+ * we need to find the cm_id once we're in the context of the
+ * worker thread, rather than holding a reference on it.
+ */
+ INIT_DELAYED_WORK(&work->work, cm_work_handler);
+ work->local_id = cm_id->local_id;
+ work->remote_id = cm_id->remote_id;
+ work->mad_recv_wc = NULL;
+ work->cm_event.event = IB_CM_USER_ESTABLISHED;
+ queue_delayed_work(cm.wq, &work->work, 0);
+out:
+ return ret;
+}
+
+static int cm_migrate(struct ib_cm_id *cm_id)
+{
+ struct cm_id_private *cm_id_priv;
+ unsigned long flags;
+ int ret = 0;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state == IB_CM_ESTABLISHED &&
+ (cm_id->lap_state == IB_CM_LAP_UNINIT ||
+ cm_id->lap_state == IB_CM_LAP_IDLE)) {
+ cm_id->lap_state = IB_CM_LAP_IDLE;
+ cm_id_priv->av = cm_id_priv->alt_av;
+ } else
+ ret = -EINVAL;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ return ret;
+}
+
+int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event)
+{
+ int ret;
+
+ switch (event) {
+ case IB_EVENT_COMM_EST:
+ ret = cm_establish(cm_id);
+ break;
+ case IB_EVENT_PATH_MIG:
+ ret = cm_migrate(cm_id);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(ib_cm_notify);
+
+static void cm_recv_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct cm_port *port = mad_agent->context;
+ struct cm_work *work;
+ enum ib_cm_event_type event;
+ u16 attr_id;
+ int paths = 0;
+
+ switch (mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) {
+ case CM_REQ_ATTR_ID:
+ paths = 1 + (((struct cm_req_msg *) mad_recv_wc->recv_buf.mad)->
+ alt_local_lid != 0);
+ event = IB_CM_REQ_RECEIVED;
+ break;
+ case CM_MRA_ATTR_ID:
+ event = IB_CM_MRA_RECEIVED;
+ break;
+ case CM_REJ_ATTR_ID:
+ event = IB_CM_REJ_RECEIVED;
+ break;
+ case CM_REP_ATTR_ID:
+ event = IB_CM_REP_RECEIVED;
+ break;
+ case CM_RTU_ATTR_ID:
+ event = IB_CM_RTU_RECEIVED;
+ break;
+ case CM_DREQ_ATTR_ID:
+ event = IB_CM_DREQ_RECEIVED;
+ break;
+ case CM_DREP_ATTR_ID:
+ event = IB_CM_DREP_RECEIVED;
+ break;
+ case CM_SIDR_REQ_ATTR_ID:
+ event = IB_CM_SIDR_REQ_RECEIVED;
+ break;
+ case CM_SIDR_REP_ATTR_ID:
+ event = IB_CM_SIDR_REP_RECEIVED;
+ break;
+ case CM_LAP_ATTR_ID:
+ paths = 1;
+ event = IB_CM_LAP_RECEIVED;
+ break;
+ case CM_APR_ATTR_ID:
+ event = IB_CM_APR_RECEIVED;
+ break;
+ default:
+ ib_free_recv_mad(mad_recv_wc);
+ return;
+ }
+
+ attr_id = be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.attr_id);
+ atomic_long_inc(&port->counter_group[CM_RECV].
+ counter[attr_id - CM_ATTR_ID_OFFSET]);
+
+ work = kmalloc(sizeof *work + sizeof(struct ib_sa_path_rec) * paths,
+ GFP_KERNEL);
+ if (!work) {
+ ib_free_recv_mad(mad_recv_wc);
+ return;
+ }
+
+ INIT_DELAYED_WORK(&work->work, cm_work_handler);
+ work->cm_event.event = event;
+ work->mad_recv_wc = mad_recv_wc;
+ work->port = port;
+ queue_delayed_work(cm.wq, &work->work, 0);
+}
+
+static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->id.state) {
+ case IB_CM_REQ_SENT:
+ case IB_CM_MRA_REQ_RCVD:
+ case IB_CM_REQ_RCVD:
+ case IB_CM_MRA_REQ_SENT:
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ case IB_CM_ESTABLISHED:
+ *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX | IB_QP_PORT;
+ qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE;
+ if (cm_id_priv->responder_resources)
+ qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_ATOMIC;
+ qp_attr->pkey_index = cm_id_priv->av.pkey_index;
+ qp_attr->port_num = cm_id_priv->av.port->port_num;
+ ret = 0;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+
+static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->id.state) {
+ case IB_CM_REQ_RCVD:
+ case IB_CM_MRA_REQ_SENT:
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ case IB_CM_ESTABLISHED:
+ *qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN | IB_QP_RQ_PSN;
+ qp_attr->ah_attr = cm_id_priv->av.ah_attr;
+ qp_attr->path_mtu = cm_id_priv->path_mtu;
+ qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn);
+ qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn);
+ if (cm_id_priv->qp_type == IB_QPT_RC) {
+ *qp_attr_mask |= IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_MIN_RNR_TIMER;
+ qp_attr->max_dest_rd_atomic =
+ cm_id_priv->responder_resources;
+ qp_attr->min_rnr_timer = 0;
+ }
+ if (cm_id_priv->alt_av.ah_attr.dlid) {
+ *qp_attr_mask |= IB_QP_ALT_PATH;
+ qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num;
+ qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
+ qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
+ qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
+ }
+ ret = 0;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+
+static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->id.state) {
+ /* Allow transition to RTS before sending REP */
+ case IB_CM_REQ_RCVD:
+ case IB_CM_MRA_REQ_SENT:
+
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ case IB_CM_ESTABLISHED:
+ if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT) {
+ *qp_attr_mask = IB_QP_STATE | IB_QP_SQ_PSN;
+ qp_attr->sq_psn = be32_to_cpu(cm_id_priv->sq_psn);
+ if (cm_id_priv->qp_type == IB_QPT_RC) {
+ *qp_attr_mask |= IB_QP_TIMEOUT | IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_MAX_QP_RD_ATOMIC;
+ qp_attr->timeout = cm_id_priv->av.timeout;
+ qp_attr->retry_cnt = cm_id_priv->retry_count;
+ qp_attr->rnr_retry = cm_id_priv->rnr_retry_count;
+ qp_attr->max_rd_atomic =
+ cm_id_priv->initiator_depth;
+ }
+ if (cm_id_priv->alt_av.ah_attr.dlid) {
+ *qp_attr_mask |= IB_QP_PATH_MIG_STATE;
+ qp_attr->path_mig_state = IB_MIG_REARM;
+ }
+ } else {
+ *qp_attr_mask = IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE;
+ qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num;
+ qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
+ qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
+ qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
+ qp_attr->path_mig_state = IB_MIG_REARM;
+ }
+ ret = 0;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+
+int ib_cm_init_qp_attr(struct ib_cm_id *cm_id,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ struct cm_id_private *cm_id_priv;
+ int ret;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ switch (qp_attr->qp_state) {
+ case IB_QPS_INIT:
+ ret = cm_init_qp_init_attr(cm_id_priv, qp_attr, qp_attr_mask);
+ break;
+ case IB_QPS_RTR:
+ ret = cm_init_qp_rtr_attr(cm_id_priv, qp_attr, qp_attr_mask);
+ break;
+ case IB_QPS_RTS:
+ ret = cm_init_qp_rts_attr(cm_id_priv, qp_attr, qp_attr_mask);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(ib_cm_init_qp_attr);
+
+static void cm_get_ack_delay(struct cm_device *cm_dev)
+{
+ struct ib_device_attr attr;
+
+ if (ib_query_device(cm_dev->ib_device, &attr))
+ cm_dev->ack_delay = 0; /* acks will rely on packet life time */
+ else
+ cm_dev->ack_delay = attr.local_ca_ack_delay;
+}
+
+static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr,
+ char *buf)
+{
+ struct cm_counter_group *group;
+ struct cm_counter_attribute *cm_attr;
+
+ group = container_of(obj, struct cm_counter_group, obj);
+ cm_attr = container_of(attr, struct cm_counter_attribute, attr);
+
+ return sprintf(buf, "%ld\n",
+ atomic_long_read(&group->counter[cm_attr->index]));
+}
+
+static struct sysfs_ops cm_counter_ops = {
+ .show = cm_show_counter
+};
+
+static struct kobj_type cm_counter_obj_type = {
+ .sysfs_ops = &cm_counter_ops,
+ .default_attrs = cm_counter_default_attrs
+};
+
+static void cm_release_port_obj(struct kobject *obj)
+{
+ struct cm_port *cm_port;
+
+ cm_port = container_of(obj, struct cm_port, port_obj);
+ kfree(cm_port);
+}
+
+static struct kobj_type cm_port_obj_type = {
+ .release = cm_release_port_obj
+};
+
+struct class cm_class = {
+ .name = "infiniband_cm",
+};
+EXPORT_SYMBOL(cm_class);
+
+static int cm_create_port_fs(struct cm_port *port)
+{
+ int i, ret;
+
+ ret = kobject_init_and_add(&port->port_obj, &cm_port_obj_type,
+ &port->cm_dev->device->kobj,
+ "%d", port->port_num);
+ if (ret) {
+ kfree(port);
+ return ret;
+ }
+
+ for (i = 0; i < CM_COUNTER_GROUPS; i++) {
+ ret = kobject_init_and_add(&port->counter_group[i].obj,
+ &cm_counter_obj_type,
+ &port->port_obj,
+ "%s", counter_group_names[i]);
+ if (ret)
+ goto error;
+ }
+
+ return 0;
+
+error:
+ while (i--)
+ kobject_put(&port->counter_group[i].obj);
+ kobject_put(&port->port_obj);
+ return ret;
+
+}
+
+static void cm_remove_port_fs(struct cm_port *port)
+{
+ int i;
+
+ for (i = 0; i < CM_COUNTER_GROUPS; i++)
+ kobject_put(&port->counter_group[i].obj);
+
+ kobject_put(&port->port_obj);
+}
+
+static void cm_add_one(struct ib_device *ib_device)
+{
+ struct cm_device *cm_dev;
+ struct cm_port *port;
+ struct ib_mad_reg_req reg_req = {
+ .mgmt_class = IB_MGMT_CLASS_CM,
+ .mgmt_class_version = IB_CM_CLASS_VERSION
+ };
+ struct ib_port_modify port_modify = {
+ .set_port_cap_mask = IB_PORT_CM_SUP
+ };
+ unsigned long flags;
+ int ret;
+ u8 i;
+
+ if (rdma_node_get_transport(ib_device->node_type) != RDMA_TRANSPORT_IB)
+ return;
+
+ cm_dev = kzalloc(sizeof(*cm_dev) + sizeof(*port) *
+ ib_device->phys_port_cnt, GFP_KERNEL);
+ if (!cm_dev)
+ return;
+
+ cm_dev->ib_device = ib_device;
+ cm_get_ack_delay(cm_dev);
+
+ cm_dev->device = device_create(&cm_class, &ib_device->dev,
+ MKDEV(0, 0), NULL,
+ "%s", ib_device->name);
+ if (!cm_dev->device) {
+ kfree(cm_dev);
+ return;
+ }
+
+ set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
+ for (i = 1; i <= ib_device->phys_port_cnt; i++) {
+ port = kzalloc(sizeof *port, GFP_KERNEL);
+ if (!port)
+ goto error1;
+
+ cm_dev->port[i-1] = port;
+ port->cm_dev = cm_dev;
+ port->port_num = i;
+
+ ret = cm_create_port_fs(port);
+ if (ret)
+ goto error1;
+
+ port->mad_agent = ib_register_mad_agent(ib_device, i,
+ IB_QPT_GSI,
+ &reg_req,
+ 0,
+ cm_send_handler,
+ cm_recv_handler,
+ port);
+ if (IS_ERR(port->mad_agent))
+ goto error2;
+
+ ret = ib_modify_port(ib_device, i, 0, &port_modify);
+ if (ret)
+ goto error3;
+ }
+ ib_set_client_data(ib_device, &cm_client, cm_dev);
+
+ write_lock_irqsave(&cm.device_lock, flags);
+ list_add_tail(&cm_dev->list, &cm.device_list);
+ write_unlock_irqrestore(&cm.device_lock, flags);
+ return;
+
+error3:
+ ib_unregister_mad_agent(port->mad_agent);
+error2:
+ cm_remove_port_fs(port);
+error1:
+ port_modify.set_port_cap_mask = 0;
+ port_modify.clr_port_cap_mask = IB_PORT_CM_SUP;
+ while (--i) {
+ port = cm_dev->port[i-1];
+ ib_modify_port(ib_device, port->port_num, 0, &port_modify);
+ ib_unregister_mad_agent(port->mad_agent);
+ cm_remove_port_fs(port);
+ }
+ device_unregister(cm_dev->device);
+ kfree(cm_dev);
+}
+
+static void cm_remove_one(struct ib_device *ib_device)
+{
+ struct cm_device *cm_dev;
+ struct cm_port *port;
+ struct ib_port_modify port_modify = {
+ .clr_port_cap_mask = IB_PORT_CM_SUP
+ };
+ unsigned long flags;
+ int i;
+
+ cm_dev = ib_get_client_data(ib_device, &cm_client);
+ if (!cm_dev)
+ return;
+
+ write_lock_irqsave(&cm.device_lock, flags);
+ list_del(&cm_dev->list);
+ write_unlock_irqrestore(&cm.device_lock, flags);
+
+ for (i = 1; i <= ib_device->phys_port_cnt; i++) {
+ port = cm_dev->port[i-1];
+ ib_modify_port(ib_device, port->port_num, 0, &port_modify);
+ ib_unregister_mad_agent(port->mad_agent);
+ flush_workqueue(cm.wq);
+ cm_remove_port_fs(port);
+ }
+ device_unregister(cm_dev->device);
+ kfree(cm_dev);
+}
+
+static int __init ib_cm_init(void)
+{
+ int ret;
+
+ memset(&cm, 0, sizeof cm);
+ INIT_LIST_HEAD(&cm.device_list);
+ rwlock_init(&cm.device_lock);
+ spin_lock_init(&cm.lock);
+ cm.listen_service_table = RB_ROOT;
+ cm.listen_service_id = be64_to_cpu(IB_CM_ASSIGN_SERVICE_ID);
+ cm.remote_id_table = RB_ROOT;
+ cm.remote_qp_table = RB_ROOT;
+ cm.remote_sidr_table = RB_ROOT;
+ idr_init(&cm.local_id_table);
+ get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand);
+ idr_pre_get(&cm.local_id_table, GFP_KERNEL);
+ INIT_LIST_HEAD(&cm.timewait_list);
+
+ ret = class_register(&cm_class);
+ if (ret)
+ return -ENOMEM;
+
+ cm.wq = create_workqueue("ib_cm");
+ if (!cm.wq) {
+ ret = -ENOMEM;
+ goto error1;
+ }
+
+ ret = ib_register_client(&cm_client);
+ if (ret)
+ goto error2;
+
+ return 0;
+error2:
+ destroy_workqueue(cm.wq);
+error1:
+ class_unregister(&cm_class);
+ return ret;
+}
+
+static void __exit ib_cm_cleanup(void)
+{
+ struct cm_timewait_info *timewait_info, *tmp;
+
+ spin_lock_irq(&cm.lock);
+ list_for_each_entry(timewait_info, &cm.timewait_list, list)
+ cancel_delayed_work(&timewait_info->work.work);
+ spin_unlock_irq(&cm.lock);
+
+ ib_unregister_client(&cm_client);
+ destroy_workqueue(cm.wq);
+
+ list_for_each_entry_safe(timewait_info, tmp, &cm.timewait_list, list) {
+ list_del(&timewait_info->list);
+ kfree(timewait_info);
+ }
+
+ class_unregister(&cm_class);
+ idr_destroy(&cm.local_id_table);
+}
+
+module_init_order(ib_cm_init, SI_ORDER_SECOND);
+module_exit(ib_cm_cleanup);
+
diff --git a/sys/ofed/drivers/infiniband/core/cm_msgs.h b/sys/ofed/drivers/infiniband/core/cm_msgs.h
new file mode 100644
index 0000000..7e63c08
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/cm_msgs.h
@@ -0,0 +1,819 @@
+/*
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING the madirectory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use source and binary forms, with or
+ * withmodification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retathe above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHWARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS THE
+ * SOFTWARE.
+ */
+#if !defined(CM_MSGS_H)
+#define CM_MSGS_H
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_cm.h>
+
+/*
+ * Parameters to routines below should be in network-byte order, and values
+ * are returned in network-byte order.
+ */
+
+#define IB_CM_CLASS_VERSION 2 /* IB specification 1.2 */
+
+#define CM_REQ_ATTR_ID cpu_to_be16(0x0010)
+#define CM_MRA_ATTR_ID cpu_to_be16(0x0011)
+#define CM_REJ_ATTR_ID cpu_to_be16(0x0012)
+#define CM_REP_ATTR_ID cpu_to_be16(0x0013)
+#define CM_RTU_ATTR_ID cpu_to_be16(0x0014)
+#define CM_DREQ_ATTR_ID cpu_to_be16(0x0015)
+#define CM_DREP_ATTR_ID cpu_to_be16(0x0016)
+#define CM_SIDR_REQ_ATTR_ID cpu_to_be16(0x0017)
+#define CM_SIDR_REP_ATTR_ID cpu_to_be16(0x0018)
+#define CM_LAP_ATTR_ID cpu_to_be16(0x0019)
+#define CM_APR_ATTR_ID cpu_to_be16(0x001A)
+
+enum cm_msg_sequence {
+ CM_MSG_SEQUENCE_REQ,
+ CM_MSG_SEQUENCE_LAP,
+ CM_MSG_SEQUENCE_DREQ,
+ CM_MSG_SEQUENCE_SIDR
+};
+
+struct cm_req_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 rsvd4;
+ __be64 service_id;
+ __be64 local_ca_guid;
+ __be32 rsvd24;
+ __be32 local_qkey;
+ /* local QPN:24, responder resources:8 */
+ __be32 offset32;
+ /* local EECN:24, initiator depth:8 */
+ __be32 offset36;
+ /*
+ * remote EECN:24, remote CM response timeout:5,
+ * transport service type:2, end-to-end flow control:1
+ */
+ __be32 offset40;
+ /* starting PSN:24, local CM response timeout:5, retry count:3 */
+ __be32 offset44;
+ __be16 pkey;
+ /* path MTU:4, RDC exists:1, RNR retry count:3. */
+ u8 offset50;
+ /* max CM Retries:4, SRQ:1, rsvd:3 */
+ u8 offset51;
+
+ __be16 primary_local_lid;
+ __be16 primary_remote_lid;
+ union ib_gid primary_local_gid;
+ union ib_gid primary_remote_gid;
+ /* flow label:20, rsvd:6, packet rate:6 */
+ __be32 primary_offset88;
+ u8 primary_traffic_class;
+ u8 primary_hop_limit;
+ /* SL:4, subnet local:1, rsvd:3 */
+ u8 primary_offset94;
+ /* local ACK timeout:5, rsvd:3 */
+ u8 primary_offset95;
+
+ __be16 alt_local_lid;
+ __be16 alt_remote_lid;
+ union ib_gid alt_local_gid;
+ union ib_gid alt_remote_gid;
+ /* flow label:20, rsvd:6, packet rate:6 */
+ __be32 alt_offset132;
+ u8 alt_traffic_class;
+ u8 alt_hop_limit;
+ /* SL:4, subnet local:1, rsvd:3 */
+ u8 alt_offset138;
+ /* local ACK timeout:5, rsvd:3 */
+ u8 alt_offset139;
+
+ u8 private_data[IB_CM_REQ_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline __be32 cm_req_get_local_qpn(struct cm_req_msg *req_msg)
+{
+ return cpu_to_be32(be32_to_cpu(req_msg->offset32) >> 8);
+}
+
+static inline void cm_req_set_local_qpn(struct cm_req_msg *req_msg, __be32 qpn)
+{
+ req_msg->offset32 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+ (be32_to_cpu(req_msg->offset32) &
+ 0x000000FF));
+}
+
+static inline u8 cm_req_get_resp_res(struct cm_req_msg *req_msg)
+{
+ return (u8) be32_to_cpu(req_msg->offset32);
+}
+
+static inline void cm_req_set_resp_res(struct cm_req_msg *req_msg, u8 resp_res)
+{
+ req_msg->offset32 = cpu_to_be32(resp_res |
+ (be32_to_cpu(req_msg->offset32) &
+ 0xFFFFFF00));
+}
+
+static inline u8 cm_req_get_init_depth(struct cm_req_msg *req_msg)
+{
+ return (u8) be32_to_cpu(req_msg->offset36);
+}
+
+static inline void cm_req_set_init_depth(struct cm_req_msg *req_msg,
+ u8 init_depth)
+{
+ req_msg->offset36 = cpu_to_be32(init_depth |
+ (be32_to_cpu(req_msg->offset36) &
+ 0xFFFFFF00));
+}
+
+static inline u8 cm_req_get_remote_resp_timeout(struct cm_req_msg *req_msg)
+{
+ return (u8) ((be32_to_cpu(req_msg->offset40) & 0xF8) >> 3);
+}
+
+static inline void cm_req_set_remote_resp_timeout(struct cm_req_msg *req_msg,
+ u8 resp_timeout)
+{
+ req_msg->offset40 = cpu_to_be32((resp_timeout << 3) |
+ (be32_to_cpu(req_msg->offset40) &
+ 0xFFFFFF07));
+}
+
+static inline enum ib_qp_type cm_req_get_qp_type(struct cm_req_msg *req_msg)
+{
+ u8 transport_type = (u8) (be32_to_cpu(req_msg->offset40) & 0x06) >> 1;
+ switch(transport_type) {
+ case 0: return IB_QPT_RC;
+ case 1: return IB_QPT_UC;
+ default: return 0;
+ }
+}
+
+static inline void cm_req_set_qp_type(struct cm_req_msg *req_msg,
+ enum ib_qp_type qp_type)
+{
+ switch(qp_type) {
+ case IB_QPT_UC:
+ req_msg->offset40 = cpu_to_be32((be32_to_cpu(
+ req_msg->offset40) &
+ 0xFFFFFFF9) | 0x2);
+ break;
+ default:
+ req_msg->offset40 = cpu_to_be32(be32_to_cpu(
+ req_msg->offset40) &
+ 0xFFFFFFF9);
+ }
+}
+
+static inline u8 cm_req_get_flow_ctrl(struct cm_req_msg *req_msg)
+{
+ return be32_to_cpu(req_msg->offset40) & 0x1;
+}
+
+static inline void cm_req_set_flow_ctrl(struct cm_req_msg *req_msg,
+ u8 flow_ctrl)
+{
+ req_msg->offset40 = cpu_to_be32((flow_ctrl & 0x1) |
+ (be32_to_cpu(req_msg->offset40) &
+ 0xFFFFFFFE));
+}
+
+static inline __be32 cm_req_get_starting_psn(struct cm_req_msg *req_msg)
+{
+ return cpu_to_be32(be32_to_cpu(req_msg->offset44) >> 8);
+}
+
+static inline void cm_req_set_starting_psn(struct cm_req_msg *req_msg,
+ __be32 starting_psn)
+{
+ req_msg->offset44 = cpu_to_be32((be32_to_cpu(starting_psn) << 8) |
+ (be32_to_cpu(req_msg->offset44) & 0x000000FF));
+}
+
+static inline u8 cm_req_get_local_resp_timeout(struct cm_req_msg *req_msg)
+{
+ return (u8) ((be32_to_cpu(req_msg->offset44) & 0xF8) >> 3);
+}
+
+static inline void cm_req_set_local_resp_timeout(struct cm_req_msg *req_msg,
+ u8 resp_timeout)
+{
+ req_msg->offset44 = cpu_to_be32((resp_timeout << 3) |
+ (be32_to_cpu(req_msg->offset44) & 0xFFFFFF07));
+}
+
+static inline u8 cm_req_get_retry_count(struct cm_req_msg *req_msg)
+{
+ return (u8) (be32_to_cpu(req_msg->offset44) & 0x7);
+}
+
+static inline void cm_req_set_retry_count(struct cm_req_msg *req_msg,
+ u8 retry_count)
+{
+ req_msg->offset44 = cpu_to_be32((retry_count & 0x7) |
+ (be32_to_cpu(req_msg->offset44) & 0xFFFFFFF8));
+}
+
+static inline u8 cm_req_get_path_mtu(struct cm_req_msg *req_msg)
+{
+ return req_msg->offset50 >> 4;
+}
+
+static inline void cm_req_set_path_mtu(struct cm_req_msg *req_msg, u8 path_mtu)
+{
+ req_msg->offset50 = (u8) ((req_msg->offset50 & 0xF) | (path_mtu << 4));
+}
+
+static inline u8 cm_req_get_rnr_retry_count(struct cm_req_msg *req_msg)
+{
+ return req_msg->offset50 & 0x7;
+}
+
+static inline void cm_req_set_rnr_retry_count(struct cm_req_msg *req_msg,
+ u8 rnr_retry_count)
+{
+ req_msg->offset50 = (u8) ((req_msg->offset50 & 0xF8) |
+ (rnr_retry_count & 0x7));
+}
+
+static inline u8 cm_req_get_max_cm_retries(struct cm_req_msg *req_msg)
+{
+ return req_msg->offset51 >> 4;
+}
+
+static inline void cm_req_set_max_cm_retries(struct cm_req_msg *req_msg,
+ u8 retries)
+{
+ req_msg->offset51 = (u8) ((req_msg->offset51 & 0xF) | (retries << 4));
+}
+
+static inline u8 cm_req_get_srq(struct cm_req_msg *req_msg)
+{
+ return (req_msg->offset51 & 0x8) >> 3;
+}
+
+static inline void cm_req_set_srq(struct cm_req_msg *req_msg, u8 srq)
+{
+ req_msg->offset51 = (u8) ((req_msg->offset51 & 0xF7) |
+ ((srq & 0x1) << 3));
+}
+
+static inline __be32 cm_req_get_primary_flow_label(struct cm_req_msg *req_msg)
+{
+ return cpu_to_be32(be32_to_cpu(req_msg->primary_offset88) >> 12);
+}
+
+static inline void cm_req_set_primary_flow_label(struct cm_req_msg *req_msg,
+ __be32 flow_label)
+{
+ req_msg->primary_offset88 = cpu_to_be32(
+ (be32_to_cpu(req_msg->primary_offset88) &
+ 0x00000FFF) |
+ (be32_to_cpu(flow_label) << 12));
+}
+
+static inline u8 cm_req_get_primary_packet_rate(struct cm_req_msg *req_msg)
+{
+ return (u8) (be32_to_cpu(req_msg->primary_offset88) & 0x3F);
+}
+
+static inline void cm_req_set_primary_packet_rate(struct cm_req_msg *req_msg,
+ u8 rate)
+{
+ req_msg->primary_offset88 = cpu_to_be32(
+ (be32_to_cpu(req_msg->primary_offset88) &
+ 0xFFFFFFC0) | (rate & 0x3F));
+}
+
+static inline u8 cm_req_get_primary_sl(struct cm_req_msg *req_msg)
+{
+ return (u8) (req_msg->primary_offset94 >> 4);
+}
+
+static inline void cm_req_set_primary_sl(struct cm_req_msg *req_msg, u8 sl)
+{
+ req_msg->primary_offset94 = (u8) ((req_msg->primary_offset94 & 0x0F) |
+ (sl << 4));
+}
+
+static inline u8 cm_req_get_primary_subnet_local(struct cm_req_msg *req_msg)
+{
+ return (u8) ((req_msg->primary_offset94 & 0x08) >> 3);
+}
+
+static inline void cm_req_set_primary_subnet_local(struct cm_req_msg *req_msg,
+ u8 subnet_local)
+{
+ req_msg->primary_offset94 = (u8) ((req_msg->primary_offset94 & 0xF7) |
+ ((subnet_local & 0x1) << 3));
+}
+
+static inline u8 cm_req_get_primary_local_ack_timeout(struct cm_req_msg *req_msg)
+{
+ return (u8) (req_msg->primary_offset95 >> 3);
+}
+
+static inline void cm_req_set_primary_local_ack_timeout(struct cm_req_msg *req_msg,
+ u8 local_ack_timeout)
+{
+ req_msg->primary_offset95 = (u8) ((req_msg->primary_offset95 & 0x07) |
+ (local_ack_timeout << 3));
+}
+
+static inline __be32 cm_req_get_alt_flow_label(struct cm_req_msg *req_msg)
+{
+ return cpu_to_be32(be32_to_cpu(req_msg->alt_offset132) >> 12);
+}
+
+static inline void cm_req_set_alt_flow_label(struct cm_req_msg *req_msg,
+ __be32 flow_label)
+{
+ req_msg->alt_offset132 = cpu_to_be32(
+ (be32_to_cpu(req_msg->alt_offset132) &
+ 0x00000FFF) |
+ (be32_to_cpu(flow_label) << 12));
+}
+
+static inline u8 cm_req_get_alt_packet_rate(struct cm_req_msg *req_msg)
+{
+ return (u8) (be32_to_cpu(req_msg->alt_offset132) & 0x3F);
+}
+
+static inline void cm_req_set_alt_packet_rate(struct cm_req_msg *req_msg,
+ u8 rate)
+{
+ req_msg->alt_offset132 = cpu_to_be32(
+ (be32_to_cpu(req_msg->alt_offset132) &
+ 0xFFFFFFC0) | (rate & 0x3F));
+}
+
+static inline u8 cm_req_get_alt_sl(struct cm_req_msg *req_msg)
+{
+ return (u8) (req_msg->alt_offset138 >> 4);
+}
+
+static inline void cm_req_set_alt_sl(struct cm_req_msg *req_msg, u8 sl)
+{
+ req_msg->alt_offset138 = (u8) ((req_msg->alt_offset138 & 0x0F) |
+ (sl << 4));
+}
+
+static inline u8 cm_req_get_alt_subnet_local(struct cm_req_msg *req_msg)
+{
+ return (u8) ((req_msg->alt_offset138 & 0x08) >> 3);
+}
+
+static inline void cm_req_set_alt_subnet_local(struct cm_req_msg *req_msg,
+ u8 subnet_local)
+{
+ req_msg->alt_offset138 = (u8) ((req_msg->alt_offset138 & 0xF7) |
+ ((subnet_local & 0x1) << 3));
+}
+
+static inline u8 cm_req_get_alt_local_ack_timeout(struct cm_req_msg *req_msg)
+{
+ return (u8) (req_msg->alt_offset139 >> 3);
+}
+
+static inline void cm_req_set_alt_local_ack_timeout(struct cm_req_msg *req_msg,
+ u8 local_ack_timeout)
+{
+ req_msg->alt_offset139 = (u8) ((req_msg->alt_offset139 & 0x07) |
+ (local_ack_timeout << 3));
+}
+
+/* Message REJected or MRAed */
+enum cm_msg_response {
+ CM_MSG_RESPONSE_REQ = 0x0,
+ CM_MSG_RESPONSE_REP = 0x1,
+ CM_MSG_RESPONSE_OTHER = 0x2
+};
+
+ struct cm_mra_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 remote_comm_id;
+ /* message MRAed:2, rsvd:6 */
+ u8 offset8;
+ /* service timeout:5, rsvd:3 */
+ u8 offset9;
+
+ u8 private_data[IB_CM_MRA_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline u8 cm_mra_get_msg_mraed(struct cm_mra_msg *mra_msg)
+{
+ return (u8) (mra_msg->offset8 >> 6);
+}
+
+static inline void cm_mra_set_msg_mraed(struct cm_mra_msg *mra_msg, u8 msg)
+{
+ mra_msg->offset8 = (u8) ((mra_msg->offset8 & 0x3F) | (msg << 6));
+}
+
+static inline u8 cm_mra_get_service_timeout(struct cm_mra_msg *mra_msg)
+{
+ return (u8) (mra_msg->offset9 >> 3);
+}
+
+static inline void cm_mra_set_service_timeout(struct cm_mra_msg *mra_msg,
+ u8 service_timeout)
+{
+ mra_msg->offset9 = (u8) ((mra_msg->offset9 & 0x07) |
+ (service_timeout << 3));
+}
+
+struct cm_rej_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 remote_comm_id;
+ /* message REJected:2, rsvd:6 */
+ u8 offset8;
+ /* reject info length:7, rsvd:1. */
+ u8 offset9;
+ __be16 reason;
+ u8 ari[IB_CM_REJ_ARI_LENGTH];
+
+ u8 private_data[IB_CM_REJ_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline u8 cm_rej_get_msg_rejected(struct cm_rej_msg *rej_msg)
+{
+ return (u8) (rej_msg->offset8 >> 6);
+}
+
+static inline void cm_rej_set_msg_rejected(struct cm_rej_msg *rej_msg, u8 msg)
+{
+ rej_msg->offset8 = (u8) ((rej_msg->offset8 & 0x3F) | (msg << 6));
+}
+
+static inline u8 cm_rej_get_reject_info_len(struct cm_rej_msg *rej_msg)
+{
+ return (u8) (rej_msg->offset9 >> 1);
+}
+
+static inline void cm_rej_set_reject_info_len(struct cm_rej_msg *rej_msg,
+ u8 len)
+{
+ rej_msg->offset9 = (u8) ((rej_msg->offset9 & 0x1) | (len << 1));
+}
+
+struct cm_rep_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 remote_comm_id;
+ __be32 local_qkey;
+ /* local QPN:24, rsvd:8 */
+ __be32 offset12;
+ /* local EECN:24, rsvd:8 */
+ __be32 offset16;
+ /* starting PSN:24 rsvd:8 */
+ __be32 offset20;
+ u8 resp_resources;
+ u8 initiator_depth;
+ /* target ACK delay:5, failover accepted:2, end-to-end flow control:1 */
+ u8 offset26;
+ /* RNR retry count:3, SRQ:1, rsvd:5 */
+ u8 offset27;
+ __be64 local_ca_guid;
+
+ u8 private_data[IB_CM_REP_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline __be32 cm_rep_get_local_qpn(struct cm_rep_msg *rep_msg)
+{
+ return cpu_to_be32(be32_to_cpu(rep_msg->offset12) >> 8);
+}
+
+static inline void cm_rep_set_local_qpn(struct cm_rep_msg *rep_msg, __be32 qpn)
+{
+ rep_msg->offset12 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+ (be32_to_cpu(rep_msg->offset12) & 0x000000FF));
+}
+
+static inline __be32 cm_rep_get_starting_psn(struct cm_rep_msg *rep_msg)
+{
+ return cpu_to_be32(be32_to_cpu(rep_msg->offset20) >> 8);
+}
+
+static inline void cm_rep_set_starting_psn(struct cm_rep_msg *rep_msg,
+ __be32 starting_psn)
+{
+ rep_msg->offset20 = cpu_to_be32((be32_to_cpu(starting_psn) << 8) |
+ (be32_to_cpu(rep_msg->offset20) & 0x000000FF));
+}
+
+static inline u8 cm_rep_get_target_ack_delay(struct cm_rep_msg *rep_msg)
+{
+ return (u8) (rep_msg->offset26 >> 3);
+}
+
+static inline void cm_rep_set_target_ack_delay(struct cm_rep_msg *rep_msg,
+ u8 target_ack_delay)
+{
+ rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0x07) |
+ (target_ack_delay << 3));
+}
+
+static inline u8 cm_rep_get_failover(struct cm_rep_msg *rep_msg)
+{
+ return (u8) ((rep_msg->offset26 & 0x06) >> 1);
+}
+
+static inline void cm_rep_set_failover(struct cm_rep_msg *rep_msg, u8 failover)
+{
+ rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0xF9) |
+ ((failover & 0x3) << 1));
+}
+
+static inline u8 cm_rep_get_flow_ctrl(struct cm_rep_msg *rep_msg)
+{
+ return (u8) (rep_msg->offset26 & 0x01);
+}
+
+static inline void cm_rep_set_flow_ctrl(struct cm_rep_msg *rep_msg,
+ u8 flow_ctrl)
+{
+ rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0xFE) |
+ (flow_ctrl & 0x1));
+}
+
+static inline u8 cm_rep_get_rnr_retry_count(struct cm_rep_msg *rep_msg)
+{
+ return (u8) (rep_msg->offset27 >> 5);
+}
+
+static inline void cm_rep_set_rnr_retry_count(struct cm_rep_msg *rep_msg,
+ u8 rnr_retry_count)
+{
+ rep_msg->offset27 = (u8) ((rep_msg->offset27 & 0x1F) |
+ (rnr_retry_count << 5));
+}
+
+static inline u8 cm_rep_get_srq(struct cm_rep_msg *rep_msg)
+{
+ return (u8) ((rep_msg->offset27 >> 4) & 0x1);
+}
+
+static inline void cm_rep_set_srq(struct cm_rep_msg *rep_msg, u8 srq)
+{
+ rep_msg->offset27 = (u8) ((rep_msg->offset27 & 0xEF) |
+ ((srq & 0x1) << 4));
+}
+
+struct cm_rtu_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 remote_comm_id;
+
+ u8 private_data[IB_CM_RTU_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+struct cm_dreq_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 remote_comm_id;
+ /* remote QPN/EECN:24, rsvd:8 */
+ __be32 offset8;
+
+ u8 private_data[IB_CM_DREQ_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline __be32 cm_dreq_get_remote_qpn(struct cm_dreq_msg *dreq_msg)
+{
+ return cpu_to_be32(be32_to_cpu(dreq_msg->offset8) >> 8);
+}
+
+static inline void cm_dreq_set_remote_qpn(struct cm_dreq_msg *dreq_msg, __be32 qpn)
+{
+ dreq_msg->offset8 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+ (be32_to_cpu(dreq_msg->offset8) & 0x000000FF));
+}
+
+struct cm_drep_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 remote_comm_id;
+
+ u8 private_data[IB_CM_DREP_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+struct cm_lap_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 remote_comm_id;
+
+ __be32 rsvd8;
+ /* remote QPN/EECN:24, remote CM response timeout:5, rsvd:3 */
+ __be32 offset12;
+ __be32 rsvd16;
+
+ __be16 alt_local_lid;
+ __be16 alt_remote_lid;
+ union ib_gid alt_local_gid;
+ union ib_gid alt_remote_gid;
+ /* flow label:20, rsvd:4, traffic class:8 */
+ __be32 offset56;
+ u8 alt_hop_limit;
+ /* rsvd:2, packet rate:6 */
+ u8 offset61;
+ /* SL:4, subnet local:1, rsvd:3 */
+ u8 offset62;
+ /* local ACK timeout:5, rsvd:3 */
+ u8 offset63;
+
+ u8 private_data[IB_CM_LAP_PRIVATE_DATA_SIZE];
+} __attribute__ ((packed));
+
+static inline __be32 cm_lap_get_remote_qpn(struct cm_lap_msg *lap_msg)
+{
+ return cpu_to_be32(be32_to_cpu(lap_msg->offset12) >> 8);
+}
+
+static inline void cm_lap_set_remote_qpn(struct cm_lap_msg *lap_msg, __be32 qpn)
+{
+ lap_msg->offset12 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+ (be32_to_cpu(lap_msg->offset12) &
+ 0x000000FF));
+}
+
+static inline u8 cm_lap_get_remote_resp_timeout(struct cm_lap_msg *lap_msg)
+{
+ return (u8) ((be32_to_cpu(lap_msg->offset12) & 0xF8) >> 3);
+}
+
+static inline void cm_lap_set_remote_resp_timeout(struct cm_lap_msg *lap_msg,
+ u8 resp_timeout)
+{
+ lap_msg->offset12 = cpu_to_be32((resp_timeout << 3) |
+ (be32_to_cpu(lap_msg->offset12) &
+ 0xFFFFFF07));
+}
+
+static inline __be32 cm_lap_get_flow_label(struct cm_lap_msg *lap_msg)
+{
+ return cpu_to_be32(be32_to_cpu(lap_msg->offset56) >> 12);
+}
+
+static inline void cm_lap_set_flow_label(struct cm_lap_msg *lap_msg,
+ __be32 flow_label)
+{
+ lap_msg->offset56 = cpu_to_be32(
+ (be32_to_cpu(lap_msg->offset56) & 0x00000FFF) |
+ (be32_to_cpu(flow_label) << 12));
+}
+
+static inline u8 cm_lap_get_traffic_class(struct cm_lap_msg *lap_msg)
+{
+ return (u8) be32_to_cpu(lap_msg->offset56);
+}
+
+static inline void cm_lap_set_traffic_class(struct cm_lap_msg *lap_msg,
+ u8 traffic_class)
+{
+ lap_msg->offset56 = cpu_to_be32(traffic_class |
+ (be32_to_cpu(lap_msg->offset56) &
+ 0xFFFFFF00));
+}
+
+static inline u8 cm_lap_get_packet_rate(struct cm_lap_msg *lap_msg)
+{
+ return lap_msg->offset61 & 0x3F;
+}
+
+static inline void cm_lap_set_packet_rate(struct cm_lap_msg *lap_msg,
+ u8 packet_rate)
+{
+ lap_msg->offset61 = (packet_rate & 0x3F) | (lap_msg->offset61 & 0xC0);
+}
+
+static inline u8 cm_lap_get_sl(struct cm_lap_msg *lap_msg)
+{
+ return lap_msg->offset62 >> 4;
+}
+
+static inline void cm_lap_set_sl(struct cm_lap_msg *lap_msg, u8 sl)
+{
+ lap_msg->offset62 = (sl << 4) | (lap_msg->offset62 & 0x0F);
+}
+
+static inline u8 cm_lap_get_subnet_local(struct cm_lap_msg *lap_msg)
+{
+ return (lap_msg->offset62 >> 3) & 0x1;
+}
+
+static inline void cm_lap_set_subnet_local(struct cm_lap_msg *lap_msg,
+ u8 subnet_local)
+{
+ lap_msg->offset62 = ((subnet_local & 0x1) << 3) |
+ (lap_msg->offset61 & 0xF7);
+}
+static inline u8 cm_lap_get_local_ack_timeout(struct cm_lap_msg *lap_msg)
+{
+ return lap_msg->offset63 >> 3;
+}
+
+static inline void cm_lap_set_local_ack_timeout(struct cm_lap_msg *lap_msg,
+ u8 local_ack_timeout)
+{
+ lap_msg->offset63 = (local_ack_timeout << 3) |
+ (lap_msg->offset63 & 0x07);
+}
+
+struct cm_apr_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 remote_comm_id;
+
+ u8 info_length;
+ u8 ap_status;
+ u8 info[IB_CM_APR_INFO_LENGTH];
+
+ u8 private_data[IB_CM_APR_PRIVATE_DATA_SIZE];
+} __attribute__ ((packed));
+
+struct cm_sidr_req_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 request_id;
+ __be16 pkey;
+ __be16 rsvd;
+ __be64 service_id;
+
+ u8 private_data[IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE];
+} __attribute__ ((packed));
+
+struct cm_sidr_rep_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 request_id;
+ u8 status;
+ u8 info_length;
+ __be16 rsvd;
+ /* QPN:24, rsvd:8 */
+ __be32 offset8;
+ __be64 service_id;
+ __be32 qkey;
+ u8 info[IB_CM_SIDR_REP_INFO_LENGTH];
+
+ u8 private_data[IB_CM_SIDR_REP_PRIVATE_DATA_SIZE];
+} __attribute__ ((packed));
+
+static inline __be32 cm_sidr_rep_get_qpn(struct cm_sidr_rep_msg *sidr_rep_msg)
+{
+ return cpu_to_be32(be32_to_cpu(sidr_rep_msg->offset8) >> 8);
+}
+
+static inline void cm_sidr_rep_set_qpn(struct cm_sidr_rep_msg *sidr_rep_msg,
+ __be32 qpn)
+{
+ sidr_rep_msg->offset8 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+ (be32_to_cpu(sidr_rep_msg->offset8) &
+ 0x000000FF));
+}
+
+#endif /* CM_MSGS_H */
diff --git a/sys/ofed/drivers/infiniband/core/cma.c b/sys/ofed/drivers/infiniband/core/cma.c
new file mode 100644
index 0000000..c016451
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/cma.c
@@ -0,0 +1,3386 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc. All rights reserved.
+ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/mutex.h>
+#include <linux/random.h>
+#include <linux/idr.h>
+#include <linux/inetdevice.h>
+
+#include <net/tcp.h>
+#include <net/ipv6.h>
+
+#include <rdma/rdma_cm.h>
+#include <rdma/rdma_cm_ib.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_sa.h>
+#include <rdma/iw_cm.h>
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("Generic RDMA CM Agent");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static int tavor_quirk = 0;
+module_param_named(tavor_quirk, tavor_quirk, int, 0644);
+MODULE_PARM_DESC(tavor_quirk, "Tavor performance quirk: limit MTU to 1K if > 0");
+
+int unify_tcp_port_space = 0;
+module_param(unify_tcp_port_space, int, 0644);
+MODULE_PARM_DESC(unify_tcp_port_space, "Unify the host TCP and RDMA port "
+ "space allocation (default=0)");
+
+#define CMA_CM_RESPONSE_TIMEOUT 20
+#define CMA_MAX_CM_RETRIES 15
+#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
+#define IBOE_PACKET_LIFETIME 18
+
+static int cma_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
+module_param_named(cma_response_timeout, cma_response_timeout, int, 0644);
+MODULE_PARM_DESC(cma_response_timeout, "CMA_CM_RESPONSE_TIMEOUT default=20");
+
+static int def_prec2sl = 3;
+module_param_named(def_prec2sl, def_prec2sl, int, 0644);
+MODULE_PARM_DESC(def_prec2sl, "Default value for SL priority with RoCE. Valid values 0 - 7");
+
+static void cma_add_one(struct ib_device *device);
+static void cma_remove_one(struct ib_device *device);
+
+static struct ib_client cma_client = {
+ .name = "cma",
+ .add = cma_add_one,
+ .remove = cma_remove_one
+};
+
+static struct ib_sa_client sa_client;
+static struct rdma_addr_client addr_client;
+static LIST_HEAD(dev_list);
+static LIST_HEAD(listen_any_list);
+static DEFINE_MUTEX(lock);
+static struct workqueue_struct *cma_wq;
+static DEFINE_IDR(sdp_ps);
+static DEFINE_IDR(tcp_ps);
+static DEFINE_IDR(udp_ps);
+static DEFINE_IDR(ipoib_ps);
+static int next_port;
+
+struct cma_device {
+ struct list_head list;
+ struct ib_device *device;
+ struct completion comp;
+ atomic_t refcount;
+ struct list_head id_list;
+};
+
+enum cma_state {
+ CMA_IDLE,
+ CMA_ADDR_QUERY,
+ CMA_ADDR_RESOLVED,
+ CMA_ROUTE_QUERY,
+ CMA_ROUTE_RESOLVED,
+ CMA_CONNECT,
+ CMA_DISCONNECT,
+ CMA_ADDR_BOUND,
+ CMA_LISTEN,
+ CMA_DEVICE_REMOVAL,
+ CMA_DESTROYING
+};
+
+struct rdma_bind_list {
+ struct idr *ps;
+ struct hlist_head owners;
+ unsigned short port;
+};
+
+/*
+ * Device removal can occur at anytime, so we need extra handling to
+ * serialize notifying the user of device removal with other callbacks.
+ * We do this by disabling removal notification while a callback is in process,
+ * and reporting it after the callback completes.
+ */
+struct rdma_id_private {
+ struct rdma_cm_id id;
+
+ struct rdma_bind_list *bind_list;
+ struct socket *sock;
+ struct hlist_node node;
+ struct list_head list; /* listen_any_list or cma_device.list */
+ struct list_head listen_list; /* per device listens */
+ struct cma_device *cma_dev;
+ struct list_head mc_list;
+
+ int internal_id;
+ enum cma_state state;
+ spinlock_t lock;
+ struct mutex qp_mutex;
+
+ struct completion comp;
+ atomic_t refcount;
+ struct mutex handler_mutex;
+
+ int backlog;
+ int timeout_ms;
+ struct ib_sa_query *query;
+ int query_id;
+ union {
+ struct ib_cm_id *ib;
+ struct iw_cm_id *iw;
+ } cm_id;
+
+ u32 seq_num;
+ u32 qkey;
+ u32 qp_num;
+ u8 srq;
+ u8 tos;
+};
+
+struct cma_multicast {
+ struct rdma_id_private *id_priv;
+ union {
+ struct ib_sa_multicast *ib;
+ } multicast;
+ struct list_head list;
+ void *context;
+ struct sockaddr_storage addr;
+ struct kref mcref;
+};
+
+struct cma_work {
+ struct work_struct work;
+ struct rdma_id_private *id;
+ enum cma_state old_state;
+ enum cma_state new_state;
+ struct rdma_cm_event event;
+};
+
+struct cma_ndev_work {
+ struct work_struct work;
+ struct rdma_id_private *id;
+ struct rdma_cm_event event;
+};
+
+struct iboe_mcast_work {
+ struct work_struct work;
+ struct rdma_id_private *id;
+ struct cma_multicast *mc;
+};
+
+union cma_ip_addr {
+ struct in6_addr ip6;
+ struct {
+ __be32 pad[3];
+ __be32 addr;
+ } ip4;
+};
+
+struct cma_hdr {
+ u8 cma_version;
+ u8 ip_version; /* IP version: 7:4 */
+ __be16 port;
+ union cma_ip_addr src_addr;
+ union cma_ip_addr dst_addr;
+};
+
+struct sdp_hh {
+ u8 bsdh[16];
+ u8 sdp_version; /* Major version: 7:4 */
+ u8 ip_version; /* IP version: 7:4 */
+ u8 sdp_specific1[10];
+ __be16 port;
+ __be16 sdp_specific2;
+ union cma_ip_addr src_addr;
+ union cma_ip_addr dst_addr;
+};
+
+struct sdp_hah {
+ u8 bsdh[16];
+ u8 sdp_version;
+};
+
+#define CMA_VERSION 0x00
+#define SDP_MAJ_VERSION 0x2
+
+static int cma_comp(struct rdma_id_private *id_priv, enum cma_state comp)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&id_priv->lock, flags);
+ ret = (id_priv->state == comp);
+ spin_unlock_irqrestore(&id_priv->lock, flags);
+ return ret;
+}
+
+static int cma_comp_exch(struct rdma_id_private *id_priv,
+ enum cma_state comp, enum cma_state exch)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&id_priv->lock, flags);
+ if ((ret = (id_priv->state == comp)))
+ id_priv->state = exch;
+ spin_unlock_irqrestore(&id_priv->lock, flags);
+ return ret;
+}
+
+static enum cma_state cma_exch(struct rdma_id_private *id_priv,
+ enum cma_state exch)
+{
+ unsigned long flags;
+ enum cma_state old;
+
+ spin_lock_irqsave(&id_priv->lock, flags);
+ old = id_priv->state;
+ id_priv->state = exch;
+ spin_unlock_irqrestore(&id_priv->lock, flags);
+ return old;
+}
+
+static inline u8 cma_get_ip_ver(struct cma_hdr *hdr)
+{
+ return hdr->ip_version >> 4;
+}
+
+static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver)
+{
+ hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF);
+}
+
+static inline u8 sdp_get_majv(u8 sdp_version)
+{
+ return sdp_version >> 4;
+}
+
+static inline u8 sdp_get_ip_ver(struct sdp_hh *hh)
+{
+ return hh->ip_version >> 4;
+}
+
+static inline void sdp_set_ip_ver(struct sdp_hh *hh, u8 ip_ver)
+{
+ hh->ip_version = (ip_ver << 4) | (hh->ip_version & 0xF);
+}
+
+static inline int cma_is_ud_ps(enum rdma_port_space ps)
+{
+ return (ps == RDMA_PS_UDP || ps == RDMA_PS_IPOIB);
+}
+
+static void cma_attach_to_dev(struct rdma_id_private *id_priv,
+ struct cma_device *cma_dev)
+{
+ atomic_inc(&cma_dev->refcount);
+ id_priv->cma_dev = cma_dev;
+ id_priv->id.device = cma_dev->device;
+ id_priv->id.route.addr.dev_addr.transport =
+ rdma_node_get_transport(cma_dev->device->node_type);
+ list_add_tail(&id_priv->list, &cma_dev->id_list);
+}
+
+static inline void cma_deref_dev(struct cma_device *cma_dev)
+{
+ if (atomic_dec_and_test(&cma_dev->refcount))
+ complete(&cma_dev->comp);
+}
+
+static inline void release_mc(struct kref *kref)
+{
+ struct cma_multicast *mc = container_of(kref, struct cma_multicast, mcref);
+
+ kfree(mc->multicast.ib);
+ kfree(mc);
+}
+
+static void cma_detach_from_dev(struct rdma_id_private *id_priv)
+{
+ list_del(&id_priv->list);
+ cma_deref_dev(id_priv->cma_dev);
+ id_priv->cma_dev = NULL;
+}
+
+static int cma_set_qkey(struct rdma_id_private *id_priv)
+{
+ struct ib_sa_mcmember_rec rec;
+ int ret = 0;
+
+ if (id_priv->qkey)
+ return 0;
+
+ switch (id_priv->id.ps) {
+ case RDMA_PS_UDP:
+ id_priv->qkey = RDMA_UDP_QKEY;
+ break;
+ case RDMA_PS_IPOIB:
+ ib_addr_get_mgid(&id_priv->id.route.addr.dev_addr, &rec.mgid);
+ ret = ib_sa_get_mcmember_rec(id_priv->id.device,
+ id_priv->id.port_num, &rec.mgid,
+ &rec);
+ if (!ret)
+ id_priv->qkey = be32_to_cpu(rec.qkey);
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
+static int cma_acquire_dev(struct rdma_id_private *id_priv)
+{
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ struct cma_device *cma_dev;
+ union ib_gid gid;
+ int ret = -ENODEV;
+
+ if (dev_addr->dev_type != ARPHRD_INFINIBAND) {
+ iboe_addr_get_sgid(dev_addr, &gid);
+ list_for_each_entry(cma_dev, &dev_list, list) {
+ ret = ib_find_cached_gid(cma_dev->device, &gid,
+ &id_priv->id.port_num, NULL);
+ if (!ret)
+ goto out;
+ }
+ }
+
+ memcpy(&gid, dev_addr->src_dev_addr +
+ rdma_addr_gid_offset(dev_addr), sizeof gid);
+ list_for_each_entry(cma_dev, &dev_list, list) {
+ ret = ib_find_cached_gid(cma_dev->device, &gid,
+ &id_priv->id.port_num, NULL);
+ if (!ret)
+ break;
+ }
+
+out:
+ if (!ret)
+ cma_attach_to_dev(id_priv, cma_dev);
+
+ return ret;
+}
+
+static void cma_deref_id(struct rdma_id_private *id_priv)
+{
+ if (atomic_dec_and_test(&id_priv->refcount))
+ complete(&id_priv->comp);
+}
+
+static int cma_disable_callback(struct rdma_id_private *id_priv,
+ enum cma_state state)
+{
+ mutex_lock(&id_priv->handler_mutex);
+ if (id_priv->state != state) {
+ mutex_unlock(&id_priv->handler_mutex);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int cma_has_cm_dev(struct rdma_id_private *id_priv)
+{
+ return (id_priv->id.device && id_priv->cm_id.ib);
+}
+
+struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
+ void *context, enum rdma_port_space ps)
+{
+ struct rdma_id_private *id_priv;
+
+ id_priv = kzalloc(sizeof *id_priv, GFP_KERNEL);
+ if (!id_priv)
+ return ERR_PTR(-ENOMEM);
+
+ id_priv->state = CMA_IDLE;
+ id_priv->id.context = context;
+ id_priv->id.event_handler = event_handler;
+ id_priv->id.ps = ps;
+ spin_lock_init(&id_priv->lock);
+ mutex_init(&id_priv->qp_mutex);
+ init_completion(&id_priv->comp);
+ atomic_set(&id_priv->refcount, 1);
+ mutex_init(&id_priv->handler_mutex);
+ INIT_LIST_HEAD(&id_priv->listen_list);
+ INIT_LIST_HEAD(&id_priv->mc_list);
+ get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num);
+
+ return &id_priv->id;
+}
+EXPORT_SYMBOL(rdma_create_id);
+
+static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp)
+{
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+
+ qp_attr.qp_state = IB_QPS_INIT;
+ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+ if (ret)
+ return ret;
+
+ ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+ if (ret)
+ return ret;
+
+ qp_attr.qp_state = IB_QPS_RTR;
+ ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+ if (ret)
+ return ret;
+
+ qp_attr.qp_state = IB_QPS_RTS;
+ qp_attr.sq_psn = 0;
+ ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN);
+
+ return ret;
+}
+
+static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp)
+{
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+
+ qp_attr.qp_state = IB_QPS_INIT;
+ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+ if (ret)
+ return ret;
+
+ return ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+}
+
+int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ struct rdma_id_private *id_priv;
+ struct ib_qp *qp;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (id->device != pd->device)
+ return -EINVAL;
+
+ qp = ib_create_qp(pd, qp_init_attr);
+ if (IS_ERR(qp))
+ return PTR_ERR(qp);
+
+ if (cma_is_ud_ps(id_priv->id.ps))
+ ret = cma_init_ud_qp(id_priv, qp);
+ else
+ ret = cma_init_conn_qp(id_priv, qp);
+ if (ret)
+ goto err;
+
+ id->qp = qp;
+ id_priv->qp_num = qp->qp_num;
+ id_priv->srq = (qp->srq != NULL);
+ return 0;
+err:
+ ib_destroy_qp(qp);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_create_qp);
+
+void rdma_destroy_qp(struct rdma_cm_id *id)
+{
+ struct rdma_id_private *id_priv;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ mutex_lock(&id_priv->qp_mutex);
+ ib_destroy_qp(id_priv->id.qp);
+ id_priv->id.qp = NULL;
+ mutex_unlock(&id_priv->qp_mutex);
+}
+EXPORT_SYMBOL(rdma_destroy_qp);
+
+static int cma_modify_qp_rtr(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+
+ mutex_lock(&id_priv->qp_mutex);
+ if (!id_priv->id.qp) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Need to update QP attributes from default values. */
+ qp_attr.qp_state = IB_QPS_INIT;
+ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+ if (ret)
+ goto out;
+
+ ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+ if (ret)
+ goto out;
+
+ qp_attr.qp_state = IB_QPS_RTR;
+ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+ if (ret)
+ goto out;
+
+ if (conn_param)
+ qp_attr.max_dest_rd_atomic = conn_param->responder_resources;
+ ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+out:
+ mutex_unlock(&id_priv->qp_mutex);
+ return ret;
+}
+
+static int cma_modify_qp_rts(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+
+ mutex_lock(&id_priv->qp_mutex);
+ if (!id_priv->id.qp) {
+ ret = 0;
+ goto out;
+ }
+
+ qp_attr.qp_state = IB_QPS_RTS;
+ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+ if (ret)
+ goto out;
+
+ if (conn_param)
+ qp_attr.max_rd_atomic = conn_param->initiator_depth;
+ ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+out:
+ mutex_unlock(&id_priv->qp_mutex);
+ return ret;
+}
+
+static int cma_modify_qp_err(struct rdma_id_private *id_priv)
+{
+ struct ib_qp_attr qp_attr;
+ int ret;
+
+ mutex_lock(&id_priv->qp_mutex);
+ if (!id_priv->id.qp) {
+ ret = 0;
+ goto out;
+ }
+
+ qp_attr.qp_state = IB_QPS_ERR;
+ ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE);
+out:
+ mutex_unlock(&id_priv->qp_mutex);
+ return ret;
+}
+
+static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv,
+ struct ib_qp_attr *qp_attr, int *qp_attr_mask)
+{
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ int ret;
+ u16 pkey;
+
+ if (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) ==
+ IB_LINK_LAYER_INFINIBAND)
+ pkey = ib_addr_get_pkey(dev_addr);
+ else
+ pkey = 0xffff;
+
+ ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num,
+ pkey, &qp_attr->pkey_index);
+ if (ret)
+ return ret;
+
+ qp_attr->port_num = id_priv->id.port_num;
+ *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT;
+
+ if (cma_is_ud_ps(id_priv->id.ps)) {
+ ret = cma_set_qkey(id_priv);
+ if (ret)
+ return ret;
+
+ qp_attr->qkey = id_priv->qkey;
+ *qp_attr_mask |= IB_QP_QKEY;
+ } else {
+ qp_attr->qp_access_flags = 0;
+ *qp_attr_mask |= IB_QP_ACCESS_FLAGS;
+ }
+ return 0;
+}
+
+int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ struct rdma_id_private *id_priv;
+ int ret = 0;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ if (!id_priv->cm_id.ib || cma_is_ud_ps(id_priv->id.ps))
+ ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask);
+ else
+ ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr,
+ qp_attr_mask);
+ if (qp_attr->qp_state == IB_QPS_RTR)
+ qp_attr->rq_psn = id_priv->seq_num;
+ break;
+ case RDMA_TRANSPORT_IWARP:
+ if (!id_priv->cm_id.iw) {
+ qp_attr->qp_access_flags = 0;
+ *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
+ } else
+ ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr,
+ qp_attr_mask);
+ break;
+ default:
+ ret = -ENOSYS;
+ break;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(rdma_init_qp_attr);
+
+static inline int cma_zero_addr(struct sockaddr *addr)
+{
+ struct in6_addr *ip6;
+
+ if (addr->sa_family == AF_INET)
+ return ipv4_is_zeronet(
+ ((struct sockaddr_in *)addr)->sin_addr.s_addr);
+ else {
+ ip6 = &((struct sockaddr_in6 *) addr)->sin6_addr;
+ return (ip6->s6_addr32[0] | ip6->s6_addr32[1] |
+ ip6->s6_addr32[2] | ip6->s6_addr32[3]) == 0;
+ }
+}
+
+static inline int cma_loopback_addr(struct sockaddr *addr)
+{
+ if (addr->sa_family == AF_INET)
+ return ipv4_is_loopback(
+ ((struct sockaddr_in *) addr)->sin_addr.s_addr);
+ else
+ return ipv6_addr_loopback(
+ &((struct sockaddr_in6 *) addr)->sin6_addr);
+}
+
+static inline int cma_any_addr(struct sockaddr *addr)
+{
+ return cma_zero_addr(addr) || cma_loopback_addr(addr);
+}
+
+static inline __be16 cma_port(struct sockaddr *addr)
+{
+ if (addr->sa_family == AF_INET)
+ return ((struct sockaddr_in *) addr)->sin_port;
+ else
+ return ((struct sockaddr_in6 *) addr)->sin6_port;
+}
+
+static inline int cma_any_port(struct sockaddr *addr)
+{
+ return !cma_port(addr);
+}
+
+static int cma_get_net_info(void *hdr, enum rdma_port_space ps,
+ u8 *ip_ver, __be16 *port,
+ union cma_ip_addr **src, union cma_ip_addr **dst)
+{
+ switch (ps) {
+ case RDMA_PS_SDP:
+ if (sdp_get_majv(((struct sdp_hh *) hdr)->sdp_version) !=
+ SDP_MAJ_VERSION)
+ return -EINVAL;
+
+ *ip_ver = sdp_get_ip_ver(hdr);
+ *port = ((struct sdp_hh *) hdr)->port;
+ *src = &((struct sdp_hh *) hdr)->src_addr;
+ *dst = &((struct sdp_hh *) hdr)->dst_addr;
+ break;
+ default:
+ if (((struct cma_hdr *) hdr)->cma_version != CMA_VERSION)
+ return -EINVAL;
+
+ *ip_ver = cma_get_ip_ver(hdr);
+ *port = ((struct cma_hdr *) hdr)->port;
+ *src = &((struct cma_hdr *) hdr)->src_addr;
+ *dst = &((struct cma_hdr *) hdr)->dst_addr;
+ break;
+ }
+
+ if (*ip_ver != 4 && *ip_ver != 6)
+ return -EINVAL;
+ return 0;
+}
+
+static void cma_save_net_info(struct rdma_addr *addr,
+ struct rdma_addr *listen_addr,
+ u8 ip_ver, __be16 port,
+ union cma_ip_addr *src, union cma_ip_addr *dst)
+{
+ struct sockaddr_in *listen4, *ip4;
+ struct sockaddr_in6 *listen6, *ip6;
+
+ switch (ip_ver) {
+ case 4:
+ listen4 = (struct sockaddr_in *) &listen_addr->src_addr;
+ ip4 = (struct sockaddr_in *) &addr->src_addr;
+ ip4->sin_family = listen4->sin_family;
+ ip4->sin_addr.s_addr = dst->ip4.addr;
+ ip4->sin_port = listen4->sin_port;
+
+ ip4 = (struct sockaddr_in *) &addr->dst_addr;
+ ip4->sin_family = listen4->sin_family;
+ ip4->sin_addr.s_addr = src->ip4.addr;
+ ip4->sin_port = port;
+ break;
+ case 6:
+ listen6 = (struct sockaddr_in6 *) &listen_addr->src_addr;
+ ip6 = (struct sockaddr_in6 *) &addr->src_addr;
+ ip6->sin6_family = listen6->sin6_family;
+ ip6->sin6_addr = dst->ip6;
+ ip6->sin6_port = listen6->sin6_port;
+
+ ip6 = (struct sockaddr_in6 *) &addr->dst_addr;
+ ip6->sin6_family = listen6->sin6_family;
+ ip6->sin6_addr = src->ip6;
+ ip6->sin6_port = port;
+ break;
+ default:
+ break;
+ }
+}
+
+static inline int cma_user_data_offset(enum rdma_port_space ps)
+{
+ switch (ps) {
+ case RDMA_PS_SDP:
+ return 0;
+ default:
+ return sizeof(struct cma_hdr);
+ }
+}
+
+static void cma_cancel_route(struct rdma_id_private *id_priv)
+{
+ switch (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num)) {
+ case IB_LINK_LAYER_INFINIBAND:
+ if (id_priv->query)
+ ib_sa_cancel_query(id_priv->query_id, id_priv->query);
+ break;
+ default:
+ break;
+ }
+}
+
+static void cma_cancel_listens(struct rdma_id_private *id_priv)
+{
+ struct rdma_id_private *dev_id_priv;
+
+ /*
+ * Remove from listen_any_list to prevent added devices from spawning
+ * additional listen requests.
+ */
+ mutex_lock(&lock);
+ list_del(&id_priv->list);
+
+ while (!list_empty(&id_priv->listen_list)) {
+ dev_id_priv = list_entry(id_priv->listen_list.next,
+ struct rdma_id_private, listen_list);
+ /* sync with device removal to avoid duplicate destruction */
+ list_del_init(&dev_id_priv->list);
+ list_del(&dev_id_priv->listen_list);
+ mutex_unlock(&lock);
+
+ rdma_destroy_id(&dev_id_priv->id);
+ mutex_lock(&lock);
+ }
+ mutex_unlock(&lock);
+}
+
+static void cma_cancel_operation(struct rdma_id_private *id_priv,
+ enum cma_state state)
+{
+ switch (state) {
+ case CMA_ADDR_QUERY:
+ rdma_addr_cancel(&id_priv->id.route.addr.dev_addr);
+ break;
+ case CMA_ROUTE_QUERY:
+ cma_cancel_route(id_priv);
+ break;
+ case CMA_LISTEN:
+ if (cma_any_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr)
+ && !id_priv->cma_dev)
+ cma_cancel_listens(id_priv);
+ break;
+ default:
+ break;
+ }
+}
+
+static void cma_release_port(struct rdma_id_private *id_priv)
+{
+ struct rdma_bind_list *bind_list = id_priv->bind_list;
+
+ if (!bind_list)
+ return;
+
+ mutex_lock(&lock);
+ hlist_del(&id_priv->node);
+ if (hlist_empty(&bind_list->owners)) {
+ idr_remove(bind_list->ps, bind_list->port);
+ kfree(bind_list);
+ }
+ mutex_unlock(&lock);
+ if (id_priv->sock)
+ sock_release(id_priv->sock);
+}
+
+static void cma_leave_mc_groups(struct rdma_id_private *id_priv)
+{
+ struct cma_multicast *mc;
+
+ while (!list_empty(&id_priv->mc_list)) {
+ mc = container_of(id_priv->mc_list.next,
+ struct cma_multicast, list);
+ list_del(&mc->list);
+ switch (rdma_port_get_link_layer(id_priv->cma_dev->device, id_priv->id.port_num)) {
+ case IB_LINK_LAYER_INFINIBAND:
+ ib_sa_free_multicast(mc->multicast.ib);
+ kfree(mc);
+ break;
+ case IB_LINK_LAYER_ETHERNET:
+ kref_put(&mc->mcref, release_mc);
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+void rdma_destroy_id(struct rdma_cm_id *id)
+{
+ struct rdma_id_private *id_priv;
+ enum cma_state state;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ state = cma_exch(id_priv, CMA_DESTROYING);
+ cma_cancel_operation(id_priv, state);
+
+ mutex_lock(&lock);
+ if (id_priv->cma_dev) {
+ mutex_unlock(&lock);
+ switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib))
+ ib_destroy_cm_id(id_priv->cm_id.ib);
+ break;
+ case RDMA_TRANSPORT_IWARP:
+ if (id_priv->cm_id.iw && !IS_ERR(id_priv->cm_id.iw))
+ iw_destroy_cm_id(id_priv->cm_id.iw);
+ break;
+ default:
+ break;
+ }
+ cma_leave_mc_groups(id_priv);
+ mutex_lock(&lock);
+ cma_detach_from_dev(id_priv);
+ }
+ mutex_unlock(&lock);
+
+ cma_release_port(id_priv);
+ cma_deref_id(id_priv);
+ wait_for_completion(&id_priv->comp);
+
+ if (id_priv->internal_id)
+ cma_deref_id(id_priv->id.context);
+
+ kfree(id_priv->id.route.path_rec);
+ kfree(id_priv);
+}
+EXPORT_SYMBOL(rdma_destroy_id);
+
+static int cma_rep_recv(struct rdma_id_private *id_priv)
+{
+ int ret;
+
+ ret = cma_modify_qp_rtr(id_priv, NULL);
+ if (ret)
+ goto reject;
+
+ ret = cma_modify_qp_rts(id_priv, NULL);
+ if (ret)
+ goto reject;
+
+ ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0);
+ if (ret)
+ goto reject;
+
+ return 0;
+reject:
+ cma_modify_qp_err(id_priv);
+ ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED,
+ NULL, 0, NULL, 0);
+ return ret;
+}
+
+static int cma_verify_rep(struct rdma_id_private *id_priv, void *data)
+{
+ if (id_priv->id.ps == RDMA_PS_SDP &&
+ sdp_get_majv(((struct sdp_hah *) data)->sdp_version) !=
+ SDP_MAJ_VERSION)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void cma_set_rep_event_data(struct rdma_cm_event *event,
+ struct ib_cm_rep_event_param *rep_data,
+ void *private_data)
+{
+ event->param.conn.private_data = private_data;
+ event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE;
+ event->param.conn.responder_resources = rep_data->responder_resources;
+ event->param.conn.initiator_depth = rep_data->initiator_depth;
+ event->param.conn.flow_control = rep_data->flow_control;
+ event->param.conn.rnr_retry_count = rep_data->rnr_retry_count;
+ event->param.conn.srq = rep_data->srq;
+ event->param.conn.qp_num = rep_data->remote_qpn;
+}
+
+static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
+{
+ struct rdma_id_private *id_priv = cm_id->context;
+ struct rdma_cm_event event;
+ int ret = 0;
+
+ if ((ib_event->event != IB_CM_TIMEWAIT_EXIT &&
+ cma_disable_callback(id_priv, CMA_CONNECT)) ||
+ (ib_event->event == IB_CM_TIMEWAIT_EXIT &&
+ cma_disable_callback(id_priv, CMA_DISCONNECT)))
+ return 0;
+
+ memset(&event, 0, sizeof event);
+ switch (ib_event->event) {
+ case IB_CM_REQ_ERROR:
+ case IB_CM_REP_ERROR:
+ event.event = RDMA_CM_EVENT_UNREACHABLE;
+ event.status = -ETIMEDOUT;
+ break;
+ case IB_CM_REP_RECEIVED:
+ event.status = cma_verify_rep(id_priv, ib_event->private_data);
+ if (event.status)
+ event.event = RDMA_CM_EVENT_CONNECT_ERROR;
+ else if (id_priv->id.qp && id_priv->id.ps != RDMA_PS_SDP) {
+ event.status = cma_rep_recv(id_priv);
+ event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR :
+ RDMA_CM_EVENT_ESTABLISHED;
+ } else
+ event.event = RDMA_CM_EVENT_CONNECT_RESPONSE;
+ cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd,
+ ib_event->private_data);
+ break;
+ case IB_CM_RTU_RECEIVED:
+ case IB_CM_USER_ESTABLISHED:
+ event.event = RDMA_CM_EVENT_ESTABLISHED;
+ break;
+ case IB_CM_DREQ_ERROR:
+ event.status = -ETIMEDOUT; /* fall through */
+ case IB_CM_DREQ_RECEIVED:
+ case IB_CM_DREP_RECEIVED:
+ if (!cma_comp_exch(id_priv, CMA_CONNECT, CMA_DISCONNECT))
+ goto out;
+ event.event = RDMA_CM_EVENT_DISCONNECTED;
+ break;
+ case IB_CM_TIMEWAIT_EXIT:
+ event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT;
+ break;
+ case IB_CM_MRA_RECEIVED:
+ /* ignore event */
+ goto out;
+ case IB_CM_REJ_RECEIVED:
+ cma_modify_qp_err(id_priv);
+ event.status = ib_event->param.rej_rcvd.reason;
+ event.event = RDMA_CM_EVENT_REJECTED;
+ event.param.conn.private_data = ib_event->private_data;
+ event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
+ break;
+ default:
+ printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n",
+ ib_event->event);
+ goto out;
+ }
+
+ ret = id_priv->id.event_handler(&id_priv->id, &event);
+ if (ret) {
+ /* Destroy the CM ID by returning a non-zero value. */
+ id_priv->cm_id.ib = NULL;
+ cma_exch(id_priv, CMA_DESTROYING);
+ mutex_unlock(&id_priv->handler_mutex);
+ rdma_destroy_id(&id_priv->id);
+ return ret;
+ }
+out:
+ mutex_unlock(&id_priv->handler_mutex);
+ return ret;
+}
+
+static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
+ struct ib_cm_event *ib_event)
+{
+ struct rdma_id_private *id_priv;
+ struct rdma_cm_id *id;
+ struct rdma_route *rt;
+ union cma_ip_addr *src, *dst;
+ __be16 port;
+ u8 ip_ver;
+ int ret;
+
+ if (cma_get_net_info(ib_event->private_data, listen_id->ps,
+ &ip_ver, &port, &src, &dst))
+ goto err;
+
+ id = rdma_create_id(listen_id->event_handler, listen_id->context,
+ listen_id->ps);
+ if (IS_ERR(id))
+ goto err;
+
+ cma_save_net_info(&id->route.addr, &listen_id->route.addr,
+ ip_ver, port, src, dst);
+
+ rt = &id->route;
+ rt->num_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1;
+ rt->path_rec = kmalloc(sizeof *rt->path_rec * rt->num_paths,
+ GFP_KERNEL);
+ if (!rt->path_rec)
+ goto destroy_id;
+
+ rt->path_rec[0] = *ib_event->param.req_rcvd.primary_path;
+ if (rt->num_paths == 2)
+ rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;
+
+ if (cma_any_addr((struct sockaddr *) &rt->addr.src_addr)) {
+ rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND;
+ rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
+ ib_addr_set_pkey(&rt->addr.dev_addr, rt->path_rec[0].pkey);
+ } else {
+ ret = rdma_translate_ip((struct sockaddr *) &rt->addr.src_addr,
+ &rt->addr.dev_addr);
+ if (ret)
+ goto destroy_id;
+ }
+ rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid);
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ id_priv->state = CMA_CONNECT;
+ return id_priv;
+
+destroy_id:
+ rdma_destroy_id(id);
+err:
+ return NULL;
+}
+
+static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
+ struct ib_cm_event *ib_event)
+{
+ struct rdma_id_private *id_priv;
+ struct rdma_cm_id *id;
+ union cma_ip_addr *src, *dst;
+ __be16 port;
+ u8 ip_ver;
+ int ret;
+
+ id = rdma_create_id(listen_id->event_handler, listen_id->context,
+ listen_id->ps);
+ if (IS_ERR(id))
+ return NULL;
+
+
+ if (cma_get_net_info(ib_event->private_data, listen_id->ps,
+ &ip_ver, &port, &src, &dst))
+ goto err;
+
+ cma_save_net_info(&id->route.addr, &listen_id->route.addr,
+ ip_ver, port, src, dst);
+
+ if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) {
+ ret = rdma_translate_ip((struct sockaddr *) &id->route.addr.src_addr,
+ &id->route.addr.dev_addr);
+ if (ret)
+ goto err;
+ }
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ id_priv->state = CMA_CONNECT;
+ return id_priv;
+err:
+ rdma_destroy_id(id);
+ return NULL;
+}
+
+static void cma_set_req_event_data(struct rdma_cm_event *event,
+ struct ib_cm_req_event_param *req_data,
+ void *private_data, int offset)
+{
+ event->param.conn.private_data = private_data + offset;
+ event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset;
+ event->param.conn.responder_resources = req_data->responder_resources;
+ event->param.conn.initiator_depth = req_data->initiator_depth;
+ event->param.conn.flow_control = req_data->flow_control;
+ event->param.conn.retry_count = req_data->retry_count;
+ event->param.conn.rnr_retry_count = req_data->rnr_retry_count;
+ event->param.conn.srq = req_data->srq;
+ event->param.conn.qp_num = req_data->remote_qpn;
+}
+
+static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
+{
+ struct rdma_id_private *listen_id, *conn_id;
+ struct rdma_cm_event event;
+ int offset, ret;
+
+ listen_id = cm_id->context;
+ if (cma_disable_callback(listen_id, CMA_LISTEN))
+ return -ECONNABORTED;
+
+ memset(&event, 0, sizeof event);
+ offset = cma_user_data_offset(listen_id->id.ps);
+ event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
+ if (cma_is_ud_ps(listen_id->id.ps)) {
+ conn_id = cma_new_udp_id(&listen_id->id, ib_event);
+ event.param.ud.private_data = ib_event->private_data + offset;
+ event.param.ud.private_data_len =
+ IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset;
+ } else {
+ conn_id = cma_new_conn_id(&listen_id->id, ib_event);
+ cma_set_req_event_data(&event, &ib_event->param.req_rcvd,
+ ib_event->private_data, offset);
+ }
+ if (!conn_id) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
+ mutex_lock(&lock);
+ ret = cma_acquire_dev(conn_id);
+ mutex_unlock(&lock);
+ if (ret)
+ goto release_conn_id;
+
+ conn_id->cm_id.ib = cm_id;
+ cm_id->context = conn_id;
+ cm_id->cm_handler = cma_ib_handler;
+
+ ret = conn_id->id.event_handler(&conn_id->id, &event);
+ if (!ret) {
+ /*
+ * Acquire mutex to prevent user executing rdma_destroy_id()
+ * while we're accessing the cm_id.
+ */
+ mutex_lock(&lock);
+ if (cma_comp(conn_id, CMA_CONNECT) &&
+ !cma_is_ud_ps(conn_id->id.ps))
+ ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
+ mutex_unlock(&lock);
+ mutex_unlock(&conn_id->handler_mutex);
+ goto out;
+ }
+
+ /* Destroy the CM ID by returning a non-zero value. */
+ conn_id->cm_id.ib = NULL;
+
+release_conn_id:
+ cma_exch(conn_id, CMA_DESTROYING);
+ mutex_unlock(&conn_id->handler_mutex);
+ rdma_destroy_id(&conn_id->id);
+
+out:
+ mutex_unlock(&listen_id->handler_mutex);
+ return ret;
+}
+
+static __be64 cma_get_service_id(enum rdma_port_space ps, struct sockaddr *addr)
+{
+ return cpu_to_be64(((u64)ps << 16) + be16_to_cpu(cma_port(addr)));
+}
+
+static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr,
+ struct ib_cm_compare_data *compare)
+{
+ struct cma_hdr *cma_data, *cma_mask;
+ struct sdp_hh *sdp_data, *sdp_mask;
+ __be32 ip4_addr;
+ struct in6_addr ip6_addr;
+
+ memset(compare, 0, sizeof *compare);
+ cma_data = (void *) compare->data;
+ cma_mask = (void *) compare->mask;
+ sdp_data = (void *) compare->data;
+ sdp_mask = (void *) compare->mask;
+
+ switch (addr->sa_family) {
+ case AF_INET:
+ ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr;
+ if (ps == RDMA_PS_SDP) {
+ sdp_set_ip_ver(sdp_data, 4);
+ sdp_set_ip_ver(sdp_mask, 0xF);
+ sdp_data->dst_addr.ip4.addr = ip4_addr;
+ sdp_mask->dst_addr.ip4.addr = htonl(~0);
+ } else {
+ cma_set_ip_ver(cma_data, 4);
+ cma_set_ip_ver(cma_mask, 0xF);
+ cma_data->dst_addr.ip4.addr = ip4_addr;
+ cma_mask->dst_addr.ip4.addr = htonl(~0);
+ }
+ break;
+#ifdef INET6
+ case AF_INET6:
+ ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr;
+ if (ps == RDMA_PS_SDP) {
+ sdp_set_ip_ver(sdp_data, 6);
+ sdp_set_ip_ver(sdp_mask, 0xF);
+ sdp_data->dst_addr.ip6 = ip6_addr;
+ memset(&sdp_mask->dst_addr.ip6, 0xFF,
+ sizeof sdp_mask->dst_addr.ip6);
+ } else {
+ cma_set_ip_ver(cma_data, 6);
+ cma_set_ip_ver(cma_mask, 0xF);
+ cma_data->dst_addr.ip6 = ip6_addr;
+ memset(&cma_mask->dst_addr.ip6, 0xFF,
+ sizeof cma_mask->dst_addr.ip6);
+ }
+ break;
+#endif
+ default:
+ break;
+ }
+}
+
+static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
+{
+ struct rdma_id_private *id_priv = iw_id->context;
+ struct rdma_cm_event event;
+ struct sockaddr_in *sin;
+ int ret = 0;
+
+ if (cma_disable_callback(id_priv, CMA_CONNECT))
+ return 0;
+
+ memset(&event, 0, sizeof event);
+ switch (iw_event->event) {
+ case IW_CM_EVENT_CLOSE:
+ event.event = RDMA_CM_EVENT_DISCONNECTED;
+ break;
+ case IW_CM_EVENT_CONNECT_REPLY:
+ sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr;
+ *sin = iw_event->local_addr;
+ sin = (struct sockaddr_in *) &id_priv->id.route.addr.dst_addr;
+ *sin = iw_event->remote_addr;
+ switch (iw_event->status) {
+ case 0:
+ event.event = RDMA_CM_EVENT_ESTABLISHED;
+ break;
+ case -ECONNRESET:
+ case -ECONNREFUSED:
+ event.event = RDMA_CM_EVENT_REJECTED;
+ break;
+ case -ETIMEDOUT:
+ event.event = RDMA_CM_EVENT_UNREACHABLE;
+ break;
+ default:
+ event.event = RDMA_CM_EVENT_CONNECT_ERROR;
+ break;
+ }
+ break;
+ case IW_CM_EVENT_ESTABLISHED:
+ event.event = RDMA_CM_EVENT_ESTABLISHED;
+ break;
+ default:
+ BUG_ON(1);
+ }
+
+ event.status = iw_event->status;
+ event.param.conn.private_data = iw_event->private_data;
+ event.param.conn.private_data_len = iw_event->private_data_len;
+ ret = id_priv->id.event_handler(&id_priv->id, &event);
+ if (ret) {
+ /* Destroy the CM ID by returning a non-zero value. */
+ id_priv->cm_id.iw = NULL;
+ cma_exch(id_priv, CMA_DESTROYING);
+ mutex_unlock(&id_priv->handler_mutex);
+ rdma_destroy_id(&id_priv->id);
+ return ret;
+ }
+
+ mutex_unlock(&id_priv->handler_mutex);
+ return ret;
+}
+
+static int iw_conn_req_handler(struct iw_cm_id *cm_id,
+ struct iw_cm_event *iw_event)
+{
+ struct rdma_cm_id *new_cm_id;
+ struct rdma_id_private *listen_id, *conn_id;
+ struct sockaddr_in *sin;
+ struct net_device *dev = NULL;
+ struct rdma_cm_event event;
+ int ret;
+ struct ib_device_attr attr;
+
+ listen_id = cm_id->context;
+ if (cma_disable_callback(listen_id, CMA_LISTEN))
+ return -ECONNABORTED;
+
+ /* Create a new RDMA id for the new IW CM ID */
+ new_cm_id = rdma_create_id(listen_id->id.event_handler,
+ listen_id->id.context,
+ RDMA_PS_TCP);
+ if (IS_ERR(new_cm_id)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ conn_id = container_of(new_cm_id, struct rdma_id_private, id);
+ mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
+ conn_id->state = CMA_CONNECT;
+
+ dev = ip_dev_find(NULL, iw_event->local_addr.sin_addr.s_addr);
+ if (!dev) {
+ ret = -EADDRNOTAVAIL;
+ mutex_unlock(&conn_id->handler_mutex);
+ rdma_destroy_id(new_cm_id);
+ goto out;
+ }
+ ret = rdma_copy_addr(&conn_id->id.route.addr.dev_addr, dev, NULL);
+ if (ret) {
+ mutex_unlock(&conn_id->handler_mutex);
+ rdma_destroy_id(new_cm_id);
+ goto out;
+ }
+
+ mutex_lock(&lock);
+ ret = cma_acquire_dev(conn_id);
+ mutex_unlock(&lock);
+ if (ret) {
+ mutex_unlock(&conn_id->handler_mutex);
+ rdma_destroy_id(new_cm_id);
+ goto out;
+ }
+
+ conn_id->cm_id.iw = cm_id;
+ cm_id->context = conn_id;
+ cm_id->cm_handler = cma_iw_handler;
+
+ sin = (struct sockaddr_in *) &new_cm_id->route.addr.src_addr;
+ *sin = iw_event->local_addr;
+ sin = (struct sockaddr_in *) &new_cm_id->route.addr.dst_addr;
+ *sin = iw_event->remote_addr;
+
+ ret = ib_query_device(conn_id->id.device, &attr);
+ if (ret) {
+ mutex_unlock(&conn_id->handler_mutex);
+ rdma_destroy_id(new_cm_id);
+ goto out;
+ }
+
+ memset(&event, 0, sizeof event);
+ event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
+ event.param.conn.private_data = iw_event->private_data;
+ event.param.conn.private_data_len = iw_event->private_data_len;
+ event.param.conn.initiator_depth = attr.max_qp_init_rd_atom;
+ event.param.conn.responder_resources = attr.max_qp_rd_atom;
+ ret = conn_id->id.event_handler(&conn_id->id, &event);
+ if (ret) {
+ /* User wants to destroy the CM ID */
+ conn_id->cm_id.iw = NULL;
+ cma_exch(conn_id, CMA_DESTROYING);
+ mutex_unlock(&conn_id->handler_mutex);
+ rdma_destroy_id(&conn_id->id);
+ goto out;
+ }
+
+ mutex_unlock(&conn_id->handler_mutex);
+
+out:
+ if (dev)
+ dev_put(dev);
+ mutex_unlock(&listen_id->handler_mutex);
+ return ret;
+}
+
+static int cma_ib_listen(struct rdma_id_private *id_priv)
+{
+ struct ib_cm_compare_data compare_data;
+ struct sockaddr *addr;
+ __be64 svc_id;
+ int ret;
+
+ id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device, cma_req_handler,
+ id_priv);
+ if (IS_ERR(id_priv->cm_id.ib))
+ return PTR_ERR(id_priv->cm_id.ib);
+
+ addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr;
+ svc_id = cma_get_service_id(id_priv->id.ps, addr);
+ if (cma_any_addr(addr))
+ ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL);
+ else {
+ cma_set_compare_data(id_priv->id.ps, addr, &compare_data);
+ ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data);
+ }
+
+ if (ret) {
+ ib_destroy_cm_id(id_priv->cm_id.ib);
+ id_priv->cm_id.ib = NULL;
+ }
+
+ return ret;
+}
+
+static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog)
+{
+ int ret;
+ struct sockaddr_in *sin;
+
+ id_priv->cm_id.iw = iw_create_cm_id(id_priv->id.device,
+ iw_conn_req_handler,
+ id_priv);
+ if (IS_ERR(id_priv->cm_id.iw))
+ return PTR_ERR(id_priv->cm_id.iw);
+
+ sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr;
+ id_priv->cm_id.iw->local_addr = *sin;
+
+ ret = iw_cm_listen(id_priv->cm_id.iw, backlog);
+
+ if (ret) {
+ iw_destroy_cm_id(id_priv->cm_id.iw);
+ id_priv->cm_id.iw = NULL;
+ }
+
+ return ret;
+}
+
+static int cma_listen_handler(struct rdma_cm_id *id,
+ struct rdma_cm_event *event)
+{
+ struct rdma_id_private *id_priv = id->context;
+
+ id->context = id_priv->id.context;
+ id->event_handler = id_priv->id.event_handler;
+ return id_priv->id.event_handler(id, event);
+}
+
+static void cma_listen_on_dev(struct rdma_id_private *id_priv,
+ struct cma_device *cma_dev)
+{
+ struct rdma_id_private *dev_id_priv;
+ struct rdma_cm_id *id;
+ int ret;
+
+ id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps);
+ if (IS_ERR(id))
+ return;
+
+ dev_id_priv = container_of(id, struct rdma_id_private, id);
+
+ dev_id_priv->state = CMA_ADDR_BOUND;
+ memcpy(&id->route.addr.src_addr, &id_priv->id.route.addr.src_addr,
+ ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr));
+
+ cma_attach_to_dev(dev_id_priv, cma_dev);
+ list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list);
+ atomic_inc(&id_priv->refcount);
+ dev_id_priv->internal_id = 1;
+
+ ret = rdma_listen(id, id_priv->backlog);
+ if (ret)
+ printk(KERN_WARNING "RDMA CMA: cma_listen_on_dev, error %d, "
+ "listening on device %s\n", ret, cma_dev->device->name);
+}
+
+static void cma_listen_on_all(struct rdma_id_private *id_priv)
+{
+ struct cma_device *cma_dev;
+
+ mutex_lock(&lock);
+ list_add_tail(&id_priv->list, &listen_any_list);
+ list_for_each_entry(cma_dev, &dev_list, list)
+ cma_listen_on_dev(id_priv, cma_dev);
+ mutex_unlock(&lock);
+}
+
+int rdma_listen(struct rdma_cm_id *id, int backlog)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (id_priv->state == CMA_IDLE) {
+ ((struct sockaddr *) &id->route.addr.src_addr)->sa_family = AF_INET;
+ ret = rdma_bind_addr(id, (struct sockaddr *) &id->route.addr.src_addr);
+ if (ret)
+ return ret;
+ }
+
+ if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_LISTEN))
+ return -EINVAL;
+
+ id_priv->backlog = backlog;
+ if (id->device) {
+ switch (rdma_node_get_transport(id->device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ ret = cma_ib_listen(id_priv);
+ if (ret)
+ goto err;
+ break;
+ case RDMA_TRANSPORT_IWARP:
+ ret = cma_iw_listen(id_priv, backlog);
+ if (ret)
+ goto err;
+ break;
+ default:
+ ret = -ENOSYS;
+ goto err;
+ }
+ } else
+ cma_listen_on_all(id_priv);
+
+ return 0;
+err:
+ id_priv->backlog = 0;
+ cma_comp_exch(id_priv, CMA_LISTEN, CMA_ADDR_BOUND);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_listen);
+
+void rdma_set_service_type(struct rdma_cm_id *id, int tos)
+{
+ struct rdma_id_private *id_priv;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ id_priv->tos = (u8) tos;
+}
+EXPORT_SYMBOL(rdma_set_service_type);
+
+static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec,
+ void *context)
+{
+ struct cma_work *work = context;
+ struct rdma_route *route;
+
+ route = &work->id->id.route;
+
+ if (!status) {
+ route->num_paths = 1;
+ *route->path_rec = *path_rec;
+ } else {
+ work->old_state = CMA_ROUTE_QUERY;
+ work->new_state = CMA_ADDR_RESOLVED;
+ work->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
+ work->event.status = status;
+ }
+
+ queue_work(cma_wq, &work->work);
+}
+
+static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms,
+ struct cma_work *work)
+{
+ struct rdma_addr *addr = &id_priv->id.route.addr;
+ struct ib_sa_path_rec path_rec;
+ ib_sa_comp_mask comp_mask;
+ struct sockaddr_in6 *sin6;
+
+ memset(&path_rec, 0, sizeof path_rec);
+ rdma_addr_get_sgid(&addr->dev_addr, &path_rec.sgid);
+ rdma_addr_get_dgid(&addr->dev_addr, &path_rec.dgid);
+ path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(&addr->dev_addr));
+ path_rec.numb_path = 1;
+ path_rec.reversible = 1;
+ path_rec.service_id = cma_get_service_id(id_priv->id.ps,
+ (struct sockaddr *) &addr->dst_addr);
+
+ comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID |
+ IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH |
+ IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID;
+
+ if (addr->src_addr.ss_family == AF_INET) {
+ path_rec.qos_class = cpu_to_be16((u16) id_priv->tos);
+ comp_mask |= IB_SA_PATH_REC_QOS_CLASS;
+ } else {
+ sin6 = (struct sockaddr_in6 *) &addr->src_addr;
+ path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20);
+ comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS;
+ }
+
+ if (tavor_quirk) {
+ path_rec.mtu_selector = IB_SA_LT;
+ path_rec.mtu = IB_MTU_2048;
+ }
+
+ id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device,
+ id_priv->id.port_num, &path_rec,
+ comp_mask, timeout_ms,
+ GFP_KERNEL, cma_query_handler,
+ work, &id_priv->query);
+
+ return (id_priv->query_id < 0) ? id_priv->query_id : 0;
+}
+
+static void cma_work_handler(struct work_struct *_work)
+{
+ struct cma_work *work = container_of(_work, struct cma_work, work);
+ struct rdma_id_private *id_priv = work->id;
+ int destroy = 0;
+
+ mutex_lock(&id_priv->handler_mutex);
+ if (!cma_comp_exch(id_priv, work->old_state, work->new_state))
+ goto out;
+
+ if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
+ cma_exch(id_priv, CMA_DESTROYING);
+ destroy = 1;
+ }
+out:
+ mutex_unlock(&id_priv->handler_mutex);
+ cma_deref_id(id_priv);
+ if (destroy)
+ rdma_destroy_id(&id_priv->id);
+ kfree(work);
+}
+
+static void cma_ndev_work_handler(struct work_struct *_work)
+{
+ struct cma_ndev_work *work = container_of(_work, struct cma_ndev_work, work);
+ struct rdma_id_private *id_priv = work->id;
+ int destroy = 0;
+
+ mutex_lock(&id_priv->handler_mutex);
+ if (id_priv->state == CMA_DESTROYING ||
+ id_priv->state == CMA_DEVICE_REMOVAL)
+ goto out;
+
+ if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
+ cma_exch(id_priv, CMA_DESTROYING);
+ destroy = 1;
+ }
+
+out:
+ mutex_unlock(&id_priv->handler_mutex);
+ cma_deref_id(id_priv);
+ if (destroy)
+ rdma_destroy_id(&id_priv->id);
+ kfree(work);
+}
+
+static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms)
+{
+ struct rdma_route *route = &id_priv->id.route;
+ struct cma_work *work;
+ int ret;
+
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ work->id = id_priv;
+ INIT_WORK(&work->work, cma_work_handler);
+ work->old_state = CMA_ROUTE_QUERY;
+ work->new_state = CMA_ROUTE_RESOLVED;
+ work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+
+ route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL);
+ if (!route->path_rec) {
+ ret = -ENOMEM;
+ goto err1;
+ }
+
+ ret = cma_query_ib_route(id_priv, timeout_ms, work);
+ if (ret)
+ goto err2;
+
+ return 0;
+err2:
+ kfree(route->path_rec);
+ route->path_rec = NULL;
+err1:
+ kfree(work);
+ return ret;
+}
+
+int rdma_set_ib_paths(struct rdma_cm_id *id,
+ struct ib_sa_path_rec *path_rec, int num_paths)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ROUTE_RESOLVED))
+ return -EINVAL;
+
+ id->route.path_rec = kmalloc(sizeof *path_rec * num_paths, GFP_KERNEL);
+ if (!id->route.path_rec) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ memcpy(id->route.path_rec, path_rec, sizeof *path_rec * num_paths);
+ return 0;
+err:
+ cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_ADDR_RESOLVED);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_set_ib_paths);
+
+static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms)
+{
+ struct cma_work *work;
+
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ work->id = id_priv;
+ INIT_WORK(&work->work, cma_work_handler);
+ work->old_state = CMA_ROUTE_QUERY;
+ work->new_state = CMA_ROUTE_RESOLVED;
+ work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+ queue_work(cma_wq, &work->work);
+ return 0;
+}
+
+static u8 tos_to_sl(u8 tos)
+{
+ return def_prec2sl & 7;
+}
+
+static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
+{
+ struct rdma_route *route = &id_priv->id.route;
+ struct rdma_addr *addr = &route->addr;
+ struct cma_work *work;
+ int ret;
+ struct sockaddr_in *src_addr = (struct sockaddr_in *)&route->addr.src_addr;
+ struct sockaddr_in *dst_addr = (struct sockaddr_in *)&route->addr.dst_addr;
+ struct net_device *ndev = NULL;
+ u16 vid;
+
+ if (src_addr->sin_family != dst_addr->sin_family)
+ return -EINVAL;
+
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ work->id = id_priv;
+ INIT_WORK(&work->work, cma_work_handler);
+
+ route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL);
+ if (!route->path_rec) {
+ ret = -ENOMEM;
+ goto err1;
+ }
+
+ route->num_paths = 1;
+
+ if (addr->dev_addr.bound_dev_if)
+ ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if);
+ if (!ndev) {
+ ret = -ENODEV;
+ goto err2;
+ }
+
+ vid = rdma_vlan_dev_vlan_id(ndev);
+
+ iboe_mac_vlan_to_ll(&route->path_rec->sgid, addr->dev_addr.src_dev_addr, vid);
+ iboe_mac_vlan_to_ll(&route->path_rec->dgid, addr->dev_addr.dst_dev_addr, vid);
+
+ route->path_rec->hop_limit = 1;
+ route->path_rec->reversible = 1;
+ route->path_rec->pkey = cpu_to_be16(0xffff);
+ route->path_rec->mtu_selector = IB_SA_EQ;
+ route->path_rec->sl = tos_to_sl(id_priv->tos);
+
+#ifdef __linux__
+ route->path_rec->mtu = iboe_get_mtu(ndev->mtu);
+#else
+ route->path_rec->mtu = iboe_get_mtu(ndev->if_mtu);
+#endif
+ route->path_rec->rate_selector = IB_SA_EQ;
+ route->path_rec->rate = iboe_get_rate(ndev);
+ dev_put(ndev);
+ route->path_rec->packet_life_time_selector = IB_SA_EQ;
+ route->path_rec->packet_life_time = IBOE_PACKET_LIFETIME;
+ if (!route->path_rec->mtu) {
+ ret = -EINVAL;
+ goto err2;
+ }
+
+ work->old_state = CMA_ROUTE_QUERY;
+ work->new_state = CMA_ROUTE_RESOLVED;
+ work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+ work->event.status = 0;
+
+ queue_work(cma_wq, &work->work);
+
+ return 0;
+
+err2:
+ kfree(route->path_rec);
+ route->path_rec = NULL;
+err1:
+ kfree(work);
+ return ret;
+}
+
+int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ROUTE_QUERY))
+ return -EINVAL;
+
+ atomic_inc(&id_priv->refcount);
+ switch (rdma_node_get_transport(id->device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ switch (rdma_port_get_link_layer(id->device, id->port_num)) {
+ case IB_LINK_LAYER_INFINIBAND:
+ ret = cma_resolve_ib_route(id_priv, timeout_ms);
+ break;
+ case IB_LINK_LAYER_ETHERNET:
+ ret = cma_resolve_iboe_route(id_priv);
+ break;
+ default:
+ ret = -ENOSYS;
+ }
+ break;
+ case RDMA_TRANSPORT_IWARP:
+ ret = cma_resolve_iw_route(id_priv, timeout_ms);
+ break;
+ default:
+ ret = -ENOSYS;
+ break;
+ }
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ cma_comp_exch(id_priv, CMA_ROUTE_QUERY, CMA_ADDR_RESOLVED);
+ cma_deref_id(id_priv);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_route);
+
+static int cma_bind_loopback(struct rdma_id_private *id_priv)
+{
+ struct cma_device *cma_dev;
+ struct ib_port_attr port_attr;
+ union ib_gid gid;
+ u16 pkey;
+ int ret;
+ u8 p;
+
+ mutex_lock(&lock);
+ if (list_empty(&dev_list)) {
+ ret = -ENODEV;
+ goto out;
+ }
+ list_for_each_entry(cma_dev, &dev_list, list)
+ for (p = 1; p <= cma_dev->device->phys_port_cnt; ++p)
+ if (!ib_query_port(cma_dev->device, p, &port_attr) &&
+ port_attr.state == IB_PORT_ACTIVE)
+ goto port_found;
+
+ p = 1;
+ cma_dev = list_entry(dev_list.next, struct cma_device, list);
+
+port_found:
+ ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid);
+ if (ret)
+ goto out;
+
+ ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey);
+ if (ret)
+ goto out;
+
+ id_priv->id.route.addr.dev_addr.dev_type =
+ (rdma_port_get_link_layer(cma_dev->device, p) == IB_LINK_LAYER_INFINIBAND) ?
+ ARPHRD_INFINIBAND : ARPHRD_ETHER;
+
+ rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid);
+ ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey);
+ id_priv->id.port_num = p;
+ cma_attach_to_dev(id_priv, cma_dev);
+out:
+ mutex_unlock(&lock);
+ return ret;
+}
+
+static void addr_handler(int status, struct sockaddr *src_addr,
+ struct rdma_dev_addr *dev_addr, void *context)
+{
+ struct rdma_id_private *id_priv = context;
+ struct rdma_cm_event event;
+
+ memset(&event, 0, sizeof event);
+ mutex_lock(&id_priv->handler_mutex);
+
+ /*
+ * Grab mutex to block rdma_destroy_id() from removing the device while
+ * we're trying to acquire it.
+ */
+ mutex_lock(&lock);
+ if (!cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_ADDR_RESOLVED)) {
+ mutex_unlock(&lock);
+ goto out;
+ }
+
+ if (!status && !id_priv->cma_dev)
+ status = cma_acquire_dev(id_priv);
+ mutex_unlock(&lock);
+
+ if (status) {
+ if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ADDR_BOUND))
+ goto out;
+ event.event = RDMA_CM_EVENT_ADDR_ERROR;
+ event.status = status;
+ } else {
+ memcpy(&id_priv->id.route.addr.src_addr, src_addr,
+ ip_addr_size(src_addr));
+ event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+ }
+
+ if (id_priv->id.event_handler(&id_priv->id, &event)) {
+ cma_exch(id_priv, CMA_DESTROYING);
+ mutex_unlock(&id_priv->handler_mutex);
+ cma_deref_id(id_priv);
+ rdma_destroy_id(&id_priv->id);
+ return;
+ }
+out:
+ mutex_unlock(&id_priv->handler_mutex);
+ cma_deref_id(id_priv);
+}
+
+static int cma_resolve_loopback(struct rdma_id_private *id_priv)
+{
+ struct cma_work *work;
+ struct sockaddr *src, *dst;
+ union ib_gid gid;
+ int ret;
+
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ if (!id_priv->cma_dev) {
+ ret = cma_bind_loopback(id_priv);
+ if (ret)
+ goto err;
+ }
+
+ rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid);
+ rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid);
+
+ src = (struct sockaddr *) &id_priv->id.route.addr.src_addr;
+ if (cma_zero_addr(src)) {
+ dst = (struct sockaddr *) &id_priv->id.route.addr.dst_addr;
+ if ((src->sa_family = dst->sa_family) == AF_INET) {
+ ((struct sockaddr_in *) src)->sin_addr.s_addr =
+ ((struct sockaddr_in *) dst)->sin_addr.s_addr;
+ } else {
+ ipv6_addr_copy(&((struct sockaddr_in6 *) src)->sin6_addr,
+ &((struct sockaddr_in6 *) dst)->sin6_addr);
+ }
+ }
+
+ work->id = id_priv;
+ INIT_WORK(&work->work, cma_work_handler);
+ work->old_state = CMA_ADDR_QUERY;
+ work->new_state = CMA_ADDR_RESOLVED;
+ work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+ queue_work(cma_wq, &work->work);
+ return 0;
+err:
+ kfree(work);
+ return ret;
+}
+
+static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+ struct sockaddr *dst_addr)
+{
+ if (!src_addr || !src_addr->sa_family) {
+ src_addr = (struct sockaddr *) &id->route.addr.src_addr;
+ if ((src_addr->sa_family = dst_addr->sa_family) == AF_INET6) {
+ ((struct sockaddr_in6 *) src_addr)->sin6_scope_id =
+ ((struct sockaddr_in6 *) dst_addr)->sin6_scope_id;
+ }
+ }
+ return rdma_bind_addr(id, src_addr);
+}
+
+int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+ struct sockaddr *dst_addr, int timeout_ms)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (id_priv->state == CMA_IDLE) {
+ ret = cma_bind_addr(id, src_addr, dst_addr);
+ if (ret)
+ return ret;
+ }
+
+ if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_ADDR_QUERY))
+ return -EINVAL;
+
+ atomic_inc(&id_priv->refcount);
+ memcpy(&id->route.addr.dst_addr, dst_addr, ip_addr_size(dst_addr));
+ if (cma_any_addr(dst_addr))
+ ret = cma_resolve_loopback(id_priv);
+ else
+ ret = rdma_resolve_ip(&addr_client, (struct sockaddr *) &id->route.addr.src_addr,
+ dst_addr, &id->route.addr.dev_addr,
+ timeout_ms, addr_handler, id_priv);
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_ADDR_BOUND);
+ cma_deref_id(id_priv);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_addr);
+
+static void cma_bind_port(struct rdma_bind_list *bind_list,
+ struct rdma_id_private *id_priv)
+{
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr;
+ sin->sin_port = htons(bind_list->port);
+ id_priv->bind_list = bind_list;
+ hlist_add_head(&id_priv->node, &bind_list->owners);
+}
+
+static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv,
+ unsigned short snum)
+{
+ struct rdma_bind_list *bind_list;
+ int port, ret;
+
+ bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL);
+ if (!bind_list)
+ return -ENOMEM;
+
+ do {
+ ret = idr_get_new_above(ps, bind_list, snum, &port);
+ } while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL));
+
+ if (ret)
+ goto err1;
+
+ if (port != snum) {
+ ret = -EADDRNOTAVAIL;
+ goto err2;
+ }
+
+ bind_list->ps = ps;
+ bind_list->port = (unsigned short) port;
+ cma_bind_port(bind_list, id_priv);
+ return 0;
+err2:
+ idr_remove(ps, port);
+err1:
+ kfree(bind_list);
+ return ret;
+}
+
+static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv)
+{
+ struct rdma_bind_list *bind_list;
+ int port, ret, low, high;
+
+ bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL);
+ if (!bind_list)
+ return -ENOMEM;
+
+retry:
+ /* FIXME: add proper port randomization per like inet_csk_get_port */
+ do {
+ ret = idr_get_new_above(ps, bind_list, next_port, &port);
+ } while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL));
+
+ if (ret)
+ goto err1;
+
+ inet_get_local_port_range(&low, &high);
+ if (port > high) {
+ if (next_port != low) {
+ idr_remove(ps, port);
+ next_port = low;
+ goto retry;
+ }
+ ret = -EADDRNOTAVAIL;
+ goto err2;
+ }
+
+ if (port == high)
+ next_port = low;
+ else
+ next_port = port + 1;
+
+ bind_list->ps = ps;
+ bind_list->port = (unsigned short) port;
+ cma_bind_port(bind_list, id_priv);
+ return 0;
+err2:
+ idr_remove(ps, port);
+err1:
+ kfree(bind_list);
+ return ret;
+}
+
+static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
+{
+ struct rdma_id_private *cur_id;
+ struct sockaddr_in *sin, *cur_sin;
+ struct rdma_bind_list *bind_list;
+ struct hlist_node *node;
+ unsigned short snum;
+
+ sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr;
+ snum = ntohs(sin->sin_port);
+#ifdef __linux__
+ if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+ return -EACCES;
+#endif
+
+ bind_list = idr_find(ps, snum);
+ if (!bind_list)
+ return cma_alloc_port(ps, id_priv, snum);
+
+ /*
+ * We don't support binding to any address if anyone is bound to
+ * a specific address on the same port.
+ */
+ if (cma_any_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr))
+ return -EADDRNOTAVAIL;
+
+ hlist_for_each_entry(cur_id, node, &bind_list->owners, node) {
+ if (cma_any_addr((struct sockaddr *) &cur_id->id.route.addr.src_addr))
+ return -EADDRNOTAVAIL;
+
+ cur_sin = (struct sockaddr_in *) &cur_id->id.route.addr.src_addr;
+ if (sin->sin_addr.s_addr == cur_sin->sin_addr.s_addr)
+ return -EADDRINUSE;
+ }
+
+ cma_bind_port(bind_list, id_priv);
+ return 0;
+}
+
+static int cma_get_tcp_port(struct rdma_id_private *id_priv)
+{
+ int ret;
+ int size;
+ struct socket *sock;
+
+ ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+ if (ret)
+ return ret;
+#ifdef __linux__
+ ret = sock->ops->bind(sock,
+ (struct sockaddr *) &id_priv->id.route.addr.src_addr,
+ ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr));
+#else
+ ret = -sobind(sock,
+ (struct sockaddr *)&id_priv->id.route.addr.src_addr,
+ curthread);
+#endif
+ if (ret) {
+ sock_release(sock);
+ return ret;
+ }
+ size = ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr);
+ ret = sock_getname(sock,
+ (struct sockaddr *) &id_priv->id.route.addr.src_addr,
+ &size, 0);
+ if (ret) {
+ sock_release(sock);
+ return ret;
+ }
+ id_priv->sock = sock;
+ return 0;
+}
+
+static int cma_get_port(struct rdma_id_private *id_priv)
+{
+ struct idr *ps;
+ int ret;
+
+ switch (id_priv->id.ps) {
+ case RDMA_PS_SDP:
+ ps = &sdp_ps;
+ break;
+ case RDMA_PS_TCP:
+ ps = &tcp_ps;
+ if (unify_tcp_port_space) {
+ ret = cma_get_tcp_port(id_priv);
+ if (ret)
+ goto out;
+ }
+ break;
+ case RDMA_PS_UDP:
+ ps = &udp_ps;
+ break;
+ case RDMA_PS_IPOIB:
+ ps = &ipoib_ps;
+ break;
+ default:
+ return -EPROTONOSUPPORT;
+ }
+
+ mutex_lock(&lock);
+ if (cma_any_port((struct sockaddr *) &id_priv->id.route.addr.src_addr))
+ ret = cma_alloc_any_port(ps, id_priv);
+ else
+ ret = cma_use_port(ps, id_priv);
+ mutex_unlock(&lock);
+out:
+ return ret;
+}
+
+static int cma_check_linklocal(struct rdma_dev_addr *dev_addr,
+ struct sockaddr *addr)
+{
+#if defined(INET6)
+ struct sockaddr_in6 *sin6;
+
+ if (addr->sa_family != AF_INET6)
+ return 0;
+
+ sin6 = (struct sockaddr_in6 *) addr;
+#ifdef __linux__
+ if ((ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) &&
+#else
+ if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) &&
+#endif
+ !sin6->sin6_scope_id)
+ return -EINVAL;
+
+ dev_addr->bound_dev_if = sin6->sin6_scope_id;
+#endif
+ return 0;
+}
+
+int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
+ return -EAFNOSUPPORT;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_comp_exch(id_priv, CMA_IDLE, CMA_ADDR_BOUND))
+ return -EINVAL;
+
+ ret = cma_check_linklocal(&id->route.addr.dev_addr, addr);
+ if (ret)
+ goto err1;
+
+ if (!cma_any_addr(addr)) {
+ ret = rdma_translate_ip(addr, &id->route.addr.dev_addr);
+ if (ret)
+ goto err1;
+
+ mutex_lock(&lock);
+ ret = cma_acquire_dev(id_priv);
+ mutex_unlock(&lock);
+ if (ret)
+ goto err1;
+ }
+
+ memcpy(&id->route.addr.src_addr, addr, ip_addr_size(addr));
+ ret = cma_get_port(id_priv);
+ if (ret)
+ goto err2;
+
+ return 0;
+err2:
+ if (id_priv->cma_dev) {
+ mutex_lock(&lock);
+ cma_detach_from_dev(id_priv);
+ mutex_unlock(&lock);
+ }
+err1:
+ cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_IDLE);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_bind_addr);
+
+static int cma_format_hdr(void *hdr, enum rdma_port_space ps,
+ struct rdma_route *route)
+{
+ struct cma_hdr *cma_hdr;
+ struct sdp_hh *sdp_hdr;
+
+ if (route->addr.src_addr.ss_family == AF_INET) {
+ struct sockaddr_in *src4, *dst4;
+
+ src4 = (struct sockaddr_in *) &route->addr.src_addr;
+ dst4 = (struct sockaddr_in *) &route->addr.dst_addr;
+
+ switch (ps) {
+ case RDMA_PS_SDP:
+ sdp_hdr = hdr;
+ if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION)
+ return -EINVAL;
+ sdp_set_ip_ver(sdp_hdr, 4);
+ sdp_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr;
+ sdp_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr;
+ sdp_hdr->port = src4->sin_port;
+ break;
+ default:
+ cma_hdr = hdr;
+ cma_hdr->cma_version = CMA_VERSION;
+ cma_set_ip_ver(cma_hdr, 4);
+ cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr;
+ cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr;
+ cma_hdr->port = src4->sin_port;
+ break;
+ }
+ } else {
+ struct sockaddr_in6 *src6, *dst6;
+
+ src6 = (struct sockaddr_in6 *) &route->addr.src_addr;
+ dst6 = (struct sockaddr_in6 *) &route->addr.dst_addr;
+
+ switch (ps) {
+ case RDMA_PS_SDP:
+ sdp_hdr = hdr;
+ if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION)
+ return -EINVAL;
+ sdp_set_ip_ver(sdp_hdr, 6);
+ sdp_hdr->src_addr.ip6 = src6->sin6_addr;
+ sdp_hdr->dst_addr.ip6 = dst6->sin6_addr;
+ sdp_hdr->port = src6->sin6_port;
+ break;
+ default:
+ cma_hdr = hdr;
+ cma_hdr->cma_version = CMA_VERSION;
+ cma_set_ip_ver(cma_hdr, 6);
+ cma_hdr->src_addr.ip6 = src6->sin6_addr;
+ cma_hdr->dst_addr.ip6 = dst6->sin6_addr;
+ cma_hdr->port = src6->sin6_port;
+ break;
+ }
+ }
+ return 0;
+}
+
+static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
+ struct ib_cm_event *ib_event)
+{
+ struct rdma_id_private *id_priv = cm_id->context;
+ struct rdma_cm_event event;
+ struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd;
+ int ret = 0;
+
+ if (cma_disable_callback(id_priv, CMA_CONNECT))
+ return 0;
+
+ memset(&event, 0, sizeof event);
+ switch (ib_event->event) {
+ case IB_CM_SIDR_REQ_ERROR:
+ event.event = RDMA_CM_EVENT_UNREACHABLE;
+ event.status = -ETIMEDOUT;
+ break;
+ case IB_CM_SIDR_REP_RECEIVED:
+ event.param.ud.private_data = ib_event->private_data;
+ event.param.ud.private_data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE;
+ if (rep->status != IB_SIDR_SUCCESS) {
+ event.event = RDMA_CM_EVENT_UNREACHABLE;
+ event.status = ib_event->param.sidr_rep_rcvd.status;
+ break;
+ }
+ ret = cma_set_qkey(id_priv);
+ if (ret) {
+ event.event = RDMA_CM_EVENT_ADDR_ERROR;
+ event.status = -EINVAL;
+ break;
+ }
+ if (id_priv->qkey != rep->qkey) {
+ event.event = RDMA_CM_EVENT_UNREACHABLE;
+ event.status = -EINVAL;
+ break;
+ }
+ ib_init_ah_from_path(id_priv->id.device, id_priv->id.port_num,
+ id_priv->id.route.path_rec,
+ &event.param.ud.ah_attr);
+ event.param.ud.qp_num = rep->qpn;
+ event.param.ud.qkey = rep->qkey;
+ event.event = RDMA_CM_EVENT_ESTABLISHED;
+ event.status = 0;
+ break;
+ default:
+ printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n",
+ ib_event->event);
+ goto out;
+ }
+
+ ret = id_priv->id.event_handler(&id_priv->id, &event);
+ if (ret) {
+ /* Destroy the CM ID by returning a non-zero value. */
+ id_priv->cm_id.ib = NULL;
+ cma_exch(id_priv, CMA_DESTROYING);
+ mutex_unlock(&id_priv->handler_mutex);
+ rdma_destroy_id(&id_priv->id);
+ return ret;
+ }
+out:
+ mutex_unlock(&id_priv->handler_mutex);
+ return ret;
+}
+
+static int cma_resolve_ib_udp(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct ib_cm_sidr_req_param req;
+ struct rdma_route *route;
+ int ret;
+
+ req.private_data_len = sizeof(struct cma_hdr) +
+ conn_param->private_data_len;
+ req.private_data = kzalloc(req.private_data_len, GFP_ATOMIC);
+ if (!req.private_data)
+ return -ENOMEM;
+
+ if (conn_param->private_data && conn_param->private_data_len)
+ memcpy((void *) req.private_data + sizeof(struct cma_hdr),
+ conn_param->private_data, conn_param->private_data_len);
+
+ route = &id_priv->id.route;
+ ret = cma_format_hdr((void *) req.private_data, id_priv->id.ps, route);
+ if (ret)
+ goto out;
+
+ id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device,
+ cma_sidr_rep_handler, id_priv);
+ if (IS_ERR(id_priv->cm_id.ib)) {
+ ret = PTR_ERR(id_priv->cm_id.ib);
+ goto out;
+ }
+
+ req.path = route->path_rec;
+ req.service_id = cma_get_service_id(id_priv->id.ps,
+ (struct sockaddr *) &route->addr.dst_addr);
+ req.timeout_ms = 1 << (cma_response_timeout - 8);
+ req.max_cm_retries = CMA_MAX_CM_RETRIES;
+
+ ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req);
+ if (ret) {
+ ib_destroy_cm_id(id_priv->cm_id.ib);
+ id_priv->cm_id.ib = NULL;
+ }
+out:
+ kfree(req.private_data);
+ return ret;
+}
+
+static int cma_connect_ib(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct ib_cm_req_param req;
+ struct rdma_route *route;
+ void *private_data;
+ int offset, ret;
+
+ memset(&req, 0, sizeof req);
+ offset = cma_user_data_offset(id_priv->id.ps);
+ req.private_data_len = offset + conn_param->private_data_len;
+ private_data = kzalloc(req.private_data_len, GFP_ATOMIC);
+ if (!private_data)
+ return -ENOMEM;
+
+ if (conn_param->private_data && conn_param->private_data_len)
+ memcpy(private_data + offset, conn_param->private_data,
+ conn_param->private_data_len);
+
+ id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device, cma_ib_handler,
+ id_priv);
+ if (IS_ERR(id_priv->cm_id.ib)) {
+ ret = PTR_ERR(id_priv->cm_id.ib);
+ goto out;
+ }
+
+ route = &id_priv->id.route;
+ ret = cma_format_hdr(private_data, id_priv->id.ps, route);
+ if (ret)
+ goto out;
+ req.private_data = private_data;
+
+ req.primary_path = &route->path_rec[0];
+ if (route->num_paths == 2)
+ req.alternate_path = &route->path_rec[1];
+
+ req.service_id = cma_get_service_id(id_priv->id.ps,
+ (struct sockaddr *) &route->addr.dst_addr);
+ req.qp_num = id_priv->qp_num;
+ req.qp_type = IB_QPT_RC;
+ req.starting_psn = id_priv->seq_num;
+ req.responder_resources = conn_param->responder_resources;
+ req.initiator_depth = conn_param->initiator_depth;
+ req.flow_control = conn_param->flow_control;
+ req.retry_count = conn_param->retry_count;
+ req.rnr_retry_count = conn_param->rnr_retry_count;
+ req.remote_cm_response_timeout = cma_response_timeout;
+ req.local_cm_response_timeout = cma_response_timeout;
+ req.max_cm_retries = CMA_MAX_CM_RETRIES;
+ req.srq = id_priv->srq ? 1 : 0;
+
+ ret = ib_send_cm_req(id_priv->cm_id.ib, &req);
+out:
+ if (ret && !IS_ERR(id_priv->cm_id.ib)) {
+ ib_destroy_cm_id(id_priv->cm_id.ib);
+ id_priv->cm_id.ib = NULL;
+ }
+
+ kfree(private_data);
+ return ret;
+}
+
+static int cma_connect_iw(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct iw_cm_id *cm_id;
+ struct sockaddr_in* sin;
+ int ret;
+ struct iw_cm_conn_param iw_param;
+
+ cm_id = iw_create_cm_id(id_priv->id.device, cma_iw_handler, id_priv);
+ if (IS_ERR(cm_id)) {
+ ret = PTR_ERR(cm_id);
+ goto out;
+ }
+
+ id_priv->cm_id.iw = cm_id;
+
+ sin = (struct sockaddr_in*) &id_priv->id.route.addr.src_addr;
+ cm_id->local_addr = *sin;
+
+ sin = (struct sockaddr_in*) &id_priv->id.route.addr.dst_addr;
+ cm_id->remote_addr = *sin;
+
+ ret = cma_modify_qp_rtr(id_priv, conn_param);
+ if (ret)
+ goto out;
+
+ iw_param.ord = conn_param->initiator_depth;
+ iw_param.ird = conn_param->responder_resources;
+ iw_param.private_data = conn_param->private_data;
+ iw_param.private_data_len = conn_param->private_data_len;
+ if (id_priv->id.qp)
+ iw_param.qpn = id_priv->qp_num;
+ else
+ iw_param.qpn = conn_param->qp_num;
+ ret = iw_cm_connect(cm_id, &iw_param);
+out:
+ if (ret && !IS_ERR(cm_id)) {
+ iw_destroy_cm_id(cm_id);
+ id_priv->cm_id.iw = NULL;
+ }
+ return ret;
+}
+
+int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_CONNECT))
+ return -EINVAL;
+
+ if (!id->qp) {
+ id_priv->qp_num = conn_param->qp_num;
+ id_priv->srq = conn_param->srq;
+ }
+
+ switch (rdma_node_get_transport(id->device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ if (cma_is_ud_ps(id->ps))
+ ret = cma_resolve_ib_udp(id_priv, conn_param);
+ else
+ ret = cma_connect_ib(id_priv, conn_param);
+ break;
+ case RDMA_TRANSPORT_IWARP:
+ ret = cma_connect_iw(id_priv, conn_param);
+ break;
+ default:
+ ret = -ENOSYS;
+ break;
+ }
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ cma_comp_exch(id_priv, CMA_CONNECT, CMA_ROUTE_RESOLVED);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_connect);
+
+static int cma_accept_ib(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct ib_cm_rep_param rep;
+ int ret;
+
+ ret = cma_modify_qp_rtr(id_priv, conn_param);
+ if (ret)
+ goto out;
+
+ ret = cma_modify_qp_rts(id_priv, conn_param);
+ if (ret)
+ goto out;
+
+ memset(&rep, 0, sizeof rep);
+ rep.qp_num = id_priv->qp_num;
+ rep.starting_psn = id_priv->seq_num;
+ rep.private_data = conn_param->private_data;
+ rep.private_data_len = conn_param->private_data_len;
+ rep.responder_resources = conn_param->responder_resources;
+ rep.initiator_depth = conn_param->initiator_depth;
+ rep.failover_accepted = 0;
+ rep.flow_control = conn_param->flow_control;
+ rep.rnr_retry_count = conn_param->rnr_retry_count;
+ rep.srq = id_priv->srq ? 1 : 0;
+
+ ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep);
+out:
+ return ret;
+}
+
+static int cma_accept_iw(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct iw_cm_conn_param iw_param;
+ int ret;
+
+ ret = cma_modify_qp_rtr(id_priv, conn_param);
+ if (ret)
+ return ret;
+
+ iw_param.ord = conn_param->initiator_depth;
+ iw_param.ird = conn_param->responder_resources;
+ iw_param.private_data = conn_param->private_data;
+ iw_param.private_data_len = conn_param->private_data_len;
+ if (id_priv->id.qp) {
+ iw_param.qpn = id_priv->qp_num;
+ } else
+ iw_param.qpn = conn_param->qp_num;
+
+ return iw_cm_accept(id_priv->cm_id.iw, &iw_param);
+}
+
+static int cma_send_sidr_rep(struct rdma_id_private *id_priv,
+ enum ib_cm_sidr_status status,
+ const void *private_data, int private_data_len)
+{
+ struct ib_cm_sidr_rep_param rep;
+ int ret;
+
+ memset(&rep, 0, sizeof rep);
+ rep.status = status;
+ if (status == IB_SIDR_SUCCESS) {
+ ret = cma_set_qkey(id_priv);
+ if (ret)
+ return ret;
+ rep.qp_num = id_priv->qp_num;
+ rep.qkey = id_priv->qkey;
+ }
+ rep.private_data = private_data;
+ rep.private_data_len = private_data_len;
+
+ return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep);
+}
+
+int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_comp(id_priv, CMA_CONNECT))
+ return -EINVAL;
+
+ if (!id->qp && conn_param) {
+ id_priv->qp_num = conn_param->qp_num;
+ id_priv->srq = conn_param->srq;
+ }
+
+ switch (rdma_node_get_transport(id->device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ if (cma_is_ud_ps(id->ps))
+ ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
+ conn_param->private_data,
+ conn_param->private_data_len);
+ else if (conn_param)
+ ret = cma_accept_ib(id_priv, conn_param);
+ else
+ ret = cma_rep_recv(id_priv);
+ break;
+ case RDMA_TRANSPORT_IWARP:
+ ret = cma_accept_iw(id_priv, conn_param);
+ break;
+ default:
+ ret = -ENOSYS;
+ break;
+ }
+
+ if (ret)
+ goto reject;
+
+ return 0;
+reject:
+ cma_modify_qp_err(id_priv);
+ rdma_reject(id, NULL, 0);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_accept);
+
+int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_has_cm_dev(id_priv))
+ return -EINVAL;
+
+ switch (id->device->node_type) {
+ case RDMA_NODE_IB_CA:
+ ret = ib_cm_notify(id_priv->cm_id.ib, event);
+ break;
+ default:
+ ret = 0;
+ break;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rdma_notify);
+
+int rdma_reject(struct rdma_cm_id *id, const void *private_data,
+ u8 private_data_len)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_has_cm_dev(id_priv))
+ return -EINVAL;
+
+ switch (rdma_node_get_transport(id->device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ if (cma_is_ud_ps(id->ps))
+ ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT,
+ private_data, private_data_len);
+ else
+ ret = ib_send_cm_rej(id_priv->cm_id.ib,
+ IB_CM_REJ_CONSUMER_DEFINED, NULL,
+ 0, private_data, private_data_len);
+ break;
+ case RDMA_TRANSPORT_IWARP:
+ ret = iw_cm_reject(id_priv->cm_id.iw,
+ private_data, private_data_len);
+ break;
+ default:
+ ret = -ENOSYS;
+ break;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rdma_reject);
+
+int rdma_disconnect(struct rdma_cm_id *id)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_has_cm_dev(id_priv))
+ return -EINVAL;
+
+ switch (rdma_node_get_transport(id->device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ ret = cma_modify_qp_err(id_priv);
+ if (ret)
+ goto out;
+ /* Initiate or respond to a disconnect. */
+ if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0))
+ ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0);
+ break;
+ case RDMA_TRANSPORT_IWARP:
+ ret = iw_cm_disconnect(id_priv->cm_id.iw, 0);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+out:
+ return ret;
+}
+EXPORT_SYMBOL(rdma_disconnect);
+
+static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
+{
+ struct rdma_id_private *id_priv;
+ struct cma_multicast *mc = multicast->context;
+ struct rdma_cm_event event;
+ int ret;
+
+ id_priv = mc->id_priv;
+ if (cma_disable_callback(id_priv, CMA_ADDR_BOUND) &&
+ cma_disable_callback(id_priv, CMA_ADDR_RESOLVED))
+ return 0;
+
+ mutex_lock(&id_priv->qp_mutex);
+ if (!status && id_priv->id.qp)
+ status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid,
+ multicast->rec.mlid);
+ mutex_unlock(&id_priv->qp_mutex);
+
+ memset(&event, 0, sizeof event);
+ event.status = status;
+ event.param.ud.private_data = mc->context;
+ if (!status) {
+ event.event = RDMA_CM_EVENT_MULTICAST_JOIN;
+ ib_init_ah_from_mcmember(id_priv->id.device,
+ id_priv->id.port_num, &multicast->rec,
+ &event.param.ud.ah_attr);
+ event.param.ud.qp_num = 0xFFFFFF;
+ event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey);
+ } else
+ event.event = RDMA_CM_EVENT_MULTICAST_ERROR;
+
+ ret = id_priv->id.event_handler(&id_priv->id, &event);
+ if (ret) {
+ cma_exch(id_priv, CMA_DESTROYING);
+ mutex_unlock(&id_priv->handler_mutex);
+ rdma_destroy_id(&id_priv->id);
+ return 0;
+ }
+
+ mutex_unlock(&id_priv->handler_mutex);
+ return 0;
+}
+
+static void cma_set_mgid(struct rdma_id_private *id_priv,
+ struct sockaddr *addr, union ib_gid *mgid)
+{
+ unsigned char mc_map[MAX_ADDR_LEN];
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ struct sockaddr_in *sin = (struct sockaddr_in *) addr;
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr;
+
+ if (cma_any_addr(addr)) {
+ memset(mgid, 0, sizeof *mgid);
+ } else if ((addr->sa_family == AF_INET6) &&
+ ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) ==
+ 0xFF10A01B)) {
+ /* IPv6 address is an SA assigned MGID. */
+ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
+ } else if ((addr->sa_family == AF_INET6)) {
+ ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map);
+ if (id_priv->id.ps == RDMA_PS_UDP)
+ mc_map[7] = 0x01; /* Use RDMA CM signature */
+ *mgid = *(union ib_gid *) (mc_map + 4);
+ } else {
+ ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map);
+ if (id_priv->id.ps == RDMA_PS_UDP)
+ mc_map[7] = 0x01; /* Use RDMA CM signature */
+ *mgid = *(union ib_gid *) (mc_map + 4);
+ }
+}
+
+static int cma_join_ib_multicast(struct rdma_id_private *id_priv,
+ struct cma_multicast *mc)
+{
+ struct ib_sa_mcmember_rec rec;
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ ib_sa_comp_mask comp_mask;
+ int ret;
+
+ ib_addr_get_mgid(dev_addr, &rec.mgid);
+ ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num,
+ &rec.mgid, &rec);
+ if (ret)
+ return ret;
+
+ cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid);
+ if (id_priv->id.ps == RDMA_PS_UDP)
+ rec.qkey = cpu_to_be32(RDMA_UDP_QKEY);
+ rdma_addr_get_sgid(dev_addr, &rec.port_gid);
+ rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
+ rec.join_state = 1;
+
+ comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID |
+ IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE |
+ IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL |
+ IB_SA_MCMEMBER_REC_FLOW_LABEL |
+ IB_SA_MCMEMBER_REC_TRAFFIC_CLASS;
+
+ if (id_priv->id.ps == RDMA_PS_IPOIB)
+ comp_mask |= IB_SA_MCMEMBER_REC_RATE |
+ IB_SA_MCMEMBER_REC_RATE_SELECTOR;
+
+ mc->multicast.ib = ib_sa_join_multicast(&sa_client, id_priv->id.device,
+ id_priv->id.port_num, &rec,
+ comp_mask, GFP_KERNEL,
+ cma_ib_mc_handler, mc);
+ if (IS_ERR(mc->multicast.ib))
+ return PTR_ERR(mc->multicast.ib);
+
+ return 0;
+}
+
+
+static void iboe_mcast_work_handler(struct work_struct *work)
+{
+ struct iboe_mcast_work *mw = container_of(work, struct iboe_mcast_work, work);
+ struct cma_multicast *mc = mw->mc;
+ struct ib_sa_multicast *m = mc->multicast.ib;
+
+ mc->multicast.ib->context = mc;
+ cma_ib_mc_handler(0, m);
+ kref_put(&mc->mcref, release_mc);
+ kfree(mw);
+}
+
+static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid)
+{
+ struct sockaddr_in *sin = (struct sockaddr_in *)addr;
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr;
+
+ if (cma_any_addr(addr)) {
+ memset(mgid, 0, sizeof *mgid);
+ } else if (addr->sa_family == AF_INET6)
+ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
+ else {
+ mgid->raw[0] = 0xff;
+ mgid->raw[1] = 0x0e;
+ mgid->raw[2] = 0;
+ mgid->raw[3] = 0;
+ mgid->raw[4] = 0;
+ mgid->raw[5] = 0;
+ mgid->raw[6] = 0;
+ mgid->raw[7] = 0;
+ mgid->raw[8] = 0;
+ mgid->raw[9] = 0;
+ mgid->raw[10] = 0xff;
+ mgid->raw[11] = 0xff;
+ *(__be32 *)(&mgid->raw[12]) = sin->sin_addr.s_addr;
+ }
+}
+
+static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
+ struct cma_multicast *mc)
+{
+ struct iboe_mcast_work *work;
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ int err;
+ struct sockaddr *addr = (struct sockaddr *)&mc->addr;
+ struct net_device *ndev = NULL;
+
+ if (cma_zero_addr((struct sockaddr *)&mc->addr))
+ return -EINVAL;
+
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ mc->multicast.ib = kzalloc(sizeof(struct ib_sa_multicast), GFP_KERNEL);
+ if (!mc->multicast.ib) {
+ err = -ENOMEM;
+ goto out1;
+ }
+
+ cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid);
+
+ mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff);
+ if (id_priv->id.ps == RDMA_PS_UDP)
+ mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY);
+
+ if (dev_addr->bound_dev_if)
+ ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+ if (!ndev) {
+ err = -ENODEV;
+ goto out2;
+ }
+
+ mc->multicast.ib->rec.rate = iboe_get_rate(ndev);
+ mc->multicast.ib->rec.hop_limit = 1;
+#ifdef __linux__
+ mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu);
+#else
+ mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->if_mtu);
+#endif
+ dev_put(ndev);
+ if (!mc->multicast.ib->rec.mtu) {
+ err = -EINVAL;
+ goto out2;
+ }
+ iboe_addr_get_sgid(dev_addr, &mc->multicast.ib->rec.port_gid);
+ work->id = id_priv;
+ work->mc = mc;
+ INIT_WORK(&work->work, iboe_mcast_work_handler);
+ kref_get(&mc->mcref);
+ queue_work(cma_wq, &work->work);
+
+ return 0;
+
+out2:
+ kfree(mc->multicast.ib);
+out1:
+ kfree(work);
+ return err;
+}
+
+int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
+ void *context)
+{
+ struct rdma_id_private *id_priv;
+ struct cma_multicast *mc;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_comp(id_priv, CMA_ADDR_BOUND) &&
+ !cma_comp(id_priv, CMA_ADDR_RESOLVED))
+ return -EINVAL;
+
+ mc = kmalloc(sizeof *mc, GFP_KERNEL);
+ if (!mc)
+ return -ENOMEM;
+
+ memcpy(&mc->addr, addr, ip_addr_size(addr));
+ mc->context = context;
+ mc->id_priv = id_priv;
+
+ spin_lock(&id_priv->lock);
+ list_add(&mc->list, &id_priv->mc_list);
+ spin_unlock(&id_priv->lock);
+
+ switch (rdma_node_get_transport(id->device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ switch (rdma_port_get_link_layer(id->device, id->port_num)) {
+ case IB_LINK_LAYER_INFINIBAND:
+ ret = cma_join_ib_multicast(id_priv, mc);
+ break;
+ case IB_LINK_LAYER_ETHERNET:
+ kref_init(&mc->mcref);
+ ret = cma_iboe_join_multicast(id_priv, mc);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ break;
+ default:
+ ret = -ENOSYS;
+ break;
+ }
+
+ if (ret) {
+ spin_lock_irq(&id_priv->lock);
+ list_del(&mc->list);
+ spin_unlock_irq(&id_priv->lock);
+ kfree(mc);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(rdma_join_multicast);
+
+void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+ struct rdma_id_private *id_priv;
+ struct cma_multicast *mc;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ spin_lock_irq(&id_priv->lock);
+ list_for_each_entry(mc, &id_priv->mc_list, list) {
+ if (!memcmp(&mc->addr, addr, ip_addr_size(addr))) {
+ list_del(&mc->list);
+ spin_unlock_irq(&id_priv->lock);
+
+ if (id->qp)
+ ib_detach_mcast(id->qp,
+ &mc->multicast.ib->rec.mgid,
+ mc->multicast.ib->rec.mlid);
+ if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) == RDMA_TRANSPORT_IB) {
+ switch (rdma_port_get_link_layer(id->device, id->port_num)) {
+ case IB_LINK_LAYER_INFINIBAND:
+ ib_sa_free_multicast(mc->multicast.ib);
+ kfree(mc);
+ break;
+ case IB_LINK_LAYER_ETHERNET:
+ kref_put(&mc->mcref, release_mc);
+ break;
+ default:
+ break;
+ }
+ }
+ return;
+ }
+ }
+ spin_unlock_irq(&id_priv->lock);
+}
+EXPORT_SYMBOL(rdma_leave_multicast);
+
+static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv)
+{
+ struct rdma_dev_addr *dev_addr;
+ struct cma_ndev_work *work;
+
+ dev_addr = &id_priv->id.route.addr.dev_addr;
+
+#ifdef __linux__
+ if ((dev_addr->bound_dev_if == ndev->ifindex) &&
+ memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) {
+ printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n",
+ ndev->name, &id_priv->id);
+#else
+ if ((dev_addr->bound_dev_if == ndev->if_index) &&
+ memcmp(dev_addr->src_dev_addr, IF_LLADDR(ndev), ndev->if_addrlen)) {
+ printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n",
+ ndev->if_xname, &id_priv->id);
+#endif
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ INIT_WORK(&work->work, cma_ndev_work_handler);
+ work->id = id_priv;
+ work->event.event = RDMA_CM_EVENT_ADDR_CHANGE;
+ atomic_inc(&id_priv->refcount);
+ queue_work(cma_wq, &work->work);
+ }
+
+ return 0;
+}
+
+static int cma_netdev_callback(struct notifier_block *self, unsigned long event,
+ void *ctx)
+{
+ struct net_device *ndev = (struct net_device *)ctx;
+ struct cma_device *cma_dev;
+ struct rdma_id_private *id_priv;
+ int ret = NOTIFY_DONE;
+
+#ifdef __linux__
+ if (dev_net(ndev) != &init_net)
+ return NOTIFY_DONE;
+
+ if (event != NETDEV_BONDING_FAILOVER)
+ return NOTIFY_DONE;
+
+ if (!(ndev->flags & IFF_MASTER) || !(ndev->priv_flags & IFF_BONDING))
+ return NOTIFY_DONE;
+#else
+ if (event != NETDEV_DOWN && event != NETDEV_UNREGISTER)
+ return NOTIFY_DONE;
+#endif
+
+ mutex_lock(&lock);
+ list_for_each_entry(cma_dev, &dev_list, list)
+ list_for_each_entry(id_priv, &cma_dev->id_list, list) {
+ ret = cma_netdev_change(ndev, id_priv);
+ if (ret)
+ goto out;
+ }
+
+out:
+ mutex_unlock(&lock);
+ return ret;
+}
+
+static struct notifier_block cma_nb = {
+ .notifier_call = cma_netdev_callback
+};
+
+static void cma_add_one(struct ib_device *device)
+{
+ struct cma_device *cma_dev;
+ struct rdma_id_private *id_priv;
+
+ cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL);
+ if (!cma_dev)
+ return;
+
+ cma_dev->device = device;
+
+ init_completion(&cma_dev->comp);
+ atomic_set(&cma_dev->refcount, 1);
+ INIT_LIST_HEAD(&cma_dev->id_list);
+ ib_set_client_data(device, &cma_client, cma_dev);
+
+ mutex_lock(&lock);
+ list_add_tail(&cma_dev->list, &dev_list);
+ list_for_each_entry(id_priv, &listen_any_list, list)
+ cma_listen_on_dev(id_priv, cma_dev);
+ mutex_unlock(&lock);
+}
+
+static int cma_remove_id_dev(struct rdma_id_private *id_priv)
+{
+ struct rdma_cm_event event;
+ enum cma_state state;
+ int ret = 0;
+
+ /* Record that we want to remove the device */
+ state = cma_exch(id_priv, CMA_DEVICE_REMOVAL);
+ if (state == CMA_DESTROYING)
+ return 0;
+
+ cma_cancel_operation(id_priv, state);
+ mutex_lock(&id_priv->handler_mutex);
+
+ /* Check for destruction from another callback. */
+ if (!cma_comp(id_priv, CMA_DEVICE_REMOVAL))
+ goto out;
+
+ memset(&event, 0, sizeof event);
+ event.event = RDMA_CM_EVENT_DEVICE_REMOVAL;
+ ret = id_priv->id.event_handler(&id_priv->id, &event);
+out:
+ mutex_unlock(&id_priv->handler_mutex);
+ return ret;
+}
+
+static void cma_process_remove(struct cma_device *cma_dev)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ mutex_lock(&lock);
+ while (!list_empty(&cma_dev->id_list)) {
+ id_priv = list_entry(cma_dev->id_list.next,
+ struct rdma_id_private, list);
+
+ list_del(&id_priv->listen_list);
+ list_del_init(&id_priv->list);
+ atomic_inc(&id_priv->refcount);
+ mutex_unlock(&lock);
+
+ ret = id_priv->internal_id ? 1 : cma_remove_id_dev(id_priv);
+ cma_deref_id(id_priv);
+ if (ret)
+ rdma_destroy_id(&id_priv->id);
+
+ mutex_lock(&lock);
+ }
+ mutex_unlock(&lock);
+
+ cma_deref_dev(cma_dev);
+ wait_for_completion(&cma_dev->comp);
+}
+
+static void cma_remove_one(struct ib_device *device)
+{
+ struct cma_device *cma_dev;
+
+ cma_dev = ib_get_client_data(device, &cma_client);
+ if (!cma_dev)
+ return;
+
+ mutex_lock(&lock);
+ list_del(&cma_dev->list);
+ mutex_unlock(&lock);
+
+ cma_process_remove(cma_dev);
+ kfree(cma_dev);
+}
+
+static int cma_init(void)
+{
+ int ret, low, high, remaining;
+
+ get_random_bytes(&next_port, sizeof next_port);
+ inet_get_local_port_range(&low, &high);
+ remaining = (high - low) + 1;
+ next_port = ((unsigned int) next_port % remaining) + low;
+
+ cma_wq = create_singlethread_workqueue("rdma_cm");
+ if (!cma_wq)
+ return -ENOMEM;
+
+ ib_sa_register_client(&sa_client);
+ rdma_addr_register_client(&addr_client);
+ register_netdevice_notifier(&cma_nb);
+
+ ret = ib_register_client(&cma_client);
+ if (ret)
+ goto err;
+ return 0;
+
+err:
+ unregister_netdevice_notifier(&cma_nb);
+ rdma_addr_unregister_client(&addr_client);
+ ib_sa_unregister_client(&sa_client);
+ destroy_workqueue(cma_wq);
+ return ret;
+}
+
+static void cma_cleanup(void)
+{
+ ib_unregister_client(&cma_client);
+ unregister_netdevice_notifier(&cma_nb);
+ rdma_addr_unregister_client(&addr_client);
+ ib_sa_unregister_client(&sa_client);
+ destroy_workqueue(cma_wq);
+ idr_destroy(&sdp_ps);
+ idr_destroy(&tcp_ps);
+ idr_destroy(&udp_ps);
+ idr_destroy(&ipoib_ps);
+}
+
+module_init(cma_init);
+module_exit(cma_cleanup);
diff --git a/sys/ofed/drivers/infiniband/core/core_priv.h b/sys/ofed/drivers/infiniband/core/core_priv.h
new file mode 100644
index 0000000..05ac36e
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/core_priv.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _CORE_PRIV_H
+#define _CORE_PRIV_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+#include <rdma/ib_verbs.h>
+
+int ib_device_register_sysfs(struct ib_device *device);
+void ib_device_unregister_sysfs(struct ib_device *device);
+
+int ib_sysfs_setup(void);
+void ib_sysfs_cleanup(void);
+
+int ib_cache_setup(void);
+void ib_cache_cleanup(void);
+
+#endif /* _CORE_PRIV_H */
diff --git a/sys/ofed/drivers/infiniband/core/device.c b/sys/ofed/drivers/infiniband/core/device.c
new file mode 100644
index 0000000..9d34bb6
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/device.c
@@ -0,0 +1,754 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+
+#include "core_priv.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("core kernel InfiniBand API");
+MODULE_LICENSE("Dual BSD/GPL");
+
+#ifdef __ia64__
+/* workaround for a bug in hp chipset that would cause kernel
+ panic when dma resources are exhaused */
+int dma_map_sg_hp_wa = 0;
+#endif
+
+struct ib_client_data {
+ struct list_head list;
+ struct ib_client *client;
+ void * data;
+};
+
+static LIST_HEAD(device_list);
+static LIST_HEAD(client_list);
+
+/*
+ * device_mutex protects access to both device_list and client_list.
+ * There's no real point to using multiple locks or something fancier
+ * like an rwsem: we always access both lists, and we're always
+ * modifying one list or the other list. In any case this is not a
+ * hot path so there's no point in trying to optimize.
+ */
+static DEFINE_MUTEX(device_mutex);
+
+static int ib_device_check_mandatory(struct ib_device *device)
+{
+#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device, x), #x }
+ static const struct {
+ size_t offset;
+ char *name;
+ } mandatory_table[] = {
+ IB_MANDATORY_FUNC(query_device),
+ IB_MANDATORY_FUNC(query_port),
+ IB_MANDATORY_FUNC(query_pkey),
+ IB_MANDATORY_FUNC(query_gid),
+ IB_MANDATORY_FUNC(alloc_pd),
+ IB_MANDATORY_FUNC(dealloc_pd),
+ IB_MANDATORY_FUNC(create_ah),
+ IB_MANDATORY_FUNC(destroy_ah),
+ IB_MANDATORY_FUNC(create_qp),
+ IB_MANDATORY_FUNC(modify_qp),
+ IB_MANDATORY_FUNC(destroy_qp),
+ IB_MANDATORY_FUNC(post_send),
+ IB_MANDATORY_FUNC(post_recv),
+ IB_MANDATORY_FUNC(create_cq),
+ IB_MANDATORY_FUNC(destroy_cq),
+ IB_MANDATORY_FUNC(poll_cq),
+ IB_MANDATORY_FUNC(req_notify_cq),
+ IB_MANDATORY_FUNC(get_dma_mr),
+ IB_MANDATORY_FUNC(dereg_mr)
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
+ if (!*(void **) ((u_char *) device + mandatory_table[i].offset)) {
+ printk(KERN_WARNING "Device %s is missing mandatory function %s\n",
+ device->name, mandatory_table[i].name);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static struct ib_device *__ib_device_get_by_name(const char *name)
+{
+ struct ib_device *device;
+
+ list_for_each_entry(device, &device_list, core_list)
+ if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX))
+ return device;
+
+ return NULL;
+}
+
+
+static int alloc_name(char *name)
+{
+ unsigned long *inuse;
+ char buf[IB_DEVICE_NAME_MAX];
+ struct ib_device *device;
+ int i;
+
+ inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL);
+ if (!inuse)
+ return -ENOMEM;
+
+ list_for_each_entry(device, &device_list, core_list) {
+ if (!sscanf(device->name, name, &i))
+ continue;
+ if (i < 0 || i >= PAGE_SIZE * 8)
+ continue;
+ snprintf(buf, sizeof buf, name, i);
+ if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX))
+ set_bit(i, inuse);
+ }
+
+ i = find_first_zero_bit(inuse, PAGE_SIZE * 8);
+ free_page((unsigned long) inuse);
+ snprintf(buf, sizeof buf, name, i);
+
+ if (__ib_device_get_by_name(buf))
+ return -ENFILE;
+
+ strlcpy(name, buf, IB_DEVICE_NAME_MAX);
+ return 0;
+}
+
+static int start_port(struct ib_device *device)
+{
+ return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1;
+}
+
+
+static int end_port(struct ib_device *device)
+{
+ return (device->node_type == RDMA_NODE_IB_SWITCH) ?
+ 0 : device->phys_port_cnt;
+}
+
+/**
+ * ib_alloc_device - allocate an IB device struct
+ * @size:size of structure to allocate
+ *
+ * Low-level drivers should use ib_alloc_device() to allocate &struct
+ * ib_device. @size is the size of the structure to be allocated,
+ * including any private data used by the low-level driver.
+ * ib_dealloc_device() must be used to free structures allocated with
+ * ib_alloc_device().
+ */
+struct ib_device *ib_alloc_device(size_t size)
+{
+ BUG_ON(size < sizeof (struct ib_device));
+
+ return kzalloc(size, GFP_KERNEL);
+}
+EXPORT_SYMBOL(ib_alloc_device);
+
+/**
+ * ib_dealloc_device - free an IB device struct
+ * @device:structure to free
+ *
+ * Free a structure allocated with ib_alloc_device().
+ */
+void ib_dealloc_device(struct ib_device *device)
+{
+ if (device->reg_state == IB_DEV_UNINITIALIZED) {
+ kfree(device);
+ return;
+ }
+
+ BUG_ON(device->reg_state != IB_DEV_UNREGISTERED);
+
+ kobject_put(&device->dev.kobj);
+}
+EXPORT_SYMBOL(ib_dealloc_device);
+
+static int add_client_context(struct ib_device *device, struct ib_client *client)
+{
+ struct ib_client_data *context;
+ unsigned long flags;
+
+ context = kmalloc(sizeof *context, GFP_KERNEL);
+ if (!context) {
+ printk(KERN_WARNING "Couldn't allocate client context for %s/%s\n",
+ device->name, client->name);
+ return -ENOMEM;
+ }
+
+ context->client = client;
+ context->data = NULL;
+
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_add(&context->list, &device->client_data_list);
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+ return 0;
+}
+
+static int read_port_table_lengths(struct ib_device *device)
+{
+ struct ib_port_attr *tprops = NULL;
+ int num_ports, ret = -ENOMEM;
+ u8 port_index;
+
+ tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+ if (!tprops)
+ goto out;
+
+ num_ports = end_port(device) - start_port(device) + 1;
+
+ device->pkey_tbl_len = kmalloc(sizeof *device->pkey_tbl_len * num_ports,
+ GFP_KERNEL);
+ device->gid_tbl_len = kmalloc(sizeof *device->gid_tbl_len * num_ports,
+ GFP_KERNEL);
+ if (!device->pkey_tbl_len || !device->gid_tbl_len)
+ goto err;
+
+ for (port_index = 0; port_index < num_ports; ++port_index) {
+ ret = ib_query_port(device, port_index + start_port(device),
+ tprops);
+ if (ret)
+ goto err;
+ device->pkey_tbl_len[port_index] = tprops->pkey_tbl_len;
+ device->gid_tbl_len[port_index] = tprops->gid_tbl_len;
+ }
+
+ ret = 0;
+ goto out;
+
+err:
+ kfree(device->gid_tbl_len);
+ kfree(device->pkey_tbl_len);
+out:
+ kfree(tprops);
+ return ret;
+}
+
+/**
+ * ib_register_device - Register an IB device with IB core
+ * @device:Device to register
+ *
+ * Low-level drivers use ib_register_device() to register their
+ * devices with the IB core. All registered clients will receive a
+ * callback for each device that is added. @device must be allocated
+ * with ib_alloc_device().
+ */
+int ib_register_device(struct ib_device *device)
+{
+ int ret;
+
+ mutex_lock(&device_mutex);
+
+ if (strchr(device->name, '%')) {
+ ret = alloc_name(device->name);
+ if (ret)
+ goto out;
+ }
+
+ if (ib_device_check_mandatory(device)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ INIT_LIST_HEAD(&device->event_handler_list);
+ INIT_LIST_HEAD(&device->client_data_list);
+ spin_lock_init(&device->event_handler_lock);
+ spin_lock_init(&device->client_data_lock);
+ device->ib_uverbs_xrcd_table = RB_ROOT;
+ mutex_init(&device->xrcd_table_mutex);
+
+ ret = read_port_table_lengths(device);
+ if (ret) {
+ printk(KERN_WARNING "Couldn't create table lengths cache for device %s\n",
+ device->name);
+ goto out;
+ }
+
+ ret = ib_device_register_sysfs(device);
+ if (ret) {
+ printk(KERN_WARNING "Couldn't register device %s with driver model\n",
+ device->name);
+ kfree(device->gid_tbl_len);
+ kfree(device->pkey_tbl_len);
+ goto out;
+ }
+
+ list_add_tail(&device->core_list, &device_list);
+
+ device->reg_state = IB_DEV_REGISTERED;
+
+ {
+ struct ib_client *client;
+
+ list_for_each_entry(client, &client_list, list)
+ if (client->add && !add_client_context(device, client))
+ client->add(device);
+ }
+
+ out:
+ mutex_unlock(&device_mutex);
+ return ret;
+}
+EXPORT_SYMBOL(ib_register_device);
+
+/**
+ * ib_unregister_device - Unregister an IB device
+ * @device:Device to unregister
+ *
+ * Unregister an IB device. All clients will receive a remove callback.
+ */
+void ib_unregister_device(struct ib_device *device)
+{
+ struct ib_client *client;
+ struct ib_client_data *context, *tmp;
+ unsigned long flags;
+
+ mutex_lock(&device_mutex);
+
+ list_for_each_entry_reverse(client, &client_list, list)
+ if (client->remove)
+ client->remove(device);
+
+ list_del(&device->core_list);
+
+ kfree(device->gid_tbl_len);
+ kfree(device->pkey_tbl_len);
+
+ mutex_unlock(&device_mutex);
+
+ ib_device_unregister_sysfs(device);
+
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+ kfree(context);
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+ device->reg_state = IB_DEV_UNREGISTERED;
+}
+EXPORT_SYMBOL(ib_unregister_device);
+
+/**
+ * ib_register_client - Register an IB client
+ * @client:Client to register
+ *
+ * Upper level users of the IB drivers can use ib_register_client() to
+ * register callbacks for IB device addition and removal. When an IB
+ * device is added, each registered client's add method will be called
+ * (in the order the clients were registered), and when a device is
+ * removed, each client's remove method will be called (in the reverse
+ * order that clients were registered). In addition, when
+ * ib_register_client() is called, the client will receive an add
+ * callback for all devices already registered.
+ */
+int ib_register_client(struct ib_client *client)
+{
+ struct ib_device *device;
+
+ mutex_lock(&device_mutex);
+
+ list_add_tail(&client->list, &client_list);
+ list_for_each_entry(device, &device_list, core_list)
+ if (client->add && !add_client_context(device, client))
+ client->add(device);
+
+ mutex_unlock(&device_mutex);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_register_client);
+
+/**
+ * ib_unregister_client - Unregister an IB client
+ * @client:Client to unregister
+ *
+ * Upper level users use ib_unregister_client() to remove their client
+ * registration. When ib_unregister_client() is called, the client
+ * will receive a remove callback for each IB device still registered.
+ */
+void ib_unregister_client(struct ib_client *client)
+{
+ struct ib_client_data *context, *tmp;
+ struct ib_device *device;
+ unsigned long flags;
+
+ mutex_lock(&device_mutex);
+
+ list_for_each_entry(device, &device_list, core_list) {
+ if (client->remove)
+ client->remove(device);
+
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+ if (context->client == client) {
+ list_del(&context->list);
+ kfree(context);
+ }
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+ }
+ list_del(&client->list);
+
+ mutex_unlock(&device_mutex);
+}
+EXPORT_SYMBOL(ib_unregister_client);
+
+/**
+ * ib_get_client_data - Get IB client context
+ * @device:Device to get context for
+ * @client:Client to get context for
+ *
+ * ib_get_client_data() returns client context set with
+ * ib_set_client_data().
+ */
+void *ib_get_client_data(struct ib_device *device, struct ib_client *client)
+{
+ struct ib_client_data *context;
+ void *ret = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_for_each_entry(context, &device->client_data_list, list)
+ if (context->client == client) {
+ ret = context->data;
+ break;
+ }
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_get_client_data);
+
+/**
+ * ib_set_client_data - Set IB client context
+ * @device:Device to set context for
+ * @client:Client to set context for
+ * @data:Context to set
+ *
+ * ib_set_client_data() sets client context that can be retrieved with
+ * ib_get_client_data().
+ */
+void ib_set_client_data(struct ib_device *device, struct ib_client *client,
+ void *data)
+{
+ struct ib_client_data *context;
+ unsigned long flags;
+
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_for_each_entry(context, &device->client_data_list, list)
+ if (context->client == client) {
+ context->data = data;
+ goto out;
+ }
+
+ printk(KERN_WARNING "No client context found for %s/%s\n",
+ device->name, client->name);
+
+out:
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+}
+EXPORT_SYMBOL(ib_set_client_data);
+
+/**
+ * ib_register_event_handler - Register an IB event handler
+ * @event_handler:Handler to register
+ *
+ * ib_register_event_handler() registers an event handler that will be
+ * called back when asynchronous IB events occur (as defined in
+ * chapter 11 of the InfiniBand Architecture Specification). This
+ * callback may occur in interrupt context.
+ */
+int ib_register_event_handler (struct ib_event_handler *event_handler)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
+ list_add_tail(&event_handler->list,
+ &event_handler->device->event_handler_list);
+ spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_register_event_handler);
+
+/**
+ * ib_unregister_event_handler - Unregister an event handler
+ * @event_handler:Handler to unregister
+ *
+ * Unregister an event handler registered with
+ * ib_register_event_handler().
+ */
+int ib_unregister_event_handler(struct ib_event_handler *event_handler)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
+ list_del(&event_handler->list);
+ spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_unregister_event_handler);
+
+/**
+ * ib_dispatch_event - Dispatch an asynchronous event
+ * @event:Event to dispatch
+ *
+ * Low-level drivers must call ib_dispatch_event() to dispatch the
+ * event to all registered event handlers when an asynchronous event
+ * occurs.
+ */
+void ib_dispatch_event(struct ib_event *event)
+{
+ unsigned long flags;
+ struct ib_event_handler *handler;
+
+ spin_lock_irqsave(&event->device->event_handler_lock, flags);
+
+ list_for_each_entry(handler, &event->device->event_handler_list, list)
+ handler->handler(handler, event);
+
+ spin_unlock_irqrestore(&event->device->event_handler_lock, flags);
+}
+EXPORT_SYMBOL(ib_dispatch_event);
+
+/**
+ * ib_query_device - Query IB device attributes
+ * @device:Device to query
+ * @device_attr:Device attributes
+ *
+ * ib_query_device() returns the attributes of a device through the
+ * @device_attr pointer.
+ */
+int ib_query_device(struct ib_device *device,
+ struct ib_device_attr *device_attr)
+{
+ return device->query_device(device, device_attr);
+}
+EXPORT_SYMBOL(ib_query_device);
+
+/**
+ * ib_query_port - Query IB port attributes
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @port_attr:Port attributes
+ *
+ * ib_query_port() returns the attributes of a port through the
+ * @port_attr pointer.
+ */
+int ib_query_port(struct ib_device *device,
+ u8 port_num,
+ struct ib_port_attr *port_attr)
+{
+ if (port_num < start_port(device) || port_num > end_port(device))
+ return -EINVAL;
+
+ return device->query_port(device, port_num, port_attr);
+}
+EXPORT_SYMBOL(ib_query_port);
+
+/**
+ * ib_query_gid - Get GID table entry
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @index:GID table index to query
+ * @gid:Returned GID
+ *
+ * ib_query_gid() fetches the specified GID table entry.
+ */
+int ib_query_gid(struct ib_device *device,
+ u8 port_num, int index, union ib_gid *gid)
+{
+ return device->query_gid(device, port_num, index, gid);
+}
+EXPORT_SYMBOL(ib_query_gid);
+
+/**
+ * ib_query_pkey - Get P_Key table entry
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @index:P_Key table index to query
+ * @pkey:Returned P_Key
+ *
+ * ib_query_pkey() fetches the specified P_Key table entry.
+ */
+int ib_query_pkey(struct ib_device *device,
+ u8 port_num, u16 index, u16 *pkey)
+{
+ return device->query_pkey(device, port_num, index, pkey);
+}
+EXPORT_SYMBOL(ib_query_pkey);
+
+/**
+ * ib_modify_device - Change IB device attributes
+ * @device:Device to modify
+ * @device_modify_mask:Mask of attributes to change
+ * @device_modify:New attribute values
+ *
+ * ib_modify_device() changes a device's attributes as specified by
+ * the @device_modify_mask and @device_modify structure.
+ */
+int ib_modify_device(struct ib_device *device,
+ int device_modify_mask,
+ struct ib_device_modify *device_modify)
+{
+ return device->modify_device(device, device_modify_mask,
+ device_modify);
+}
+EXPORT_SYMBOL(ib_modify_device);
+
+/**
+ * ib_modify_port - Modifies the attributes for the specified port.
+ * @device: The device to modify.
+ * @port_num: The number of the port to modify.
+ * @port_modify_mask: Mask used to specify which attributes of the port
+ * to change.
+ * @port_modify: New attribute values for the port.
+ *
+ * ib_modify_port() changes a port's attributes as specified by the
+ * @port_modify_mask and @port_modify structure.
+ */
+int ib_modify_port(struct ib_device *device,
+ u8 port_num, int port_modify_mask,
+ struct ib_port_modify *port_modify)
+{
+ if (port_num < start_port(device) || port_num > end_port(device))
+ return -EINVAL;
+
+ return device->modify_port(device, port_num, port_modify_mask,
+ port_modify);
+}
+EXPORT_SYMBOL(ib_modify_port);
+
+/**
+ * ib_find_gid - Returns the port number and GID table index where
+ * a specified GID value occurs.
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @port_num: The port number of the device where the GID value was found.
+ * @index: The index into the GID table where the GID was found. This
+ * parameter may be NULL.
+ */
+int ib_find_gid(struct ib_device *device, union ib_gid *gid,
+ u8 *port_num, u16 *index)
+{
+ union ib_gid tmp_gid;
+ int ret, port, i;
+
+ for (port = start_port(device); port <= end_port(device); ++port) {
+ for (i = 0; i < device->gid_tbl_len[port - start_port(device)]; ++i) {
+ ret = ib_query_gid(device, port, i, &tmp_gid);
+ if (ret)
+ return ret;
+ if (!memcmp(&tmp_gid, gid, sizeof *gid)) {
+ *port_num = port;
+ if (index)
+ *index = i;
+ return 0;
+ }
+ }
+ }
+
+ return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_gid);
+
+/**
+ * ib_find_pkey - Returns the PKey table index where a specified
+ * PKey value occurs.
+ * @device: The device to query.
+ * @port_num: The port number of the device to search for the PKey.
+ * @pkey: The PKey value to search for.
+ * @index: The index into the PKey table where the PKey was found.
+ */
+int ib_find_pkey(struct ib_device *device,
+ u8 port_num, u16 pkey, u16 *index)
+{
+ int ret, i;
+ u16 tmp_pkey;
+
+ for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) {
+ ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
+ if (ret)
+ return ret;
+
+ if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) {
+ *index = i;
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_pkey);
+
+static int __init ib_core_init(void)
+{
+ int ret;
+
+#ifdef __ia64__
+ if (ia64_platform_is("hpzx1"))
+ dma_map_sg_hp_wa = 1;
+#endif
+
+ ret = ib_sysfs_setup();
+ if (ret)
+ printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
+
+ ret = ib_cache_setup();
+ if (ret) {
+ printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
+ ib_sysfs_cleanup();
+ }
+
+ return ret;
+}
+
+static void __exit ib_core_cleanup(void)
+{
+ ib_cache_cleanup();
+ ib_sysfs_cleanup();
+ /* Make sure that any pending umem accounting work is done. */
+ flush_scheduled_work();
+}
+
+module_init(ib_core_init);
+module_exit(ib_core_cleanup);
diff --git a/sys/ofed/drivers/infiniband/core/fmr_pool.c b/sys/ofed/drivers/infiniband/core/fmr_pool.c
new file mode 100644
index 0000000..4507043
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/fmr_pool.c
@@ -0,0 +1,544 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/jhash.h>
+#include <linux/kthread.h>
+
+#include <rdma/ib_fmr_pool.h>
+
+#include "core_priv.h"
+
+#define PFX "fmr_pool: "
+
+enum {
+ IB_FMR_MAX_REMAPS = 32,
+
+ IB_FMR_HASH_BITS = 8,
+ IB_FMR_HASH_SIZE = 1 << IB_FMR_HASH_BITS,
+ IB_FMR_HASH_MASK = IB_FMR_HASH_SIZE - 1
+};
+
+/*
+ * If an FMR is not in use, then the list member will point to either
+ * its pool's free_list (if the FMR can be mapped again; that is,
+ * remap_count < pool->max_remaps) or its pool's dirty_list (if the
+ * FMR needs to be unmapped before being remapped). In either of
+ * these cases it is a bug if the ref_count is not 0. In other words,
+ * if ref_count is > 0, then the list member must not be linked into
+ * either free_list or dirty_list.
+ *
+ * The cache_node member is used to link the FMR into a cache bucket
+ * (if caching is enabled). This is independent of the reference
+ * count of the FMR. When a valid FMR is released, its ref_count is
+ * decremented, and if ref_count reaches 0, the FMR is placed in
+ * either free_list or dirty_list as appropriate. However, it is not
+ * removed from the cache and may be "revived" if a call to
+ * ib_fmr_register_physical() occurs before the FMR is remapped. In
+ * this case we just increment the ref_count and remove the FMR from
+ * free_list/dirty_list.
+ *
+ * Before we remap an FMR from free_list, we remove it from the cache
+ * (to prevent another user from obtaining a stale FMR). When an FMR
+ * is released, we add it to the tail of the free list, so that our
+ * cache eviction policy is "least recently used."
+ *
+ * All manipulation of ref_count, list and cache_node is protected by
+ * pool_lock to maintain consistency.
+ */
+
+struct ib_fmr_pool {
+ spinlock_t pool_lock;
+
+ int pool_size;
+ int max_pages;
+ int max_remaps;
+ int dirty_watermark;
+ int dirty_len;
+ struct list_head free_list;
+ struct list_head dirty_list;
+ struct hlist_head *cache_bucket;
+
+ void (*flush_function)(struct ib_fmr_pool *pool,
+ void * arg);
+ void *flush_arg;
+
+ struct task_struct *thread;
+
+ atomic_t req_ser;
+ atomic_t flush_ser;
+
+ wait_queue_head_t force_wait;
+};
+
+static inline u32 ib_fmr_hash(u64 first_page)
+{
+ return jhash_2words((u32) first_page, (u32) (first_page >> 32), 0) &
+ (IB_FMR_HASH_SIZE - 1);
+}
+
+/* Caller must hold pool_lock */
+static inline struct ib_pool_fmr *ib_fmr_cache_lookup(struct ib_fmr_pool *pool,
+ u64 *page_list,
+ int page_list_len,
+ u64 io_virtual_address)
+{
+ struct hlist_head *bucket;
+ struct ib_pool_fmr *fmr;
+ struct hlist_node *pos;
+
+ if (!pool->cache_bucket)
+ return NULL;
+
+ bucket = pool->cache_bucket + ib_fmr_hash(*page_list);
+
+ hlist_for_each_entry(fmr, pos, bucket, cache_node)
+ if (io_virtual_address == fmr->io_virtual_address &&
+ page_list_len == fmr->page_list_len &&
+ !memcmp(page_list, fmr->page_list,
+ page_list_len * sizeof *page_list))
+ return fmr;
+
+ return NULL;
+}
+
+static void ib_fmr_batch_release(struct ib_fmr_pool *pool)
+{
+ int ret;
+ struct ib_pool_fmr *fmr;
+ LIST_HEAD(unmap_list);
+ LIST_HEAD(fmr_list);
+
+ spin_lock_irq(&pool->pool_lock);
+
+ list_for_each_entry(fmr, &pool->dirty_list, list) {
+ hlist_del_init(&fmr->cache_node);
+ fmr->remap_count = 0;
+ list_add_tail(&fmr->fmr->list, &fmr_list);
+
+#ifdef DEBUG
+ if (fmr->ref_count !=0) {
+ printk(KERN_WARNING PFX "Unmapping FMR 0x%08x with ref count %d\n",
+ fmr, fmr->ref_count);
+ }
+#endif
+ }
+
+ list_splice_init(&pool->dirty_list, &unmap_list);
+ pool->dirty_len = 0;
+
+ spin_unlock_irq(&pool->pool_lock);
+
+ if (list_empty(&unmap_list)) {
+ return;
+ }
+
+ ret = ib_unmap_fmr(&fmr_list);
+ if (ret)
+ printk(KERN_WARNING PFX "ib_unmap_fmr returned %d\n", ret);
+
+ spin_lock_irq(&pool->pool_lock);
+ list_splice(&unmap_list, &pool->free_list);
+ spin_unlock_irq(&pool->pool_lock);
+}
+
+static int ib_fmr_cleanup_thread(void *pool_ptr)
+{
+ struct ib_fmr_pool *pool = pool_ptr;
+
+ do {
+ if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) < 0) {
+ ib_fmr_batch_release(pool);
+
+ atomic_inc(&pool->flush_ser);
+ wake_up_interruptible(&pool->force_wait);
+
+ if (pool->flush_function)
+ pool->flush_function(pool, pool->flush_arg);
+ }
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) >= 0 &&
+ !kthread_should_stop())
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ } while (!kthread_should_stop());
+
+ return 0;
+}
+
+/**
+ * ib_create_fmr_pool - Create an FMR pool
+ * @pd:Protection domain for FMRs
+ * @params:FMR pool parameters
+ *
+ * Create a pool of FMRs. Return value is pointer to new pool or
+ * error code if creation failed.
+ */
+struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd,
+ struct ib_fmr_pool_param *params)
+{
+ struct ib_device *device;
+ struct ib_fmr_pool *pool;
+ struct ib_device_attr *attr;
+ int i;
+ int ret;
+ int max_remaps;
+
+ if (!params)
+ return ERR_PTR(-EINVAL);
+
+ device = pd->device;
+ if (!device->alloc_fmr || !device->dealloc_fmr ||
+ !device->map_phys_fmr || !device->unmap_fmr) {
+ printk(KERN_INFO PFX "Device %s does not support FMRs\n",
+ device->name);
+ return ERR_PTR(-ENOSYS);
+ }
+
+ attr = kmalloc(sizeof *attr, GFP_KERNEL);
+ if (!attr) {
+ printk(KERN_WARNING PFX "couldn't allocate device attr struct\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ ret = ib_query_device(device, attr);
+ if (ret) {
+ printk(KERN_WARNING PFX "couldn't query device: %d\n", ret);
+ kfree(attr);
+ return ERR_PTR(ret);
+ }
+
+ if (!attr->max_map_per_fmr)
+ max_remaps = IB_FMR_MAX_REMAPS;
+ else
+ max_remaps = attr->max_map_per_fmr;
+
+ kfree(attr);
+
+ pool = kmalloc(sizeof *pool, GFP_KERNEL);
+ if (!pool) {
+ printk(KERN_WARNING PFX "couldn't allocate pool struct\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ pool->cache_bucket = NULL;
+
+ pool->flush_function = params->flush_function;
+ pool->flush_arg = params->flush_arg;
+
+ INIT_LIST_HEAD(&pool->free_list);
+ INIT_LIST_HEAD(&pool->dirty_list);
+
+ if (params->cache) {
+ pool->cache_bucket =
+ kmalloc(IB_FMR_HASH_SIZE * sizeof *pool->cache_bucket,
+ GFP_KERNEL);
+ if (!pool->cache_bucket) {
+ printk(KERN_WARNING PFX "Failed to allocate cache in pool\n");
+ ret = -ENOMEM;
+ goto out_free_pool;
+ }
+
+ for (i = 0; i < IB_FMR_HASH_SIZE; ++i)
+ INIT_HLIST_HEAD(pool->cache_bucket + i);
+ }
+
+ pool->pool_size = 0;
+ pool->max_pages = params->max_pages_per_fmr;
+ pool->max_remaps = max_remaps;
+ pool->dirty_watermark = params->dirty_watermark;
+ pool->dirty_len = 0;
+ spin_lock_init(&pool->pool_lock);
+ atomic_set(&pool->req_ser, 0);
+ atomic_set(&pool->flush_ser, 0);
+ init_waitqueue_head(&pool->force_wait);
+
+ pool->thread = kthread_run(ib_fmr_cleanup_thread,
+ pool,
+ "ib_fmr(%s)",
+ device->name);
+ if (IS_ERR(pool->thread)) {
+ printk(KERN_WARNING PFX "couldn't start cleanup thread\n");
+ ret = PTR_ERR(pool->thread);
+ goto out_free_pool;
+ }
+
+ {
+ struct ib_pool_fmr *fmr;
+ struct ib_fmr_attr fmr_attr = {
+ .max_pages = params->max_pages_per_fmr,
+ .max_maps = pool->max_remaps,
+ .page_shift = params->page_shift
+ };
+ int bytes_per_fmr = sizeof *fmr;
+
+ if (pool->cache_bucket)
+ bytes_per_fmr += params->max_pages_per_fmr * sizeof (u64);
+
+ for (i = 0; i < params->pool_size; ++i) {
+ fmr = kmalloc(bytes_per_fmr, GFP_KERNEL);
+ if (!fmr) {
+ printk(KERN_WARNING PFX "failed to allocate fmr "
+ "struct for FMR %d\n", i);
+ goto out_fail;
+ }
+
+ fmr->pool = pool;
+ fmr->remap_count = 0;
+ fmr->ref_count = 0;
+ INIT_HLIST_NODE(&fmr->cache_node);
+
+ fmr->fmr = ib_alloc_fmr(pd, params->access, &fmr_attr);
+ if (IS_ERR(fmr->fmr)) {
+ printk(KERN_WARNING PFX "fmr_create failed "
+ "for FMR %d\n", i);
+ kfree(fmr);
+ goto out_fail;
+ }
+
+ list_add_tail(&fmr->list, &pool->free_list);
+ ++pool->pool_size;
+ }
+ }
+
+ return pool;
+
+ out_free_pool:
+ kfree(pool->cache_bucket);
+ kfree(pool);
+
+ return ERR_PTR(ret);
+
+ out_fail:
+ ib_destroy_fmr_pool(pool);
+
+ return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(ib_create_fmr_pool);
+
+/**
+ * ib_destroy_fmr_pool - Free FMR pool
+ * @pool:FMR pool to free
+ *
+ * Destroy an FMR pool and free all associated resources.
+ */
+void ib_destroy_fmr_pool(struct ib_fmr_pool *pool)
+{
+ struct ib_pool_fmr *fmr;
+ struct ib_pool_fmr *tmp;
+ LIST_HEAD(fmr_list);
+ int i;
+
+ kthread_stop(pool->thread);
+ ib_fmr_batch_release(pool);
+
+ i = 0;
+ list_for_each_entry_safe(fmr, tmp, &pool->free_list, list) {
+ if (fmr->remap_count) {
+ INIT_LIST_HEAD(&fmr_list);
+ list_add_tail(&fmr->fmr->list, &fmr_list);
+ ib_unmap_fmr(&fmr_list);
+ }
+ ib_dealloc_fmr(fmr->fmr);
+ list_del(&fmr->list);
+ kfree(fmr);
+ ++i;
+ }
+
+ if (i < pool->pool_size)
+ printk(KERN_WARNING PFX "pool still has %d regions registered\n",
+ pool->pool_size - i);
+
+ kfree(pool->cache_bucket);
+ kfree(pool);
+}
+EXPORT_SYMBOL(ib_destroy_fmr_pool);
+
+/**
+ * ib_flush_fmr_pool - Invalidate all unmapped FMRs
+ * @pool:FMR pool to flush
+ *
+ * Ensure that all unmapped FMRs are fully invalidated.
+ */
+int ib_flush_fmr_pool(struct ib_fmr_pool *pool)
+{
+ int serial;
+ struct ib_pool_fmr *fmr, *next;
+
+ /*
+ * The free_list holds FMRs that may have been used
+ * but have not been remapped enough times to be dirty.
+ * Put them on the dirty list now so that the cleanup
+ * thread will reap them too.
+ */
+ spin_lock_irq(&pool->pool_lock);
+ list_for_each_entry_safe(fmr, next, &pool->free_list, list) {
+ if (fmr->remap_count > 0)
+ list_move(&fmr->list, &pool->dirty_list);
+ }
+ spin_unlock_irq(&pool->pool_lock);
+
+ serial = atomic_inc_return(&pool->req_ser);
+ wake_up_process(pool->thread);
+
+ if (wait_event_interruptible(pool->force_wait,
+ atomic_read(&pool->flush_ser) - serial >= 0))
+ return -EINTR;
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_flush_fmr_pool);
+
+/**
+ * ib_fmr_pool_map_phys -
+ * @pool:FMR pool to allocate FMR from
+ * @page_list:List of pages to map
+ * @list_len:Number of pages in @page_list
+ * @io_virtual_address:I/O virtual address for new FMR
+ *
+ * Map an FMR from an FMR pool.
+ */
+struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle,
+ u64 *page_list,
+ int list_len,
+ u64 io_virtual_address)
+{
+ struct ib_fmr_pool *pool = pool_handle;
+ struct ib_pool_fmr *fmr;
+ unsigned long flags;
+ int result;
+
+ if (list_len < 1 || list_len > pool->max_pages)
+ return ERR_PTR(-EINVAL);
+
+ spin_lock_irqsave(&pool->pool_lock, flags);
+ fmr = ib_fmr_cache_lookup(pool,
+ page_list,
+ list_len,
+ io_virtual_address);
+ if (fmr) {
+ /* found in cache */
+ ++fmr->ref_count;
+ if (fmr->ref_count == 1) {
+ list_del(&fmr->list);
+ }
+
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+ return fmr;
+ }
+
+ if (list_empty(&pool->free_list)) {
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+ return ERR_PTR(-EAGAIN);
+ }
+
+ fmr = list_entry(pool->free_list.next, struct ib_pool_fmr, list);
+ list_del(&fmr->list);
+ hlist_del_init(&fmr->cache_node);
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+ result = ib_map_phys_fmr(fmr->fmr, page_list, list_len,
+ io_virtual_address);
+
+ if (result) {
+ spin_lock_irqsave(&pool->pool_lock, flags);
+ list_add(&fmr->list, &pool->free_list);
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+ printk(KERN_WARNING PFX "fmr_map returns %d\n", result);
+
+ return ERR_PTR(result);
+ }
+
+ ++fmr->remap_count;
+ fmr->ref_count = 1;
+
+ if (pool->cache_bucket) {
+ fmr->io_virtual_address = io_virtual_address;
+ fmr->page_list_len = list_len;
+ memcpy(fmr->page_list, page_list, list_len * sizeof(*page_list));
+
+ spin_lock_irqsave(&pool->pool_lock, flags);
+ hlist_add_head(&fmr->cache_node,
+ pool->cache_bucket + ib_fmr_hash(fmr->page_list[0]));
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+ }
+
+ return fmr;
+}
+EXPORT_SYMBOL(ib_fmr_pool_map_phys);
+
+/**
+ * ib_fmr_pool_unmap - Unmap FMR
+ * @fmr:FMR to unmap
+ *
+ * Unmap an FMR. The FMR mapping may remain valid until the FMR is
+ * reused (or until ib_flush_fmr_pool() is called).
+ */
+int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr)
+{
+ struct ib_fmr_pool *pool;
+ unsigned long flags;
+
+ pool = fmr->pool;
+
+ spin_lock_irqsave(&pool->pool_lock, flags);
+
+ --fmr->ref_count;
+ if (!fmr->ref_count) {
+ if (fmr->remap_count < pool->max_remaps) {
+ list_add_tail(&fmr->list, &pool->free_list);
+ } else {
+ list_add_tail(&fmr->list, &pool->dirty_list);
+ if (++pool->dirty_len >= pool->dirty_watermark) {
+ atomic_inc(&pool->req_ser);
+ wake_up_process(pool->thread);
+ }
+ }
+ }
+
+#ifdef DEBUG
+ if (fmr->ref_count < 0)
+ printk(KERN_WARNING PFX "FMR %p has ref count %d < 0\n",
+ fmr, fmr->ref_count);
+#endif
+
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_fmr_pool_unmap);
diff --git a/sys/ofed/drivers/infiniband/core/iwcm.c b/sys/ofed/drivers/infiniband/core/iwcm.c
new file mode 100644
index 0000000..625fec5
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/iwcm.c
@@ -0,0 +1,1025 @@
+/*
+ * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004, 2005 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ * Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/interrupt.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/completion.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_addr.h>
+
+#include "iwcm.h"
+
+MODULE_AUTHOR("Tom Tucker");
+MODULE_DESCRIPTION("iWARP CM");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static struct workqueue_struct *iwcm_wq;
+struct iwcm_work {
+ struct work_struct work;
+ struct iwcm_id_private *cm_id;
+ struct list_head list;
+ struct iw_cm_event event;
+ struct list_head free_list;
+};
+
+/*
+ * The following services provide a mechanism for pre-allocating iwcm_work
+ * elements. The design pre-allocates them based on the cm_id type:
+ * LISTENING IDS: Get enough elements preallocated to handle the
+ * listen backlog.
+ * ACTIVE IDS: 4: CONNECT_REPLY, ESTABLISHED, DISCONNECT, CLOSE
+ * PASSIVE IDS: 3: ESTABLISHED, DISCONNECT, CLOSE
+ *
+ * Allocating them in connect and listen avoids having to deal
+ * with allocation failures on the event upcall from the provider (which
+ * is called in the interrupt context).
+ *
+ * One exception is when creating the cm_id for incoming connection requests.
+ * There are two cases:
+ * 1) in the event upcall, cm_event_handler(), for a listening cm_id. If
+ * the backlog is exceeded, then no more connection request events will
+ * be processed. cm_event_handler() returns -ENOMEM in this case. Its up
+ * to the provider to reject the connection request.
+ * 2) in the connection request workqueue handler, cm_conn_req_handler().
+ * If work elements cannot be allocated for the new connect request cm_id,
+ * then IWCM will call the provider reject method. This is ok since
+ * cm_conn_req_handler() runs in the workqueue thread context.
+ */
+
+static struct iwcm_work *get_work(struct iwcm_id_private *cm_id_priv)
+{
+ struct iwcm_work *work;
+
+ if (list_empty(&cm_id_priv->work_free_list))
+ return NULL;
+ work = list_entry(cm_id_priv->work_free_list.next, struct iwcm_work,
+ free_list);
+ list_del_init(&work->free_list);
+ return work;
+}
+
+static void put_work(struct iwcm_work *work)
+{
+ list_add(&work->free_list, &work->cm_id->work_free_list);
+}
+
+static void dealloc_work_entries(struct iwcm_id_private *cm_id_priv)
+{
+ struct list_head *e, *tmp;
+
+ list_for_each_safe(e, tmp, &cm_id_priv->work_free_list)
+ kfree(list_entry(e, struct iwcm_work, free_list));
+}
+
+static int alloc_work_entries(struct iwcm_id_private *cm_id_priv, int count)
+{
+ struct iwcm_work *work;
+
+ BUG_ON(!list_empty(&cm_id_priv->work_free_list));
+ while (count--) {
+ work = kmalloc(sizeof(struct iwcm_work), GFP_KERNEL);
+ if (!work) {
+ dealloc_work_entries(cm_id_priv);
+ return -ENOMEM;
+ }
+ work->cm_id = cm_id_priv;
+ INIT_LIST_HEAD(&work->list);
+ put_work(work);
+ }
+ return 0;
+}
+
+/*
+ * Save private data from incoming connection requests to
+ * iw_cm_event, so the low level driver doesn't have to. Adjust
+ * the event ptr to point to the local copy.
+ */
+static int copy_private_data(struct iw_cm_event *event)
+{
+ void *p;
+
+ p = kmemdup(event->private_data, event->private_data_len, GFP_ATOMIC);
+ if (!p)
+ return -ENOMEM;
+ event->private_data = p;
+ return 0;
+}
+
+static void free_cm_id(struct iwcm_id_private *cm_id_priv)
+{
+ dealloc_work_entries(cm_id_priv);
+ kfree(cm_id_priv);
+}
+
+/*
+ * Release a reference on cm_id. If the last reference is being
+ * released, enable the waiting thread (in iw_destroy_cm_id) to
+ * get woken up, and return 1 if a thread is already waiting.
+ */
+static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
+{
+ BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
+ if (atomic_dec_and_test(&cm_id_priv->refcount)) {
+ BUG_ON(!list_empty(&cm_id_priv->work_list));
+ complete(&cm_id_priv->destroy_comp);
+ return 1;
+ }
+
+ return 0;
+}
+
+static void add_ref(struct iw_cm_id *cm_id)
+{
+ struct iwcm_id_private *cm_id_priv;
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ atomic_inc(&cm_id_priv->refcount);
+}
+
+static void rem_ref(struct iw_cm_id *cm_id)
+{
+ struct iwcm_id_private *cm_id_priv;
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ if (iwcm_deref_id(cm_id_priv) &&
+ test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags)) {
+ BUG_ON(!list_empty(&cm_id_priv->work_list));
+ free_cm_id(cm_id_priv);
+ }
+}
+
+static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event);
+
+struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
+ iw_cm_handler cm_handler,
+ void *context)
+{
+ struct iwcm_id_private *cm_id_priv;
+
+ cm_id_priv = kzalloc(sizeof(*cm_id_priv), GFP_KERNEL);
+ if (!cm_id_priv)
+ return ERR_PTR(-ENOMEM);
+
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ cm_id_priv->id.device = device;
+ cm_id_priv->id.cm_handler = cm_handler;
+ cm_id_priv->id.context = context;
+ cm_id_priv->id.event_handler = cm_event_handler;
+ cm_id_priv->id.add_ref = add_ref;
+ cm_id_priv->id.rem_ref = rem_ref;
+ spin_lock_init(&cm_id_priv->lock);
+ atomic_set(&cm_id_priv->refcount, 1);
+ init_waitqueue_head(&cm_id_priv->connect_wait);
+ init_completion(&cm_id_priv->destroy_comp);
+ INIT_LIST_HEAD(&cm_id_priv->work_list);
+ INIT_LIST_HEAD(&cm_id_priv->work_free_list);
+
+ return &cm_id_priv->id;
+}
+EXPORT_SYMBOL(iw_create_cm_id);
+
+
+static int iwcm_modify_qp_err(struct ib_qp *qp)
+{
+ struct ib_qp_attr qp_attr;
+
+ if (!qp)
+ return -EINVAL;
+
+ qp_attr.qp_state = IB_QPS_ERR;
+ return ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+}
+
+/*
+ * This is really the RDMAC CLOSING state. It is most similar to the
+ * IB SQD QP state.
+ */
+static int iwcm_modify_qp_sqd(struct ib_qp *qp)
+{
+ struct ib_qp_attr qp_attr;
+
+ BUG_ON(qp == NULL);
+ qp_attr.qp_state = IB_QPS_SQD;
+ return ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+}
+
+/*
+ * CM_ID <-- CLOSING
+ *
+ * Block if a passive or active connection is currently being processed. Then
+ * process the event as follows:
+ * - If we are ESTABLISHED, move to CLOSING and modify the QP state
+ * based on the abrupt flag
+ * - If the connection is already in the CLOSING or IDLE state, the peer is
+ * disconnecting concurrently with us and we've already seen the
+ * DISCONNECT event -- ignore the request and return 0
+ * - Disconnect on a listening endpoint returns -EINVAL
+ */
+int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt)
+{
+ struct iwcm_id_private *cm_id_priv;
+ unsigned long flags;
+ int ret = 0;
+ struct ib_qp *qp = NULL;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ /* Wait if we're currently in a connect or accept downcall */
+ wait_event(cm_id_priv->connect_wait,
+ !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags));
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_ESTABLISHED:
+ cm_id_priv->state = IW_CM_STATE_CLOSING;
+
+ /* QP could be <nul> for user-mode client */
+ if (cm_id_priv->qp)
+ qp = cm_id_priv->qp;
+ else
+ ret = -EINVAL;
+ break;
+ case IW_CM_STATE_LISTEN:
+ ret = -EINVAL;
+ break;
+ case IW_CM_STATE_CLOSING:
+ /* remote peer closed first */
+ case IW_CM_STATE_IDLE:
+ /* accept or connect returned !0 */
+ break;
+ case IW_CM_STATE_CONN_RECV:
+ /*
+ * App called disconnect before/without calling accept after
+ * connect_request event delivered.
+ */
+ break;
+ case IW_CM_STATE_CONN_SENT:
+ /* Can only get here if wait above fails */
+ default:
+ BUG();
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ if (qp) {
+ if (abrupt)
+ ret = iwcm_modify_qp_err(qp);
+ else
+ ret = iwcm_modify_qp_sqd(qp);
+
+ /*
+ * If both sides are disconnecting the QP could
+ * already be in ERR or SQD states
+ */
+ ret = 0;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(iw_cm_disconnect);
+
+/*
+ * CM_ID <-- DESTROYING
+ *
+ * Clean up all resources associated with the connection and release
+ * the initial reference taken by iw_create_cm_id.
+ */
+static void destroy_cm_id(struct iw_cm_id *cm_id)
+{
+ struct iwcm_id_private *cm_id_priv;
+ unsigned long flags;
+ int ret;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ /*
+ * Wait if we're currently in a connect or accept downcall. A
+ * listening endpoint should never block here.
+ */
+ wait_event(cm_id_priv->connect_wait,
+ !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags));
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_LISTEN:
+ cm_id_priv->state = IW_CM_STATE_DESTROYING;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ /* destroy the listening endpoint */
+ ret = cm_id->device->iwcm->destroy_listen(cm_id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ break;
+ case IW_CM_STATE_ESTABLISHED:
+ cm_id_priv->state = IW_CM_STATE_DESTROYING;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ /* Abrupt close of the connection */
+ (void)iwcm_modify_qp_err(cm_id_priv->qp);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ break;
+ case IW_CM_STATE_IDLE:
+ case IW_CM_STATE_CLOSING:
+ cm_id_priv->state = IW_CM_STATE_DESTROYING;
+ break;
+ case IW_CM_STATE_CONN_RECV:
+ /*
+ * App called destroy before/without calling accept after
+ * receiving connection request event notification or
+ * returned non zero from the event callback function.
+ * In either case, must tell the provider to reject.
+ */
+ cm_id_priv->state = IW_CM_STATE_DESTROYING;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_id->device->iwcm->reject(cm_id, NULL, 0);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ break;
+ case IW_CM_STATE_CONN_SENT:
+ case IW_CM_STATE_DESTROYING:
+ default:
+ BUG();
+ break;
+ }
+ if (cm_id_priv->qp) {
+ cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+ cm_id_priv->qp = NULL;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ (void)iwcm_deref_id(cm_id_priv);
+}
+
+/*
+ * This function is only called by the application thread and cannot
+ * be called by the event thread. The function will wait for all
+ * references to be released on the cm_id and then kfree the cm_id
+ * object.
+ */
+void iw_destroy_cm_id(struct iw_cm_id *cm_id)
+{
+ struct iwcm_id_private *cm_id_priv;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ BUG_ON(test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags));
+
+ destroy_cm_id(cm_id);
+
+ wait_for_completion(&cm_id_priv->destroy_comp);
+
+ free_cm_id(cm_id_priv);
+}
+EXPORT_SYMBOL(iw_destroy_cm_id);
+
+/*
+ * CM_ID <-- LISTEN
+ *
+ * Start listening for connect requests. Generates one CONNECT_REQUEST
+ * event for each inbound connect request.
+ */
+int iw_cm_listen(struct iw_cm_id *cm_id, int backlog)
+{
+ struct iwcm_id_private *cm_id_priv;
+ unsigned long flags;
+ int ret;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+ ret = alloc_work_entries(cm_id_priv, backlog);
+ if (ret)
+ return ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_IDLE:
+ cm_id_priv->state = IW_CM_STATE_LISTEN;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ ret = cm_id->device->iwcm->create_listen(cm_id, backlog);
+ if (ret)
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(iw_cm_listen);
+
+/*
+ * CM_ID <-- IDLE
+ *
+ * Rejects an inbound connection request. No events are generated.
+ */
+int iw_cm_reject(struct iw_cm_id *cm_id,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct iwcm_id_private *cm_id_priv;
+ unsigned long flags;
+ int ret;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
+ return -EINVAL;
+ }
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ ret = cm_id->device->iwcm->reject(cm_id, private_data,
+ private_data_len);
+
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
+
+ return ret;
+}
+EXPORT_SYMBOL(iw_cm_reject);
+
+/*
+ * CM_ID <-- ESTABLISHED
+ *
+ * Accepts an inbound connection request and generates an ESTABLISHED
+ * event. Callers of iw_cm_disconnect and iw_destroy_cm_id will block
+ * until the ESTABLISHED event is received from the provider.
+ */
+int iw_cm_accept(struct iw_cm_id *cm_id,
+ struct iw_cm_conn_param *iw_param)
+{
+ struct iwcm_id_private *cm_id_priv;
+ struct ib_qp *qp;
+ unsigned long flags;
+ int ret;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
+ return -EINVAL;
+ }
+ /* Get the ib_qp given the QPN */
+ qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
+ if (!qp) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return -EINVAL;
+ }
+ cm_id->device->iwcm->add_ref(qp);
+ cm_id_priv->qp = qp;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ ret = cm_id->device->iwcm->accept(cm_id, iw_param);
+ if (ret) {
+ /* An error on accept precludes provider events */
+ BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id_priv->qp) {
+ cm_id->device->iwcm->rem_ref(qp);
+ cm_id_priv->qp = NULL;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(iw_cm_accept);
+
+/*
+ * Active Side: CM_ID <-- CONN_SENT
+ *
+ * If successful, results in the generation of a CONNECT_REPLY
+ * event. iw_cm_disconnect and iw_cm_destroy will block until the
+ * CONNECT_REPLY event is received from the provider.
+ */
+int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+ struct iwcm_id_private *cm_id_priv;
+ int ret;
+ unsigned long flags;
+ struct ib_qp *qp;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+ ret = alloc_work_entries(cm_id_priv, 4);
+ if (ret)
+ return ret;
+
+ set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+
+ if (cm_id_priv->state != IW_CM_STATE_IDLE) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
+ return -EINVAL;
+ }
+
+ /* Get the ib_qp given the QPN */
+ qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
+ if (!qp) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return -EINVAL;
+ }
+ cm_id->device->iwcm->add_ref(qp);
+ cm_id_priv->qp = qp;
+ cm_id_priv->state = IW_CM_STATE_CONN_SENT;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ ret = cm_id->device->iwcm->connect(cm_id, iw_param);
+ if (ret) {
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id_priv->qp) {
+ cm_id->device->iwcm->rem_ref(qp);
+ cm_id_priv->qp = NULL;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT);
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(iw_cm_connect);
+
+/*
+ * Passive Side: new CM_ID <-- CONN_RECV
+ *
+ * Handles an inbound connect request. The function creates a new
+ * iw_cm_id to represent the new connection and inherits the client
+ * callback function and other attributes from the listening parent.
+ *
+ * The work item contains a pointer to the listen_cm_id and the event. The
+ * listen_cm_id contains the client cm_handler, context and
+ * device. These are copied when the device is cloned. The event
+ * contains the new four tuple.
+ *
+ * An error on the child should not affect the parent, so this
+ * function does not return a value.
+ */
+static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ unsigned long flags;
+ struct iw_cm_id *cm_id;
+ struct iwcm_id_private *cm_id_priv;
+ int ret;
+
+ /*
+ * The provider should never generate a connection request
+ * event with a bad status.
+ */
+ BUG_ON(iw_event->status);
+
+ /*
+ * We could be destroying the listening id. If so, ignore this
+ * upcall.
+ */
+ spin_lock_irqsave(&listen_id_priv->lock, flags);
+ if (listen_id_priv->state != IW_CM_STATE_LISTEN) {
+ spin_unlock_irqrestore(&listen_id_priv->lock, flags);
+ goto out;
+ }
+ spin_unlock_irqrestore(&listen_id_priv->lock, flags);
+
+ cm_id = iw_create_cm_id(listen_id_priv->id.device,
+ listen_id_priv->id.cm_handler,
+ listen_id_priv->id.context);
+ /* If the cm_id could not be created, ignore the request */
+ if (IS_ERR(cm_id))
+ goto out;
+
+ cm_id->provider_data = iw_event->provider_data;
+ cm_id->local_addr = iw_event->local_addr;
+ cm_id->remote_addr = iw_event->remote_addr;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ cm_id_priv->state = IW_CM_STATE_CONN_RECV;
+
+ ret = alloc_work_entries(cm_id_priv, 3);
+ if (ret) {
+ iw_cm_reject(cm_id, NULL, 0);
+ iw_destroy_cm_id(cm_id);
+ goto out;
+ }
+
+ /* Call the client CM handler */
+ ret = cm_id->cm_handler(cm_id, iw_event);
+ if (ret) {
+ iw_cm_reject(cm_id, NULL, 0);
+ set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
+ destroy_cm_id(cm_id);
+ if (atomic_read(&cm_id_priv->refcount)==0)
+ free_cm_id(cm_id_priv);
+ }
+
+out:
+ if (iw_event->private_data_len)
+ kfree(iw_event->private_data);
+}
+
+/*
+ * Passive Side: CM_ID <-- ESTABLISHED
+ *
+ * The provider generated an ESTABLISHED event which means that
+ * the MPA negotion has completed successfully and we are now in MPA
+ * FPDU mode.
+ *
+ * This event can only be received in the CONN_RECV state. If the
+ * remote peer closed, the ESTABLISHED event would be received followed
+ * by the CLOSE event. If the app closes, it will block until we wake
+ * it up after processing this event.
+ */
+static int cm_conn_est_handler(struct iwcm_id_private *cm_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+
+ /*
+ * We clear the CONNECT_WAIT bit here to allow the callback
+ * function to call iw_cm_disconnect. Calling iw_destroy_cm_id
+ * from a callback handler is not allowed.
+ */
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
+ cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+ wake_up_all(&cm_id_priv->connect_wait);
+
+ return ret;
+}
+
+/*
+ * Active Side: CM_ID <-- ESTABLISHED
+ *
+ * The app has called connect and is waiting for the established event to
+ * post it's requests to the server. This event will wake up anyone
+ * blocked in iw_cm_disconnect or iw_destroy_id.
+ */
+static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ /*
+ * Clear the connect wait bit so a callback function calling
+ * iw_cm_disconnect will not wait and deadlock this thread
+ */
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT);
+ if (iw_event->status == IW_CM_EVENT_STATUS_ACCEPTED) {
+ cm_id_priv->id.local_addr = iw_event->local_addr;
+ cm_id_priv->id.remote_addr = iw_event->remote_addr;
+ cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
+ } else {
+ /* REJECTED or RESET */
+ cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+ cm_id_priv->qp = NULL;
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+
+ if (iw_event->private_data_len)
+ kfree(iw_event->private_data);
+
+ /* Wake up waiters on connect complete */
+ wake_up_all(&cm_id_priv->connect_wait);
+
+ return ret;
+}
+
+/*
+ * CM_ID <-- CLOSING
+ *
+ * If in the ESTABLISHED state, move to CLOSING.
+ */
+static void cm_disconnect_handler(struct iwcm_id_private *cm_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id_priv->state == IW_CM_STATE_ESTABLISHED)
+ cm_id_priv->state = IW_CM_STATE_CLOSING;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+}
+
+/*
+ * CM_ID <-- IDLE
+ *
+ * If in the ESTBLISHED or CLOSING states, the QP will have have been
+ * moved by the provider to the ERR state. Disassociate the CM_ID from
+ * the QP, move to IDLE, and remove the 'connected' reference.
+ *
+ * If in some other state, the cm_id was destroyed asynchronously.
+ * This is the last reference that will result in waking up
+ * the app thread blocked in iw_destroy_cm_id.
+ */
+static int cm_close_handler(struct iwcm_id_private *cm_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ unsigned long flags;
+ int ret = 0;
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+
+ if (cm_id_priv->qp) {
+ cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+ cm_id_priv->qp = NULL;
+ }
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_ESTABLISHED:
+ case IW_CM_STATE_CLOSING:
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ break;
+ case IW_CM_STATE_DESTROYING:
+ break;
+ default:
+ BUG();
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ return ret;
+}
+
+static int process_event(struct iwcm_id_private *cm_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ int ret = 0;
+
+ switch (iw_event->event) {
+ case IW_CM_EVENT_CONNECT_REQUEST:
+ cm_conn_req_handler(cm_id_priv, iw_event);
+ break;
+ case IW_CM_EVENT_CONNECT_REPLY:
+ ret = cm_conn_rep_handler(cm_id_priv, iw_event);
+ break;
+ case IW_CM_EVENT_ESTABLISHED:
+ ret = cm_conn_est_handler(cm_id_priv, iw_event);
+ break;
+ case IW_CM_EVENT_DISCONNECT:
+ cm_disconnect_handler(cm_id_priv, iw_event);
+ break;
+ case IW_CM_EVENT_CLOSE:
+ ret = cm_close_handler(cm_id_priv, iw_event);
+ break;
+ default:
+ BUG();
+ }
+
+ return ret;
+}
+
+/*
+ * Process events on the work_list for the cm_id. If the callback
+ * function requests that the cm_id be deleted, a flag is set in the
+ * cm_id flags to indicate that when the last reference is
+ * removed, the cm_id is to be destroyed. This is necessary to
+ * distinguish between an object that will be destroyed by the app
+ * thread asleep on the destroy_comp list vs. an object destroyed
+ * here synchronously when the last reference is removed.
+ */
+static void cm_work_handler(struct work_struct *_work)
+{
+ struct iwcm_work *work = container_of(_work, struct iwcm_work, work);
+ struct iw_cm_event levent;
+ struct iwcm_id_private *cm_id_priv = work->cm_id;
+ unsigned long flags;
+ int empty;
+ int ret = 0;
+ int destroy_id;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ empty = list_empty(&cm_id_priv->work_list);
+ while (!empty) {
+ work = list_entry(cm_id_priv->work_list.next,
+ struct iwcm_work, list);
+ list_del_init(&work->list);
+ empty = list_empty(&cm_id_priv->work_list);
+ levent = work->event;
+ put_work(work);
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ ret = process_event(cm_id_priv, &levent);
+ if (ret) {
+ set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
+ destroy_cm_id(&cm_id_priv->id);
+ }
+ BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
+ destroy_id = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
+ if (iwcm_deref_id(cm_id_priv)) {
+ if (destroy_id) {
+ BUG_ON(!list_empty(&cm_id_priv->work_list));
+ free_cm_id(cm_id_priv);
+ }
+ return;
+ }
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+}
+
+/*
+ * This function is called on interrupt context. Schedule events on
+ * the iwcm_wq thread to allow callback functions to downcall into
+ * the CM and/or block. Events are queued to a per-CM_ID
+ * work_list. If this is the first event on the work_list, the work
+ * element is also queued on the iwcm_wq thread.
+ *
+ * Each event holds a reference on the cm_id. Until the last posted
+ * event has been delivered and processed, the cm_id cannot be
+ * deleted.
+ *
+ * Returns:
+ * 0 - the event was handled.
+ * -ENOMEM - the event was not handled due to lack of resources.
+ */
+static int cm_event_handler(struct iw_cm_id *cm_id,
+ struct iw_cm_event *iw_event)
+{
+ struct iwcm_work *work;
+ struct iwcm_id_private *cm_id_priv;
+ unsigned long flags;
+ int ret = 0;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ work = get_work(cm_id_priv);
+ if (!work) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ INIT_WORK(&work->work, cm_work_handler);
+ work->cm_id = cm_id_priv;
+ work->event = *iw_event;
+
+ if ((work->event.event == IW_CM_EVENT_CONNECT_REQUEST ||
+ work->event.event == IW_CM_EVENT_CONNECT_REPLY) &&
+ work->event.private_data_len) {
+ ret = copy_private_data(&work->event);
+ if (ret) {
+ put_work(work);
+ goto out;
+ }
+ }
+
+ atomic_inc(&cm_id_priv->refcount);
+ if (list_empty(&cm_id_priv->work_list)) {
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ queue_work(iwcm_wq, &work->work);
+ } else
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+out:
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+
+static int iwcm_init_qp_init_attr(struct iwcm_id_private *cm_id_priv,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_IDLE:
+ case IW_CM_STATE_CONN_SENT:
+ case IW_CM_STATE_CONN_RECV:
+ case IW_CM_STATE_ESTABLISHED:
+ *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
+ qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE|
+ IB_ACCESS_REMOTE_READ;
+ ret = 0;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+
+static int iwcm_init_qp_rts_attr(struct iwcm_id_private *cm_id_priv,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_IDLE:
+ case IW_CM_STATE_CONN_SENT:
+ case IW_CM_STATE_CONN_RECV:
+ case IW_CM_STATE_ESTABLISHED:
+ *qp_attr_mask = 0;
+ ret = 0;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+
+int iw_cm_init_qp_attr(struct iw_cm_id *cm_id,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ struct iwcm_id_private *cm_id_priv;
+ int ret;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ switch (qp_attr->qp_state) {
+ case IB_QPS_INIT:
+ case IB_QPS_RTR:
+ ret = iwcm_init_qp_init_attr(cm_id_priv,
+ qp_attr, qp_attr_mask);
+ break;
+ case IB_QPS_RTS:
+ ret = iwcm_init_qp_rts_attr(cm_id_priv,
+ qp_attr, qp_attr_mask);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(iw_cm_init_qp_attr);
+
+static int __init iw_cm_init(void)
+{
+ iwcm_wq = create_singlethread_workqueue("iw_cm_wq");
+ if (!iwcm_wq)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void __exit iw_cm_cleanup(void)
+{
+ destroy_workqueue(iwcm_wq);
+}
+
+module_init(iw_cm_init);
+module_exit(iw_cm_cleanup);
diff --git a/sys/ofed/drivers/infiniband/core/iwcm.h b/sys/ofed/drivers/infiniband/core/iwcm.h
new file mode 100644
index 0000000..3f6cc82
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/iwcm.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef IWCM_H
+#define IWCM_H
+
+enum iw_cm_state {
+ IW_CM_STATE_IDLE, /* unbound, inactive */
+ IW_CM_STATE_LISTEN, /* listen waiting for connect */
+ IW_CM_STATE_CONN_RECV, /* inbound waiting for user accept */
+ IW_CM_STATE_CONN_SENT, /* outbound waiting for peer accept */
+ IW_CM_STATE_ESTABLISHED, /* established */
+ IW_CM_STATE_CLOSING, /* disconnect */
+ IW_CM_STATE_DESTROYING /* object being deleted */
+};
+
+struct iwcm_id_private {
+ struct iw_cm_id id;
+ enum iw_cm_state state;
+ unsigned long flags;
+ struct ib_qp *qp;
+ struct completion destroy_comp;
+ wait_queue_head_t connect_wait;
+ struct list_head work_list;
+ spinlock_t lock;
+ atomic_t refcount;
+ struct list_head work_free_list;
+};
+
+#define IWCM_F_CALLBACK_DESTROY 1
+#define IWCM_F_CONNECT_WAIT 2
+
+#endif /* IWCM_H */
diff --git a/sys/ofed/drivers/infiniband/core/local_sa.c b/sys/ofed/drivers/infiniband/core/local_sa.c
new file mode 100644
index 0000000..eb62c42
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/local_sa.c
@@ -0,0 +1,1273 @@
+/*
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/rbtree.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/miscdevice.h>
+#include <linux/random.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_sa.h>
+#include "sa.h"
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("InfiniBand subnet administration caching");
+MODULE_LICENSE("Dual BSD/GPL");
+
+enum {
+ SA_DB_MAX_PATHS_PER_DEST = 0x7F,
+ SA_DB_MIN_RETRY_TIMER = 4000, /* 4 sec */
+ SA_DB_MAX_RETRY_TIMER = 256000 /* 256 sec */
+};
+
+static int set_paths_per_dest(const char *val, struct kernel_param *kp);
+static unsigned long paths_per_dest = 0;
+module_param_call(paths_per_dest, set_paths_per_dest, param_get_ulong,
+ &paths_per_dest, 0644);
+MODULE_PARM_DESC(paths_per_dest, "Maximum number of paths to retrieve "
+ "to each destination (DGID). Set to 0 "
+ "to disable cache.");
+
+static int set_subscribe_inform_info(const char *val, struct kernel_param *kp);
+static char subscribe_inform_info = 1;
+module_param_call(subscribe_inform_info, set_subscribe_inform_info,
+ param_get_bool, &subscribe_inform_info, 0644);
+MODULE_PARM_DESC(subscribe_inform_info,
+ "Subscribe for SA InformInfo/Notice events.");
+
+static int do_refresh(const char *val, struct kernel_param *kp);
+module_param_call(refresh, do_refresh, NULL, NULL, 0200);
+
+static unsigned long retry_timer = SA_DB_MIN_RETRY_TIMER;
+
+enum sa_db_lookup_method {
+ SA_DB_LOOKUP_LEAST_USED,
+ SA_DB_LOOKUP_RANDOM
+};
+
+static int set_lookup_method(const char *val, struct kernel_param *kp);
+static int get_lookup_method(char *buf, struct kernel_param *kp);
+static unsigned long lookup_method;
+module_param_call(lookup_method, set_lookup_method, get_lookup_method,
+ &lookup_method, 0644);
+MODULE_PARM_DESC(lookup_method, "Method used to return path records when "
+ "multiple paths exist to a given destination.");
+
+static void sa_db_add_dev(struct ib_device *device);
+static void sa_db_remove_dev(struct ib_device *device);
+
+static struct ib_client sa_db_client = {
+ .name = "local_sa",
+ .add = sa_db_add_dev,
+ .remove = sa_db_remove_dev
+};
+
+static LIST_HEAD(dev_list);
+static DEFINE_MUTEX(lock);
+static rwlock_t rwlock;
+static struct workqueue_struct *sa_wq;
+static struct ib_sa_client sa_client;
+
+enum sa_db_state {
+ SA_DB_IDLE,
+ SA_DB_REFRESH,
+ SA_DB_DESTROY
+};
+
+struct sa_db_port {
+ struct sa_db_device *dev;
+ struct ib_mad_agent *agent;
+ /* Limit number of outstanding MADs to SA to reduce SA flooding */
+ struct ib_mad_send_buf *msg;
+ u16 sm_lid;
+ u8 sm_sl;
+ struct ib_inform_info *in_info;
+ struct ib_inform_info *out_info;
+ struct rb_root paths;
+ struct list_head update_list;
+ unsigned long update_id;
+ enum sa_db_state state;
+ struct work_struct work;
+ union ib_gid gid;
+ int port_num;
+};
+
+struct sa_db_device {
+ struct list_head list;
+ struct ib_device *device;
+ struct ib_event_handler event_handler;
+ int start_port;
+ int port_count;
+ struct sa_db_port port[0];
+};
+
+struct ib_sa_iterator {
+ struct ib_sa_iterator *next;
+};
+
+struct ib_sa_attr_iter {
+ struct ib_sa_iterator *iter;
+ unsigned long flags;
+};
+
+struct ib_sa_attr_list {
+ struct ib_sa_iterator iter;
+ struct ib_sa_iterator *tail;
+ int update_id;
+ union ib_gid gid;
+ struct rb_node node;
+};
+
+struct ib_path_rec_info {
+ struct ib_sa_iterator iter; /* keep first */
+ struct ib_sa_path_rec rec;
+ unsigned long lookups;
+};
+
+struct ib_sa_mad_iter {
+ struct ib_mad_recv_wc *recv_wc;
+ struct ib_mad_recv_buf *recv_buf;
+ int attr_size;
+ int attr_offset;
+ int data_offset;
+ int data_left;
+ void *attr;
+ u8 attr_data[0];
+};
+
+enum sa_update_type {
+ SA_UPDATE_FULL,
+ SA_UPDATE_ADD,
+ SA_UPDATE_REMOVE
+};
+
+struct update_info {
+ struct list_head list;
+ union ib_gid gid;
+ enum sa_update_type type;
+};
+
+struct sa_path_request {
+ struct work_struct work;
+ struct ib_sa_client *client;
+ void (*callback)(int, struct ib_sa_path_rec *, void *);
+ void *context;
+ struct ib_sa_path_rec path_rec;
+};
+
+static void process_updates(struct sa_db_port *port);
+
+static void free_attr_list(struct ib_sa_attr_list *attr_list)
+{
+ struct ib_sa_iterator *cur;
+
+ for (cur = attr_list->iter.next; cur; cur = attr_list->iter.next) {
+ attr_list->iter.next = cur->next;
+ kfree(cur);
+ }
+ attr_list->tail = &attr_list->iter;
+}
+
+static void remove_attr(struct rb_root *root, struct ib_sa_attr_list *attr_list)
+{
+ rb_erase(&attr_list->node, root);
+ free_attr_list(attr_list);
+ kfree(attr_list);
+}
+
+static void remove_all_attrs(struct rb_root *root)
+{
+ struct rb_node *node, *next_node;
+ struct ib_sa_attr_list *attr_list;
+
+ write_lock_irq(&rwlock);
+ for (node = rb_first(root); node; node = next_node) {
+ next_node = rb_next(node);
+ attr_list = rb_entry(node, struct ib_sa_attr_list, node);
+ remove_attr(root, attr_list);
+ }
+ write_unlock_irq(&rwlock);
+}
+
+static void remove_old_attrs(struct rb_root *root, unsigned long update_id)
+{
+ struct rb_node *node, *next_node;
+ struct ib_sa_attr_list *attr_list;
+
+ write_lock_irq(&rwlock);
+ for (node = rb_first(root); node; node = next_node) {
+ next_node = rb_next(node);
+ attr_list = rb_entry(node, struct ib_sa_attr_list, node);
+ if (attr_list->update_id != update_id)
+ remove_attr(root, attr_list);
+ }
+ write_unlock_irq(&rwlock);
+}
+
+static struct ib_sa_attr_list *insert_attr_list(struct rb_root *root,
+ struct ib_sa_attr_list *attr_list)
+{
+ struct rb_node **link = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct ib_sa_attr_list *cur_attr_list;
+ int cmp;
+
+ while (*link) {
+ parent = *link;
+ cur_attr_list = rb_entry(parent, struct ib_sa_attr_list, node);
+ cmp = memcmp(&cur_attr_list->gid, &attr_list->gid,
+ sizeof attr_list->gid);
+ if (cmp < 0)
+ link = &(*link)->rb_left;
+ else if (cmp > 0)
+ link = &(*link)->rb_right;
+ else
+ return cur_attr_list;
+ }
+ rb_link_node(&attr_list->node, parent, link);
+ rb_insert_color(&attr_list->node, root);
+ return NULL;
+}
+
+static struct ib_sa_attr_list *find_attr_list(struct rb_root *root, u8 *gid)
+{
+ struct rb_node *node = root->rb_node;
+ struct ib_sa_attr_list *attr_list;
+ int cmp;
+
+ while (node) {
+ attr_list = rb_entry(node, struct ib_sa_attr_list, node);
+ cmp = memcmp(&attr_list->gid, gid, sizeof attr_list->gid);
+ if (cmp < 0)
+ node = node->rb_left;
+ else if (cmp > 0)
+ node = node->rb_right;
+ else
+ return attr_list;
+ }
+ return NULL;
+}
+
+static int insert_attr(struct rb_root *root, unsigned long update_id, void *key,
+ struct ib_sa_iterator *iter)
+{
+ struct ib_sa_attr_list *attr_list;
+ void *err;
+
+ write_lock_irq(&rwlock);
+ attr_list = find_attr_list(root, key);
+ if (!attr_list) {
+ write_unlock_irq(&rwlock);
+ attr_list = kmalloc(sizeof *attr_list, GFP_KERNEL);
+ if (!attr_list)
+ return -ENOMEM;
+
+ attr_list->iter.next = NULL;
+ attr_list->tail = &attr_list->iter;
+ attr_list->update_id = update_id;
+ memcpy(attr_list->gid.raw, key, sizeof attr_list->gid);
+
+ write_lock_irq(&rwlock);
+ err = insert_attr_list(root, attr_list);
+ if (err) {
+ write_unlock_irq(&rwlock);
+ kfree(attr_list);
+ return PTR_ERR(err);
+ }
+ } else if (attr_list->update_id != update_id) {
+ free_attr_list(attr_list);
+ attr_list->update_id = update_id;
+ }
+
+ attr_list->tail->next = iter;
+ iter->next = NULL;
+ attr_list->tail = iter;
+ write_unlock_irq(&rwlock);
+ return 0;
+}
+
+static struct ib_sa_mad_iter *ib_sa_iter_create(struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_sa_mad_iter *iter;
+ struct ib_sa_mad *mad = (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad;
+ int attr_size, attr_offset;
+
+ attr_offset = be16_to_cpu(mad->sa_hdr.attr_offset) * 8;
+ attr_size = 64; /* path record length */
+ if (attr_offset < attr_size)
+ return ERR_PTR(-EINVAL);
+
+ iter = kzalloc(sizeof *iter + attr_size, GFP_KERNEL);
+ if (!iter)
+ return ERR_PTR(-ENOMEM);
+
+ iter->data_left = mad_recv_wc->mad_len - IB_MGMT_SA_HDR;
+ iter->recv_wc = mad_recv_wc;
+ iter->recv_buf = &mad_recv_wc->recv_buf;
+ iter->attr_offset = attr_offset;
+ iter->attr_size = attr_size;
+ return iter;
+}
+
+static void ib_sa_iter_free(struct ib_sa_mad_iter *iter)
+{
+ kfree(iter);
+}
+
+static void *ib_sa_iter_next(struct ib_sa_mad_iter *iter)
+{
+ struct ib_sa_mad *mad;
+ int left, offset = 0;
+
+ while (iter->data_left >= iter->attr_offset) {
+ while (iter->data_offset < IB_MGMT_SA_DATA) {
+ mad = (struct ib_sa_mad *) iter->recv_buf->mad;
+
+ left = IB_MGMT_SA_DATA - iter->data_offset;
+ if (left < iter->attr_size) {
+ /* copy first piece of the attribute */
+ iter->attr = &iter->attr_data;
+ memcpy(iter->attr,
+ &mad->data[iter->data_offset], left);
+ offset = left;
+ break;
+ } else if (offset) {
+ /* copy the second piece of the attribute */
+ memcpy(iter->attr + offset, &mad->data[0],
+ iter->attr_size - offset);
+ iter->data_offset = iter->attr_size - offset;
+ offset = 0;
+ } else {
+ iter->attr = &mad->data[iter->data_offset];
+ iter->data_offset += iter->attr_size;
+ }
+
+ iter->data_left -= iter->attr_offset;
+ goto out;
+ }
+ iter->data_offset = 0;
+ iter->recv_buf = list_entry(iter->recv_buf->list.next,
+ struct ib_mad_recv_buf, list);
+ }
+ iter->attr = NULL;
+out:
+ return iter->attr;
+}
+
+/*
+ * Copy path records from a received response and insert them into our cache.
+ * A path record in the MADs are in network order, packed, and may
+ * span multiple MAD buffers, just to make our life hard.
+ */
+static void update_path_db(struct sa_db_port *port,
+ struct ib_mad_recv_wc *mad_recv_wc,
+ enum sa_update_type type)
+{
+ struct ib_sa_mad_iter *iter;
+ struct ib_path_rec_info *path_info;
+ void *attr;
+ int ret;
+
+ iter = ib_sa_iter_create(mad_recv_wc);
+ if (IS_ERR(iter))
+ return;
+
+ port->update_id += (type == SA_UPDATE_FULL);
+
+ while ((attr = ib_sa_iter_next(iter)) &&
+ (path_info = kmalloc(sizeof *path_info, GFP_KERNEL))) {
+
+ ib_sa_unpack_attr(&path_info->rec, attr, IB_SA_ATTR_PATH_REC);
+
+ ret = insert_attr(&port->paths, port->update_id,
+ path_info->rec.dgid.raw, &path_info->iter);
+ if (ret) {
+ kfree(path_info);
+ break;
+ }
+ }
+ ib_sa_iter_free(iter);
+
+ if (type == SA_UPDATE_FULL)
+ remove_old_attrs(&port->paths, port->update_id);
+}
+
+static struct ib_mad_send_buf *get_sa_msg(struct sa_db_port *port,
+ struct update_info *update)
+{
+ struct ib_ah_attr ah_attr;
+ struct ib_mad_send_buf *msg;
+
+ msg = ib_create_send_mad(port->agent, 1, 0, 0, IB_MGMT_SA_HDR,
+ IB_MGMT_SA_DATA, GFP_KERNEL);
+ if (IS_ERR(msg))
+ return NULL;
+
+ memset(&ah_attr, 0, sizeof ah_attr);
+ ah_attr.dlid = port->sm_lid;
+ ah_attr.sl = port->sm_sl;
+ ah_attr.port_num = port->port_num;
+
+ msg->ah = ib_create_ah(port->agent->qp->pd, &ah_attr);
+ if (IS_ERR(msg->ah)) {
+ ib_free_send_mad(msg);
+ return NULL;
+ }
+
+ msg->timeout_ms = retry_timer;
+ msg->retries = 0;
+ msg->context[0] = port;
+ msg->context[1] = update;
+ return msg;
+}
+
+static __be64 form_tid(u32 hi_tid)
+{
+ static atomic_t tid;
+ return cpu_to_be64((((u64) hi_tid) << 32) |
+ ((u32) atomic_inc_return(&tid)));
+}
+
+static void format_path_req(struct sa_db_port *port,
+ struct update_info *update,
+ struct ib_mad_send_buf *msg)
+{
+ struct ib_sa_mad *mad = msg->mad;
+ struct ib_sa_path_rec path_rec;
+
+ mad->mad_hdr.base_version = IB_MGMT_BASE_VERSION;
+ mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
+ mad->mad_hdr.class_version = IB_SA_CLASS_VERSION;
+ mad->mad_hdr.method = IB_SA_METHOD_GET_TABLE;
+ mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_PATH_REC);
+ mad->mad_hdr.tid = form_tid(msg->mad_agent->hi_tid);
+
+ mad->sa_hdr.comp_mask = IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH;
+
+ path_rec.sgid = port->gid;
+ path_rec.numb_path = (u8) paths_per_dest;
+
+ if (update->type == SA_UPDATE_ADD) {
+ mad->sa_hdr.comp_mask |= IB_SA_PATH_REC_DGID;
+ memcpy(&path_rec.dgid, &update->gid, sizeof path_rec.dgid);
+ }
+
+ ib_sa_pack_attr(mad->data, &path_rec, IB_SA_ATTR_PATH_REC);
+}
+
+static int send_query(struct sa_db_port *port,
+ struct update_info *update)
+{
+ int ret;
+
+ port->msg = get_sa_msg(port, update);
+ if (!port->msg)
+ return -ENOMEM;
+
+ format_path_req(port, update, port->msg);
+
+ ret = ib_post_send_mad(port->msg, NULL);
+ if (ret)
+ goto err;
+
+ return 0;
+
+err:
+ ib_destroy_ah(port->msg->ah);
+ ib_free_send_mad(port->msg);
+ return ret;
+}
+
+static void add_update(struct sa_db_port *port, u8 *gid,
+ enum sa_update_type type)
+{
+ struct update_info *update;
+
+ update = kmalloc(sizeof *update, GFP_KERNEL);
+ if (update) {
+ if (gid)
+ memcpy(&update->gid, gid, sizeof update->gid);
+ update->type = type;
+ list_add(&update->list, &port->update_list);
+ }
+
+ if (port->state == SA_DB_IDLE) {
+ port->state = SA_DB_REFRESH;
+ process_updates(port);
+ }
+}
+
+static void clean_update_list(struct sa_db_port *port)
+{
+ struct update_info *update;
+
+ while (!list_empty(&port->update_list)) {
+ update = list_entry(port->update_list.next,
+ struct update_info, list);
+ list_del(&update->list);
+ kfree(update);
+ }
+}
+
+static int notice_handler(int status, struct ib_inform_info *info,
+ struct ib_sa_notice *notice)
+{
+ struct sa_db_port *port = info->context;
+ struct ib_sa_notice_data_gid *gid_data;
+ struct ib_inform_info **pinfo;
+ enum sa_update_type type;
+
+ if (info->trap_number == IB_SA_SM_TRAP_GID_IN_SERVICE) {
+ pinfo = &port->in_info;
+ type = SA_UPDATE_ADD;
+ } else {
+ pinfo = &port->out_info;
+ type = SA_UPDATE_REMOVE;
+ }
+
+ mutex_lock(&lock);
+ if (port->state == SA_DB_DESTROY || !*pinfo) {
+ mutex_unlock(&lock);
+ return 0;
+ }
+
+ if (notice) {
+ gid_data = (struct ib_sa_notice_data_gid *)
+ &notice->data_details;
+ add_update(port, gid_data->gid, type);
+ mutex_unlock(&lock);
+ } else if (status == -ENETRESET) {
+ *pinfo = NULL;
+ mutex_unlock(&lock);
+ } else {
+ if (status)
+ *pinfo = ERR_PTR(-EINVAL);
+ port->state = SA_DB_IDLE;
+ clean_update_list(port);
+ mutex_unlock(&lock);
+ queue_work(sa_wq, &port->work);
+ }
+
+ return status;
+}
+
+static int reg_in_info(struct sa_db_port *port)
+{
+ int ret = 0;
+
+ port->in_info = ib_sa_register_inform_info(&sa_client,
+ port->dev->device,
+ port->port_num,
+ IB_SA_SM_TRAP_GID_IN_SERVICE,
+ GFP_KERNEL, notice_handler,
+ port);
+ if (IS_ERR(port->in_info))
+ ret = PTR_ERR(port->in_info);
+
+ return ret;
+}
+
+static int reg_out_info(struct sa_db_port *port)
+{
+ int ret = 0;
+
+ port->out_info = ib_sa_register_inform_info(&sa_client,
+ port->dev->device,
+ port->port_num,
+ IB_SA_SM_TRAP_GID_OUT_OF_SERVICE,
+ GFP_KERNEL, notice_handler,
+ port);
+ if (IS_ERR(port->out_info))
+ ret = PTR_ERR(port->out_info);
+
+ return ret;
+}
+
+static void unsubscribe_port(struct sa_db_port *port)
+{
+ if (port->in_info && !IS_ERR(port->in_info))
+ ib_sa_unregister_inform_info(port->in_info);
+
+ if (port->out_info && !IS_ERR(port->out_info))
+ ib_sa_unregister_inform_info(port->out_info);
+
+ port->out_info = NULL;
+ port->in_info = NULL;
+
+}
+
+static void cleanup_port(struct sa_db_port *port)
+{
+ unsubscribe_port(port);
+
+ clean_update_list(port);
+ remove_all_attrs(&port->paths);
+}
+
+static int update_port_info(struct sa_db_port *port)
+{
+ struct ib_port_attr port_attr;
+ int ret;
+
+ ret = ib_query_port(port->dev->device, port->port_num, &port_attr);
+ if (ret)
+ return ret;
+
+ if (port_attr.state != IB_PORT_ACTIVE)
+ return -ENODATA;
+
+ port->sm_lid = port_attr.sm_lid;
+ port->sm_sl = port_attr.sm_sl;
+ return 0;
+}
+
+static void process_updates(struct sa_db_port *port)
+{
+ struct update_info *update;
+ struct ib_sa_attr_list *attr_list;
+ int ret;
+
+ if (!paths_per_dest || update_port_info(port)) {
+ cleanup_port(port);
+ goto out;
+ }
+
+ /* Event registration is an optimization, so ignore failures. */
+ if (subscribe_inform_info) {
+ if (!port->out_info) {
+ ret = reg_out_info(port);
+ if (!ret)
+ return;
+ }
+
+ if (!port->in_info) {
+ ret = reg_in_info(port);
+ if (!ret)
+ return;
+ }
+ } else
+ unsubscribe_port(port);
+
+ while (!list_empty(&port->update_list)) {
+ update = list_entry(port->update_list.next,
+ struct update_info, list);
+
+ if (update->type == SA_UPDATE_REMOVE) {
+ write_lock_irq(&rwlock);
+ attr_list = find_attr_list(&port->paths,
+ update->gid.raw);
+ if (attr_list)
+ remove_attr(&port->paths, attr_list);
+ write_unlock_irq(&rwlock);
+ } else {
+ ret = send_query(port, update);
+ if (!ret)
+ return;
+
+ }
+ list_del(&update->list);
+ kfree(update);
+ }
+out:
+ port->state = SA_DB_IDLE;
+}
+
+static void refresh_port_db(struct sa_db_port *port)
+{
+ if (port->state == SA_DB_DESTROY)
+ return;
+
+ if (port->state == SA_DB_REFRESH) {
+ clean_update_list(port);
+ ib_cancel_mad(port->agent, port->msg);
+ }
+
+ add_update(port, NULL, SA_UPDATE_FULL);
+}
+
+static void refresh_dev_db(struct sa_db_device *dev)
+{
+ int i;
+
+ for (i = 0; i < dev->port_count; i++)
+ refresh_port_db(&dev->port[i]);
+}
+
+static void refresh_db(void)
+{
+ struct sa_db_device *dev;
+
+ list_for_each_entry(dev, &dev_list, list)
+ refresh_dev_db(dev);
+}
+
+static int do_refresh(const char *val, struct kernel_param *kp)
+{
+ mutex_lock(&lock);
+ refresh_db();
+ mutex_unlock(&lock);
+ return 0;
+}
+
+static int get_lookup_method(char *buf, struct kernel_param *kp)
+{
+ return sprintf(buf,
+ "%c %d round robin\n"
+ "%c %d random",
+ (lookup_method == SA_DB_LOOKUP_LEAST_USED) ? '*' : ' ',
+ SA_DB_LOOKUP_LEAST_USED,
+ (lookup_method == SA_DB_LOOKUP_RANDOM) ? '*' : ' ',
+ SA_DB_LOOKUP_RANDOM);
+}
+
+static int set_lookup_method(const char *val, struct kernel_param *kp)
+{
+ unsigned long method;
+ int ret = 0;
+
+ method = simple_strtoul(val, NULL, 0);
+
+ switch (method) {
+ case SA_DB_LOOKUP_LEAST_USED:
+ case SA_DB_LOOKUP_RANDOM:
+ lookup_method = method;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+static int set_paths_per_dest(const char *val, struct kernel_param *kp)
+{
+ int ret;
+
+ mutex_lock(&lock);
+ ret = param_set_ulong(val, kp);
+ if (ret)
+ goto out;
+
+ if (paths_per_dest > SA_DB_MAX_PATHS_PER_DEST)
+ paths_per_dest = SA_DB_MAX_PATHS_PER_DEST;
+ refresh_db();
+out:
+ mutex_unlock(&lock);
+ return ret;
+}
+
+static int set_subscribe_inform_info(const char *val, struct kernel_param *kp)
+{
+ int ret;
+
+ ret = param_set_bool(val, kp);
+ if (ret)
+ return ret;
+
+ return do_refresh(val, kp);
+}
+
+static void port_work_handler(struct work_struct *work)
+{
+ struct sa_db_port *port;
+
+ port = container_of(work, typeof(*port), work);
+ mutex_lock(&lock);
+ refresh_port_db(port);
+ mutex_unlock(&lock);
+}
+
+static void handle_event(struct ib_event_handler *event_handler,
+ struct ib_event *event)
+{
+ struct sa_db_device *dev;
+ struct sa_db_port *port;
+
+ dev = container_of(event_handler, typeof(*dev), event_handler);
+ port = &dev->port[event->element.port_num - dev->start_port];
+
+ switch (event->event) {
+ case IB_EVENT_PORT_ERR:
+ case IB_EVENT_LID_CHANGE:
+ case IB_EVENT_SM_CHANGE:
+ case IB_EVENT_CLIENT_REREGISTER:
+ case IB_EVENT_PKEY_CHANGE:
+ case IB_EVENT_PORT_ACTIVE:
+ queue_work(sa_wq, &port->work);
+ break;
+ default:
+ break;
+ }
+}
+
+static void ib_free_path_iter(struct ib_sa_attr_iter *iter)
+{
+ read_unlock_irqrestore(&rwlock, iter->flags);
+}
+
+static int ib_create_path_iter(struct ib_device *device, u8 port_num,
+ union ib_gid *dgid, struct ib_sa_attr_iter *iter)
+{
+ struct sa_db_device *dev;
+ struct sa_db_port *port;
+ struct ib_sa_attr_list *list;
+
+ dev = ib_get_client_data(device, &sa_db_client);
+ if (!dev)
+ return -ENODEV;
+
+ port = &dev->port[port_num - dev->start_port];
+
+ read_lock_irqsave(&rwlock, iter->flags);
+ list = find_attr_list(&port->paths, dgid->raw);
+ if (!list) {
+ ib_free_path_iter(iter);
+ return -ENODATA;
+ }
+
+ iter->iter = &list->iter;
+ return 0;
+}
+
+static struct ib_sa_path_rec *ib_get_next_path(struct ib_sa_attr_iter *iter)
+{
+ struct ib_path_rec_info *next_path;
+
+ iter->iter = iter->iter->next;
+ if (iter->iter) {
+ next_path = container_of(iter->iter, struct ib_path_rec_info, iter);
+ return &next_path->rec;
+ } else
+ return NULL;
+}
+
+static int cmp_rec(struct ib_sa_path_rec *src,
+ struct ib_sa_path_rec *dst, ib_sa_comp_mask comp_mask)
+{
+ /* DGID check already done */
+ if (comp_mask & IB_SA_PATH_REC_SGID &&
+ memcmp(&src->sgid, &dst->sgid, sizeof src->sgid))
+ return -EINVAL;
+ if (comp_mask & IB_SA_PATH_REC_DLID && src->dlid != dst->dlid)
+ return -EINVAL;
+ if (comp_mask & IB_SA_PATH_REC_SLID && src->slid != dst->slid)
+ return -EINVAL;
+ if (comp_mask & IB_SA_PATH_REC_RAW_TRAFFIC &&
+ src->raw_traffic != dst->raw_traffic)
+ return -EINVAL;
+
+ if (comp_mask & IB_SA_PATH_REC_FLOW_LABEL &&
+ src->flow_label != dst->flow_label)
+ return -EINVAL;
+ if (comp_mask & IB_SA_PATH_REC_HOP_LIMIT &&
+ src->hop_limit != dst->hop_limit)
+ return -EINVAL;
+ if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS &&
+ src->traffic_class != dst->traffic_class)
+ return -EINVAL;
+ if (comp_mask & IB_SA_PATH_REC_REVERSIBLE &&
+ dst->reversible && !src->reversible)
+ return -EINVAL;
+ /* Numb path check already done */
+ if (comp_mask & IB_SA_PATH_REC_PKEY && src->pkey != dst->pkey)
+ return -EINVAL;
+
+ if (comp_mask & IB_SA_PATH_REC_SL && src->sl != dst->sl)
+ return -EINVAL;
+
+ if (ib_sa_check_selector(comp_mask, IB_SA_PATH_REC_MTU_SELECTOR,
+ IB_SA_PATH_REC_MTU, dst->mtu_selector,
+ src->mtu, dst->mtu))
+ return -EINVAL;
+ if (ib_sa_check_selector(comp_mask, IB_SA_PATH_REC_RATE_SELECTOR,
+ IB_SA_PATH_REC_RATE, dst->rate_selector,
+ src->rate, dst->rate))
+ return -EINVAL;
+ if (ib_sa_check_selector(comp_mask,
+ IB_SA_PATH_REC_PACKET_LIFE_TIME_SELECTOR,
+ IB_SA_PATH_REC_PACKET_LIFE_TIME,
+ dst->packet_life_time_selector,
+ src->packet_life_time, dst->packet_life_time))
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct ib_sa_path_rec *get_random_path(struct ib_sa_attr_iter *iter,
+ struct ib_sa_path_rec *req_path,
+ ib_sa_comp_mask comp_mask)
+{
+ struct ib_sa_path_rec *path, *rand_path = NULL;
+ int num, count = 0;
+
+ for (path = ib_get_next_path(iter); path;
+ path = ib_get_next_path(iter)) {
+ if (!cmp_rec(path, req_path, comp_mask)) {
+ get_random_bytes(&num, sizeof num);
+ if ((num % ++count) == 0)
+ rand_path = path;
+ }
+ }
+
+ return rand_path;
+}
+
+static struct ib_sa_path_rec *get_next_path(struct ib_sa_attr_iter *iter,
+ struct ib_sa_path_rec *req_path,
+ ib_sa_comp_mask comp_mask)
+{
+ struct ib_path_rec_info *cur_path, *next_path = NULL;
+ struct ib_sa_path_rec *path;
+ unsigned long lookups = ~0;
+
+ for (path = ib_get_next_path(iter); path;
+ path = ib_get_next_path(iter)) {
+ if (!cmp_rec(path, req_path, comp_mask)) {
+
+ cur_path = container_of(iter->iter, struct ib_path_rec_info,
+ iter);
+ if (cur_path->lookups < lookups) {
+ lookups = cur_path->lookups;
+ next_path = cur_path;
+ }
+ }
+ }
+
+ if (next_path) {
+ next_path->lookups++;
+ return &next_path->rec;
+ } else
+ return NULL;
+}
+
+static void report_path(struct work_struct *work)
+{
+ struct sa_path_request *req;
+
+ req = container_of(work, struct sa_path_request, work);
+ req->callback(0, &req->path_rec, req->context);
+ ib_sa_client_put(req->client);
+ kfree(req);
+}
+
+/**
+ * ib_sa_path_rec_get - Start a Path get query
+ * @client:SA client
+ * @device:device to send query on
+ * @port_num: port number to send query on
+ * @rec:Path Record to send in query
+ * @comp_mask:component mask to send in query
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when query completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:query context, used to cancel query
+ *
+ * Send a Path Record Get query to the SA to look up a path. The
+ * callback function will be called when the query completes (or
+ * fails); status is 0 for a successful response, -EINTR if the query
+ * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
+ * occurred sending the query. The resp parameter of the callback is
+ * only valid if status is 0.
+ *
+ * If the return value of ib_sa_path_rec_get() is negative, it is an
+ * error code. Otherwise it is a query ID that can be used to cancel
+ * the query.
+ */
+int ib_sa_path_rec_get(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ struct ib_sa_path_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ int timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_path_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query)
+{
+ struct sa_path_request *req;
+ struct ib_sa_attr_iter iter;
+ struct ib_sa_path_rec *path_rec;
+ int ret;
+
+ if (!paths_per_dest)
+ goto query_sa;
+
+ if (!(comp_mask & IB_SA_PATH_REC_DGID) ||
+ !(comp_mask & IB_SA_PATH_REC_NUMB_PATH) || rec->numb_path != 1)
+ goto query_sa;
+
+ req = kmalloc(sizeof *req, gfp_mask);
+ if (!req)
+ goto query_sa;
+
+ ret = ib_create_path_iter(device, port_num, &rec->dgid, &iter);
+ if (ret)
+ goto free_req;
+
+ if (lookup_method == SA_DB_LOOKUP_RANDOM)
+ path_rec = get_random_path(&iter, rec, comp_mask);
+ else
+ path_rec = get_next_path(&iter, rec, comp_mask);
+
+ if (!path_rec)
+ goto free_iter;
+
+ memcpy(&req->path_rec, path_rec, sizeof *path_rec);
+ ib_free_path_iter(&iter);
+
+ INIT_WORK(&req->work, report_path);
+ req->client = client;
+ req->callback = callback;
+ req->context = context;
+
+ ib_sa_client_get(client);
+ queue_work(sa_wq, &req->work);
+ *sa_query = ERR_PTR(-EEXIST);
+ return 0;
+
+free_iter:
+ ib_free_path_iter(&iter);
+free_req:
+ kfree(req);
+query_sa:
+ return ib_sa_path_rec_query(client, device, port_num, rec, comp_mask,
+ timeout_ms, gfp_mask, callback, context,
+ sa_query);
+}
+EXPORT_SYMBOL(ib_sa_path_rec_get);
+
+static void recv_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct sa_db_port *port;
+ struct update_info *update;
+ struct ib_mad_send_buf *msg;
+ enum sa_update_type type;
+
+ msg = (struct ib_mad_send_buf *) (unsigned long) mad_recv_wc->wc->wr_id;
+ port = msg->context[0];
+ update = msg->context[1];
+
+ mutex_lock(&lock);
+ if (port->state == SA_DB_DESTROY ||
+ update != list_entry(port->update_list.next,
+ struct update_info, list)) {
+ mutex_unlock(&lock);
+ } else {
+ type = update->type;
+ mutex_unlock(&lock);
+ update_path_db(mad_agent->context, mad_recv_wc, type);
+ }
+
+ ib_free_recv_mad(mad_recv_wc);
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ struct ib_mad_send_buf *msg;
+ struct sa_db_port *port;
+ struct update_info *update;
+ int ret;
+
+ msg = mad_send_wc->send_buf;
+ port = msg->context[0];
+ update = msg->context[1];
+
+ mutex_lock(&lock);
+ if (port->state == SA_DB_DESTROY)
+ goto unlock;
+
+ if (update == list_entry(port->update_list.next,
+ struct update_info, list)) {
+
+ if (mad_send_wc->status == IB_WC_RESP_TIMEOUT_ERR &&
+ msg->timeout_ms < SA_DB_MAX_RETRY_TIMER) {
+
+ msg->timeout_ms <<= 1;
+ ret = ib_post_send_mad(msg, NULL);
+ if (!ret) {
+ mutex_unlock(&lock);
+ return;
+ }
+ }
+ list_del(&update->list);
+ kfree(update);
+ }
+ process_updates(port);
+unlock:
+ mutex_unlock(&lock);
+
+ ib_destroy_ah(msg->ah);
+ ib_free_send_mad(msg);
+}
+
+static int init_port(struct sa_db_device *dev, int port_num)
+{
+ struct sa_db_port *port;
+ int ret;
+
+ port = &dev->port[port_num - dev->start_port];
+ port->dev = dev;
+ port->port_num = port_num;
+ INIT_WORK(&port->work, port_work_handler);
+ port->paths = RB_ROOT;
+ INIT_LIST_HEAD(&port->update_list);
+
+ ret = ib_get_cached_gid(dev->device, port_num, 0, &port->gid);
+ if (ret)
+ return ret;
+
+ port->agent = ib_register_mad_agent(dev->device, port_num, IB_QPT_GSI,
+ NULL, IB_MGMT_RMPP_VERSION,
+ send_handler, recv_handler, port);
+ if (IS_ERR(port->agent))
+ ret = PTR_ERR(port->agent);
+
+ return ret;
+}
+
+static void destroy_port(struct sa_db_port *port)
+{
+ mutex_lock(&lock);
+ port->state = SA_DB_DESTROY;
+ mutex_unlock(&lock);
+
+ ib_unregister_mad_agent(port->agent);
+ cleanup_port(port);
+ flush_workqueue(sa_wq);
+}
+
+static void sa_db_add_dev(struct ib_device *device)
+{
+ struct sa_db_device *dev;
+ struct sa_db_port *port;
+ int s, e, i, ret;
+
+ if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+ return;
+
+ if (device->node_type == RDMA_NODE_IB_SWITCH) {
+ s = e = 0;
+ } else {
+ s = 1;
+ e = device->phys_port_cnt;
+ }
+
+ dev = kzalloc(sizeof *dev + (e - s + 1) * sizeof *port, GFP_KERNEL);
+ if (!dev)
+ return;
+
+ dev->start_port = s;
+ dev->port_count = e - s + 1;
+ dev->device = device;
+ for (i = 0; i < dev->port_count; i++) {
+ ret = init_port(dev, s + i);
+ if (ret)
+ goto err;
+ }
+
+ ib_set_client_data(device, &sa_db_client, dev);
+
+ INIT_IB_EVENT_HANDLER(&dev->event_handler, device, handle_event);
+
+ mutex_lock(&lock);
+ list_add_tail(&dev->list, &dev_list);
+ refresh_dev_db(dev);
+ mutex_unlock(&lock);
+
+ ib_register_event_handler(&dev->event_handler);
+ return;
+err:
+ while (i--)
+ destroy_port(&dev->port[i]);
+ kfree(dev);
+}
+
+static void sa_db_remove_dev(struct ib_device *device)
+{
+ struct sa_db_device *dev;
+ int i;
+
+ dev = ib_get_client_data(device, &sa_db_client);
+ if (!dev)
+ return;
+
+ ib_unregister_event_handler(&dev->event_handler);
+ flush_workqueue(sa_wq);
+
+ for (i = 0; i < dev->port_count; i++)
+ destroy_port(&dev->port[i]);
+
+ mutex_lock(&lock);
+ list_del(&dev->list);
+ mutex_unlock(&lock);
+
+ kfree(dev);
+}
+
+int sa_db_init(void)
+{
+ int ret;
+
+ rwlock_init(&rwlock);
+ sa_wq = create_singlethread_workqueue("local_sa");
+ if (!sa_wq)
+ return -ENOMEM;
+
+ ib_sa_register_client(&sa_client);
+ ret = ib_register_client(&sa_db_client);
+ if (ret)
+ goto err;
+
+ return 0;
+
+err:
+ ib_sa_unregister_client(&sa_client);
+ destroy_workqueue(sa_wq);
+ return ret;
+}
+
+void sa_db_cleanup(void)
+{
+ ib_unregister_client(&sa_db_client);
+ ib_sa_unregister_client(&sa_client);
+ destroy_workqueue(sa_wq);
+}
diff --git a/sys/ofed/drivers/infiniband/core/mad.c b/sys/ofed/drivers/infiniband/core/mad.c
new file mode 100644
index 0000000..64e660c
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/mad.c
@@ -0,0 +1,3057 @@
+/*
+ * Copyright (c) 2004-2007 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/dma-mapping.h>
+#include <rdma/ib_cache.h>
+
+#include "mad_priv.h"
+#include "mad_rmpp.h"
+#include "smi.h"
+#include "agent.h"
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("kernel IB MAD API");
+MODULE_AUTHOR("Hal Rosenstock");
+MODULE_AUTHOR("Sean Hefty");
+
+int mad_sendq_size = IB_MAD_QP_SEND_SIZE;
+int mad_recvq_size = IB_MAD_QP_RECV_SIZE;
+
+module_param_named(send_queue_size, mad_sendq_size, int, 0444);
+MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests");
+module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
+MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests");
+
+static struct kmem_cache *ib_mad_cache;
+
+static struct list_head ib_mad_port_list;
+static u32 ib_mad_client_id = 0;
+
+/* Port list lock */
+static spinlock_t ib_mad_port_list_lock;
+
+
+/* Forward declarations */
+static int method_in_use(struct ib_mad_mgmt_method_table **method,
+ struct ib_mad_reg_req *mad_reg_req);
+static void remove_mad_reg_req(struct ib_mad_agent_private *priv);
+static struct ib_mad_agent_private *find_mad_agent(
+ struct ib_mad_port_private *port_priv,
+ struct ib_mad *mad);
+static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
+ struct ib_mad_private *mad);
+static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv);
+static void timeout_sends(struct work_struct *work);
+static void local_completions(struct work_struct *work);
+static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+ struct ib_mad_agent_private *agent_priv,
+ u8 mgmt_class);
+static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+ struct ib_mad_agent_private *agent_priv);
+
+/*
+ * Returns a ib_mad_port_private structure or NULL for a device/port
+ * Assumes ib_mad_port_list_lock is being held
+ */
+static inline struct ib_mad_port_private *
+__ib_get_mad_port(struct ib_device *device, int port_num)
+{
+ struct ib_mad_port_private *entry;
+
+ list_for_each_entry(entry, &ib_mad_port_list, port_list) {
+ if (entry->device == device && entry->port_num == port_num)
+ return entry;
+ }
+ return NULL;
+}
+
+/*
+ * Wrapper function to return a ib_mad_port_private structure or NULL
+ * for a device/port
+ */
+static inline struct ib_mad_port_private *
+ib_get_mad_port(struct ib_device *device, int port_num)
+{
+ struct ib_mad_port_private *entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+ entry = __ib_get_mad_port(device, port_num);
+ spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+ return entry;
+}
+
+static inline u8 convert_mgmt_class(u8 mgmt_class)
+{
+ /* Alias IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE to 0 */
+ return mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE ?
+ 0 : mgmt_class;
+}
+
+static int get_spl_qp_index(enum ib_qp_type qp_type)
+{
+ switch (qp_type)
+ {
+ case IB_QPT_SMI:
+ return 0;
+ case IB_QPT_GSI:
+ return 1;
+ default:
+ return -1;
+ }
+}
+
+static int vendor_class_index(u8 mgmt_class)
+{
+ return mgmt_class - IB_MGMT_CLASS_VENDOR_RANGE2_START;
+}
+
+static int is_vendor_class(u8 mgmt_class)
+{
+ if ((mgmt_class < IB_MGMT_CLASS_VENDOR_RANGE2_START) ||
+ (mgmt_class > IB_MGMT_CLASS_VENDOR_RANGE2_END))
+ return 0;
+ return 1;
+}
+
+static int is_vendor_oui(char *oui)
+{
+ if (oui[0] || oui[1] || oui[2])
+ return 1;
+ return 0;
+}
+
+static int is_vendor_method_in_use(
+ struct ib_mad_mgmt_vendor_class *vendor_class,
+ struct ib_mad_reg_req *mad_reg_req)
+{
+ struct ib_mad_mgmt_method_table *method;
+ int i;
+
+ for (i = 0; i < MAX_MGMT_OUI; i++) {
+ if (!memcmp(vendor_class->oui[i], mad_reg_req->oui, 3)) {
+ method = vendor_class->method_table[i];
+ if (method) {
+ if (method_in_use(&method, mad_reg_req))
+ return 1;
+ else
+ break;
+ }
+ }
+ }
+ return 0;
+}
+
+int ib_response_mad(struct ib_mad *mad)
+{
+ return ((mad->mad_hdr.method & IB_MGMT_METHOD_RESP) ||
+ (mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) ||
+ ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_BM) &&
+ (mad->mad_hdr.attr_mod & IB_BM_ATTR_MOD_RESP)));
+}
+EXPORT_SYMBOL(ib_response_mad);
+
+static void timeout_callback(unsigned long data)
+{
+ struct ib_mad_agent_private *mad_agent_priv =
+ (struct ib_mad_agent_private *) data;
+
+ queue_work(mad_agent_priv->qp_info->port_priv->wq,
+ &mad_agent_priv->timeout_work);
+}
+
+/*
+ * ib_register_mad_agent - Register to send/receive MADs
+ */
+struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
+ u8 port_num,
+ enum ib_qp_type qp_type,
+ struct ib_mad_reg_req *mad_reg_req,
+ u8 rmpp_version,
+ ib_mad_send_handler send_handler,
+ ib_mad_recv_handler recv_handler,
+ void *context)
+{
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_agent *ret = ERR_PTR(-EINVAL);
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_reg_req *reg_req = NULL;
+ struct ib_mad_mgmt_class_table *class;
+ struct ib_mad_mgmt_vendor_class_table *vendor;
+ struct ib_mad_mgmt_vendor_class *vendor_class;
+ struct ib_mad_mgmt_method_table *method;
+ int ret2, qpn;
+ unsigned long flags;
+ u8 mgmt_class, vclass;
+
+ /* Validate parameters */
+ qpn = get_spl_qp_index(qp_type);
+ if (qpn == -1)
+ goto error1;
+
+ if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION)
+ goto error1;
+
+ /* Validate MAD registration request if supplied */
+ if (mad_reg_req) {
+ if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION)
+ goto error1;
+ if (!recv_handler)
+ goto error1;
+ if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) {
+ /*
+ * IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE is the only
+ * one in this range currently allowed
+ */
+ if (mad_reg_req->mgmt_class !=
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ goto error1;
+ } else if (mad_reg_req->mgmt_class == 0) {
+ /*
+ * Class 0 is reserved in IBA and is used for
+ * aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+ */
+ goto error1;
+ } else if (is_vendor_class(mad_reg_req->mgmt_class)) {
+ /*
+ * If class is in "new" vendor range,
+ * ensure supplied OUI is not zero
+ */
+ if (!is_vendor_oui(mad_reg_req->oui))
+ goto error1;
+ }
+ /* Make sure class supplied is consistent with RMPP */
+ if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) {
+ if (rmpp_version)
+ goto error1;
+ }
+ /* Make sure class supplied is consistent with QP type */
+ if (qp_type == IB_QPT_SMI) {
+ if ((mad_reg_req->mgmt_class !=
+ IB_MGMT_CLASS_SUBN_LID_ROUTED) &&
+ (mad_reg_req->mgmt_class !=
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE))
+ goto error1;
+ } else {
+ if ((mad_reg_req->mgmt_class ==
+ IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
+ (mad_reg_req->mgmt_class ==
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE))
+ goto error1;
+ }
+ } else {
+ /* No registration request supplied */
+ if (!send_handler)
+ goto error1;
+ }
+
+ /* Validate device and port */
+ port_priv = ib_get_mad_port(device, port_num);
+ if (!port_priv) {
+ ret = ERR_PTR(-ENODEV);
+ goto error1;
+ }
+
+ /* Allocate structures */
+ mad_agent_priv = kzalloc(sizeof *mad_agent_priv, GFP_KERNEL);
+ if (!mad_agent_priv) {
+ ret = ERR_PTR(-ENOMEM);
+ goto error1;
+ }
+
+ mad_agent_priv->agent.mr = ib_get_dma_mr(port_priv->qp_info[qpn].qp->pd,
+ IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(mad_agent_priv->agent.mr)) {
+ ret = ERR_PTR(-ENOMEM);
+ goto error2;
+ }
+
+ if (mad_reg_req) {
+ reg_req = kmalloc(sizeof *reg_req, GFP_KERNEL);
+ if (!reg_req) {
+ ret = ERR_PTR(-ENOMEM);
+ goto error3;
+ }
+ /* Make a copy of the MAD registration request */
+ memcpy(reg_req, mad_reg_req, sizeof *reg_req);
+ }
+
+ /* Now, fill in the various structures */
+ mad_agent_priv->qp_info = &port_priv->qp_info[qpn];
+ mad_agent_priv->reg_req = reg_req;
+ mad_agent_priv->agent.rmpp_version = rmpp_version;
+ mad_agent_priv->agent.device = device;
+ mad_agent_priv->agent.recv_handler = recv_handler;
+ mad_agent_priv->agent.send_handler = send_handler;
+ mad_agent_priv->agent.context = context;
+ mad_agent_priv->agent.qp = port_priv->qp_info[qpn].qp;
+ mad_agent_priv->agent.port_num = port_num;
+ spin_lock_init(&mad_agent_priv->lock);
+ INIT_LIST_HEAD(&mad_agent_priv->send_list);
+ INIT_LIST_HEAD(&mad_agent_priv->wait_list);
+ INIT_LIST_HEAD(&mad_agent_priv->done_list);
+ INIT_LIST_HEAD(&mad_agent_priv->rmpp_list);
+ INIT_WORK(&mad_agent_priv->timeout_work, timeout_sends);
+ setup_timer(&mad_agent_priv->timeout_timer, timeout_callback,
+ (unsigned long) mad_agent_priv);
+ INIT_LIST_HEAD(&mad_agent_priv->local_list);
+ INIT_WORK(&mad_agent_priv->local_work, local_completions);
+ atomic_set(&mad_agent_priv->refcount, 1);
+ init_completion(&mad_agent_priv->comp);
+
+ spin_lock_irqsave(&port_priv->reg_lock, flags);
+ mad_agent_priv->agent.hi_tid = ++ib_mad_client_id;
+
+ /*
+ * Make sure MAD registration (if supplied)
+ * is non overlapping with any existing ones
+ */
+ if (mad_reg_req) {
+ mgmt_class = convert_mgmt_class(mad_reg_req->mgmt_class);
+ if (!is_vendor_class(mgmt_class)) {
+ class = port_priv->version[mad_reg_req->
+ mgmt_class_version].class;
+ if (class) {
+ method = class->method_table[mgmt_class];
+ if (method) {
+ if (method_in_use(&method,
+ mad_reg_req))
+ goto error4;
+ }
+ }
+ ret2 = add_nonoui_reg_req(mad_reg_req, mad_agent_priv,
+ mgmt_class);
+ } else {
+ /* "New" vendor class range */
+ vendor = port_priv->version[mad_reg_req->
+ mgmt_class_version].vendor;
+ if (vendor) {
+ vclass = vendor_class_index(mgmt_class);
+ vendor_class = vendor->vendor_class[vclass];
+ if (vendor_class) {
+ if (is_vendor_method_in_use(
+ vendor_class,
+ mad_reg_req))
+ goto error4;
+ }
+ }
+ ret2 = add_oui_reg_req(mad_reg_req, mad_agent_priv);
+ }
+ if (ret2) {
+ ret = ERR_PTR(ret2);
+ goto error4;
+ }
+ }
+
+ /* Add mad agent into port's agent list */
+ list_add_tail(&mad_agent_priv->agent_list, &port_priv->agent_list);
+ spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+
+ return &mad_agent_priv->agent;
+
+error4:
+ spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+ kfree(reg_req);
+error3:
+ ib_dereg_mr(mad_agent_priv->agent.mr);
+error2:
+ kfree(mad_agent_priv);
+error1:
+ return ret;
+}
+EXPORT_SYMBOL(ib_register_mad_agent);
+
+static inline int is_snooping_sends(int mad_snoop_flags)
+{
+ return (mad_snoop_flags &
+ (/*IB_MAD_SNOOP_POSTED_SENDS |
+ IB_MAD_SNOOP_RMPP_SENDS |*/
+ IB_MAD_SNOOP_SEND_COMPLETIONS /*|
+ IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS*/));
+}
+
+static inline int is_snooping_recvs(int mad_snoop_flags)
+{
+ return (mad_snoop_flags &
+ (IB_MAD_SNOOP_RECVS /*|
+ IB_MAD_SNOOP_RMPP_RECVS*/));
+}
+
+static int register_snoop_agent(struct ib_mad_qp_info *qp_info,
+ struct ib_mad_snoop_private *mad_snoop_priv)
+{
+ struct ib_mad_snoop_private **new_snoop_table;
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&qp_info->snoop_lock, flags);
+ /* Check for empty slot in array. */
+ for (i = 0; i < qp_info->snoop_table_size; i++)
+ if (!qp_info->snoop_table[i])
+ break;
+
+ if (i == qp_info->snoop_table_size) {
+ /* Grow table. */
+ new_snoop_table = krealloc(qp_info->snoop_table,
+ sizeof mad_snoop_priv *
+ (qp_info->snoop_table_size + 1),
+ GFP_ATOMIC);
+ if (!new_snoop_table) {
+ i = -ENOMEM;
+ goto out;
+ }
+
+ qp_info->snoop_table = new_snoop_table;
+ qp_info->snoop_table_size++;
+ }
+ qp_info->snoop_table[i] = mad_snoop_priv;
+ atomic_inc(&qp_info->snoop_count);
+out:
+ spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+ return i;
+}
+
+struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device,
+ u8 port_num,
+ enum ib_qp_type qp_type,
+ int mad_snoop_flags,
+ ib_mad_snoop_handler snoop_handler,
+ ib_mad_recv_handler recv_handler,
+ void *context)
+{
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_agent *ret;
+ struct ib_mad_snoop_private *mad_snoop_priv;
+ int qpn;
+
+ /* Validate parameters */
+ if ((is_snooping_sends(mad_snoop_flags) && !snoop_handler) ||
+ (is_snooping_recvs(mad_snoop_flags) && !recv_handler)) {
+ ret = ERR_PTR(-EINVAL);
+ goto error1;
+ }
+ qpn = get_spl_qp_index(qp_type);
+ if (qpn == -1) {
+ ret = ERR_PTR(-EINVAL);
+ goto error1;
+ }
+ port_priv = ib_get_mad_port(device, port_num);
+ if (!port_priv) {
+ ret = ERR_PTR(-ENODEV);
+ goto error1;
+ }
+ /* Allocate structures */
+ mad_snoop_priv = kzalloc(sizeof *mad_snoop_priv, GFP_KERNEL);
+ if (!mad_snoop_priv) {
+ ret = ERR_PTR(-ENOMEM);
+ goto error1;
+ }
+
+ /* Now, fill in the various structures */
+ mad_snoop_priv->qp_info = &port_priv->qp_info[qpn];
+ mad_snoop_priv->agent.device = device;
+ mad_snoop_priv->agent.recv_handler = recv_handler;
+ mad_snoop_priv->agent.snoop_handler = snoop_handler;
+ mad_snoop_priv->agent.context = context;
+ mad_snoop_priv->agent.qp = port_priv->qp_info[qpn].qp;
+ mad_snoop_priv->agent.port_num = port_num;
+ mad_snoop_priv->mad_snoop_flags = mad_snoop_flags;
+ init_completion(&mad_snoop_priv->comp);
+ mad_snoop_priv->snoop_index = register_snoop_agent(
+ &port_priv->qp_info[qpn],
+ mad_snoop_priv);
+ if (mad_snoop_priv->snoop_index < 0) {
+ ret = ERR_PTR(mad_snoop_priv->snoop_index);
+ goto error2;
+ }
+
+ atomic_set(&mad_snoop_priv->refcount, 1);
+ return &mad_snoop_priv->agent;
+
+error2:
+ kfree(mad_snoop_priv);
+error1:
+ return ret;
+}
+EXPORT_SYMBOL(ib_register_mad_snoop);
+
+static inline void deref_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
+{
+ if (atomic_dec_and_test(&mad_agent_priv->refcount))
+ complete(&mad_agent_priv->comp);
+}
+
+static inline void deref_snoop_agent(struct ib_mad_snoop_private *mad_snoop_priv)
+{
+ if (atomic_dec_and_test(&mad_snoop_priv->refcount))
+ complete(&mad_snoop_priv->comp);
+}
+
+static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
+{
+ struct ib_mad_port_private *port_priv;
+ unsigned long flags;
+
+ /* Note that we could still be handling received MADs */
+
+ /*
+ * Canceling all sends results in dropping received response
+ * MADs, preventing us from queuing additional work
+ */
+ cancel_mads(mad_agent_priv);
+ port_priv = mad_agent_priv->qp_info->port_priv;
+ del_timer_sync(&mad_agent_priv->timeout_timer);
+ cancel_work_sync(&mad_agent_priv->timeout_work);
+
+ spin_lock_irqsave(&port_priv->reg_lock, flags);
+ remove_mad_reg_req(mad_agent_priv);
+ list_del(&mad_agent_priv->agent_list);
+ spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+
+ flush_workqueue(port_priv->wq);
+ ib_cancel_rmpp_recvs(mad_agent_priv);
+
+ deref_mad_agent(mad_agent_priv);
+ wait_for_completion(&mad_agent_priv->comp);
+
+ kfree(mad_agent_priv->reg_req);
+ ib_dereg_mr(mad_agent_priv->agent.mr);
+ kfree(mad_agent_priv);
+}
+
+static void unregister_mad_snoop(struct ib_mad_snoop_private *mad_snoop_priv)
+{
+ struct ib_mad_qp_info *qp_info;
+ unsigned long flags;
+
+ qp_info = mad_snoop_priv->qp_info;
+ spin_lock_irqsave(&qp_info->snoop_lock, flags);
+ qp_info->snoop_table[mad_snoop_priv->snoop_index] = NULL;
+ atomic_dec(&qp_info->snoop_count);
+ spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+
+ deref_snoop_agent(mad_snoop_priv);
+ wait_for_completion(&mad_snoop_priv->comp);
+
+ kfree(mad_snoop_priv);
+}
+
+/*
+ * ib_unregister_mad_agent - Unregisters a client from using MAD services
+ */
+int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_snoop_private *mad_snoop_priv;
+
+ /* If the TID is zero, the agent can only snoop. */
+ if (mad_agent->hi_tid) {
+ mad_agent_priv = container_of(mad_agent,
+ struct ib_mad_agent_private,
+ agent);
+ unregister_mad_agent(mad_agent_priv);
+ } else {
+ mad_snoop_priv = container_of(mad_agent,
+ struct ib_mad_snoop_private,
+ agent);
+ unregister_mad_snoop(mad_snoop_priv);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ib_unregister_mad_agent);
+
+static void dequeue_mad(struct ib_mad_list_head *mad_list)
+{
+ struct ib_mad_queue *mad_queue;
+ unsigned long flags;
+
+ BUG_ON(!mad_list->mad_queue);
+ mad_queue = mad_list->mad_queue;
+ spin_lock_irqsave(&mad_queue->lock, flags);
+ list_del(&mad_list->list);
+ mad_queue->count--;
+ spin_unlock_irqrestore(&mad_queue->lock, flags);
+}
+
+static void snoop_send(struct ib_mad_qp_info *qp_info,
+ struct ib_mad_send_buf *send_buf,
+ struct ib_mad_send_wc *mad_send_wc,
+ int mad_snoop_flags)
+{
+ struct ib_mad_snoop_private *mad_snoop_priv;
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&qp_info->snoop_lock, flags);
+ for (i = 0; i < qp_info->snoop_table_size; i++) {
+ mad_snoop_priv = qp_info->snoop_table[i];
+ if (!mad_snoop_priv ||
+ !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags))
+ continue;
+
+ atomic_inc(&mad_snoop_priv->refcount);
+ spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+ mad_snoop_priv->agent.snoop_handler(&mad_snoop_priv->agent,
+ send_buf, mad_send_wc);
+ deref_snoop_agent(mad_snoop_priv);
+ spin_lock_irqsave(&qp_info->snoop_lock, flags);
+ }
+ spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+}
+
+static void snoop_recv(struct ib_mad_qp_info *qp_info,
+ struct ib_mad_recv_wc *mad_recv_wc,
+ int mad_snoop_flags)
+{
+ struct ib_mad_snoop_private *mad_snoop_priv;
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&qp_info->snoop_lock, flags);
+ for (i = 0; i < qp_info->snoop_table_size; i++) {
+ mad_snoop_priv = qp_info->snoop_table[i];
+ if (!mad_snoop_priv ||
+ !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags))
+ continue;
+
+ atomic_inc(&mad_snoop_priv->refcount);
+ spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+ mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent,
+ mad_recv_wc);
+ deref_snoop_agent(mad_snoop_priv);
+ spin_lock_irqsave(&qp_info->snoop_lock, flags);
+ }
+ spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+}
+
+static void build_smp_wc(struct ib_qp *qp,
+ u64 wr_id, u16 slid, u16 pkey_index, u8 port_num,
+ struct ib_wc *wc)
+{
+ memset(wc, 0, sizeof *wc);
+ wc->wr_id = wr_id;
+ wc->status = IB_WC_SUCCESS;
+ wc->opcode = IB_WC_RECV;
+ wc->pkey_index = pkey_index;
+ wc->byte_len = sizeof(struct ib_mad) + sizeof(struct ib_grh);
+ wc->src_qp = IB_QP0;
+ wc->qp = qp;
+ wc->slid = slid;
+ wc->sl = 0;
+ wc->dlid_path_bits = 0;
+ wc->port_num = port_num;
+}
+
+/*
+ * Return 0 if SMP is to be sent
+ * Return 1 if SMP was consumed locally (whether or not solicited)
+ * Return < 0 if error
+ */
+static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
+ struct ib_mad_send_wr_private *mad_send_wr)
+{
+ int ret = 0;
+ struct ib_smp *smp = mad_send_wr->send_buf.mad;
+ unsigned long flags;
+ struct ib_mad_local_private *local;
+ struct ib_mad_private *mad_priv;
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_agent_private *recv_mad_agent = NULL;
+ struct ib_device *device = mad_agent_priv->agent.device;
+ u8 port_num;
+ struct ib_wc mad_wc;
+ struct ib_send_wr *send_wr = &mad_send_wr->send_wr;
+
+ if (device->node_type == RDMA_NODE_IB_SWITCH)
+ port_num = send_wr->wr.ud.port_num;
+ else
+ port_num = mad_agent_priv->agent.port_num;
+
+ /*
+ * Directed route handling starts if the initial LID routed part of
+ * a request or the ending LID routed part of a response is empty.
+ * If we are at the start of the LID routed part, don't update the
+ * hop_ptr or hop_cnt. See section 14.2.2, Vol 1 IB spec.
+ */
+ if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) !=
+ IB_LID_PERMISSIVE)
+ goto out;
+ if (smi_handle_dr_smp_send(smp, device->node_type, port_num) ==
+ IB_SMI_DISCARD) {
+ ret = -EINVAL;
+ printk(KERN_ERR PFX "Invalid directed route\n");
+ goto out;
+ }
+
+ /* Check to post send on QP or process locally */
+ if (smi_check_local_smp(smp, device) == IB_SMI_DISCARD &&
+ smi_check_local_returning_smp(smp, device) == IB_SMI_DISCARD)
+ goto out;
+
+ local = kmalloc(sizeof *local, GFP_ATOMIC);
+ if (!local) {
+ ret = -ENOMEM;
+ printk(KERN_ERR PFX "No memory for ib_mad_local_private\n");
+ goto out;
+ }
+ local->mad_priv = NULL;
+ local->recv_mad_agent = NULL;
+ mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_ATOMIC);
+ if (!mad_priv) {
+ ret = -ENOMEM;
+ printk(KERN_ERR PFX "No memory for local response MAD\n");
+ kfree(local);
+ goto out;
+ }
+
+ build_smp_wc(mad_agent_priv->agent.qp,
+ send_wr->wr_id, be16_to_cpu(smp->dr_slid),
+ send_wr->wr.ud.pkey_index,
+ send_wr->wr.ud.port_num, &mad_wc);
+
+ /* No GRH for DR SMP */
+ ret = device->process_mad(device, 0, port_num, &mad_wc, NULL,
+ (struct ib_mad *)smp,
+ (struct ib_mad *)&mad_priv->mad);
+ switch (ret)
+ {
+ case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY:
+ if (ib_response_mad(&mad_priv->mad.mad) &&
+ mad_agent_priv->agent.recv_handler) {
+ local->mad_priv = mad_priv;
+ local->recv_mad_agent = mad_agent_priv;
+ /*
+ * Reference MAD agent until receive
+ * side of local completion handled
+ */
+ atomic_inc(&mad_agent_priv->refcount);
+ } else
+ kmem_cache_free(ib_mad_cache, mad_priv);
+ break;
+ case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED:
+ kmem_cache_free(ib_mad_cache, mad_priv);
+ break;
+ case IB_MAD_RESULT_SUCCESS:
+ /* Treat like an incoming receive MAD */
+ port_priv = ib_get_mad_port(mad_agent_priv->agent.device,
+ mad_agent_priv->agent.port_num);
+ if (port_priv) {
+ memcpy(&mad_priv->mad.mad, smp, sizeof(struct ib_mad));
+ recv_mad_agent = find_mad_agent(port_priv,
+ &mad_priv->mad.mad);
+ }
+ if (!port_priv || !recv_mad_agent) {
+ /*
+ * No receiving agent so drop packet and
+ * generate send completion.
+ */
+ kmem_cache_free(ib_mad_cache, mad_priv);
+ break;
+ }
+ local->mad_priv = mad_priv;
+ local->recv_mad_agent = recv_mad_agent;
+ break;
+ default:
+ kmem_cache_free(ib_mad_cache, mad_priv);
+ kfree(local);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ local->mad_send_wr = mad_send_wr;
+ /* Reference MAD agent until send side of local completion handled */
+ atomic_inc(&mad_agent_priv->refcount);
+ /* Queue local completion to local list */
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ list_add_tail(&local->completion_list, &mad_agent_priv->local_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ queue_work(mad_agent_priv->qp_info->port_priv->wq,
+ &mad_agent_priv->local_work);
+ ret = 1;
+out:
+ return ret;
+}
+
+static int get_pad_size(int hdr_len, int data_len)
+{
+ int seg_size, pad;
+
+ seg_size = sizeof(struct ib_mad) - hdr_len;
+ if (data_len && seg_size) {
+ pad = seg_size - data_len % seg_size;
+ return pad == seg_size ? 0 : pad;
+ } else
+ return seg_size;
+}
+
+static void free_send_rmpp_list(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_rmpp_segment *s, *t;
+
+ list_for_each_entry_safe(s, t, &mad_send_wr->rmpp_list, list) {
+ list_del(&s->list);
+ kfree(s);
+ }
+}
+
+static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr,
+ gfp_t gfp_mask)
+{
+ struct ib_mad_send_buf *send_buf = &send_wr->send_buf;
+ struct ib_rmpp_mad *rmpp_mad = send_buf->mad;
+ struct ib_rmpp_segment *seg = NULL;
+ int left, seg_size, pad;
+
+ send_buf->seg_size = sizeof (struct ib_mad) - send_buf->hdr_len;
+ seg_size = send_buf->seg_size;
+ pad = send_wr->pad;
+
+ /* Allocate data segments. */
+ for (left = send_buf->data_len + pad; left > 0; left -= seg_size) {
+ seg = kmalloc(sizeof (*seg) + seg_size, gfp_mask);
+ if (!seg) {
+ printk(KERN_ERR "alloc_send_rmpp_segs: RMPP mem "
+ "alloc failed for len %zd, gfp %#x\n",
+ sizeof (*seg) + seg_size, gfp_mask);
+ free_send_rmpp_list(send_wr);
+ return -ENOMEM;
+ }
+ seg->num = ++send_buf->seg_count;
+ list_add_tail(&seg->list, &send_wr->rmpp_list);
+ }
+
+ /* Zero any padding */
+ if (pad)
+ memset(seg->data + seg_size - pad, 0, pad);
+
+ rmpp_mad->rmpp_hdr.rmpp_version = send_wr->mad_agent_priv->
+ agent.rmpp_version;
+ rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_DATA;
+ ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+
+ send_wr->cur_seg = container_of(send_wr->rmpp_list.next,
+ struct ib_rmpp_segment, list);
+ send_wr->last_ack_seg = send_wr->cur_seg;
+ return 0;
+}
+
+struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
+ u32 remote_qpn, u16 pkey_index,
+ int rmpp_active,
+ int hdr_len, int data_len,
+ gfp_t gfp_mask)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ int pad, message_size, ret, size;
+ void *buf;
+
+ mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
+ agent);
+ pad = get_pad_size(hdr_len, data_len);
+ message_size = hdr_len + data_len + pad;
+
+ if ((!mad_agent->rmpp_version &&
+ (rmpp_active || message_size > sizeof(struct ib_mad))) ||
+ (!rmpp_active && message_size > sizeof(struct ib_mad)))
+ return ERR_PTR(-EINVAL);
+
+ size = rmpp_active ? hdr_len : sizeof(struct ib_mad);
+ buf = kzalloc(sizeof *mad_send_wr + size, gfp_mask);
+ if (!buf)
+ return ERR_PTR(-ENOMEM);
+
+ mad_send_wr = buf + size;
+ INIT_LIST_HEAD(&mad_send_wr->rmpp_list);
+ mad_send_wr->send_buf.mad = buf;
+ mad_send_wr->send_buf.hdr_len = hdr_len;
+ mad_send_wr->send_buf.data_len = data_len;
+ mad_send_wr->pad = pad;
+
+ mad_send_wr->mad_agent_priv = mad_agent_priv;
+ mad_send_wr->sg_list[0].length = hdr_len;
+ mad_send_wr->sg_list[0].lkey = mad_agent->mr->lkey;
+ mad_send_wr->sg_list[1].length = sizeof(struct ib_mad) - hdr_len;
+ mad_send_wr->sg_list[1].lkey = mad_agent->mr->lkey;
+
+ mad_send_wr->send_wr.wr_id = (unsigned long) mad_send_wr;
+ mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list;
+ mad_send_wr->send_wr.num_sge = 2;
+ mad_send_wr->send_wr.opcode = IB_WR_SEND;
+ mad_send_wr->send_wr.send_flags = IB_SEND_SIGNALED;
+ mad_send_wr->send_wr.wr.ud.remote_qpn = remote_qpn;
+ mad_send_wr->send_wr.wr.ud.remote_qkey = IB_QP_SET_QKEY;
+ mad_send_wr->send_wr.wr.ud.pkey_index = pkey_index;
+
+ if (rmpp_active) {
+ ret = alloc_send_rmpp_list(mad_send_wr, gfp_mask);
+ if (ret) {
+ kfree(buf);
+ return ERR_PTR(ret);
+ }
+ }
+
+ mad_send_wr->send_buf.mad_agent = mad_agent;
+ atomic_inc(&mad_agent_priv->refcount);
+ return &mad_send_wr->send_buf;
+}
+EXPORT_SYMBOL(ib_create_send_mad);
+
+int ib_get_mad_data_offset(u8 mgmt_class)
+{
+ if (mgmt_class == IB_MGMT_CLASS_SUBN_ADM)
+ return IB_MGMT_SA_HDR;
+ else if ((mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) ||
+ (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) ||
+ (mgmt_class == IB_MGMT_CLASS_BIS))
+ return IB_MGMT_DEVICE_HDR;
+ else if ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) &&
+ (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END))
+ return IB_MGMT_VENDOR_HDR;
+ else
+ return IB_MGMT_MAD_HDR;
+}
+EXPORT_SYMBOL(ib_get_mad_data_offset);
+
+int ib_is_mad_class_rmpp(u8 mgmt_class)
+{
+ if ((mgmt_class == IB_MGMT_CLASS_SUBN_ADM) ||
+ (mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) ||
+ (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) ||
+ (mgmt_class == IB_MGMT_CLASS_BIS) ||
+ ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) &&
+ (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END)))
+ return 1;
+ return 0;
+}
+EXPORT_SYMBOL(ib_is_mad_class_rmpp);
+
+void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct list_head *list;
+
+ mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
+ send_buf);
+ list = &mad_send_wr->cur_seg->list;
+
+ if (mad_send_wr->cur_seg->num < seg_num) {
+ list_for_each_entry(mad_send_wr->cur_seg, list, list)
+ if (mad_send_wr->cur_seg->num == seg_num)
+ break;
+ } else if (mad_send_wr->cur_seg->num > seg_num) {
+ list_for_each_entry_reverse(mad_send_wr->cur_seg, list, list)
+ if (mad_send_wr->cur_seg->num == seg_num)
+ break;
+ }
+ return mad_send_wr->cur_seg->data;
+}
+EXPORT_SYMBOL(ib_get_rmpp_segment);
+
+static inline void *ib_get_payload(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ if (mad_send_wr->send_buf.seg_count)
+ return ib_get_rmpp_segment(&mad_send_wr->send_buf,
+ mad_send_wr->seg_num);
+ else
+ return mad_send_wr->send_buf.mad +
+ mad_send_wr->send_buf.hdr_len;
+}
+
+void ib_free_send_mad(struct ib_mad_send_buf *send_buf)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_wr_private *mad_send_wr;
+
+ mad_agent_priv = container_of(send_buf->mad_agent,
+ struct ib_mad_agent_private, agent);
+ mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
+ send_buf);
+
+ free_send_rmpp_list(mad_send_wr);
+ kfree(send_buf->mad);
+ deref_mad_agent(mad_agent_priv);
+}
+EXPORT_SYMBOL(ib_free_send_mad);
+
+int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_mad_qp_info *qp_info;
+ struct list_head *list;
+ struct ib_send_wr *bad_send_wr;
+ struct ib_mad_agent *mad_agent;
+ struct ib_sge *sge;
+ unsigned long flags;
+ int ret;
+
+ /* Set WR ID to find mad_send_wr upon completion */
+ qp_info = mad_send_wr->mad_agent_priv->qp_info;
+ mad_send_wr->send_wr.wr_id = (unsigned long)&mad_send_wr->mad_list;
+ mad_send_wr->mad_list.mad_queue = &qp_info->send_queue;
+
+ mad_agent = mad_send_wr->send_buf.mad_agent;
+ sge = mad_send_wr->sg_list;
+ sge[0].addr = ib_dma_map_single(mad_agent->device,
+ mad_send_wr->send_buf.mad,
+ sge[0].length,
+ DMA_TO_DEVICE);
+ mad_send_wr->header_mapping = sge[0].addr;
+
+ sge[1].addr = ib_dma_map_single(mad_agent->device,
+ ib_get_payload(mad_send_wr),
+ sge[1].length,
+ DMA_TO_DEVICE);
+ mad_send_wr->payload_mapping = sge[1].addr;
+
+ spin_lock_irqsave(&qp_info->send_queue.lock, flags);
+ if (qp_info->send_queue.count < qp_info->send_queue.max_active) {
+ ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr,
+ &bad_send_wr);
+ list = &qp_info->send_queue.list;
+ } else {
+ ret = 0;
+ list = &qp_info->overflow_list;
+ }
+
+ if (!ret) {
+ qp_info->send_queue.count++;
+ list_add_tail(&mad_send_wr->mad_list.list, list);
+ }
+ spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
+ if (ret) {
+ ib_dma_unmap_single(mad_agent->device,
+ mad_send_wr->header_mapping,
+ sge[0].length, DMA_TO_DEVICE);
+ ib_dma_unmap_single(mad_agent->device,
+ mad_send_wr->payload_mapping,
+ sge[1].length, DMA_TO_DEVICE);
+ }
+ return ret;
+}
+
+/*
+ * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated
+ * with the registered client
+ */
+int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
+ struct ib_mad_send_buf **bad_send_buf)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_buf *next_send_buf;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ unsigned long flags;
+ int ret = -EINVAL;
+
+ /* Walk list of send WRs and post each on send list */
+ for (; send_buf; send_buf = next_send_buf) {
+
+ mad_send_wr = container_of(send_buf,
+ struct ib_mad_send_wr_private,
+ send_buf);
+ mad_agent_priv = mad_send_wr->mad_agent_priv;
+
+ if (!send_buf->mad_agent->send_handler ||
+ (send_buf->timeout_ms &&
+ !send_buf->mad_agent->recv_handler)) {
+ ret = -EINVAL;
+ goto error;
+ }
+
+ if (!ib_is_mad_class_rmpp(((struct ib_mad_hdr *) send_buf->mad)->mgmt_class)) {
+ if (mad_agent_priv->agent.rmpp_version) {
+ ret = -EINVAL;
+ goto error;
+ }
+ }
+
+ /*
+ * Save pointer to next work request to post in case the
+ * current one completes, and the user modifies the work
+ * request associated with the completion
+ */
+ next_send_buf = send_buf->next;
+ mad_send_wr->send_wr.wr.ud.ah = send_buf->ah;
+
+ if (((struct ib_mad_hdr *) send_buf->mad)->mgmt_class ==
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+ ret = handle_outgoing_dr_smp(mad_agent_priv,
+ mad_send_wr);
+ if (ret < 0) /* error */
+ goto error;
+ else if (ret == 1) /* locally consumed */
+ continue;
+ }
+
+ mad_send_wr->tid = ((struct ib_mad_hdr *) send_buf->mad)->tid;
+ /* Timeout will be updated after send completes */
+ mad_send_wr->timeout = msecs_to_jiffies(send_buf->timeout_ms);
+ mad_send_wr->max_retries = send_buf->retries;
+ mad_send_wr->retries_left = send_buf->retries;
+ send_buf->retries = 0;
+ /* Reference for work request to QP + response */
+ mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
+ mad_send_wr->status = IB_WC_SUCCESS;
+
+ /* Reference MAD agent until send completes */
+ atomic_inc(&mad_agent_priv->refcount);
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ list_add_tail(&mad_send_wr->agent_list,
+ &mad_agent_priv->send_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ if (mad_agent_priv->agent.rmpp_version) {
+ ret = ib_send_rmpp_mad(mad_send_wr);
+ if (ret >= 0 && ret != IB_RMPP_RESULT_CONSUMED)
+ ret = ib_send_mad(mad_send_wr);
+ } else
+ ret = ib_send_mad(mad_send_wr);
+ if (ret < 0) {
+ /* Fail send request */
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ list_del(&mad_send_wr->agent_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ atomic_dec(&mad_agent_priv->refcount);
+ goto error;
+ }
+ }
+ return 0;
+error:
+ if (bad_send_buf)
+ *bad_send_buf = send_buf;
+ return ret;
+}
+EXPORT_SYMBOL(ib_post_send_mad);
+
+/*
+ * ib_free_recv_mad - Returns data buffers used to receive
+ * a MAD to the access layer
+ */
+void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_mad_recv_buf *mad_recv_buf, *temp_recv_buf;
+ struct ib_mad_private_header *mad_priv_hdr;
+ struct ib_mad_private *priv;
+ struct list_head free_list;
+
+ INIT_LIST_HEAD(&free_list);
+ list_splice_init(&mad_recv_wc->rmpp_list, &free_list);
+
+ list_for_each_entry_safe(mad_recv_buf, temp_recv_buf,
+ &free_list, list) {
+ mad_recv_wc = container_of(mad_recv_buf, struct ib_mad_recv_wc,
+ recv_buf);
+ mad_priv_hdr = container_of(mad_recv_wc,
+ struct ib_mad_private_header,
+ recv_wc);
+ priv = container_of(mad_priv_hdr, struct ib_mad_private,
+ header);
+ kmem_cache_free(ib_mad_cache, priv);
+ }
+}
+EXPORT_SYMBOL(ib_free_recv_mad);
+
+struct ib_mad_agent *ib_redirect_mad_qp(struct ib_qp *qp,
+ u8 rmpp_version,
+ ib_mad_send_handler send_handler,
+ ib_mad_recv_handler recv_handler,
+ void *context)
+{
+ return ERR_PTR(-EINVAL); /* XXX: for now */
+}
+EXPORT_SYMBOL(ib_redirect_mad_qp);
+
+int ib_process_mad_wc(struct ib_mad_agent *mad_agent,
+ struct ib_wc *wc)
+{
+ printk(KERN_ERR PFX "ib_process_mad_wc() not implemented yet\n");
+ return 0;
+}
+EXPORT_SYMBOL(ib_process_mad_wc);
+
+static int method_in_use(struct ib_mad_mgmt_method_table **method,
+ struct ib_mad_reg_req *mad_reg_req)
+{
+ int i;
+
+ for (i = find_first_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS);
+ i < IB_MGMT_MAX_METHODS;
+ i = find_next_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS,
+ 1+i)) {
+ if ((*method)->agent[i]) {
+ printk(KERN_ERR PFX "Method %d already in use\n", i);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+static int allocate_method_table(struct ib_mad_mgmt_method_table **method)
+{
+ /* Allocate management method table */
+ *method = kzalloc(sizeof **method, GFP_ATOMIC);
+ if (!*method) {
+ printk(KERN_ERR PFX "No memory for "
+ "ib_mad_mgmt_method_table\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/*
+ * Check to see if there are any methods still in use
+ */
+static int check_method_table(struct ib_mad_mgmt_method_table *method)
+{
+ int i;
+
+ for (i = 0; i < IB_MGMT_MAX_METHODS; i++)
+ if (method->agent[i])
+ return 1;
+ return 0;
+}
+
+/*
+ * Check to see if there are any method tables for this class still in use
+ */
+static int check_class_table(struct ib_mad_mgmt_class_table *class)
+{
+ int i;
+
+ for (i = 0; i < MAX_MGMT_CLASS; i++)
+ if (class->method_table[i])
+ return 1;
+ return 0;
+}
+
+static int check_vendor_class(struct ib_mad_mgmt_vendor_class *vendor_class)
+{
+ int i;
+
+ for (i = 0; i < MAX_MGMT_OUI; i++)
+ if (vendor_class->method_table[i])
+ return 1;
+ return 0;
+}
+
+static int find_vendor_oui(struct ib_mad_mgmt_vendor_class *vendor_class,
+ char *oui)
+{
+ int i;
+
+ for (i = 0; i < MAX_MGMT_OUI; i++)
+ /* Is there matching OUI for this vendor class ? */
+ if (!memcmp(vendor_class->oui[i], oui, 3))
+ return i;
+
+ return -1;
+}
+
+static int check_vendor_table(struct ib_mad_mgmt_vendor_class_table *vendor)
+{
+ int i;
+
+ for (i = 0; i < MAX_MGMT_VENDOR_RANGE2; i++)
+ if (vendor->vendor_class[i])
+ return 1;
+
+ return 0;
+}
+
+static void remove_methods_mad_agent(struct ib_mad_mgmt_method_table *method,
+ struct ib_mad_agent_private *agent)
+{
+ int i;
+
+ /* Remove any methods for this mad agent */
+ for (i = 0; i < IB_MGMT_MAX_METHODS; i++) {
+ if (method->agent[i] == agent) {
+ method->agent[i] = NULL;
+ }
+ }
+}
+
+static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+ struct ib_mad_agent_private *agent_priv,
+ u8 mgmt_class)
+{
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_mgmt_class_table **class;
+ struct ib_mad_mgmt_method_table **method;
+ int i, ret;
+
+ port_priv = agent_priv->qp_info->port_priv;
+ class = &port_priv->version[mad_reg_req->mgmt_class_version].class;
+ if (!*class) {
+ /* Allocate management class table for "new" class version */
+ *class = kzalloc(sizeof **class, GFP_ATOMIC);
+ if (!*class) {
+ printk(KERN_ERR PFX "No memory for "
+ "ib_mad_mgmt_class_table\n");
+ ret = -ENOMEM;
+ goto error1;
+ }
+
+ /* Allocate method table for this management class */
+ method = &(*class)->method_table[mgmt_class];
+ if ((ret = allocate_method_table(method)))
+ goto error2;
+ } else {
+ method = &(*class)->method_table[mgmt_class];
+ if (!*method) {
+ /* Allocate method table for this management class */
+ if ((ret = allocate_method_table(method)))
+ goto error1;
+ }
+ }
+
+ /* Now, make sure methods are not already in use */
+ if (method_in_use(method, mad_reg_req))
+ goto error3;
+
+ /* Finally, add in methods being registered */
+ for (i = find_first_bit(mad_reg_req->method_mask,
+ IB_MGMT_MAX_METHODS);
+ i < IB_MGMT_MAX_METHODS;
+ i = find_next_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS,
+ 1+i)) {
+ (*method)->agent[i] = agent_priv;
+ }
+ return 0;
+
+error3:
+ /* Remove any methods for this mad agent */
+ remove_methods_mad_agent(*method, agent_priv);
+ /* Now, check to see if there are any methods in use */
+ if (!check_method_table(*method)) {
+ /* If not, release management method table */
+ kfree(*method);
+ *method = NULL;
+ }
+ ret = -EINVAL;
+ goto error1;
+error2:
+ kfree(*class);
+ *class = NULL;
+error1:
+ return ret;
+}
+
+static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+ struct ib_mad_agent_private *agent_priv)
+{
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_mgmt_vendor_class_table **vendor_table;
+ struct ib_mad_mgmt_vendor_class_table *vendor = NULL;
+ struct ib_mad_mgmt_vendor_class *vendor_class = NULL;
+ struct ib_mad_mgmt_method_table **method;
+ int i, ret = -ENOMEM;
+ u8 vclass;
+
+ /* "New" vendor (with OUI) class */
+ vclass = vendor_class_index(mad_reg_req->mgmt_class);
+ port_priv = agent_priv->qp_info->port_priv;
+ vendor_table = &port_priv->version[
+ mad_reg_req->mgmt_class_version].vendor;
+ if (!*vendor_table) {
+ /* Allocate mgmt vendor class table for "new" class version */
+ vendor = kzalloc(sizeof *vendor, GFP_ATOMIC);
+ if (!vendor) {
+ printk(KERN_ERR PFX "No memory for "
+ "ib_mad_mgmt_vendor_class_table\n");
+ goto error1;
+ }
+
+ *vendor_table = vendor;
+ }
+ if (!(*vendor_table)->vendor_class[vclass]) {
+ /* Allocate table for this management vendor class */
+ vendor_class = kzalloc(sizeof *vendor_class, GFP_ATOMIC);
+ if (!vendor_class) {
+ printk(KERN_ERR PFX "No memory for "
+ "ib_mad_mgmt_vendor_class\n");
+ goto error2;
+ }
+
+ (*vendor_table)->vendor_class[vclass] = vendor_class;
+ }
+ for (i = 0; i < MAX_MGMT_OUI; i++) {
+ /* Is there matching OUI for this vendor class ? */
+ if (!memcmp((*vendor_table)->vendor_class[vclass]->oui[i],
+ mad_reg_req->oui, 3)) {
+ method = &(*vendor_table)->vendor_class[
+ vclass]->method_table[i];
+ BUG_ON(!*method);
+ goto check_in_use;
+ }
+ }
+ for (i = 0; i < MAX_MGMT_OUI; i++) {
+ /* OUI slot available ? */
+ if (!is_vendor_oui((*vendor_table)->vendor_class[
+ vclass]->oui[i])) {
+ method = &(*vendor_table)->vendor_class[
+ vclass]->method_table[i];
+ BUG_ON(*method);
+ /* Allocate method table for this OUI */
+ if ((ret = allocate_method_table(method)))
+ goto error3;
+ memcpy((*vendor_table)->vendor_class[vclass]->oui[i],
+ mad_reg_req->oui, 3);
+ goto check_in_use;
+ }
+ }
+ printk(KERN_ERR PFX "All OUI slots in use\n");
+ goto error3;
+
+check_in_use:
+ /* Now, make sure methods are not already in use */
+ if (method_in_use(method, mad_reg_req))
+ goto error4;
+
+ /* Finally, add in methods being registered */
+ for (i = find_first_bit(mad_reg_req->method_mask,
+ IB_MGMT_MAX_METHODS);
+ i < IB_MGMT_MAX_METHODS;
+ i = find_next_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS,
+ 1+i)) {
+ (*method)->agent[i] = agent_priv;
+ }
+ return 0;
+
+error4:
+ /* Remove any methods for this mad agent */
+ remove_methods_mad_agent(*method, agent_priv);
+ /* Now, check to see if there are any methods in use */
+ if (!check_method_table(*method)) {
+ /* If not, release management method table */
+ kfree(*method);
+ *method = NULL;
+ }
+ ret = -EINVAL;
+error3:
+ if (vendor_class) {
+ (*vendor_table)->vendor_class[vclass] = NULL;
+ kfree(vendor_class);
+ }
+error2:
+ if (vendor) {
+ *vendor_table = NULL;
+ kfree(vendor);
+ }
+error1:
+ return ret;
+}
+
+static void remove_mad_reg_req(struct ib_mad_agent_private *agent_priv)
+{
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_mgmt_class_table *class;
+ struct ib_mad_mgmt_method_table *method;
+ struct ib_mad_mgmt_vendor_class_table *vendor;
+ struct ib_mad_mgmt_vendor_class *vendor_class;
+ int index;
+ u8 mgmt_class;
+
+ /*
+ * Was MAD registration request supplied
+ * with original registration ?
+ */
+ if (!agent_priv->reg_req) {
+ goto out;
+ }
+
+ port_priv = agent_priv->qp_info->port_priv;
+ mgmt_class = convert_mgmt_class(agent_priv->reg_req->mgmt_class);
+ class = port_priv->version[
+ agent_priv->reg_req->mgmt_class_version].class;
+ if (!class)
+ goto vendor_check;
+
+ method = class->method_table[mgmt_class];
+ if (method) {
+ /* Remove any methods for this mad agent */
+ remove_methods_mad_agent(method, agent_priv);
+ /* Now, check to see if there are any methods still in use */
+ if (!check_method_table(method)) {
+ /* If not, release management method table */
+ kfree(method);
+ class->method_table[mgmt_class] = NULL;
+ /* Any management classes left ? */
+ if (!check_class_table(class)) {
+ /* If not, release management class table */
+ kfree(class);
+ port_priv->version[
+ agent_priv->reg_req->
+ mgmt_class_version].class = NULL;
+ }
+ }
+ }
+
+vendor_check:
+ if (!is_vendor_class(mgmt_class))
+ goto out;
+
+ /* normalize mgmt_class to vendor range 2 */
+ mgmt_class = vendor_class_index(agent_priv->reg_req->mgmt_class);
+ vendor = port_priv->version[
+ agent_priv->reg_req->mgmt_class_version].vendor;
+
+ if (!vendor)
+ goto out;
+
+ vendor_class = vendor->vendor_class[mgmt_class];
+ if (vendor_class) {
+ index = find_vendor_oui(vendor_class, agent_priv->reg_req->oui);
+ if (index < 0)
+ goto out;
+ method = vendor_class->method_table[index];
+ if (method) {
+ /* Remove any methods for this mad agent */
+ remove_methods_mad_agent(method, agent_priv);
+ /*
+ * Now, check to see if there are
+ * any methods still in use
+ */
+ if (!check_method_table(method)) {
+ /* If not, release management method table */
+ kfree(method);
+ vendor_class->method_table[index] = NULL;
+ memset(vendor_class->oui[index], 0, 3);
+ /* Any OUIs left ? */
+ if (!check_vendor_class(vendor_class)) {
+ /* If not, release vendor class table */
+ kfree(vendor_class);
+ vendor->vendor_class[mgmt_class] = NULL;
+ /* Any other vendor classes left ? */
+ if (!check_vendor_table(vendor)) {
+ kfree(vendor);
+ port_priv->version[
+ agent_priv->reg_req->
+ mgmt_class_version].
+ vendor = NULL;
+ }
+ }
+ }
+ }
+ }
+
+out:
+ return;
+}
+
+static struct ib_mad_agent_private *
+find_mad_agent(struct ib_mad_port_private *port_priv,
+ struct ib_mad *mad)
+{
+ struct ib_mad_agent_private *mad_agent = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&port_priv->reg_lock, flags);
+ if (ib_response_mad(mad)) {
+ u32 hi_tid;
+ struct ib_mad_agent_private *entry;
+
+ /*
+ * Routing is based on high 32 bits of transaction ID
+ * of MAD.
+ */
+ hi_tid = be64_to_cpu(mad->mad_hdr.tid) >> 32;
+ list_for_each_entry(entry, &port_priv->agent_list, agent_list) {
+ if (entry->agent.hi_tid == hi_tid) {
+ mad_agent = entry;
+ break;
+ }
+ }
+ } else {
+ struct ib_mad_mgmt_class_table *class;
+ struct ib_mad_mgmt_method_table *method;
+ struct ib_mad_mgmt_vendor_class_table *vendor;
+ struct ib_mad_mgmt_vendor_class *vendor_class;
+ struct ib_vendor_mad *vendor_mad;
+ int index;
+
+ /*
+ * Routing is based on version, class, and method
+ * For "newer" vendor MADs, also based on OUI
+ */
+ if (mad->mad_hdr.class_version >= MAX_MGMT_VERSION)
+ goto out;
+ if (!is_vendor_class(mad->mad_hdr.mgmt_class)) {
+ class = port_priv->version[
+ mad->mad_hdr.class_version].class;
+ if (!class)
+ goto out;
+ method = class->method_table[convert_mgmt_class(
+ mad->mad_hdr.mgmt_class)];
+ if (method)
+ mad_agent = method->agent[mad->mad_hdr.method &
+ ~IB_MGMT_METHOD_RESP];
+ } else {
+ vendor = port_priv->version[
+ mad->mad_hdr.class_version].vendor;
+ if (!vendor)
+ goto out;
+ vendor_class = vendor->vendor_class[vendor_class_index(
+ mad->mad_hdr.mgmt_class)];
+ if (!vendor_class)
+ goto out;
+ /* Find matching OUI */
+ vendor_mad = (struct ib_vendor_mad *)mad;
+ index = find_vendor_oui(vendor_class, vendor_mad->oui);
+ if (index == -1)
+ goto out;
+ method = vendor_class->method_table[index];
+ if (method) {
+ mad_agent = method->agent[mad->mad_hdr.method &
+ ~IB_MGMT_METHOD_RESP];
+ }
+ }
+ }
+
+ if (mad_agent) {
+ if (mad_agent->agent.recv_handler)
+ atomic_inc(&mad_agent->refcount);
+ else {
+ printk(KERN_NOTICE PFX "No receive handler for client "
+ "%p on port %d\n",
+ &mad_agent->agent, port_priv->port_num);
+ mad_agent = NULL;
+ }
+ }
+out:
+ spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+
+ return mad_agent;
+}
+
+static int validate_mad(struct ib_mad *mad, u32 qp_num)
+{
+ int valid = 0;
+
+ /* Make sure MAD base version is understood */
+ if (mad->mad_hdr.base_version != IB_MGMT_BASE_VERSION) {
+ printk(KERN_ERR PFX "MAD received with unsupported base "
+ "version %d\n", mad->mad_hdr.base_version);
+ goto out;
+ }
+
+ /* Filter SMI packets sent to other than QP0 */
+ if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
+ (mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+ if (qp_num == 0)
+ valid = 1;
+ } else {
+ /* Filter GSI packets sent to QP0 */
+ if (qp_num != 0)
+ valid = 1;
+ }
+
+out:
+ return valid;
+}
+
+static int is_data_mad(struct ib_mad_agent_private *mad_agent_priv,
+ struct ib_mad_hdr *mad_hdr)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+
+ rmpp_mad = (struct ib_rmpp_mad *)mad_hdr;
+ return !mad_agent_priv->agent.rmpp_version ||
+ !(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+ IB_MGMT_RMPP_FLAG_ACTIVE) ||
+ (rmpp_mad->rmpp_hdr.rmpp_type == IB_MGMT_RMPP_TYPE_DATA);
+}
+
+static inline int rcv_has_same_class(struct ib_mad_send_wr_private *wr,
+ struct ib_mad_recv_wc *rwc)
+{
+ return ((struct ib_mad *)(wr->send_buf.mad))->mad_hdr.mgmt_class ==
+ rwc->recv_buf.mad->mad_hdr.mgmt_class;
+}
+
+static inline int rcv_has_same_gid(struct ib_mad_agent_private *mad_agent_priv,
+ struct ib_mad_send_wr_private *wr,
+ struct ib_mad_recv_wc *rwc )
+{
+ struct ib_ah_attr attr;
+ u8 send_resp, rcv_resp;
+ union ib_gid sgid;
+ struct ib_device *device = mad_agent_priv->agent.device;
+ u8 port_num = mad_agent_priv->agent.port_num;
+ u8 lmc;
+
+ send_resp = ib_response_mad((struct ib_mad *)wr->send_buf.mad);
+ rcv_resp = ib_response_mad(rwc->recv_buf.mad);
+
+ if (send_resp == rcv_resp)
+ /* both requests, or both responses. GIDs different */
+ return 0;
+
+ if (ib_query_ah(wr->send_buf.ah, &attr))
+ /* Assume not equal, to avoid false positives. */
+ return 0;
+
+ if (!!(attr.ah_flags & IB_AH_GRH) !=
+ !!(rwc->wc->wc_flags & IB_WC_GRH))
+ /* one has GID, other does not. Assume different */
+ return 0;
+
+ if (!send_resp && rcv_resp) {
+ /* is request/response. */
+ if (!(attr.ah_flags & IB_AH_GRH)) {
+ if (ib_get_cached_lmc(device, port_num, &lmc))
+ return 0;
+ return (!lmc || !((attr.src_path_bits ^
+ rwc->wc->dlid_path_bits) &
+ ((1 << lmc) - 1)));
+ } else {
+ if (ib_get_cached_gid(device, port_num,
+ attr.grh.sgid_index, &sgid))
+ return 0;
+ return !memcmp(sgid.raw, rwc->recv_buf.grh->dgid.raw,
+ 16);
+ }
+ }
+
+ if (!(attr.ah_flags & IB_AH_GRH))
+ return attr.dlid == rwc->wc->slid;
+ else
+ return !memcmp(attr.grh.dgid.raw, rwc->recv_buf.grh->sgid.raw,
+ 16);
+}
+
+static inline int is_direct(u8 class)
+{
+ return (class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE);
+}
+
+struct ib_mad_send_wr_private*
+ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv,
+ struct ib_mad_recv_wc *wc)
+{
+ struct ib_mad_send_wr_private *wr;
+ struct ib_mad *mad;
+
+ mad = (struct ib_mad *)wc->recv_buf.mad;
+
+ list_for_each_entry(wr, &mad_agent_priv->wait_list, agent_list) {
+ if ((wr->tid == mad->mad_hdr.tid) &&
+ rcv_has_same_class(wr, wc) &&
+ /*
+ * Don't check GID for direct routed MADs.
+ * These might have permissive LIDs.
+ */
+ (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) ||
+ rcv_has_same_gid(mad_agent_priv, wr, wc)))
+ return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
+ }
+
+ /*
+ * It's possible to receive the response before we've
+ * been notified that the send has completed
+ */
+ list_for_each_entry(wr, &mad_agent_priv->send_list, agent_list) {
+ if (is_data_mad(mad_agent_priv, wr->send_buf.mad) &&
+ wr->tid == mad->mad_hdr.tid &&
+ wr->timeout &&
+ rcv_has_same_class(wr, wc) &&
+ /*
+ * Don't check GID for direct routed MADs.
+ * These might have permissive LIDs.
+ */
+ (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) ||
+ rcv_has_same_gid(mad_agent_priv, wr, wc)))
+ /* Verify request has not been canceled */
+ return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
+ }
+ return NULL;
+}
+
+void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ mad_send_wr->timeout = 0;
+ if (mad_send_wr->refcount == 1)
+ list_move_tail(&mad_send_wr->agent_list,
+ &mad_send_wr->mad_agent_priv->done_list);
+}
+
+static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_mad_send_wc mad_send_wc;
+ unsigned long flags;
+
+ INIT_LIST_HEAD(&mad_recv_wc->rmpp_list);
+ list_add(&mad_recv_wc->recv_buf.list, &mad_recv_wc->rmpp_list);
+ if (mad_agent_priv->agent.rmpp_version) {
+ mad_recv_wc = ib_process_rmpp_recv_wc(mad_agent_priv,
+ mad_recv_wc);
+ if (!mad_recv_wc) {
+ deref_mad_agent(mad_agent_priv);
+ return;
+ }
+ }
+
+ /* Complete corresponding request */
+ if (ib_response_mad(mad_recv_wc->recv_buf.mad)) {
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc);
+ if (!mad_send_wr) {
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ ib_free_recv_mad(mad_recv_wc);
+ deref_mad_agent(mad_agent_priv);
+ return;
+ }
+ ib_mark_mad_done(mad_send_wr);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ /* Defined behavior is to complete response before request */
+ mad_recv_wc->wc->wr_id = (unsigned long) &mad_send_wr->send_buf;
+ mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
+ mad_recv_wc);
+ atomic_dec(&mad_agent_priv->refcount);
+
+ mad_send_wc.status = IB_WC_SUCCESS;
+ mad_send_wc.vendor_err = 0;
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+ } else {
+ mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
+ mad_recv_wc);
+ deref_mad_agent(mad_agent_priv);
+ }
+}
+
+static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
+ struct ib_wc *wc)
+{
+ struct ib_mad_qp_info *qp_info;
+ struct ib_mad_private_header *mad_priv_hdr;
+ struct ib_mad_private *recv, *response = NULL;
+ struct ib_mad_list_head *mad_list;
+ struct ib_mad_agent_private *mad_agent;
+ int port_num;
+
+ mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+ qp_info = mad_list->mad_queue->qp_info;
+ dequeue_mad(mad_list);
+
+ mad_priv_hdr = container_of(mad_list, struct ib_mad_private_header,
+ mad_list);
+ recv = container_of(mad_priv_hdr, struct ib_mad_private, header);
+ ib_dma_unmap_single(port_priv->device,
+ recv->header.mapping,
+ sizeof(struct ib_mad_private) -
+ sizeof(struct ib_mad_private_header),
+ DMA_FROM_DEVICE);
+
+ /* Setup MAD receive work completion from "normal" work completion */
+ recv->header.wc = *wc;
+ recv->header.recv_wc.wc = &recv->header.wc;
+ recv->header.recv_wc.mad_len = sizeof(struct ib_mad);
+ recv->header.recv_wc.recv_buf.mad = &recv->mad.mad;
+ recv->header.recv_wc.recv_buf.grh = &recv->grh;
+
+ if (atomic_read(&qp_info->snoop_count))
+ snoop_recv(qp_info, &recv->header.recv_wc, IB_MAD_SNOOP_RECVS);
+
+ /* Validate MAD */
+ if (!validate_mad(&recv->mad.mad, qp_info->qp->qp_num))
+ goto out;
+
+ response = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL);
+ if (!response) {
+ printk(KERN_ERR PFX "ib_mad_recv_done_handler no memory "
+ "for response buffer\n");
+ goto out;
+ }
+
+ if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH)
+ port_num = wc->port_num;
+ else
+ port_num = port_priv->port_num;
+
+ if (recv->mad.mad.mad_hdr.mgmt_class ==
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+ enum smi_forward_action retsmi;
+
+ if (smi_handle_dr_smp_recv(&recv->mad.smp,
+ port_priv->device->node_type,
+ port_num,
+ port_priv->device->phys_port_cnt) ==
+ IB_SMI_DISCARD)
+ goto out;
+
+ retsmi = smi_check_forward_dr_smp(&recv->mad.smp);
+ if (retsmi == IB_SMI_LOCAL)
+ goto local;
+
+ if (retsmi == IB_SMI_SEND) { /* don't forward */
+ if (smi_handle_dr_smp_send(&recv->mad.smp,
+ port_priv->device->node_type,
+ port_num) == IB_SMI_DISCARD)
+ goto out;
+
+ if (smi_check_local_smp(&recv->mad.smp, port_priv->device) == IB_SMI_DISCARD)
+ goto out;
+ } else if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH) {
+ /* forward case for switches */
+ memcpy(response, recv, sizeof(*response));
+ response->header.recv_wc.wc = &response->header.wc;
+ response->header.recv_wc.recv_buf.mad = &response->mad.mad;
+ response->header.recv_wc.recv_buf.grh = &response->grh;
+
+ agent_send_response(&response->mad.mad,
+ &response->grh, wc,
+ port_priv->device,
+ smi_get_fwd_port(&recv->mad.smp),
+ qp_info->qp->qp_num);
+
+ goto out;
+ }
+ }
+
+local:
+ /* Give driver "right of first refusal" on incoming MAD */
+ if (port_priv->device->process_mad) {
+ int ret;
+
+ ret = port_priv->device->process_mad(port_priv->device, 0,
+ port_priv->port_num,
+ wc, &recv->grh,
+ &recv->mad.mad,
+ &response->mad.mad);
+ if (ret & IB_MAD_RESULT_SUCCESS) {
+ if (ret & IB_MAD_RESULT_CONSUMED)
+ goto out;
+ if (ret & IB_MAD_RESULT_REPLY) {
+ agent_send_response(&response->mad.mad,
+ &recv->grh, wc,
+ port_priv->device,
+ port_num,
+ qp_info->qp->qp_num);
+ goto out;
+ }
+ }
+ }
+
+ mad_agent = find_mad_agent(port_priv, &recv->mad.mad);
+ if (mad_agent) {
+ ib_mad_complete_recv(mad_agent, &recv->header.recv_wc);
+ /*
+ * recv is freed up in error cases in ib_mad_complete_recv
+ * or via recv_handler in ib_mad_complete_recv()
+ */
+ recv = NULL;
+ }
+
+out:
+ /* Post another receive request for this QP */
+ if (response) {
+ ib_mad_post_receive_mads(qp_info, response);
+ if (recv)
+ kmem_cache_free(ib_mad_cache, recv);
+ } else
+ ib_mad_post_receive_mads(qp_info, recv);
+}
+
+static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+
+ if (list_empty(&mad_agent_priv->wait_list)) {
+ del_timer(&mad_agent_priv->timeout_timer);
+ } else {
+ mad_send_wr = list_entry(mad_agent_priv->wait_list.next,
+ struct ib_mad_send_wr_private,
+ agent_list);
+
+ if (time_after(mad_agent_priv->timeout,
+ mad_send_wr->timeout)) {
+ mad_agent_priv->timeout = mad_send_wr->timeout;
+ mod_timer(&mad_agent_priv->timeout_timer,
+ mad_send_wr->timeout);
+ }
+ }
+}
+
+static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_wr_private *temp_mad_send_wr;
+ struct list_head *list_item;
+ unsigned long delay;
+
+ mad_agent_priv = mad_send_wr->mad_agent_priv;
+ list_del(&mad_send_wr->agent_list);
+
+ delay = mad_send_wr->timeout;
+ mad_send_wr->timeout += jiffies;
+
+ if (delay) {
+ list_for_each_prev(list_item, &mad_agent_priv->wait_list) {
+ temp_mad_send_wr = list_entry(list_item,
+ struct ib_mad_send_wr_private,
+ agent_list);
+ if (time_after(mad_send_wr->timeout,
+ temp_mad_send_wr->timeout))
+ break;
+ }
+ } else
+ list_item = &mad_agent_priv->wait_list;
+ list_add(&mad_send_wr->agent_list, list_item);
+
+ /* Reschedule a work item if we have a shorter timeout */
+ if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list)
+ mod_timer(&mad_agent_priv->timeout_timer,
+ mad_send_wr->timeout);
+}
+
+void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr,
+ int timeout_ms)
+{
+ mad_send_wr->timeout = msecs_to_jiffies(timeout_ms);
+ wait_for_response(mad_send_wr);
+}
+
+/*
+ * Process a send work completion
+ */
+void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ unsigned long flags;
+ int ret;
+
+ mad_agent_priv = mad_send_wr->mad_agent_priv;
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ if (mad_agent_priv->agent.rmpp_version) {
+ ret = ib_process_rmpp_send_wc(mad_send_wr, mad_send_wc);
+ if (ret == IB_RMPP_RESULT_CONSUMED)
+ goto done;
+ } else
+ ret = IB_RMPP_RESULT_UNHANDLED;
+
+ if (mad_send_wc->status != IB_WC_SUCCESS &&
+ mad_send_wr->status == IB_WC_SUCCESS) {
+ mad_send_wr->status = mad_send_wc->status;
+ mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+ }
+
+ if (--mad_send_wr->refcount > 0) {
+ if (mad_send_wr->refcount == 1 && mad_send_wr->timeout &&
+ mad_send_wr->status == IB_WC_SUCCESS) {
+ wait_for_response(mad_send_wr);
+ }
+ goto done;
+ }
+
+ /* Remove send from MAD agent and notify client of completion */
+ list_del(&mad_send_wr->agent_list);
+ adjust_timeout(mad_agent_priv);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ if (mad_send_wr->status != IB_WC_SUCCESS )
+ mad_send_wc->status = mad_send_wr->status;
+ if (ret == IB_RMPP_RESULT_INTERNAL)
+ ib_rmpp_send_handler(mad_send_wc);
+ else
+ mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+ mad_send_wc);
+
+ /* Release reference on agent taken when sending */
+ deref_mad_agent(mad_agent_priv);
+ return;
+done:
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static void ib_mad_send_done_handler(struct ib_mad_port_private *port_priv,
+ struct ib_wc *wc)
+{
+ struct ib_mad_send_wr_private *mad_send_wr, *queued_send_wr;
+ struct ib_mad_list_head *mad_list;
+ struct ib_mad_qp_info *qp_info;
+ struct ib_mad_queue *send_queue;
+ struct ib_send_wr *bad_send_wr;
+ struct ib_mad_send_wc mad_send_wc;
+ unsigned long flags;
+ int ret;
+
+ mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+ mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
+ mad_list);
+ send_queue = mad_list->mad_queue;
+ qp_info = send_queue->qp_info;
+
+retry:
+ ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device,
+ mad_send_wr->header_mapping,
+ mad_send_wr->sg_list[0].length, DMA_TO_DEVICE);
+ ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device,
+ mad_send_wr->payload_mapping,
+ mad_send_wr->sg_list[1].length, DMA_TO_DEVICE);
+ queued_send_wr = NULL;
+ spin_lock_irqsave(&send_queue->lock, flags);
+ list_del(&mad_list->list);
+
+ /* Move queued send to the send queue */
+ if (send_queue->count-- > send_queue->max_active) {
+ mad_list = container_of(qp_info->overflow_list.next,
+ struct ib_mad_list_head, list);
+ queued_send_wr = container_of(mad_list,
+ struct ib_mad_send_wr_private,
+ mad_list);
+ list_move_tail(&mad_list->list, &send_queue->list);
+ }
+ spin_unlock_irqrestore(&send_queue->lock, flags);
+
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ mad_send_wc.status = wc->status;
+ mad_send_wc.vendor_err = wc->vendor_err;
+ if (atomic_read(&qp_info->snoop_count))
+ snoop_send(qp_info, &mad_send_wr->send_buf, &mad_send_wc,
+ IB_MAD_SNOOP_SEND_COMPLETIONS);
+ ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+
+ if (queued_send_wr) {
+ ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr,
+ &bad_send_wr);
+ if (ret) {
+ printk(KERN_ERR PFX "ib_post_send failed: %d\n", ret);
+ mad_send_wr = queued_send_wr;
+ wc->status = IB_WC_LOC_QP_OP_ERR;
+ goto retry;
+ }
+ }
+}
+
+static void mark_sends_for_retry(struct ib_mad_qp_info *qp_info)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_mad_list_head *mad_list;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp_info->send_queue.lock, flags);
+ list_for_each_entry(mad_list, &qp_info->send_queue.list, list) {
+ mad_send_wr = container_of(mad_list,
+ struct ib_mad_send_wr_private,
+ mad_list);
+ mad_send_wr->retry = 1;
+ }
+ spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
+}
+
+static void mad_error_handler(struct ib_mad_port_private *port_priv,
+ struct ib_wc *wc)
+{
+ struct ib_mad_list_head *mad_list;
+ struct ib_mad_qp_info *qp_info;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ int ret;
+
+ /* Determine if failure was a send or receive */
+ mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+ qp_info = mad_list->mad_queue->qp_info;
+ if (mad_list->mad_queue == &qp_info->recv_queue)
+ /*
+ * Receive errors indicate that the QP has entered the error
+ * state - error handling/shutdown code will cleanup
+ */
+ return;
+
+ /*
+ * Send errors will transition the QP to SQE - move
+ * QP to RTS and repost flushed work requests
+ */
+ mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
+ mad_list);
+ if (wc->status == IB_WC_WR_FLUSH_ERR) {
+ if (mad_send_wr->retry) {
+ /* Repost send */
+ struct ib_send_wr *bad_send_wr;
+
+ mad_send_wr->retry = 0;
+ ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr,
+ &bad_send_wr);
+ if (ret)
+ ib_mad_send_done_handler(port_priv, wc);
+ } else
+ ib_mad_send_done_handler(port_priv, wc);
+ } else {
+ struct ib_qp_attr *attr;
+
+ /* Transition QP to RTS and fail offending send */
+ attr = kmalloc(sizeof *attr, GFP_KERNEL);
+ if (attr) {
+ attr->qp_state = IB_QPS_RTS;
+ attr->cur_qp_state = IB_QPS_SQE;
+ ret = ib_modify_qp(qp_info->qp, attr,
+ IB_QP_STATE | IB_QP_CUR_STATE);
+ kfree(attr);
+ if (ret)
+ printk(KERN_ERR PFX "mad_error_handler - "
+ "ib_modify_qp to RTS : %d\n", ret);
+ else
+ mark_sends_for_retry(qp_info);
+ }
+ ib_mad_send_done_handler(port_priv, wc);
+ }
+}
+
+/*
+ * IB MAD completion callback
+ */
+static void ib_mad_completion_handler(struct work_struct *work)
+{
+ struct ib_mad_port_private *port_priv;
+ struct ib_wc wc;
+
+ port_priv = container_of(work, struct ib_mad_port_private, work);
+ ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
+
+ while (ib_poll_cq(port_priv->cq, 1, &wc) == 1) {
+ if (wc.status == IB_WC_SUCCESS) {
+ switch (wc.opcode) {
+ case IB_WC_SEND:
+ ib_mad_send_done_handler(port_priv, &wc);
+ break;
+ case IB_WC_RECV:
+ ib_mad_recv_done_handler(port_priv, &wc);
+ break;
+ default:
+ BUG_ON(1);
+ break;
+ }
+ } else
+ mad_error_handler(port_priv, &wc);
+ }
+}
+
+static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv)
+{
+ unsigned long flags;
+ struct ib_mad_send_wr_private *mad_send_wr, *temp_mad_send_wr;
+ struct ib_mad_send_wc mad_send_wc;
+ struct list_head cancel_list;
+
+ INIT_LIST_HEAD(&cancel_list);
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
+ &mad_agent_priv->send_list, agent_list) {
+ if (mad_send_wr->status == IB_WC_SUCCESS) {
+ mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
+ mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+ }
+ }
+
+ /* Empty wait list to prevent receives from finding a request */
+ list_splice_init(&mad_agent_priv->wait_list, &cancel_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ /* Report all cancelled requests */
+ mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
+ mad_send_wc.vendor_err = 0;
+
+ list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
+ &cancel_list, agent_list) {
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ list_del(&mad_send_wr->agent_list);
+ mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+ &mad_send_wc);
+ atomic_dec(&mad_agent_priv->refcount);
+ }
+}
+
+static struct ib_mad_send_wr_private*
+find_send_wr(struct ib_mad_agent_private *mad_agent_priv,
+ struct ib_mad_send_buf *send_buf)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+
+ list_for_each_entry(mad_send_wr, &mad_agent_priv->wait_list,
+ agent_list) {
+ if (&mad_send_wr->send_buf == send_buf)
+ return mad_send_wr;
+ }
+
+ list_for_each_entry(mad_send_wr, &mad_agent_priv->send_list,
+ agent_list) {
+ if (is_data_mad(mad_agent_priv, mad_send_wr->send_buf.mad) &&
+ &mad_send_wr->send_buf == send_buf)
+ return mad_send_wr;
+ }
+ return NULL;
+}
+
+int ib_modify_mad(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_buf *send_buf, u32 timeout_ms)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ unsigned long flags;
+ int active;
+
+ mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
+ agent);
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ mad_send_wr = find_send_wr(mad_agent_priv, send_buf);
+ if (!mad_send_wr || mad_send_wr->status != IB_WC_SUCCESS) {
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ return -EINVAL;
+ }
+
+ active = (!mad_send_wr->timeout || mad_send_wr->refcount > 1);
+ if (!timeout_ms) {
+ mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
+ mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+ }
+
+ mad_send_wr->send_buf.timeout_ms = timeout_ms;
+ if (active)
+ mad_send_wr->timeout = msecs_to_jiffies(timeout_ms);
+ else
+ ib_reset_mad_timeout(mad_send_wr, timeout_ms);
+
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ return 0;
+}
+EXPORT_SYMBOL(ib_modify_mad);
+
+void ib_cancel_mad(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_buf *send_buf)
+{
+ ib_modify_mad(mad_agent, send_buf, 0);
+}
+EXPORT_SYMBOL(ib_cancel_mad);
+
+static void local_completions(struct work_struct *work)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_local_private *local;
+ struct ib_mad_agent_private *recv_mad_agent;
+ unsigned long flags;
+ int free_mad;
+ struct ib_wc wc;
+ struct ib_mad_send_wc mad_send_wc;
+
+ mad_agent_priv =
+ container_of(work, struct ib_mad_agent_private, local_work);
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ while (!list_empty(&mad_agent_priv->local_list)) {
+ local = list_entry(mad_agent_priv->local_list.next,
+ struct ib_mad_local_private,
+ completion_list);
+ list_del(&local->completion_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ free_mad = 0;
+ if (local->mad_priv) {
+ recv_mad_agent = local->recv_mad_agent;
+ if (!recv_mad_agent) {
+ printk(KERN_ERR PFX "No receive MAD agent for local completion\n");
+ free_mad = 1;
+ goto local_send_completion;
+ }
+
+ /*
+ * Defined behavior is to complete response
+ * before request
+ */
+ build_smp_wc(recv_mad_agent->agent.qp,
+ (unsigned long) local->mad_send_wr,
+ be16_to_cpu(IB_LID_PERMISSIVE),
+ 0, recv_mad_agent->agent.port_num, &wc);
+
+ local->mad_priv->header.recv_wc.wc = &wc;
+ local->mad_priv->header.recv_wc.mad_len =
+ sizeof(struct ib_mad);
+ INIT_LIST_HEAD(&local->mad_priv->header.recv_wc.rmpp_list);
+ list_add(&local->mad_priv->header.recv_wc.recv_buf.list,
+ &local->mad_priv->header.recv_wc.rmpp_list);
+ local->mad_priv->header.recv_wc.recv_buf.grh = NULL;
+ local->mad_priv->header.recv_wc.recv_buf.mad =
+ &local->mad_priv->mad.mad;
+ if (atomic_read(&recv_mad_agent->qp_info->snoop_count))
+ snoop_recv(recv_mad_agent->qp_info,
+ &local->mad_priv->header.recv_wc,
+ IB_MAD_SNOOP_RECVS);
+ recv_mad_agent->agent.recv_handler(
+ &recv_mad_agent->agent,
+ &local->mad_priv->header.recv_wc);
+ spin_lock_irqsave(&recv_mad_agent->lock, flags);
+ atomic_dec(&recv_mad_agent->refcount);
+ spin_unlock_irqrestore(&recv_mad_agent->lock, flags);
+ }
+
+local_send_completion:
+ /* Complete send */
+ mad_send_wc.status = IB_WC_SUCCESS;
+ mad_send_wc.vendor_err = 0;
+ mad_send_wc.send_buf = &local->mad_send_wr->send_buf;
+ if (atomic_read(&mad_agent_priv->qp_info->snoop_count))
+ snoop_send(mad_agent_priv->qp_info,
+ &local->mad_send_wr->send_buf,
+ &mad_send_wc, IB_MAD_SNOOP_SEND_COMPLETIONS);
+ mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+ &mad_send_wc);
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ atomic_dec(&mad_agent_priv->refcount);
+ if (free_mad)
+ kmem_cache_free(ib_mad_cache, local->mad_priv);
+ kfree(local);
+ }
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ int ret;
+
+ if (!mad_send_wr->retries_left)
+ return -ETIMEDOUT;
+
+ mad_send_wr->retries_left--;
+ mad_send_wr->send_buf.retries++;
+
+ mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+
+ if (mad_send_wr->mad_agent_priv->agent.rmpp_version) {
+ ret = ib_retry_rmpp(mad_send_wr);
+ switch (ret) {
+ case IB_RMPP_RESULT_UNHANDLED:
+ ret = ib_send_mad(mad_send_wr);
+ break;
+ case IB_RMPP_RESULT_CONSUMED:
+ ret = 0;
+ break;
+ default:
+ ret = -ECOMM;
+ break;
+ }
+ } else
+ ret = ib_send_mad(mad_send_wr);
+
+ if (!ret) {
+ mad_send_wr->refcount++;
+ list_add_tail(&mad_send_wr->agent_list,
+ &mad_send_wr->mad_agent_priv->send_list);
+ }
+ return ret;
+}
+
+static void timeout_sends(struct work_struct *work)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_mad_send_wc mad_send_wc;
+ unsigned long flags;
+
+ mad_agent_priv = container_of(work, struct ib_mad_agent_private,
+ timeout_work);
+ mad_send_wc.vendor_err = 0;
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ while (!list_empty(&mad_agent_priv->wait_list)) {
+ mad_send_wr = list_entry(mad_agent_priv->wait_list.next,
+ struct ib_mad_send_wr_private,
+ agent_list);
+
+ if (time_after(mad_send_wr->timeout, jiffies)) {
+ mod_timer(&mad_agent_priv->timeout_timer,
+ mad_send_wr->timeout);
+ break;
+ }
+
+ list_del(&mad_send_wr->agent_list);
+ if (mad_send_wr->status == IB_WC_SUCCESS &&
+ !retry_send(mad_send_wr))
+ continue;
+
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ if (mad_send_wr->status == IB_WC_SUCCESS)
+ mad_send_wc.status = IB_WC_RESP_TIMEOUT_ERR;
+ else
+ mad_send_wc.status = mad_send_wr->status;
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+ &mad_send_wc);
+
+ atomic_dec(&mad_agent_priv->refcount);
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ }
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static void ib_mad_thread_completion_handler(struct ib_cq *cq, void *arg)
+{
+ struct ib_mad_port_private *port_priv = cq->cq_context;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+ if (!list_empty(&port_priv->port_list))
+ queue_work(port_priv->wq, &port_priv->work);
+ spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+}
+
+/*
+ * Allocate receive MADs and post receive WRs for them
+ */
+static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
+ struct ib_mad_private *mad)
+{
+ unsigned long flags;
+ int post, ret;
+ struct ib_mad_private *mad_priv;
+ struct ib_sge sg_list;
+ struct ib_recv_wr recv_wr, *bad_recv_wr;
+ struct ib_mad_queue *recv_queue = &qp_info->recv_queue;
+
+ /* Initialize common scatter list fields */
+ sg_list.length = sizeof *mad_priv - sizeof mad_priv->header;
+ sg_list.lkey = (*qp_info->port_priv->mr).lkey;
+
+ /* Initialize common receive WR fields */
+ recv_wr.next = NULL;
+ recv_wr.sg_list = &sg_list;
+ recv_wr.num_sge = 1;
+
+ do {
+ /* Allocate and map receive buffer */
+ if (mad) {
+ mad_priv = mad;
+ mad = NULL;
+ } else {
+ mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL);
+ if (!mad_priv) {
+ printk(KERN_ERR PFX "No memory for receive buffer\n");
+ ret = -ENOMEM;
+ break;
+ }
+ }
+ sg_list.addr = ib_dma_map_single(qp_info->port_priv->device,
+ &mad_priv->grh,
+ sizeof *mad_priv -
+ sizeof mad_priv->header,
+ DMA_FROM_DEVICE);
+ mad_priv->header.mapping = sg_list.addr;
+ recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list;
+ mad_priv->header.mad_list.mad_queue = recv_queue;
+
+ /* Post receive WR */
+ spin_lock_irqsave(&recv_queue->lock, flags);
+ post = (++recv_queue->count < recv_queue->max_active);
+ list_add_tail(&mad_priv->header.mad_list.list, &recv_queue->list);
+ spin_unlock_irqrestore(&recv_queue->lock, flags);
+ ret = ib_post_recv(qp_info->qp, &recv_wr, &bad_recv_wr);
+ if (ret) {
+ spin_lock_irqsave(&recv_queue->lock, flags);
+ list_del(&mad_priv->header.mad_list.list);
+ recv_queue->count--;
+ spin_unlock_irqrestore(&recv_queue->lock, flags);
+ ib_dma_unmap_single(qp_info->port_priv->device,
+ mad_priv->header.mapping,
+ sizeof *mad_priv -
+ sizeof mad_priv->header,
+ DMA_FROM_DEVICE);
+ kmem_cache_free(ib_mad_cache, mad_priv);
+ printk(KERN_ERR PFX "ib_post_recv failed: %d\n", ret);
+ break;
+ }
+ } while (post);
+
+ return ret;
+}
+
+/*
+ * Return all the posted receive MADs
+ */
+static void cleanup_recv_queue(struct ib_mad_qp_info *qp_info)
+{
+ struct ib_mad_private_header *mad_priv_hdr;
+ struct ib_mad_private *recv;
+ struct ib_mad_list_head *mad_list;
+
+ if (!qp_info->qp)
+ return;
+
+ while (!list_empty(&qp_info->recv_queue.list)) {
+
+ mad_list = list_entry(qp_info->recv_queue.list.next,
+ struct ib_mad_list_head, list);
+ mad_priv_hdr = container_of(mad_list,
+ struct ib_mad_private_header,
+ mad_list);
+ recv = container_of(mad_priv_hdr, struct ib_mad_private,
+ header);
+
+ /* Remove from posted receive MAD list */
+ list_del(&mad_list->list);
+
+ ib_dma_unmap_single(qp_info->port_priv->device,
+ recv->header.mapping,
+ sizeof(struct ib_mad_private) -
+ sizeof(struct ib_mad_private_header),
+ DMA_FROM_DEVICE);
+ kmem_cache_free(ib_mad_cache, recv);
+ }
+
+ qp_info->recv_queue.count = 0;
+}
+
+/*
+ * Start the port
+ */
+static int ib_mad_port_start(struct ib_mad_port_private *port_priv)
+{
+ int ret, i;
+ struct ib_qp_attr *attr;
+ struct ib_qp *qp;
+
+ attr = kmalloc(sizeof *attr, GFP_KERNEL);
+ if (!attr) {
+ printk(KERN_ERR PFX "Couldn't kmalloc ib_qp_attr\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < IB_MAD_QPS_CORE; i++) {
+ qp = port_priv->qp_info[i].qp;
+ if (!qp)
+ continue;
+
+ /*
+ * PKey index for QP1 is irrelevant but
+ * one is needed for the Reset to Init transition
+ */
+ attr->qp_state = IB_QPS_INIT;
+ attr->pkey_index = 0;
+ attr->qkey = (qp->qp_num == 0) ? 0 : IB_QP1_QKEY;
+ ret = ib_modify_qp(qp, attr, IB_QP_STATE |
+ IB_QP_PKEY_INDEX | IB_QP_QKEY);
+ if (ret) {
+ printk(KERN_ERR PFX "Couldn't change QP%d state to "
+ "INIT: %d\n", i, ret);
+ goto out;
+ }
+
+ attr->qp_state = IB_QPS_RTR;
+ ret = ib_modify_qp(qp, attr, IB_QP_STATE);
+ if (ret) {
+ printk(KERN_ERR PFX "Couldn't change QP%d state to "
+ "RTR: %d\n", i, ret);
+ goto out;
+ }
+
+ attr->qp_state = IB_QPS_RTS;
+ attr->sq_psn = IB_MAD_SEND_Q_PSN;
+ ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_SQ_PSN);
+ if (ret) {
+ printk(KERN_ERR PFX "Couldn't change QP%d state to "
+ "RTS: %d\n", i, ret);
+ goto out;
+ }
+ }
+
+ ret = ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
+ if (ret) {
+ printk(KERN_ERR PFX "Failed to request completion "
+ "notification: %d\n", ret);
+ goto out;
+ }
+
+ for (i = 0; i < IB_MAD_QPS_CORE; i++) {
+ if (!port_priv->qp_info[i].qp)
+ continue;
+
+ ret = ib_mad_post_receive_mads(&port_priv->qp_info[i], NULL);
+ if (ret) {
+ printk(KERN_ERR PFX "Couldn't post receive WRs\n");
+ goto out;
+ }
+ }
+out:
+ kfree(attr);
+ return ret;
+}
+
+static void qp_event_handler(struct ib_event *event, void *qp_context)
+{
+ struct ib_mad_qp_info *qp_info = qp_context;
+
+ /* It's worse than that! He's dead, Jim! */
+ printk(KERN_ERR PFX "Fatal error (%d) on MAD QP (%d)\n",
+ event->event, qp_info->qp->qp_num);
+}
+
+static void init_mad_queue(struct ib_mad_qp_info *qp_info,
+ struct ib_mad_queue *mad_queue)
+{
+ mad_queue->qp_info = qp_info;
+ mad_queue->count = 0;
+ spin_lock_init(&mad_queue->lock);
+ INIT_LIST_HEAD(&mad_queue->list);
+}
+
+static void init_mad_qp(struct ib_mad_port_private *port_priv,
+ struct ib_mad_qp_info *qp_info)
+{
+ qp_info->port_priv = port_priv;
+ init_mad_queue(qp_info, &qp_info->send_queue);
+ init_mad_queue(qp_info, &qp_info->recv_queue);
+ INIT_LIST_HEAD(&qp_info->overflow_list);
+ spin_lock_init(&qp_info->snoop_lock);
+ qp_info->snoop_table = NULL;
+ qp_info->snoop_table_size = 0;
+ atomic_set(&qp_info->snoop_count, 0);
+}
+
+static int create_mad_qp(struct ib_mad_qp_info *qp_info,
+ enum ib_qp_type qp_type)
+{
+ struct ib_qp_init_attr qp_init_attr;
+ int ret;
+
+ memset(&qp_init_attr, 0, sizeof qp_init_attr);
+ qp_init_attr.send_cq = qp_info->port_priv->cq;
+ qp_init_attr.recv_cq = qp_info->port_priv->cq;
+ qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+ qp_init_attr.cap.max_send_wr = mad_sendq_size;
+ qp_init_attr.cap.max_recv_wr = mad_recvq_size;
+ qp_init_attr.cap.max_send_sge = IB_MAD_SEND_REQ_MAX_SG;
+ qp_init_attr.cap.max_recv_sge = IB_MAD_RECV_REQ_MAX_SG;
+ qp_init_attr.qp_type = qp_type;
+ qp_init_attr.port_num = qp_info->port_priv->port_num;
+ qp_init_attr.qp_context = qp_info;
+ qp_init_attr.event_handler = qp_event_handler;
+ qp_info->qp = ib_create_qp(qp_info->port_priv->pd, &qp_init_attr);
+ if (IS_ERR(qp_info->qp)) {
+ printk(KERN_ERR PFX "Couldn't create ib_mad QP%d\n",
+ get_spl_qp_index(qp_type));
+ ret = PTR_ERR(qp_info->qp);
+ goto error;
+ }
+ /* Use minimum queue sizes unless the CQ is resized */
+ qp_info->send_queue.max_active = mad_sendq_size;
+ qp_info->recv_queue.max_active = mad_recvq_size;
+ return 0;
+
+error:
+ return ret;
+}
+
+static void destroy_mad_qp(struct ib_mad_qp_info *qp_info)
+{
+ if (!qp_info->qp)
+ return;
+
+ ib_destroy_qp(qp_info->qp);
+ kfree(qp_info->snoop_table);
+}
+
+/*
+ * Open the port
+ * Create the QP, PD, MR, and CQ if needed
+ */
+static int ib_mad_port_open(struct ib_device *device,
+ int port_num)
+{
+ int ret, cq_size;
+ struct ib_mad_port_private *port_priv;
+ unsigned long flags;
+ char name[sizeof "ib_mad123"];
+ int has_smi;
+
+ /* Create new device info */
+ port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
+ if (!port_priv) {
+ printk(KERN_ERR PFX "No memory for ib_mad_port_private\n");
+ return -ENOMEM;
+ }
+
+ port_priv->device = device;
+ port_priv->port_num = port_num;
+ spin_lock_init(&port_priv->reg_lock);
+ INIT_LIST_HEAD(&port_priv->agent_list);
+ init_mad_qp(port_priv, &port_priv->qp_info[0]);
+ init_mad_qp(port_priv, &port_priv->qp_info[1]);
+
+ cq_size = mad_sendq_size + mad_recvq_size;
+ has_smi = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND;
+ if (has_smi)
+ cq_size *= 2;
+
+ port_priv->cq = ib_create_cq(port_priv->device,
+ ib_mad_thread_completion_handler,
+ NULL, port_priv, cq_size, 0);
+ if (IS_ERR(port_priv->cq)) {
+ printk(KERN_ERR PFX "Couldn't create ib_mad CQ\n");
+ ret = PTR_ERR(port_priv->cq);
+ goto error3;
+ }
+
+ port_priv->pd = ib_alloc_pd(device);
+ if (IS_ERR(port_priv->pd)) {
+ printk(KERN_ERR PFX "Couldn't create ib_mad PD\n");
+ ret = PTR_ERR(port_priv->pd);
+ goto error4;
+ }
+
+ port_priv->mr = ib_get_dma_mr(port_priv->pd, IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(port_priv->mr)) {
+ printk(KERN_ERR PFX "Couldn't get ib_mad DMA MR\n");
+ ret = PTR_ERR(port_priv->mr);
+ goto error5;
+ }
+
+ if (has_smi) {
+ ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI);
+ if (ret)
+ goto error6;
+ }
+ ret = create_mad_qp(&port_priv->qp_info[1], IB_QPT_GSI);
+ if (ret)
+ goto error7;
+
+ snprintf(name, sizeof name, "ib_mad%d", port_num);
+ port_priv->wq = create_singlethread_workqueue(name);
+ if (!port_priv->wq) {
+ ret = -ENOMEM;
+ goto error8;
+ }
+ INIT_WORK(&port_priv->work, ib_mad_completion_handler);
+
+ spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+ list_add_tail(&port_priv->port_list, &ib_mad_port_list);
+ spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+ ret = ib_mad_port_start(port_priv);
+ if (ret) {
+ printk(KERN_ERR PFX "Couldn't start port\n");
+ goto error9;
+ }
+
+ return 0;
+
+error9:
+ spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+ list_del_init(&port_priv->port_list);
+ spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+ destroy_workqueue(port_priv->wq);
+error8:
+ destroy_mad_qp(&port_priv->qp_info[1]);
+error7:
+ destroy_mad_qp(&port_priv->qp_info[0]);
+error6:
+ ib_dereg_mr(port_priv->mr);
+error5:
+ ib_dealloc_pd(port_priv->pd);
+error4:
+ ib_destroy_cq(port_priv->cq);
+ cleanup_recv_queue(&port_priv->qp_info[1]);
+ cleanup_recv_queue(&port_priv->qp_info[0]);
+error3:
+ kfree(port_priv);
+
+ return ret;
+}
+
+/*
+ * Close the port
+ * If there are no classes using the port, free the port
+ * resources (CQ, MR, PD, QP) and remove the port's info structure
+ */
+static int ib_mad_port_close(struct ib_device *device, int port_num)
+{
+ struct ib_mad_port_private *port_priv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+ port_priv = __ib_get_mad_port(device, port_num);
+ if (port_priv == NULL) {
+ spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+ printk(KERN_ERR PFX "Port %d not found\n", port_num);
+ return -ENODEV;
+ }
+ list_del_init(&port_priv->port_list);
+ spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+ destroy_workqueue(port_priv->wq);
+ destroy_mad_qp(&port_priv->qp_info[1]);
+ destroy_mad_qp(&port_priv->qp_info[0]);
+ ib_dereg_mr(port_priv->mr);
+ ib_dealloc_pd(port_priv->pd);
+ ib_destroy_cq(port_priv->cq);
+ cleanup_recv_queue(&port_priv->qp_info[1]);
+ cleanup_recv_queue(&port_priv->qp_info[0]);
+ /* XXX: Handle deallocation of MAD registration tables */
+
+ kfree(port_priv);
+
+ return 0;
+}
+
+static void ib_mad_init_device(struct ib_device *device)
+{
+ int start, end, i;
+
+ if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+ return;
+
+ if (device->node_type == RDMA_NODE_IB_SWITCH) {
+ start = 0;
+ end = 0;
+ } else {
+ start = 1;
+ end = device->phys_port_cnt;
+ }
+
+ for (i = start; i <= end; i++) {
+ if (ib_mad_port_open(device, i)) {
+ printk(KERN_ERR PFX "Couldn't open %s port %d\n",
+ device->name, i);
+ goto error;
+ }
+ if (ib_agent_port_open(device, i)) {
+ printk(KERN_ERR PFX "Couldn't open %s port %d "
+ "for agents\n",
+ device->name, i);
+ goto error_agent;
+ }
+ }
+ return;
+
+error_agent:
+ if (ib_mad_port_close(device, i))
+ printk(KERN_ERR PFX "Couldn't close %s port %d\n",
+ device->name, i);
+
+error:
+ i--;
+
+ while (i >= start) {
+ if (ib_agent_port_close(device, i))
+ printk(KERN_ERR PFX "Couldn't close %s port %d "
+ "for agents\n",
+ device->name, i);
+ if (ib_mad_port_close(device, i))
+ printk(KERN_ERR PFX "Couldn't close %s port %d\n",
+ device->name, i);
+ i--;
+ }
+}
+
+static void ib_mad_remove_device(struct ib_device *device)
+{
+ int i, num_ports, cur_port;
+
+ if (device->node_type == RDMA_NODE_IB_SWITCH) {
+ num_ports = 1;
+ cur_port = 0;
+ } else {
+ num_ports = device->phys_port_cnt;
+ cur_port = 1;
+ }
+ for (i = 0; i < num_ports; i++, cur_port++) {
+ if (ib_agent_port_close(device, cur_port))
+ printk(KERN_ERR PFX "Couldn't close %s port %d "
+ "for agents\n",
+ device->name, cur_port);
+ if (ib_mad_port_close(device, cur_port))
+ printk(KERN_ERR PFX "Couldn't close %s port %d\n",
+ device->name, cur_port);
+ }
+}
+
+static struct ib_client mad_client = {
+ .name = "mad",
+ .add = ib_mad_init_device,
+ .remove = ib_mad_remove_device
+};
+
+static int __init ib_mad_init_module(void)
+{
+ int ret;
+
+ mad_recvq_size = min(mad_recvq_size, IB_MAD_QP_MAX_SIZE);
+ mad_recvq_size = max(mad_recvq_size, IB_MAD_QP_MIN_SIZE);
+
+ mad_sendq_size = min(mad_sendq_size, IB_MAD_QP_MAX_SIZE);
+ mad_sendq_size = max(mad_sendq_size, IB_MAD_QP_MIN_SIZE);
+
+ spin_lock_init(&ib_mad_port_list_lock);
+
+ ib_mad_cache = kmem_cache_create("ib_mad",
+ sizeof(struct ib_mad_private),
+ 0,
+ SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (!ib_mad_cache) {
+ printk(KERN_ERR PFX "Couldn't create ib_mad cache\n");
+ ret = -ENOMEM;
+ goto error1;
+ }
+
+ INIT_LIST_HEAD(&ib_mad_port_list);
+
+ if (ib_register_client(&mad_client)) {
+ printk(KERN_ERR PFX "Couldn't register ib_mad client\n");
+ ret = -EINVAL;
+ goto error2;
+ }
+
+ return 0;
+
+error2:
+ kmem_cache_destroy(ib_mad_cache);
+error1:
+ return ret;
+}
+
+static void __exit ib_mad_cleanup_module(void)
+{
+ ib_unregister_client(&mad_client);
+ kmem_cache_destroy(ib_mad_cache);
+}
+
+module_init(ib_mad_init_module);
+module_exit(ib_mad_cleanup_module);
+
diff --git a/sys/ofed/drivers/infiniband/core/mad_priv.h b/sys/ofed/drivers/infiniband/core/mad_priv.h
new file mode 100644
index 0000000..8b4df0a
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/mad_priv.h
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2004, 2005, Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __IB_MAD_PRIV_H__
+#define __IB_MAD_PRIV_H__
+
+#include <linux/completion.h>
+#include <linux/err.h>
+#include <linux/workqueue.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+
+
+#define PFX "ib_mad: "
+
+#define IB_MAD_QPS_CORE 2 /* Always QP0 and QP1 as a minimum */
+
+/* QP and CQ parameters */
+#define IB_MAD_QP_SEND_SIZE 128
+#define IB_MAD_QP_RECV_SIZE 512
+#define IB_MAD_QP_MIN_SIZE 64
+#define IB_MAD_QP_MAX_SIZE 8192
+#define IB_MAD_SEND_REQ_MAX_SG 2
+#define IB_MAD_RECV_REQ_MAX_SG 1
+
+#define IB_MAD_SEND_Q_PSN 0
+
+/* Registration table sizes */
+#define MAX_MGMT_CLASS 80
+#define MAX_MGMT_VERSION 8
+#define MAX_MGMT_OUI 8
+#define MAX_MGMT_VENDOR_RANGE2 (IB_MGMT_CLASS_VENDOR_RANGE2_END - \
+ IB_MGMT_CLASS_VENDOR_RANGE2_START + 1)
+
+struct ib_mad_list_head {
+ struct list_head list;
+ struct ib_mad_queue *mad_queue;
+};
+
+struct ib_mad_private_header {
+ struct ib_mad_list_head mad_list;
+ struct ib_mad_recv_wc recv_wc;
+ struct ib_wc wc;
+ u64 mapping;
+} __attribute__ ((packed));
+
+struct ib_mad_private {
+ struct ib_mad_private_header header;
+ struct ib_grh grh;
+ union {
+ struct ib_mad mad;
+ struct ib_rmpp_mad rmpp_mad;
+ struct ib_smp smp;
+ } mad;
+} __attribute__ ((packed));
+
+struct ib_rmpp_segment {
+ struct list_head list;
+ u32 num;
+ u8 data[0];
+};
+
+struct ib_mad_agent_private {
+ struct list_head agent_list;
+ struct ib_mad_agent agent;
+ struct ib_mad_reg_req *reg_req;
+ struct ib_mad_qp_info *qp_info;
+
+ spinlock_t lock;
+ struct list_head send_list;
+ struct list_head wait_list;
+ struct list_head done_list;
+ struct work_struct timeout_work;
+ struct timer_list timeout_timer;
+ unsigned long timeout;
+ struct list_head local_list;
+ struct work_struct local_work;
+ struct list_head rmpp_list;
+
+ atomic_t refcount;
+ struct completion comp;
+};
+
+struct ib_mad_snoop_private {
+ struct ib_mad_agent agent;
+ struct ib_mad_qp_info *qp_info;
+ int snoop_index;
+ int mad_snoop_flags;
+ atomic_t refcount;
+ struct completion comp;
+};
+
+struct ib_mad_send_wr_private {
+ struct ib_mad_list_head mad_list;
+ struct list_head agent_list;
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_buf send_buf;
+ u64 header_mapping;
+ u64 payload_mapping;
+ struct ib_send_wr send_wr;
+ struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
+ __be64 tid;
+ unsigned long timeout;
+ int max_retries;
+ int retries_left;
+ int retry;
+ int refcount;
+ enum ib_wc_status status;
+
+ /* RMPP control */
+ struct list_head rmpp_list;
+ struct ib_rmpp_segment *last_ack_seg;
+ struct ib_rmpp_segment *cur_seg;
+ int last_ack;
+ int seg_num;
+ int newwin;
+ int pad;
+};
+
+struct ib_mad_local_private {
+ struct list_head completion_list;
+ struct ib_mad_private *mad_priv;
+ struct ib_mad_agent_private *recv_mad_agent;
+ struct ib_mad_send_wr_private *mad_send_wr;
+};
+
+struct ib_mad_mgmt_method_table {
+ struct ib_mad_agent_private *agent[IB_MGMT_MAX_METHODS];
+};
+
+struct ib_mad_mgmt_class_table {
+ struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_CLASS];
+};
+
+struct ib_mad_mgmt_vendor_class {
+ u8 oui[MAX_MGMT_OUI][3];
+ struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_OUI];
+};
+
+struct ib_mad_mgmt_vendor_class_table {
+ struct ib_mad_mgmt_vendor_class *vendor_class[MAX_MGMT_VENDOR_RANGE2];
+};
+
+struct ib_mad_mgmt_version_table {
+ struct ib_mad_mgmt_class_table *class;
+ struct ib_mad_mgmt_vendor_class_table *vendor;
+};
+
+struct ib_mad_queue {
+ spinlock_t lock;
+ struct list_head list;
+ int count;
+ int max_active;
+ struct ib_mad_qp_info *qp_info;
+};
+
+struct ib_mad_qp_info {
+ struct ib_mad_port_private *port_priv;
+ struct ib_qp *qp;
+ struct ib_mad_queue send_queue;
+ struct ib_mad_queue recv_queue;
+ struct list_head overflow_list;
+ spinlock_t snoop_lock;
+ struct ib_mad_snoop_private **snoop_table;
+ int snoop_table_size;
+ atomic_t snoop_count;
+};
+
+struct ib_mad_port_private {
+ struct list_head port_list;
+ struct ib_device *device;
+ int port_num;
+ struct ib_cq *cq;
+ struct ib_pd *pd;
+ struct ib_mr *mr;
+
+ spinlock_t reg_lock;
+ struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION];
+ struct list_head agent_list;
+ struct workqueue_struct *wq;
+ struct work_struct work;
+ struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE];
+};
+
+int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr);
+
+struct ib_mad_send_wr_private *
+ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv,
+ struct ib_mad_recv_wc *mad_recv_wc);
+
+void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_send_wc *mad_send_wc);
+
+void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr);
+
+void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr,
+ int timeout_ms);
+
+#endif /* __IB_MAD_PRIV_H__ */
diff --git a/sys/ofed/drivers/infiniband/core/mad_rmpp.c b/sys/ofed/drivers/infiniband/core/mad_rmpp.c
new file mode 100644
index 0000000..4e0f282
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/mad_rmpp.c
@@ -0,0 +1,951 @@
+/*
+ * Copyright (c) 2005 Intel Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "mad_priv.h"
+#include "mad_rmpp.h"
+
+enum rmpp_state {
+ RMPP_STATE_ACTIVE,
+ RMPP_STATE_TIMEOUT,
+ RMPP_STATE_COMPLETE,
+ RMPP_STATE_CANCELING
+};
+
+struct mad_rmpp_recv {
+ struct ib_mad_agent_private *agent;
+ struct list_head list;
+ struct delayed_work timeout_work;
+ struct delayed_work cleanup_work;
+ struct completion comp;
+ enum rmpp_state state;
+ spinlock_t lock;
+ atomic_t refcount;
+
+ struct ib_ah *ah;
+ struct ib_mad_recv_wc *rmpp_wc;
+ struct ib_mad_recv_buf *cur_seg_buf;
+ int last_ack;
+ int seg_num;
+ int newwin;
+ int repwin;
+
+ __be64 tid;
+ u32 src_qp;
+ u16 slid;
+ u8 mgmt_class;
+ u8 class_version;
+ u8 method;
+};
+
+static inline void deref_rmpp_recv(struct mad_rmpp_recv *rmpp_recv)
+{
+ if (atomic_dec_and_test(&rmpp_recv->refcount))
+ complete(&rmpp_recv->comp);
+}
+
+static void destroy_rmpp_recv(struct mad_rmpp_recv *rmpp_recv)
+{
+ deref_rmpp_recv(rmpp_recv);
+ wait_for_completion(&rmpp_recv->comp);
+ ib_destroy_ah(rmpp_recv->ah);
+ kfree(rmpp_recv);
+}
+
+void ib_cancel_rmpp_recvs(struct ib_mad_agent_private *agent)
+{
+ struct mad_rmpp_recv *rmpp_recv, *temp_rmpp_recv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&agent->lock, flags);
+ list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+ if (rmpp_recv->state != RMPP_STATE_COMPLETE)
+ ib_free_recv_mad(rmpp_recv->rmpp_wc);
+ rmpp_recv->state = RMPP_STATE_CANCELING;
+ }
+ spin_unlock_irqrestore(&agent->lock, flags);
+
+ list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+ cancel_delayed_work(&rmpp_recv->timeout_work);
+ cancel_delayed_work(&rmpp_recv->cleanup_work);
+ }
+
+ flush_workqueue(agent->qp_info->port_priv->wq);
+
+ list_for_each_entry_safe(rmpp_recv, temp_rmpp_recv,
+ &agent->rmpp_list, list) {
+ list_del(&rmpp_recv->list);
+ destroy_rmpp_recv(rmpp_recv);
+ }
+}
+
+static void format_ack(struct ib_mad_send_buf *msg,
+ struct ib_rmpp_mad *data,
+ struct mad_rmpp_recv *rmpp_recv)
+{
+ struct ib_rmpp_mad *ack = msg->mad;
+ unsigned long flags;
+
+ memcpy(ack, &data->mad_hdr, msg->hdr_len);
+
+ ack->mad_hdr.method ^= IB_MGMT_METHOD_RESP;
+ ack->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_ACK;
+ ib_set_rmpp_flags(&ack->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+
+ spin_lock_irqsave(&rmpp_recv->lock, flags);
+ rmpp_recv->last_ack = rmpp_recv->seg_num;
+ ack->rmpp_hdr.seg_num = cpu_to_be32(rmpp_recv->seg_num);
+ ack->rmpp_hdr.paylen_newwin = cpu_to_be32(rmpp_recv->newwin);
+ spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+}
+
+static void ack_recv(struct mad_rmpp_recv *rmpp_recv,
+ struct ib_mad_recv_wc *recv_wc)
+{
+ struct ib_mad_send_buf *msg;
+ int ret, hdr_len;
+
+ hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
+ msg = ib_create_send_mad(&rmpp_recv->agent->agent, recv_wc->wc->src_qp,
+ recv_wc->wc->pkey_index, 1, hdr_len,
+ 0, GFP_KERNEL);
+ if (IS_ERR(msg))
+ return;
+
+ format_ack(msg, (struct ib_rmpp_mad *) recv_wc->recv_buf.mad, rmpp_recv);
+ msg->ah = rmpp_recv->ah;
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ ib_free_send_mad(msg);
+}
+
+static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent,
+ struct ib_mad_recv_wc *recv_wc)
+{
+ struct ib_mad_send_buf *msg;
+ struct ib_ah *ah;
+ int hdr_len;
+
+ ah = ib_create_ah_from_wc(agent->qp->pd, recv_wc->wc,
+ recv_wc->recv_buf.grh, agent->port_num);
+ if (IS_ERR(ah))
+ return (void *) ah;
+
+ hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
+ msg = ib_create_send_mad(agent, recv_wc->wc->src_qp,
+ recv_wc->wc->pkey_index, 1,
+ hdr_len, 0, GFP_KERNEL);
+ if (IS_ERR(msg))
+ ib_destroy_ah(ah);
+ else {
+ msg->ah = ah;
+ msg->context[0] = ah;
+ }
+
+ return msg;
+}
+
+static void ack_ds_ack(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *recv_wc)
+{
+ struct ib_mad_send_buf *msg;
+ struct ib_rmpp_mad *rmpp_mad;
+ int ret;
+
+ msg = alloc_response_msg(&agent->agent, recv_wc);
+ if (IS_ERR(msg))
+ return;
+
+ rmpp_mad = msg->mad;
+ memcpy(rmpp_mad, recv_wc->recv_buf.mad, msg->hdr_len);
+
+ rmpp_mad->mad_hdr.method ^= IB_MGMT_METHOD_RESP;
+ ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+ rmpp_mad->rmpp_hdr.seg_num = 0;
+ rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(1);
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ ib_destroy_ah(msg->ah);
+ ib_free_send_mad(msg);
+ }
+}
+
+void ib_rmpp_send_handler(struct ib_mad_send_wc *mad_send_wc)
+{
+ if (mad_send_wc->send_buf->context[0] == mad_send_wc->send_buf->ah)
+ ib_destroy_ah(mad_send_wc->send_buf->ah);
+ ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+static void nack_recv(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *recv_wc, u8 rmpp_status)
+{
+ struct ib_mad_send_buf *msg;
+ struct ib_rmpp_mad *rmpp_mad;
+ int ret;
+
+ msg = alloc_response_msg(&agent->agent, recv_wc);
+ if (IS_ERR(msg))
+ return;
+
+ rmpp_mad = msg->mad;
+ memcpy(rmpp_mad, recv_wc->recv_buf.mad, msg->hdr_len);
+
+ rmpp_mad->mad_hdr.method ^= IB_MGMT_METHOD_RESP;
+ rmpp_mad->rmpp_hdr.rmpp_version = IB_MGMT_RMPP_VERSION;
+ rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_ABORT;
+ ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+ rmpp_mad->rmpp_hdr.rmpp_status = rmpp_status;
+ rmpp_mad->rmpp_hdr.seg_num = 0;
+ rmpp_mad->rmpp_hdr.paylen_newwin = 0;
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ ib_destroy_ah(msg->ah);
+ ib_free_send_mad(msg);
+ }
+}
+
+static void recv_timeout_handler(struct work_struct *work)
+{
+ struct mad_rmpp_recv *rmpp_recv =
+ container_of(work, struct mad_rmpp_recv, timeout_work.work);
+ struct ib_mad_recv_wc *rmpp_wc;
+ unsigned long flags;
+
+ spin_lock_irqsave(&rmpp_recv->agent->lock, flags);
+ if (rmpp_recv->state != RMPP_STATE_ACTIVE) {
+ spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+ return;
+ }
+ rmpp_recv->state = RMPP_STATE_TIMEOUT;
+ list_del(&rmpp_recv->list);
+ spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+
+ rmpp_wc = rmpp_recv->rmpp_wc;
+ nack_recv(rmpp_recv->agent, rmpp_wc, IB_MGMT_RMPP_STATUS_T2L);
+ destroy_rmpp_recv(rmpp_recv);
+ ib_free_recv_mad(rmpp_wc);
+}
+
+static void recv_cleanup_handler(struct work_struct *work)
+{
+ struct mad_rmpp_recv *rmpp_recv =
+ container_of(work, struct mad_rmpp_recv, cleanup_work.work);
+ unsigned long flags;
+
+ spin_lock_irqsave(&rmpp_recv->agent->lock, flags);
+ if (rmpp_recv->state == RMPP_STATE_CANCELING) {
+ spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+ return;
+ }
+ list_del(&rmpp_recv->list);
+ spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+ destroy_rmpp_recv(rmpp_recv);
+}
+
+static struct mad_rmpp_recv *
+create_rmpp_recv(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct mad_rmpp_recv *rmpp_recv;
+ struct ib_mad_hdr *mad_hdr;
+
+ rmpp_recv = kmalloc(sizeof *rmpp_recv, GFP_KERNEL);
+ if (!rmpp_recv)
+ return NULL;
+
+ rmpp_recv->ah = ib_create_ah_from_wc(agent->agent.qp->pd,
+ mad_recv_wc->wc,
+ mad_recv_wc->recv_buf.grh,
+ agent->agent.port_num);
+ if (IS_ERR(rmpp_recv->ah))
+ goto error;
+
+ rmpp_recv->agent = agent;
+ init_completion(&rmpp_recv->comp);
+ INIT_DELAYED_WORK(&rmpp_recv->timeout_work, recv_timeout_handler);
+ INIT_DELAYED_WORK(&rmpp_recv->cleanup_work, recv_cleanup_handler);
+ spin_lock_init(&rmpp_recv->lock);
+ rmpp_recv->state = RMPP_STATE_ACTIVE;
+ atomic_set(&rmpp_recv->refcount, 1);
+
+ rmpp_recv->rmpp_wc = mad_recv_wc;
+ rmpp_recv->cur_seg_buf = &mad_recv_wc->recv_buf;
+ rmpp_recv->newwin = 1;
+ rmpp_recv->seg_num = 1;
+ rmpp_recv->last_ack = 0;
+ rmpp_recv->repwin = 1;
+
+ mad_hdr = &mad_recv_wc->recv_buf.mad->mad_hdr;
+ rmpp_recv->tid = mad_hdr->tid;
+ rmpp_recv->src_qp = mad_recv_wc->wc->src_qp;
+ rmpp_recv->slid = mad_recv_wc->wc->slid;
+ rmpp_recv->mgmt_class = mad_hdr->mgmt_class;
+ rmpp_recv->class_version = mad_hdr->class_version;
+ rmpp_recv->method = mad_hdr->method;
+ return rmpp_recv;
+
+error: kfree(rmpp_recv);
+ return NULL;
+}
+
+static struct mad_rmpp_recv *
+find_rmpp_recv(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct mad_rmpp_recv *rmpp_recv;
+ struct ib_mad_hdr *mad_hdr = &mad_recv_wc->recv_buf.mad->mad_hdr;
+
+ list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+ if (rmpp_recv->tid == mad_hdr->tid &&
+ rmpp_recv->src_qp == mad_recv_wc->wc->src_qp &&
+ rmpp_recv->slid == mad_recv_wc->wc->slid &&
+ rmpp_recv->mgmt_class == mad_hdr->mgmt_class &&
+ rmpp_recv->class_version == mad_hdr->class_version &&
+ rmpp_recv->method == mad_hdr->method)
+ return rmpp_recv;
+ }
+ return NULL;
+}
+
+static struct mad_rmpp_recv *
+acquire_rmpp_recv(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct mad_rmpp_recv *rmpp_recv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&agent->lock, flags);
+ rmpp_recv = find_rmpp_recv(agent, mad_recv_wc);
+ if (rmpp_recv)
+ atomic_inc(&rmpp_recv->refcount);
+ spin_unlock_irqrestore(&agent->lock, flags);
+ return rmpp_recv;
+}
+
+static struct mad_rmpp_recv *
+insert_rmpp_recv(struct ib_mad_agent_private *agent,
+ struct mad_rmpp_recv *rmpp_recv)
+{
+ struct mad_rmpp_recv *cur_rmpp_recv;
+
+ cur_rmpp_recv = find_rmpp_recv(agent, rmpp_recv->rmpp_wc);
+ if (!cur_rmpp_recv)
+ list_add_tail(&rmpp_recv->list, &agent->rmpp_list);
+
+ return cur_rmpp_recv;
+}
+
+static inline int get_last_flag(struct ib_mad_recv_buf *seg)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+
+ rmpp_mad = (struct ib_rmpp_mad *) seg->mad;
+ return ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_LAST;
+}
+
+static inline int get_seg_num(struct ib_mad_recv_buf *seg)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+
+ rmpp_mad = (struct ib_rmpp_mad *) seg->mad;
+ return be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num);
+}
+
+static inline struct ib_mad_recv_buf * get_next_seg(struct list_head *rmpp_list,
+ struct ib_mad_recv_buf *seg)
+{
+ if (seg->list.next == rmpp_list)
+ return NULL;
+
+ return container_of(seg->list.next, struct ib_mad_recv_buf, list);
+}
+
+static inline int window_size(struct ib_mad_agent_private *agent)
+{
+ return max(agent->qp_info->recv_queue.max_active >> 3, 1);
+}
+
+static struct ib_mad_recv_buf * find_seg_location(struct list_head *rmpp_list,
+ int seg_num)
+{
+ struct ib_mad_recv_buf *seg_buf;
+ int cur_seg_num;
+
+ list_for_each_entry_reverse(seg_buf, rmpp_list, list) {
+ cur_seg_num = get_seg_num(seg_buf);
+ if (seg_num > cur_seg_num)
+ return seg_buf;
+ if (seg_num == cur_seg_num)
+ break;
+ }
+ return NULL;
+}
+
+static void update_seg_num(struct mad_rmpp_recv *rmpp_recv,
+ struct ib_mad_recv_buf *new_buf)
+{
+ struct list_head *rmpp_list = &rmpp_recv->rmpp_wc->rmpp_list;
+
+ while (new_buf && (get_seg_num(new_buf) == rmpp_recv->seg_num + 1)) {
+ rmpp_recv->cur_seg_buf = new_buf;
+ rmpp_recv->seg_num++;
+ new_buf = get_next_seg(rmpp_list, new_buf);
+ }
+}
+
+static inline int get_mad_len(struct mad_rmpp_recv *rmpp_recv)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+ int hdr_size, data_size, pad;
+
+ rmpp_mad = (struct ib_rmpp_mad *)rmpp_recv->cur_seg_buf->mad;
+
+ hdr_size = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class);
+ data_size = sizeof(struct ib_rmpp_mad) - hdr_size;
+ pad = IB_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
+ if (pad > IB_MGMT_RMPP_DATA || pad < 0)
+ pad = 0;
+
+ return hdr_size + rmpp_recv->seg_num * data_size - pad;
+}
+
+static struct ib_mad_recv_wc * complete_rmpp(struct mad_rmpp_recv *rmpp_recv)
+{
+ struct ib_mad_recv_wc *rmpp_wc;
+
+ ack_recv(rmpp_recv, rmpp_recv->rmpp_wc);
+ if (rmpp_recv->seg_num > 1)
+ cancel_delayed_work(&rmpp_recv->timeout_work);
+
+ rmpp_wc = rmpp_recv->rmpp_wc;
+ rmpp_wc->mad_len = get_mad_len(rmpp_recv);
+ /* 10 seconds until we can find the packet lifetime */
+ queue_delayed_work(rmpp_recv->agent->qp_info->port_priv->wq,
+ &rmpp_recv->cleanup_work, msecs_to_jiffies(10000));
+ return rmpp_wc;
+}
+
+static struct ib_mad_recv_wc *
+continue_rmpp(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct mad_rmpp_recv *rmpp_recv;
+ struct ib_mad_recv_buf *prev_buf;
+ struct ib_mad_recv_wc *done_wc;
+ int seg_num;
+ unsigned long flags;
+
+ rmpp_recv = acquire_rmpp_recv(agent, mad_recv_wc);
+ if (!rmpp_recv)
+ goto drop1;
+
+ seg_num = get_seg_num(&mad_recv_wc->recv_buf);
+
+ spin_lock_irqsave(&rmpp_recv->lock, flags);
+ if ((rmpp_recv->state == RMPP_STATE_TIMEOUT) ||
+ (seg_num > rmpp_recv->newwin))
+ goto drop3;
+
+ if ((seg_num <= rmpp_recv->last_ack) ||
+ (rmpp_recv->state == RMPP_STATE_COMPLETE)) {
+ spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+ ack_recv(rmpp_recv, mad_recv_wc);
+ goto drop2;
+ }
+
+ prev_buf = find_seg_location(&rmpp_recv->rmpp_wc->rmpp_list, seg_num);
+ if (!prev_buf)
+ goto drop3;
+
+ done_wc = NULL;
+ list_add(&mad_recv_wc->recv_buf.list, &prev_buf->list);
+ if (rmpp_recv->cur_seg_buf == prev_buf) {
+ update_seg_num(rmpp_recv, &mad_recv_wc->recv_buf);
+ if (get_last_flag(rmpp_recv->cur_seg_buf)) {
+ rmpp_recv->state = RMPP_STATE_COMPLETE;
+ spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+ done_wc = complete_rmpp(rmpp_recv);
+ goto out;
+ } else if (rmpp_recv->seg_num == rmpp_recv->newwin) {
+ rmpp_recv->newwin += window_size(agent);
+ spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+ ack_recv(rmpp_recv, mad_recv_wc);
+ goto out;
+ }
+ }
+ spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+out:
+ deref_rmpp_recv(rmpp_recv);
+ return done_wc;
+
+drop3: spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+drop2: deref_rmpp_recv(rmpp_recv);
+drop1: ib_free_recv_mad(mad_recv_wc);
+ return NULL;
+}
+
+static struct ib_mad_recv_wc *
+start_rmpp(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct mad_rmpp_recv *rmpp_recv;
+ unsigned long flags;
+
+ rmpp_recv = create_rmpp_recv(agent, mad_recv_wc);
+ if (!rmpp_recv) {
+ ib_free_recv_mad(mad_recv_wc);
+ return NULL;
+ }
+
+ spin_lock_irqsave(&agent->lock, flags);
+ if (insert_rmpp_recv(agent, rmpp_recv)) {
+ spin_unlock_irqrestore(&agent->lock, flags);
+ /* duplicate first MAD */
+ destroy_rmpp_recv(rmpp_recv);
+ return continue_rmpp(agent, mad_recv_wc);
+ }
+ atomic_inc(&rmpp_recv->refcount);
+
+ if (get_last_flag(&mad_recv_wc->recv_buf)) {
+ rmpp_recv->state = RMPP_STATE_COMPLETE;
+ spin_unlock_irqrestore(&agent->lock, flags);
+ complete_rmpp(rmpp_recv);
+ } else {
+ spin_unlock_irqrestore(&agent->lock, flags);
+ /* 40 seconds until we can find the packet lifetimes */
+ queue_delayed_work(agent->qp_info->port_priv->wq,
+ &rmpp_recv->timeout_work,
+ msecs_to_jiffies(40000));
+ rmpp_recv->newwin += window_size(agent);
+ ack_recv(rmpp_recv, mad_recv_wc);
+ mad_recv_wc = NULL;
+ }
+ deref_rmpp_recv(rmpp_recv);
+ return mad_recv_wc;
+}
+
+static int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+ int timeout;
+ u32 paylen = 0;
+
+ rmpp_mad = mad_send_wr->send_buf.mad;
+ ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+ rmpp_mad->rmpp_hdr.seg_num = cpu_to_be32(++mad_send_wr->seg_num);
+
+ if (mad_send_wr->seg_num == 1) {
+ rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_FIRST;
+ paylen = mad_send_wr->send_buf.seg_count * IB_MGMT_RMPP_DATA -
+ mad_send_wr->pad;
+ }
+
+ if (mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count) {
+ rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_LAST;
+ paylen = IB_MGMT_RMPP_DATA - mad_send_wr->pad;
+ }
+ rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(paylen);
+
+ /* 2 seconds for an ACK until we can find the packet lifetime */
+ timeout = mad_send_wr->send_buf.timeout_ms;
+ if (!timeout || timeout > 2000)
+ mad_send_wr->timeout = msecs_to_jiffies(2000);
+
+ return ib_send_mad(mad_send_wr);
+}
+
+static void abort_send(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc, u8 rmpp_status)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_mad_send_wc wc;
+ unsigned long flags;
+
+ spin_lock_irqsave(&agent->lock, flags);
+ mad_send_wr = ib_find_send_mad(agent, mad_recv_wc);
+ if (!mad_send_wr)
+ goto out; /* Unmatched send */
+
+ if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) ||
+ (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS))
+ goto out; /* Send is already done */
+
+ ib_mark_mad_done(mad_send_wr);
+ spin_unlock_irqrestore(&agent->lock, flags);
+
+ wc.status = IB_WC_REM_ABORT_ERR;
+ wc.vendor_err = rmpp_status;
+ wc.send_buf = &mad_send_wr->send_buf;
+ ib_mad_complete_send_wr(mad_send_wr, &wc);
+ return;
+out:
+ spin_unlock_irqrestore(&agent->lock, flags);
+}
+
+static inline void adjust_last_ack(struct ib_mad_send_wr_private *wr,
+ int seg_num)
+{
+ struct list_head *list;
+
+ wr->last_ack = seg_num;
+ list = &wr->last_ack_seg->list;
+ list_for_each_entry(wr->last_ack_seg, list, list)
+ if (wr->last_ack_seg->num == seg_num)
+ break;
+}
+
+static void process_ds_ack(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc, int newwin)
+{
+ struct mad_rmpp_recv *rmpp_recv;
+
+ rmpp_recv = find_rmpp_recv(agent, mad_recv_wc);
+ if (rmpp_recv && rmpp_recv->state == RMPP_STATE_COMPLETE)
+ rmpp_recv->repwin = newwin;
+}
+
+static void process_rmpp_ack(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_rmpp_mad *rmpp_mad;
+ unsigned long flags;
+ int seg_num, newwin, ret;
+
+ rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+ if (rmpp_mad->rmpp_hdr.rmpp_status) {
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+ return;
+ }
+
+ seg_num = be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num);
+ newwin = be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
+ if (newwin < seg_num) {
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_W2S);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_W2S);
+ return;
+ }
+
+ spin_lock_irqsave(&agent->lock, flags);
+ mad_send_wr = ib_find_send_mad(agent, mad_recv_wc);
+ if (!mad_send_wr) {
+ if (!seg_num)
+ process_ds_ack(agent, mad_recv_wc, newwin);
+ goto out; /* Unmatched or DS RMPP ACK */
+ }
+
+ if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) &&
+ (mad_send_wr->timeout)) {
+ spin_unlock_irqrestore(&agent->lock, flags);
+ ack_ds_ack(agent, mad_recv_wc);
+ return; /* Repeated ACK for DS RMPP transaction */
+ }
+
+ if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) ||
+ (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS))
+ goto out; /* Send is already done */
+
+ if (seg_num > mad_send_wr->send_buf.seg_count ||
+ seg_num > mad_send_wr->newwin) {
+ spin_unlock_irqrestore(&agent->lock, flags);
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_S2B);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_S2B);
+ return;
+ }
+
+ if (newwin < mad_send_wr->newwin || seg_num < mad_send_wr->last_ack)
+ goto out; /* Old ACK */
+
+ if (seg_num > mad_send_wr->last_ack) {
+ adjust_last_ack(mad_send_wr, seg_num);
+ mad_send_wr->retries_left = mad_send_wr->max_retries;
+ }
+ mad_send_wr->newwin = newwin;
+ if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) {
+ /* If no response is expected, the ACK completes the send */
+ if (!mad_send_wr->send_buf.timeout_ms) {
+ struct ib_mad_send_wc wc;
+
+ ib_mark_mad_done(mad_send_wr);
+ spin_unlock_irqrestore(&agent->lock, flags);
+
+ wc.status = IB_WC_SUCCESS;
+ wc.vendor_err = 0;
+ wc.send_buf = &mad_send_wr->send_buf;
+ ib_mad_complete_send_wr(mad_send_wr, &wc);
+ return;
+ }
+ if (mad_send_wr->refcount == 1)
+ ib_reset_mad_timeout(mad_send_wr,
+ mad_send_wr->send_buf.timeout_ms);
+ spin_unlock_irqrestore(&agent->lock, flags);
+ ack_ds_ack(agent, mad_recv_wc);
+ return;
+ } else if (mad_send_wr->refcount == 1 &&
+ mad_send_wr->seg_num < mad_send_wr->newwin &&
+ mad_send_wr->seg_num < mad_send_wr->send_buf.seg_count) {
+ /* Send failure will just result in a timeout/retry */
+ ret = send_next_seg(mad_send_wr);
+ if (ret)
+ goto out;
+
+ mad_send_wr->refcount++;
+ list_move_tail(&mad_send_wr->agent_list,
+ &mad_send_wr->mad_agent_priv->send_list);
+ }
+out:
+ spin_unlock_irqrestore(&agent->lock, flags);
+}
+
+static struct ib_mad_recv_wc *
+process_rmpp_data(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_rmpp_hdr *rmpp_hdr;
+ u8 rmpp_status;
+
+ rmpp_hdr = &((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr;
+
+ if (rmpp_hdr->rmpp_status) {
+ rmpp_status = IB_MGMT_RMPP_STATUS_BAD_STATUS;
+ goto bad;
+ }
+
+ if (rmpp_hdr->seg_num == cpu_to_be32(1)) {
+ if (!(ib_get_rmpp_flags(rmpp_hdr) & IB_MGMT_RMPP_FLAG_FIRST)) {
+ rmpp_status = IB_MGMT_RMPP_STATUS_BAD_SEG;
+ goto bad;
+ }
+ return start_rmpp(agent, mad_recv_wc);
+ } else {
+ if (ib_get_rmpp_flags(rmpp_hdr) & IB_MGMT_RMPP_FLAG_FIRST) {
+ rmpp_status = IB_MGMT_RMPP_STATUS_BAD_SEG;
+ goto bad;
+ }
+ return continue_rmpp(agent, mad_recv_wc);
+ }
+bad:
+ nack_recv(agent, mad_recv_wc, rmpp_status);
+ ib_free_recv_mad(mad_recv_wc);
+ return NULL;
+}
+
+static void process_rmpp_stop(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+
+ rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+
+ if (rmpp_mad->rmpp_hdr.rmpp_status != IB_MGMT_RMPP_STATUS_RESX) {
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+ } else
+ abort_send(agent, mad_recv_wc, rmpp_mad->rmpp_hdr.rmpp_status);
+}
+
+static void process_rmpp_abort(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+
+ rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+
+ if (rmpp_mad->rmpp_hdr.rmpp_status < IB_MGMT_RMPP_STATUS_ABORT_MIN ||
+ rmpp_mad->rmpp_hdr.rmpp_status > IB_MGMT_RMPP_STATUS_ABORT_MAX) {
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+ } else
+ abort_send(agent, mad_recv_wc, rmpp_mad->rmpp_hdr.rmpp_status);
+}
+
+struct ib_mad_recv_wc *
+ib_process_rmpp_recv_wc(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+
+ rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+ if (!(rmpp_mad->rmpp_hdr.rmpp_rtime_flags & IB_MGMT_RMPP_FLAG_ACTIVE))
+ return mad_recv_wc;
+
+ if (rmpp_mad->rmpp_hdr.rmpp_version != IB_MGMT_RMPP_VERSION) {
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_UNV);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_UNV);
+ goto out;
+ }
+
+ switch (rmpp_mad->rmpp_hdr.rmpp_type) {
+ case IB_MGMT_RMPP_TYPE_DATA:
+ return process_rmpp_data(agent, mad_recv_wc);
+ case IB_MGMT_RMPP_TYPE_ACK:
+ process_rmpp_ack(agent, mad_recv_wc);
+ break;
+ case IB_MGMT_RMPP_TYPE_STOP:
+ process_rmpp_stop(agent, mad_recv_wc);
+ break;
+ case IB_MGMT_RMPP_TYPE_ABORT:
+ process_rmpp_abort(agent, mad_recv_wc);
+ break;
+ default:
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BADT);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BADT);
+ break;
+ }
+out:
+ ib_free_recv_mad(mad_recv_wc);
+ return NULL;
+}
+
+static int init_newwin(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_mad_agent_private *agent = mad_send_wr->mad_agent_priv;
+ struct ib_mad_hdr *mad_hdr = mad_send_wr->send_buf.mad;
+ struct mad_rmpp_recv *rmpp_recv;
+ struct ib_ah_attr ah_attr;
+ unsigned long flags;
+ int newwin = 1;
+
+ if (!(mad_hdr->method & IB_MGMT_METHOD_RESP))
+ goto out;
+
+ spin_lock_irqsave(&agent->lock, flags);
+ list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+ if (rmpp_recv->tid != mad_hdr->tid ||
+ rmpp_recv->mgmt_class != mad_hdr->mgmt_class ||
+ rmpp_recv->class_version != mad_hdr->class_version ||
+ (rmpp_recv->method & IB_MGMT_METHOD_RESP))
+ continue;
+
+ if (ib_query_ah(mad_send_wr->send_buf.ah, &ah_attr))
+ continue;
+
+ if (rmpp_recv->slid == ah_attr.dlid) {
+ newwin = rmpp_recv->repwin;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&agent->lock, flags);
+out:
+ return newwin;
+}
+
+int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+ int ret;
+
+ rmpp_mad = mad_send_wr->send_buf.mad;
+ if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+ IB_MGMT_RMPP_FLAG_ACTIVE))
+ return IB_RMPP_RESULT_UNHANDLED;
+
+ if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA) {
+ mad_send_wr->seg_num = 1;
+ return IB_RMPP_RESULT_INTERNAL;
+ }
+
+ mad_send_wr->newwin = init_newwin(mad_send_wr);
+
+ /* We need to wait for the final ACK even if there isn't a response */
+ mad_send_wr->refcount += (mad_send_wr->timeout == 0);
+ ret = send_next_seg(mad_send_wr);
+ if (!ret)
+ return IB_RMPP_RESULT_CONSUMED;
+ return ret;
+}
+
+int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+ int ret;
+
+ rmpp_mad = mad_send_wr->send_buf.mad;
+ if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+ IB_MGMT_RMPP_FLAG_ACTIVE))
+ return IB_RMPP_RESULT_UNHANDLED; /* RMPP not active */
+
+ if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA)
+ return IB_RMPP_RESULT_INTERNAL; /* ACK, STOP, or ABORT */
+
+ if (mad_send_wc->status != IB_WC_SUCCESS ||
+ mad_send_wr->status != IB_WC_SUCCESS)
+ return IB_RMPP_RESULT_PROCESSED; /* Canceled or send error */
+
+ if (!mad_send_wr->timeout)
+ return IB_RMPP_RESULT_PROCESSED; /* Response received */
+
+ if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) {
+ mad_send_wr->timeout =
+ msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+ return IB_RMPP_RESULT_PROCESSED; /* Send done */
+ }
+
+ if (mad_send_wr->seg_num == mad_send_wr->newwin ||
+ mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count)
+ return IB_RMPP_RESULT_PROCESSED; /* Wait for ACK */
+
+ ret = send_next_seg(mad_send_wr);
+ if (ret) {
+ mad_send_wc->status = IB_WC_GENERAL_ERR;
+ return IB_RMPP_RESULT_PROCESSED;
+ }
+ return IB_RMPP_RESULT_CONSUMED;
+}
+
+int ib_retry_rmpp(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+ int ret;
+
+ rmpp_mad = mad_send_wr->send_buf.mad;
+ if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+ IB_MGMT_RMPP_FLAG_ACTIVE))
+ return IB_RMPP_RESULT_UNHANDLED; /* RMPP not active */
+
+ if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count)
+ return IB_RMPP_RESULT_PROCESSED;
+
+ mad_send_wr->seg_num = mad_send_wr->last_ack;
+ mad_send_wr->cur_seg = mad_send_wr->last_ack_seg;
+
+ ret = send_next_seg(mad_send_wr);
+ if (ret)
+ return IB_RMPP_RESULT_PROCESSED;
+
+ return IB_RMPP_RESULT_CONSUMED;
+}
diff --git a/sys/ofed/drivers/infiniband/core/mad_rmpp.h b/sys/ofed/drivers/infiniband/core/mad_rmpp.h
new file mode 100644
index 0000000..3d336bf
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/mad_rmpp.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2005 Intel Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MAD_RMPP_H__
+#define __MAD_RMPP_H__
+
+enum {
+ IB_RMPP_RESULT_PROCESSED,
+ IB_RMPP_RESULT_CONSUMED,
+ IB_RMPP_RESULT_INTERNAL,
+ IB_RMPP_RESULT_UNHANDLED
+};
+
+int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr);
+
+struct ib_mad_recv_wc *
+ib_process_rmpp_recv_wc(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc);
+
+int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_send_wc *mad_send_wc);
+
+void ib_rmpp_send_handler(struct ib_mad_send_wc *mad_send_wc);
+
+void ib_cancel_rmpp_recvs(struct ib_mad_agent_private *agent);
+
+int ib_retry_rmpp(struct ib_mad_send_wr_private *mad_send_wr);
+
+#endif /* __MAD_RMPP_H__ */
diff --git a/sys/ofed/drivers/infiniband/core/multicast.c b/sys/ofed/drivers/infiniband/core/multicast.c
new file mode 100644
index 0000000..f8d7ef8
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/multicast.c
@@ -0,0 +1,868 @@
+/*
+ * Copyright (c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/bitops.h>
+#include <linux/random.h>
+
+#include <rdma/ib_cache.h>
+#include "sa.h"
+
+static void mcast_add_one(struct ib_device *device);
+static void mcast_remove_one(struct ib_device *device);
+
+static struct ib_client mcast_client = {
+ .name = "ib_multicast",
+ .add = mcast_add_one,
+ .remove = mcast_remove_one
+};
+
+static struct ib_sa_client sa_client;
+static struct workqueue_struct *mcast_wq;
+static union ib_gid mgid0;
+
+struct mcast_device;
+
+struct mcast_port {
+ struct mcast_device *dev;
+ spinlock_t lock;
+ struct rb_root table;
+ atomic_t refcount;
+ struct completion comp;
+ u8 port_num;
+};
+
+struct mcast_device {
+ struct ib_device *device;
+ struct ib_event_handler event_handler;
+ int start_port;
+ int end_port;
+ struct mcast_port port[0];
+};
+
+enum mcast_state {
+ MCAST_JOINING,
+ MCAST_MEMBER,
+ MCAST_ERROR,
+};
+
+enum mcast_group_state {
+ MCAST_IDLE,
+ MCAST_BUSY,
+ MCAST_GROUP_ERROR,
+ MCAST_PKEY_EVENT
+};
+
+enum {
+ MCAST_INVALID_PKEY_INDEX = 0xFFFF
+};
+
+struct mcast_member;
+
+struct mcast_group {
+ struct ib_sa_mcmember_rec rec;
+ struct rb_node node;
+ struct mcast_port *port;
+ spinlock_t lock;
+ struct work_struct work;
+ struct list_head pending_list;
+ struct list_head active_list;
+ struct mcast_member *last_join;
+ int members[3];
+ atomic_t refcount;
+ enum mcast_group_state state;
+ struct ib_sa_query *query;
+ int query_id;
+ u16 pkey_index;
+ u8 leave_state;
+ int retries;
+};
+
+struct mcast_member {
+ struct ib_sa_multicast multicast;
+ struct ib_sa_client *client;
+ struct mcast_group *group;
+ struct list_head list;
+ enum mcast_state state;
+ atomic_t refcount;
+ struct completion comp;
+};
+
+static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
+ void *context);
+static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
+ void *context);
+
+static struct mcast_group *mcast_find(struct mcast_port *port,
+ union ib_gid *mgid)
+{
+ struct rb_node *node = port->table.rb_node;
+ struct mcast_group *group;
+ int ret;
+
+ while (node) {
+ group = rb_entry(node, struct mcast_group, node);
+ ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid);
+ if (!ret)
+ return group;
+
+ if (ret < 0)
+ node = node->rb_left;
+ else
+ node = node->rb_right;
+ }
+ return NULL;
+}
+
+static struct mcast_group *mcast_insert(struct mcast_port *port,
+ struct mcast_group *group,
+ int allow_duplicates)
+{
+ struct rb_node **link = &port->table.rb_node;
+ struct rb_node *parent = NULL;
+ struct mcast_group *cur_group;
+ int ret;
+
+ while (*link) {
+ parent = *link;
+ cur_group = rb_entry(parent, struct mcast_group, node);
+
+ ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw,
+ sizeof group->rec.mgid);
+ if (ret < 0)
+ link = &(*link)->rb_left;
+ else if (ret > 0)
+ link = &(*link)->rb_right;
+ else if (allow_duplicates)
+ link = &(*link)->rb_left;
+ else
+ return cur_group;
+ }
+ rb_link_node(&group->node, parent, link);
+ rb_insert_color(&group->node, &port->table);
+ return NULL;
+}
+
+static void deref_port(struct mcast_port *port)
+{
+ if (atomic_dec_and_test(&port->refcount))
+ complete(&port->comp);
+}
+
+static void release_group(struct mcast_group *group)
+{
+ struct mcast_port *port = group->port;
+ unsigned long flags;
+
+ spin_lock_irqsave(&port->lock, flags);
+ if (atomic_dec_and_test(&group->refcount)) {
+ rb_erase(&group->node, &port->table);
+ spin_unlock_irqrestore(&port->lock, flags);
+ kfree(group);
+ deref_port(port);
+ } else
+ spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void deref_member(struct mcast_member *member)
+{
+ if (atomic_dec_and_test(&member->refcount))
+ complete(&member->comp);
+}
+
+static void queue_join(struct mcast_member *member)
+{
+ struct mcast_group *group = member->group;
+ unsigned long flags;
+
+ spin_lock_irqsave(&group->lock, flags);
+ list_add_tail(&member->list, &group->pending_list);
+ if (group->state == MCAST_IDLE) {
+ group->state = MCAST_BUSY;
+ atomic_inc(&group->refcount);
+ queue_work(mcast_wq, &group->work);
+ }
+ spin_unlock_irqrestore(&group->lock, flags);
+}
+
+/*
+ * A multicast group has three types of members: full member, non member, and
+ * send only member. We need to keep track of the number of members of each
+ * type based on their join state. Adjust the number of members the belong to
+ * the specified join states.
+ */
+static void adjust_membership(struct mcast_group *group, u8 join_state, int inc)
+{
+ int i;
+
+ for (i = 0; i < 3; i++, join_state >>= 1)
+ if (join_state & 0x1)
+ group->members[i] += inc;
+}
+
+/*
+ * If a multicast group has zero members left for a particular join state, but
+ * the group is still a member with the SA, we need to leave that join state.
+ * Determine which join states we still belong to, but that do not have any
+ * active members.
+ */
+static u8 get_leave_state(struct mcast_group *group)
+{
+ u8 leave_state = 0;
+ int i;
+
+ for (i = 0; i < 3; i++)
+ if (!group->members[i])
+ leave_state |= (0x1 << i);
+
+ return leave_state & group->rec.join_state;
+}
+
+static int cmp_rec(struct ib_sa_mcmember_rec *src,
+ struct ib_sa_mcmember_rec *dst, ib_sa_comp_mask comp_mask)
+{
+ /* MGID must already match */
+
+ if (comp_mask & IB_SA_MCMEMBER_REC_PORT_GID &&
+ memcmp(&src->port_gid, &dst->port_gid, sizeof src->port_gid))
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey)
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid)
+ return -EINVAL;
+ if (ib_sa_check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR,
+ IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector,
+ src->mtu, dst->mtu))
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS &&
+ src->traffic_class != dst->traffic_class)
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey)
+ return -EINVAL;
+ if (ib_sa_check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR,
+ IB_SA_MCMEMBER_REC_RATE, dst->rate_selector,
+ src->rate, dst->rate))
+ return -EINVAL;
+ if (ib_sa_check_selector(comp_mask,
+ IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR,
+ IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME,
+ dst->packet_life_time_selector,
+ src->packet_life_time, dst->packet_life_time))
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_SL && src->sl != dst->sl)
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL &&
+ src->flow_label != dst->flow_label)
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT &&
+ src->hop_limit != dst->hop_limit)
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE && src->scope != dst->scope)
+ return -EINVAL;
+
+ /* join_state checked separately, proxy_join ignored */
+
+ return 0;
+}
+
+static int send_join(struct mcast_group *group, struct mcast_member *member)
+{
+ struct mcast_port *port = group->port;
+ int ret;
+
+ group->last_join = member;
+ ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device,
+ port->port_num, IB_MGMT_METHOD_SET,
+ &member->multicast.rec,
+ member->multicast.comp_mask,
+ 3000, GFP_KERNEL, join_handler, group,
+ &group->query);
+ if (ret >= 0) {
+ group->query_id = ret;
+ ret = 0;
+ }
+ return ret;
+}
+
+static int send_leave(struct mcast_group *group, u8 leave_state)
+{
+ struct mcast_port *port = group->port;
+ struct ib_sa_mcmember_rec rec;
+ int ret;
+
+ rec = group->rec;
+ rec.join_state = leave_state;
+ group->leave_state = leave_state;
+
+ ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device,
+ port->port_num, IB_SA_METHOD_DELETE, &rec,
+ IB_SA_MCMEMBER_REC_MGID |
+ IB_SA_MCMEMBER_REC_PORT_GID |
+ IB_SA_MCMEMBER_REC_JOIN_STATE,
+ 3000, GFP_KERNEL, leave_handler,
+ group, &group->query);
+ if (ret >= 0) {
+ group->query_id = ret;
+ ret = 0;
+ }
+ return ret;
+}
+
+static void join_group(struct mcast_group *group, struct mcast_member *member,
+ u8 join_state)
+{
+ member->state = MCAST_MEMBER;
+ adjust_membership(group, join_state, 1);
+ group->rec.join_state |= join_state;
+ member->multicast.rec = group->rec;
+ member->multicast.rec.join_state = join_state;
+ list_move(&member->list, &group->active_list);
+}
+
+static int fail_join(struct mcast_group *group, struct mcast_member *member,
+ int status)
+{
+ spin_lock_irq(&group->lock);
+ list_del_init(&member->list);
+ spin_unlock_irq(&group->lock);
+ return member->multicast.callback(status, &member->multicast);
+}
+
+static void process_group_error(struct mcast_group *group)
+{
+ struct mcast_member *member;
+ int ret = 0;
+ u16 pkey_index;
+
+ if (group->state == MCAST_PKEY_EVENT)
+ ret = ib_find_pkey(group->port->dev->device,
+ group->port->port_num,
+ be16_to_cpu(group->rec.pkey), &pkey_index);
+
+ spin_lock_irq(&group->lock);
+ if (group->state == MCAST_PKEY_EVENT && !ret &&
+ group->pkey_index == pkey_index)
+ goto out;
+
+ while (!list_empty(&group->active_list)) {
+ member = list_entry(group->active_list.next,
+ struct mcast_member, list);
+ atomic_inc(&member->refcount);
+ list_del_init(&member->list);
+ adjust_membership(group, member->multicast.rec.join_state, -1);
+ member->state = MCAST_ERROR;
+ spin_unlock_irq(&group->lock);
+
+ ret = member->multicast.callback(-ENETRESET,
+ &member->multicast);
+ deref_member(member);
+ if (ret)
+ ib_sa_free_multicast(&member->multicast);
+ spin_lock_irq(&group->lock);
+ }
+
+ group->rec.join_state = 0;
+out:
+ group->state = MCAST_BUSY;
+ spin_unlock_irq(&group->lock);
+}
+
+static void mcast_work_handler(struct work_struct *work)
+{
+ struct mcast_group *group;
+ struct mcast_member *member;
+ struct ib_sa_multicast *multicast;
+ int status, ret;
+ u8 join_state;
+
+ group = container_of(work, typeof(*group), work);
+retest:
+ spin_lock_irq(&group->lock);
+ while (!list_empty(&group->pending_list) ||
+ (group->state != MCAST_BUSY)) {
+
+ if (group->state != MCAST_BUSY) {
+ spin_unlock_irq(&group->lock);
+ process_group_error(group);
+ goto retest;
+ }
+
+ member = list_entry(group->pending_list.next,
+ struct mcast_member, list);
+ multicast = &member->multicast;
+ join_state = multicast->rec.join_state;
+ atomic_inc(&member->refcount);
+
+ if (join_state == (group->rec.join_state & join_state)) {
+ status = cmp_rec(&group->rec, &multicast->rec,
+ multicast->comp_mask);
+ if (!status)
+ join_group(group, member, join_state);
+ else
+ list_del_init(&member->list);
+ spin_unlock_irq(&group->lock);
+ ret = multicast->callback(status, multicast);
+ } else {
+ spin_unlock_irq(&group->lock);
+ status = send_join(group, member);
+ if (!status) {
+ deref_member(member);
+ return;
+ }
+ ret = fail_join(group, member, status);
+ }
+
+ deref_member(member);
+ if (ret)
+ ib_sa_free_multicast(&member->multicast);
+ spin_lock_irq(&group->lock);
+ }
+
+ join_state = get_leave_state(group);
+ if (join_state) {
+ group->rec.join_state &= ~join_state;
+ spin_unlock_irq(&group->lock);
+ if (send_leave(group, join_state))
+ goto retest;
+ } else {
+ group->state = MCAST_IDLE;
+ spin_unlock_irq(&group->lock);
+ release_group(group);
+ }
+}
+
+/*
+ * Fail a join request if it is still active - at the head of the pending queue.
+ */
+static void process_join_error(struct mcast_group *group, int status)
+{
+ struct mcast_member *member;
+ int ret;
+
+ spin_lock_irq(&group->lock);
+ member = list_entry(group->pending_list.next,
+ struct mcast_member, list);
+ if (group->last_join == member) {
+ atomic_inc(&member->refcount);
+ list_del_init(&member->list);
+ spin_unlock_irq(&group->lock);
+ ret = member->multicast.callback(status, &member->multicast);
+ deref_member(member);
+ if (ret)
+ ib_sa_free_multicast(&member->multicast);
+ } else
+ spin_unlock_irq(&group->lock);
+}
+
+static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
+ void *context)
+{
+ struct mcast_group *group = context;
+ u16 pkey_index = MCAST_INVALID_PKEY_INDEX;
+
+ if (status)
+ process_join_error(group, status);
+ else {
+ ib_find_pkey(group->port->dev->device, group->port->port_num,
+ be16_to_cpu(rec->pkey), &pkey_index);
+
+ spin_lock_irq(&group->port->lock);
+ group->rec = *rec;
+ if (group->state == MCAST_BUSY &&
+ group->pkey_index == MCAST_INVALID_PKEY_INDEX)
+ group->pkey_index = pkey_index;
+ if (!memcmp(&mgid0, &group->rec.mgid, sizeof mgid0)) {
+ rb_erase(&group->node, &group->port->table);
+ mcast_insert(group->port, group, 1);
+ }
+ spin_unlock_irq(&group->port->lock);
+ }
+ mcast_work_handler(&group->work);
+}
+
+static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
+ void *context)
+{
+ struct mcast_group *group = context;
+
+ if (status && (group->retries > 0) &&
+ !send_leave(group, group->leave_state))
+ group->retries--;
+ else
+ mcast_work_handler(&group->work);
+}
+
+static struct mcast_group *acquire_group(struct mcast_port *port,
+ union ib_gid *mgid, gfp_t gfp_mask)
+{
+ struct mcast_group *group, *cur_group;
+ unsigned long flags;
+ int is_mgid0;
+
+ is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0);
+ if (!is_mgid0) {
+ spin_lock_irqsave(&port->lock, flags);
+ group = mcast_find(port, mgid);
+ if (group)
+ goto found;
+ spin_unlock_irqrestore(&port->lock, flags);
+ }
+
+ group = kzalloc(sizeof *group, gfp_mask);
+ if (!group)
+ return NULL;
+
+ group->retries = 3;
+ group->port = port;
+ group->rec.mgid = *mgid;
+ group->pkey_index = MCAST_INVALID_PKEY_INDEX;
+ INIT_LIST_HEAD(&group->pending_list);
+ INIT_LIST_HEAD(&group->active_list);
+ INIT_WORK(&group->work, mcast_work_handler);
+ spin_lock_init(&group->lock);
+
+ spin_lock_irqsave(&port->lock, flags);
+ cur_group = mcast_insert(port, group, is_mgid0);
+ if (cur_group) {
+ kfree(group);
+ group = cur_group;
+ } else
+ atomic_inc(&port->refcount);
+found:
+ atomic_inc(&group->refcount);
+ spin_unlock_irqrestore(&port->lock, flags);
+ return group;
+}
+
+/*
+ * We serialize all join requests to a single group to make our lives much
+ * easier. Otherwise, two users could try to join the same group
+ * simultaneously, with different configurations, one could leave while the
+ * join is in progress, etc., which makes locking around error recovery
+ * difficult.
+ */
+struct ib_sa_multicast *
+ib_sa_join_multicast(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ struct ib_sa_mcmember_rec *rec,
+ ib_sa_comp_mask comp_mask, gfp_t gfp_mask,
+ int (*callback)(int status,
+ struct ib_sa_multicast *multicast),
+ void *context)
+{
+ struct mcast_device *dev;
+ struct mcast_member *member;
+ struct ib_sa_multicast *multicast;
+ int ret;
+
+ dev = ib_get_client_data(device, &mcast_client);
+ if (!dev)
+ return ERR_PTR(-ENODEV);
+
+ member = kmalloc(sizeof *member, gfp_mask);
+ if (!member)
+ return ERR_PTR(-ENOMEM);
+
+ ib_sa_client_get(client);
+ member->client = client;
+ member->multicast.rec = *rec;
+ member->multicast.comp_mask = comp_mask;
+ member->multicast.callback = callback;
+ member->multicast.context = context;
+ init_completion(&member->comp);
+ atomic_set(&member->refcount, 1);
+ member->state = MCAST_JOINING;
+
+ member->group = acquire_group(&dev->port[port_num - dev->start_port],
+ &rec->mgid, gfp_mask);
+ if (!member->group) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ /*
+ * The user will get the multicast structure in their callback. They
+ * could then free the multicast structure before we can return from
+ * this routine. So we save the pointer to return before queuing
+ * any callback.
+ */
+ multicast = &member->multicast;
+ queue_join(member);
+ return multicast;
+
+err:
+ ib_sa_client_put(client);
+ kfree(member);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_sa_join_multicast);
+
+void ib_sa_free_multicast(struct ib_sa_multicast *multicast)
+{
+ struct mcast_member *member;
+ struct mcast_group *group;
+
+ member = container_of(multicast, struct mcast_member, multicast);
+ group = member->group;
+
+ spin_lock_irq(&group->lock);
+ if (member->state == MCAST_MEMBER)
+ adjust_membership(group, multicast->rec.join_state, -1);
+
+ list_del_init(&member->list);
+
+ if (group->state == MCAST_IDLE) {
+ group->state = MCAST_BUSY;
+ spin_unlock_irq(&group->lock);
+ /* Continue to hold reference on group until callback */
+ queue_work(mcast_wq, &group->work);
+ } else {
+ spin_unlock_irq(&group->lock);
+ release_group(group);
+ }
+
+ deref_member(member);
+ wait_for_completion(&member->comp);
+ ib_sa_client_put(member->client);
+ kfree(member);
+}
+EXPORT_SYMBOL(ib_sa_free_multicast);
+
+int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num,
+ union ib_gid *mgid, struct ib_sa_mcmember_rec *rec)
+{
+ struct mcast_device *dev;
+ struct mcast_port *port;
+ struct mcast_group *group;
+ unsigned long flags;
+ int ret = 0;
+
+ dev = ib_get_client_data(device, &mcast_client);
+ if (!dev)
+ return -ENODEV;
+
+ port = &dev->port[port_num - dev->start_port];
+ spin_lock_irqsave(&port->lock, flags);
+ group = mcast_find(port, mgid);
+ if (group)
+ *rec = group->rec;
+ else
+ ret = -EADDRNOTAVAIL;
+ spin_unlock_irqrestore(&port->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_sa_get_mcmember_rec);
+
+int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
+ struct ib_sa_mcmember_rec *rec,
+ struct ib_ah_attr *ah_attr)
+{
+ int ret;
+ u16 gid_index;
+ u8 p;
+
+ ret = ib_find_cached_gid(device, &rec->port_gid, &p, &gid_index);
+ if (ret)
+ return ret;
+
+ memset(ah_attr, 0, sizeof *ah_attr);
+ ah_attr->dlid = be16_to_cpu(rec->mlid);
+ ah_attr->sl = rec->sl;
+ ah_attr->port_num = port_num;
+ ah_attr->static_rate = rec->rate;
+
+ ah_attr->ah_flags = IB_AH_GRH;
+ ah_attr->grh.dgid = rec->mgid;
+
+ ah_attr->grh.sgid_index = (u8) gid_index;
+ ah_attr->grh.flow_label = be32_to_cpu(rec->flow_label);
+ ah_attr->grh.hop_limit = rec->hop_limit;
+ ah_attr->grh.traffic_class = rec->traffic_class;
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_init_ah_from_mcmember);
+
+static void mcast_groups_event(struct mcast_port *port,
+ enum mcast_group_state state)
+{
+ struct mcast_group *group;
+ struct rb_node *node;
+ unsigned long flags;
+
+ spin_lock_irqsave(&port->lock, flags);
+ for (node = rb_first(&port->table); node; node = rb_next(node)) {
+ group = rb_entry(node, struct mcast_group, node);
+ spin_lock(&group->lock);
+ if (group->state == MCAST_IDLE) {
+ atomic_inc(&group->refcount);
+ queue_work(mcast_wq, &group->work);
+ }
+ if (group->state != MCAST_GROUP_ERROR)
+ group->state = state;
+ spin_unlock(&group->lock);
+ }
+ spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void mcast_event_handler(struct ib_event_handler *handler,
+ struct ib_event *event)
+{
+ struct mcast_device *dev;
+ int index;
+
+ dev = container_of(handler, struct mcast_device, event_handler);
+ if (rdma_port_get_link_layer(dev->device, event->element.port_num) !=
+ IB_LINK_LAYER_INFINIBAND)
+ return;
+
+ index = event->element.port_num - dev->start_port;
+
+ switch (event->event) {
+ case IB_EVENT_PORT_ERR:
+ case IB_EVENT_LID_CHANGE:
+ case IB_EVENT_SM_CHANGE:
+ case IB_EVENT_CLIENT_REREGISTER:
+ mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR);
+ break;
+ case IB_EVENT_PKEY_CHANGE:
+ mcast_groups_event(&dev->port[index], MCAST_PKEY_EVENT);
+ break;
+ default:
+ break;
+ }
+}
+
+static void mcast_add_one(struct ib_device *device)
+{
+ struct mcast_device *dev;
+ struct mcast_port *port;
+ int i;
+ int count = 0;
+
+ if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+ return;
+
+ dev = kmalloc(sizeof *dev + device->phys_port_cnt * sizeof *port,
+ GFP_KERNEL);
+ if (!dev)
+ return;
+
+ if (device->node_type == RDMA_NODE_IB_SWITCH)
+ dev->start_port = dev->end_port = 0;
+ else {
+ dev->start_port = 1;
+ dev->end_port = device->phys_port_cnt;
+ }
+
+ for (i = 0; i <= dev->end_port - dev->start_port; i++) {
+ if (rdma_port_get_link_layer(device, dev->start_port + i) !=
+ IB_LINK_LAYER_INFINIBAND)
+ continue;
+ port = &dev->port[i];
+ port->dev = dev;
+ port->port_num = dev->start_port + i;
+ spin_lock_init(&port->lock);
+ port->table = RB_ROOT;
+ init_completion(&port->comp);
+ atomic_set(&port->refcount, 1);
+ ++count;
+ }
+
+ if (!count) {
+ kfree(dev);
+ return;
+ }
+
+ dev->device = device;
+ ib_set_client_data(device, &mcast_client, dev);
+
+ INIT_IB_EVENT_HANDLER(&dev->event_handler, device, mcast_event_handler);
+ ib_register_event_handler(&dev->event_handler);
+}
+
+static void mcast_remove_one(struct ib_device *device)
+{
+ struct mcast_device *dev;
+ struct mcast_port *port;
+ int i;
+
+ dev = ib_get_client_data(device, &mcast_client);
+ if (!dev)
+ return;
+
+ ib_unregister_event_handler(&dev->event_handler);
+ flush_workqueue(mcast_wq);
+
+ for (i = 0; i <= dev->end_port - dev->start_port; i++) {
+ if (rdma_port_get_link_layer(device, dev->start_port + i) ==
+ IB_LINK_LAYER_INFINIBAND) {
+ port = &dev->port[i];
+ deref_port(port);
+ wait_for_completion(&port->comp);
+ }
+ }
+
+ kfree(dev);
+}
+
+int mcast_init(void)
+{
+ int ret;
+
+ mcast_wq = create_singlethread_workqueue("ib_mcast");
+ if (!mcast_wq)
+ return -ENOMEM;
+
+ ib_sa_register_client(&sa_client);
+
+ ret = ib_register_client(&mcast_client);
+ if (ret)
+ goto err;
+ return 0;
+
+err:
+ ib_sa_unregister_client(&sa_client);
+ destroy_workqueue(mcast_wq);
+ return ret;
+}
+
+void mcast_cleanup(void)
+{
+ ib_unregister_client(&mcast_client);
+ ib_sa_unregister_client(&sa_client);
+ destroy_workqueue(mcast_wq);
+}
diff --git a/sys/ofed/drivers/infiniband/core/notice.c b/sys/ofed/drivers/infiniband/core/notice.c
new file mode 100644
index 0000000..4a8d98f
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/notice.c
@@ -0,0 +1,749 @@
+/*
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/bitops.h>
+#include <linux/random.h>
+
+#include "sa.h"
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("InfiniBand InformInfo & Notice event handling");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static void inform_add_one(struct ib_device *device);
+static void inform_remove_one(struct ib_device *device);
+
+static struct ib_client inform_client = {
+ .name = "ib_notice",
+ .add = inform_add_one,
+ .remove = inform_remove_one
+};
+
+static struct ib_sa_client sa_client;
+static struct workqueue_struct *inform_wq;
+
+struct inform_device;
+
+struct inform_port {
+ struct inform_device *dev;
+ spinlock_t lock;
+ struct rb_root table;
+ atomic_t refcount;
+ struct completion comp;
+ u8 port_num;
+};
+
+struct inform_device {
+ struct ib_device *device;
+ struct ib_event_handler event_handler;
+ int start_port;
+ int end_port;
+ struct inform_port port[0];
+};
+
+enum inform_state {
+ INFORM_IDLE,
+ INFORM_REGISTERING,
+ INFORM_MEMBER,
+ INFORM_BUSY,
+ INFORM_ERROR
+};
+
+struct inform_member;
+
+struct inform_group {
+ u16 trap_number;
+ struct rb_node node;
+ struct inform_port *port;
+ spinlock_t lock;
+ struct work_struct work;
+ struct list_head pending_list;
+ struct list_head active_list;
+ struct list_head notice_list;
+ struct inform_member *last_join;
+ int members;
+ enum inform_state join_state; /* State relative to SA */
+ atomic_t refcount;
+ enum inform_state state;
+ struct ib_sa_query *query;
+ int query_id;
+};
+
+struct inform_member {
+ struct ib_inform_info info;
+ struct ib_sa_client *client;
+ struct inform_group *group;
+ struct list_head list;
+ enum inform_state state;
+ atomic_t refcount;
+ struct completion comp;
+};
+
+struct inform_notice {
+ struct list_head list;
+ struct ib_sa_notice notice;
+};
+
+static void reg_handler(int status, struct ib_sa_inform *inform,
+ void *context);
+static void unreg_handler(int status, struct ib_sa_inform *inform,
+ void *context);
+
+static struct inform_group *inform_find(struct inform_port *port,
+ u16 trap_number)
+{
+ struct rb_node *node = port->table.rb_node;
+ struct inform_group *group;
+
+ while (node) {
+ group = rb_entry(node, struct inform_group, node);
+ if (trap_number < group->trap_number)
+ node = node->rb_left;
+ else if (trap_number > group->trap_number)
+ node = node->rb_right;
+ else
+ return group;
+ }
+ return NULL;
+}
+
+static struct inform_group *inform_insert(struct inform_port *port,
+ struct inform_group *group)
+{
+ struct rb_node **link = &port->table.rb_node;
+ struct rb_node *parent = NULL;
+ struct inform_group *cur_group;
+
+ while (*link) {
+ parent = *link;
+ cur_group = rb_entry(parent, struct inform_group, node);
+ if (group->trap_number < cur_group->trap_number)
+ link = &(*link)->rb_left;
+ else if (group->trap_number > cur_group->trap_number)
+ link = &(*link)->rb_right;
+ else
+ return cur_group;
+ }
+ rb_link_node(&group->node, parent, link);
+ rb_insert_color(&group->node, &port->table);
+ return NULL;
+}
+
+static void deref_port(struct inform_port *port)
+{
+ if (atomic_dec_and_test(&port->refcount))
+ complete(&port->comp);
+}
+
+static void release_group(struct inform_group *group)
+{
+ struct inform_port *port = group->port;
+ unsigned long flags;
+
+ spin_lock_irqsave(&port->lock, flags);
+ if (atomic_dec_and_test(&group->refcount)) {
+ rb_erase(&group->node, &port->table);
+ spin_unlock_irqrestore(&port->lock, flags);
+ kfree(group);
+ deref_port(port);
+ } else
+ spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void deref_member(struct inform_member *member)
+{
+ if (atomic_dec_and_test(&member->refcount))
+ complete(&member->comp);
+}
+
+static void queue_reg(struct inform_member *member)
+{
+ struct inform_group *group = member->group;
+ unsigned long flags;
+
+ spin_lock_irqsave(&group->lock, flags);
+ list_add(&member->list, &group->pending_list);
+ if (group->state == INFORM_IDLE) {
+ group->state = INFORM_BUSY;
+ atomic_inc(&group->refcount);
+ queue_work(inform_wq, &group->work);
+ }
+ spin_unlock_irqrestore(&group->lock, flags);
+}
+
+static int send_reg(struct inform_group *group, struct inform_member *member)
+{
+ struct inform_port *port = group->port;
+ struct ib_sa_inform inform;
+ int ret;
+
+ memset(&inform, 0, sizeof inform);
+ inform.lid_range_begin = cpu_to_be16(0xFFFF);
+ inform.is_generic = 1;
+ inform.subscribe = 1;
+ inform.type = cpu_to_be16(IB_SA_EVENT_TYPE_ALL);
+ inform.trap.generic.trap_num = cpu_to_be16(member->info.trap_number);
+ inform.trap.generic.resp_time = 19;
+ inform.trap.generic.producer_type =
+ cpu_to_be32(IB_SA_EVENT_PRODUCER_TYPE_ALL);
+
+ group->last_join = member;
+ ret = ib_sa_informinfo_query(&sa_client, port->dev->device,
+ port->port_num, &inform, 3000, GFP_KERNEL,
+ reg_handler, group,&group->query);
+ if (ret >= 0) {
+ group->query_id = ret;
+ ret = 0;
+ }
+ return ret;
+}
+
+static int send_unreg(struct inform_group *group)
+{
+ struct inform_port *port = group->port;
+ struct ib_sa_inform inform;
+ int ret;
+
+ memset(&inform, 0, sizeof inform);
+ inform.lid_range_begin = cpu_to_be16(0xFFFF);
+ inform.is_generic = 1;
+ inform.type = cpu_to_be16(IB_SA_EVENT_TYPE_ALL);
+ inform.trap.generic.trap_num = cpu_to_be16(group->trap_number);
+ inform.trap.generic.qpn = IB_QP1;
+ inform.trap.generic.resp_time = 19;
+ inform.trap.generic.producer_type =
+ cpu_to_be32(IB_SA_EVENT_PRODUCER_TYPE_ALL);
+
+ ret = ib_sa_informinfo_query(&sa_client, port->dev->device,
+ port->port_num, &inform, 3000, GFP_KERNEL,
+ unreg_handler, group, &group->query);
+ if (ret >= 0) {
+ group->query_id = ret;
+ ret = 0;
+ }
+ return ret;
+}
+
+static void join_group(struct inform_group *group, struct inform_member *member)
+{
+ member->state = INFORM_MEMBER;
+ group->members++;
+ list_move(&member->list, &group->active_list);
+}
+
+static int fail_join(struct inform_group *group, struct inform_member *member,
+ int status)
+{
+ spin_lock_irq(&group->lock);
+ list_del_init(&member->list);
+ spin_unlock_irq(&group->lock);
+ return member->info.callback(status, &member->info, NULL);
+}
+
+static void process_group_error(struct inform_group *group)
+{
+ struct inform_member *member;
+ int ret;
+
+ spin_lock_irq(&group->lock);
+ while (!list_empty(&group->active_list)) {
+ member = list_entry(group->active_list.next,
+ struct inform_member, list);
+ atomic_inc(&member->refcount);
+ list_del_init(&member->list);
+ group->members--;
+ member->state = INFORM_ERROR;
+ spin_unlock_irq(&group->lock);
+
+ ret = member->info.callback(-ENETRESET, &member->info, NULL);
+ deref_member(member);
+ if (ret)
+ ib_sa_unregister_inform_info(&member->info);
+ spin_lock_irq(&group->lock);
+ }
+
+ group->join_state = INFORM_IDLE;
+ group->state = INFORM_BUSY;
+ spin_unlock_irq(&group->lock);
+}
+
+/*
+ * Report a notice to all active subscribers. We use a temporary list to
+ * handle unsubscription requests while the notice is being reported, which
+ * avoids holding the group lock while in the user's callback.
+ */
+static void process_notice(struct inform_group *group,
+ struct inform_notice *info_notice)
+{
+ struct inform_member *member;
+ struct list_head list;
+ int ret;
+
+ INIT_LIST_HEAD(&list);
+
+ spin_lock_irq(&group->lock);
+ list_splice_init(&group->active_list, &list);
+ while (!list_empty(&list)) {
+
+ member = list_entry(list.next, struct inform_member, list);
+ atomic_inc(&member->refcount);
+ list_move(&member->list, &group->active_list);
+ spin_unlock_irq(&group->lock);
+
+ ret = member->info.callback(0, &member->info,
+ &info_notice->notice);
+ deref_member(member);
+ if (ret)
+ ib_sa_unregister_inform_info(&member->info);
+ spin_lock_irq(&group->lock);
+ }
+ spin_unlock_irq(&group->lock);
+}
+
+static void inform_work_handler(struct work_struct *work)
+{
+ struct inform_group *group;
+ struct inform_member *member;
+ struct ib_inform_info *info;
+ struct inform_notice *info_notice;
+ int status, ret;
+
+ group = container_of(work, typeof(*group), work);
+retest:
+ spin_lock_irq(&group->lock);
+ while (!list_empty(&group->pending_list) ||
+ !list_empty(&group->notice_list) ||
+ (group->state == INFORM_ERROR)) {
+
+ if (group->state == INFORM_ERROR) {
+ spin_unlock_irq(&group->lock);
+ process_group_error(group);
+ goto retest;
+ }
+
+ if (!list_empty(&group->notice_list)) {
+ info_notice = list_entry(group->notice_list.next,
+ struct inform_notice, list);
+ list_del(&info_notice->list);
+ spin_unlock_irq(&group->lock);
+ process_notice(group, info_notice);
+ kfree(info_notice);
+ goto retest;
+ }
+
+ member = list_entry(group->pending_list.next,
+ struct inform_member, list);
+ info = &member->info;
+ atomic_inc(&member->refcount);
+
+ if (group->join_state == INFORM_MEMBER) {
+ join_group(group, member);
+ spin_unlock_irq(&group->lock);
+ ret = info->callback(0, info, NULL);
+ } else {
+ spin_unlock_irq(&group->lock);
+ status = send_reg(group, member);
+ if (!status) {
+ deref_member(member);
+ return;
+ }
+ ret = fail_join(group, member, status);
+ }
+
+ deref_member(member);
+ if (ret)
+ ib_sa_unregister_inform_info(&member->info);
+ spin_lock_irq(&group->lock);
+ }
+
+ if (!group->members && (group->join_state == INFORM_MEMBER)) {
+ group->join_state = INFORM_IDLE;
+ spin_unlock_irq(&group->lock);
+ if (send_unreg(group))
+ goto retest;
+ } else {
+ group->state = INFORM_IDLE;
+ spin_unlock_irq(&group->lock);
+ release_group(group);
+ }
+}
+
+/*
+ * Fail a join request if it is still active - at the head of the pending queue.
+ */
+static void process_join_error(struct inform_group *group, int status)
+{
+ struct inform_member *member;
+ int ret;
+
+ spin_lock_irq(&group->lock);
+ member = list_entry(group->pending_list.next,
+ struct inform_member, list);
+ if (group->last_join == member) {
+ atomic_inc(&member->refcount);
+ list_del_init(&member->list);
+ spin_unlock_irq(&group->lock);
+ ret = member->info.callback(status, &member->info, NULL);
+ deref_member(member);
+ if (ret)
+ ib_sa_unregister_inform_info(&member->info);
+ } else
+ spin_unlock_irq(&group->lock);
+}
+
+static void reg_handler(int status, struct ib_sa_inform *inform, void *context)
+{
+ struct inform_group *group = context;
+
+ if (status)
+ process_join_error(group, status);
+ else
+ group->join_state = INFORM_MEMBER;
+
+ inform_work_handler(&group->work);
+}
+
+static void unreg_handler(int status, struct ib_sa_inform *rec, void *context)
+{
+ struct inform_group *group = context;
+
+ inform_work_handler(&group->work);
+}
+
+int notice_dispatch(struct ib_device *device, u8 port_num,
+ struct ib_sa_notice *notice)
+{
+ struct inform_device *dev;
+ struct inform_port *port;
+ struct inform_group *group;
+ struct inform_notice *info_notice;
+
+ dev = ib_get_client_data(device, &inform_client);
+ if (!dev)
+ return 0; /* No one to give notice to. */
+
+ port = &dev->port[port_num - dev->start_port];
+ spin_lock_irq(&port->lock);
+ group = inform_find(port, __be16_to_cpu(notice->trap.
+ generic.trap_num));
+ if (!group) {
+ spin_unlock_irq(&port->lock);
+ return 0;
+ }
+
+ atomic_inc(&group->refcount);
+ spin_unlock_irq(&port->lock);
+
+ info_notice = kmalloc(sizeof *info_notice, GFP_KERNEL);
+ if (!info_notice) {
+ release_group(group);
+ return -ENOMEM;
+ }
+
+ info_notice->notice = *notice;
+
+ spin_lock_irq(&group->lock);
+ list_add(&info_notice->list, &group->notice_list);
+ if (group->state == INFORM_IDLE) {
+ group->state = INFORM_BUSY;
+ spin_unlock_irq(&group->lock);
+ inform_work_handler(&group->work);
+ } else {
+ spin_unlock_irq(&group->lock);
+ release_group(group);
+ }
+
+ return 0;
+}
+
+static struct inform_group *acquire_group(struct inform_port *port,
+ u16 trap_number, gfp_t gfp_mask)
+{
+ struct inform_group *group, *cur_group;
+ unsigned long flags;
+
+ spin_lock_irqsave(&port->lock, flags);
+ group = inform_find(port, trap_number);
+ if (group)
+ goto found;
+ spin_unlock_irqrestore(&port->lock, flags);
+
+ group = kzalloc(sizeof *group, gfp_mask);
+ if (!group)
+ return NULL;
+
+ group->port = port;
+ group->trap_number = trap_number;
+ INIT_LIST_HEAD(&group->pending_list);
+ INIT_LIST_HEAD(&group->active_list);
+ INIT_LIST_HEAD(&group->notice_list);
+ INIT_WORK(&group->work, inform_work_handler);
+ spin_lock_init(&group->lock);
+
+ spin_lock_irqsave(&port->lock, flags);
+ cur_group = inform_insert(port, group);
+ if (cur_group) {
+ kfree(group);
+ group = cur_group;
+ } else
+ atomic_inc(&port->refcount);
+found:
+ atomic_inc(&group->refcount);
+ spin_unlock_irqrestore(&port->lock, flags);
+ return group;
+}
+
+/*
+ * We serialize all join requests to a single group to make our lives much
+ * easier. Otherwise, two users could try to join the same group
+ * simultaneously, with different configurations, one could leave while the
+ * join is in progress, etc., which makes locking around error recovery
+ * difficult.
+ */
+struct ib_inform_info *
+ib_sa_register_inform_info(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ u16 trap_number, gfp_t gfp_mask,
+ int (*callback)(int status,
+ struct ib_inform_info *info,
+ struct ib_sa_notice *notice),
+ void *context)
+{
+ struct inform_device *dev;
+ struct inform_member *member;
+ struct ib_inform_info *info;
+ int ret;
+
+ dev = ib_get_client_data(device, &inform_client);
+ if (!dev)
+ return ERR_PTR(-ENODEV);
+
+ member = kzalloc(sizeof *member, gfp_mask);
+ if (!member)
+ return ERR_PTR(-ENOMEM);
+
+ ib_sa_client_get(client);
+ member->client = client;
+ member->info.trap_number = trap_number;
+ member->info.callback = callback;
+ member->info.context = context;
+ init_completion(&member->comp);
+ atomic_set(&member->refcount, 1);
+ member->state = INFORM_REGISTERING;
+
+ member->group = acquire_group(&dev->port[port_num - dev->start_port],
+ trap_number, gfp_mask);
+ if (!member->group) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ /*
+ * The user will get the info structure in their callback. They
+ * could then free the info structure before we can return from
+ * this routine. So we save the pointer to return before queuing
+ * any callback.
+ */
+ info = &member->info;
+ queue_reg(member);
+ return info;
+
+err:
+ ib_sa_client_put(member->client);
+ kfree(member);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_sa_register_inform_info);
+
+void ib_sa_unregister_inform_info(struct ib_inform_info *info)
+{
+ struct inform_member *member;
+ struct inform_group *group;
+
+ member = container_of(info, struct inform_member, info);
+ group = member->group;
+
+ spin_lock_irq(&group->lock);
+ if (member->state == INFORM_MEMBER)
+ group->members--;
+
+ list_del_init(&member->list);
+
+ if (group->state == INFORM_IDLE) {
+ group->state = INFORM_BUSY;
+ spin_unlock_irq(&group->lock);
+ /* Continue to hold reference on group until callback */
+ queue_work(inform_wq, &group->work);
+ } else {
+ spin_unlock_irq(&group->lock);
+ release_group(group);
+ }
+
+ deref_member(member);
+ wait_for_completion(&member->comp);
+ ib_sa_client_put(member->client);
+ kfree(member);
+}
+EXPORT_SYMBOL(ib_sa_unregister_inform_info);
+
+static void inform_groups_lost(struct inform_port *port)
+{
+ struct inform_group *group;
+ struct rb_node *node;
+ unsigned long flags;
+
+ spin_lock_irqsave(&port->lock, flags);
+ for (node = rb_first(&port->table); node; node = rb_next(node)) {
+ group = rb_entry(node, struct inform_group, node);
+ spin_lock(&group->lock);
+ if (group->state == INFORM_IDLE) {
+ atomic_inc(&group->refcount);
+ queue_work(inform_wq, &group->work);
+ }
+ group->state = INFORM_ERROR;
+ spin_unlock(&group->lock);
+ }
+ spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void inform_event_handler(struct ib_event_handler *handler,
+ struct ib_event *event)
+{
+ struct inform_device *dev;
+
+ dev = container_of(handler, struct inform_device, event_handler);
+
+ switch (event->event) {
+ case IB_EVENT_PORT_ERR:
+ case IB_EVENT_LID_CHANGE:
+ case IB_EVENT_SM_CHANGE:
+ case IB_EVENT_CLIENT_REREGISTER:
+ inform_groups_lost(&dev->port[event->element.port_num -
+ dev->start_port]);
+ break;
+ default:
+ break;
+ }
+}
+
+static void inform_add_one(struct ib_device *device)
+{
+ struct inform_device *dev;
+ struct inform_port *port;
+ int i;
+
+ if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+ return;
+
+ dev = kmalloc(sizeof *dev + device->phys_port_cnt * sizeof *port,
+ GFP_KERNEL);
+ if (!dev)
+ return;
+
+ if (device->node_type == RDMA_NODE_IB_SWITCH)
+ dev->start_port = dev->end_port = 0;
+ else {
+ dev->start_port = 1;
+ dev->end_port = device->phys_port_cnt;
+ }
+
+ for (i = 0; i <= dev->end_port - dev->start_port; i++) {
+ port = &dev->port[i];
+ port->dev = dev;
+ port->port_num = dev->start_port + i;
+ spin_lock_init(&port->lock);
+ port->table = RB_ROOT;
+ init_completion(&port->comp);
+ atomic_set(&port->refcount, 1);
+ }
+
+ dev->device = device;
+ ib_set_client_data(device, &inform_client, dev);
+
+ INIT_IB_EVENT_HANDLER(&dev->event_handler, device, inform_event_handler);
+ ib_register_event_handler(&dev->event_handler);
+}
+
+static void inform_remove_one(struct ib_device *device)
+{
+ struct inform_device *dev;
+ struct inform_port *port;
+ int i;
+
+ dev = ib_get_client_data(device, &inform_client);
+ if (!dev)
+ return;
+
+ ib_unregister_event_handler(&dev->event_handler);
+ flush_workqueue(inform_wq);
+
+ for (i = 0; i <= dev->end_port - dev->start_port; i++) {
+ port = &dev->port[i];
+ deref_port(port);
+ wait_for_completion(&port->comp);
+ }
+
+ kfree(dev);
+}
+
+int notice_init(void)
+{
+ int ret;
+
+ inform_wq = create_singlethread_workqueue("ib_inform");
+ if (!inform_wq)
+ return -ENOMEM;
+
+ ib_sa_register_client(&sa_client);
+
+ ret = ib_register_client(&inform_client);
+ if (ret)
+ goto err;
+ return 0;
+
+err:
+ ib_sa_unregister_client(&sa_client);
+ destroy_workqueue(inform_wq);
+ return ret;
+}
+
+void notice_cleanup(void)
+{
+ ib_unregister_client(&inform_client);
+ ib_sa_unregister_client(&sa_client);
+ destroy_workqueue(inform_wq);
+}
diff --git a/sys/ofed/drivers/infiniband/core/packer.c b/sys/ofed/drivers/infiniband/core/packer.c
new file mode 100644
index 0000000..019bd4b
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/packer.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/string.h>
+
+#include <rdma/ib_pack.h>
+
+static u64 value_read(int offset, int size, void *structure)
+{
+ switch (size) {
+ case 1: return *(u8 *) (structure + offset);
+ case 2: return be16_to_cpup((__be16 *) (structure + offset));
+ case 4: return be32_to_cpup((__be32 *) (structure + offset));
+ case 8: return be64_to_cpup((__be64 *) (structure + offset));
+ default:
+ printk(KERN_WARNING "Field size %d bits not handled\n", size * 8);
+ return 0;
+ }
+}
+
+/**
+ * ib_pack - Pack a structure into a buffer
+ * @desc:Array of structure field descriptions
+ * @desc_len:Number of entries in @desc
+ * @structure:Structure to pack from
+ * @buf:Buffer to pack into
+ *
+ * ib_pack() packs a list of structure fields into a buffer,
+ * controlled by the array of fields in @desc.
+ */
+void ib_pack(const struct ib_field *desc,
+ int desc_len,
+ void *structure,
+ void *buf)
+{
+ int i;
+
+ for (i = 0; i < desc_len; ++i) {
+ if (desc[i].size_bits <= 32) {
+ int shift;
+ u32 val;
+ __be32 mask;
+ __be32 *addr;
+
+ shift = 32 - desc[i].offset_bits - desc[i].size_bits;
+ if (desc[i].struct_size_bytes)
+ val = value_read(desc[i].struct_offset_bytes,
+ desc[i].struct_size_bytes,
+ structure) << shift;
+ else
+ val = 0;
+
+ mask = cpu_to_be32(((1ull << desc[i].size_bits) - 1) << shift);
+ addr = (__be32 *) buf + desc[i].offset_words;
+ *addr = (*addr & ~mask) | (cpu_to_be32(val) & mask);
+ } else if (desc[i].size_bits <= 64) {
+ int shift;
+ u64 val;
+ __be64 mask;
+ __be64 *addr;
+
+ shift = 64 - desc[i].offset_bits - desc[i].size_bits;
+ if (desc[i].struct_size_bytes)
+ val = value_read(desc[i].struct_offset_bytes,
+ desc[i].struct_size_bytes,
+ structure) << shift;
+ else
+ val = 0;
+
+ mask = cpu_to_be64((~0ull >> (64 - desc[i].size_bits)) << shift);
+ addr = (__be64 *) ((__be32 *) buf + desc[i].offset_words);
+ *addr = (*addr & ~mask) | (cpu_to_be64(val) & mask);
+ } else {
+ if (desc[i].offset_bits % 8 ||
+ desc[i].size_bits % 8) {
+ printk(KERN_WARNING "Structure field %s of size %d "
+ "bits is not byte-aligned\n",
+ desc[i].field_name, desc[i].size_bits);
+ }
+
+ if (desc[i].struct_size_bytes)
+ memcpy(buf + desc[i].offset_words * 4 +
+ desc[i].offset_bits / 8,
+ structure + desc[i].struct_offset_bytes,
+ desc[i].size_bits / 8);
+ else
+ memset(buf + desc[i].offset_words * 4 +
+ desc[i].offset_bits / 8,
+ 0,
+ desc[i].size_bits / 8);
+ }
+ }
+}
+EXPORT_SYMBOL(ib_pack);
+
+static void value_write(int offset, int size, u64 val, void *structure)
+{
+ switch (size * 8) {
+ case 8: *( u8 *) (structure + offset) = val; break;
+ case 16: *(__be16 *) (structure + offset) = cpu_to_be16(val); break;
+ case 32: *(__be32 *) (structure + offset) = cpu_to_be32(val); break;
+ case 64: *(__be64 *) (structure + offset) = cpu_to_be64(val); break;
+ default:
+ printk(KERN_WARNING "Field size %d bits not handled\n", size * 8);
+ }
+}
+
+/**
+ * ib_unpack - Unpack a buffer into a structure
+ * @desc:Array of structure field descriptions
+ * @desc_len:Number of entries in @desc
+ * @buf:Buffer to unpack from
+ * @structure:Structure to unpack into
+ *
+ * ib_pack() unpacks a list of structure fields from a buffer,
+ * controlled by the array of fields in @desc.
+ */
+void ib_unpack(const struct ib_field *desc,
+ int desc_len,
+ void *buf,
+ void *structure)
+{
+ int i;
+
+ for (i = 0; i < desc_len; ++i) {
+ if (!desc[i].struct_size_bytes)
+ continue;
+
+ if (desc[i].size_bits <= 32) {
+ int shift;
+ u32 val;
+ u32 mask;
+ __be32 *addr;
+
+ shift = 32 - desc[i].offset_bits - desc[i].size_bits;
+ mask = ((1ull << desc[i].size_bits) - 1) << shift;
+ addr = (__be32 *) buf + desc[i].offset_words;
+ val = (be32_to_cpup(addr) & mask) >> shift;
+ value_write(desc[i].struct_offset_bytes,
+ desc[i].struct_size_bytes,
+ val,
+ structure);
+ } else if (desc[i].size_bits <= 64) {
+ int shift;
+ u64 val;
+ u64 mask;
+ __be64 *addr;
+
+ shift = 64 - desc[i].offset_bits - desc[i].size_bits;
+ mask = (~0ull >> (64 - desc[i].size_bits)) << shift;
+ addr = (__be64 *) buf + desc[i].offset_words;
+ val = (be64_to_cpup(addr) & mask) >> shift;
+ value_write(desc[i].struct_offset_bytes,
+ desc[i].struct_size_bytes,
+ val,
+ structure);
+ } else {
+ if (desc[i].offset_bits % 8 ||
+ desc[i].size_bits % 8) {
+ printk(KERN_WARNING "Structure field %s of size %d "
+ "bits is not byte-aligned\n",
+ desc[i].field_name, desc[i].size_bits);
+ }
+
+ memcpy(structure + desc[i].struct_offset_bytes,
+ buf + desc[i].offset_words * 4 +
+ desc[i].offset_bits / 8,
+ desc[i].size_bits / 8);
+ }
+ }
+}
+EXPORT_SYMBOL(ib_unpack);
diff --git a/sys/ofed/drivers/infiniband/core/sa.h b/sys/ofed/drivers/infiniband/core/sa.h
new file mode 100644
index 0000000..b8abdd7
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/sa.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef SA_H
+#define SA_H
+
+#include <rdma/ib_sa.h>
+
+static inline void ib_sa_client_get(struct ib_sa_client *client)
+{
+ atomic_inc(&client->users);
+}
+
+static inline void ib_sa_client_put(struct ib_sa_client *client)
+{
+ if (atomic_dec_and_test(&client->users))
+ complete(&client->comp);
+}
+
+int ib_sa_check_selector(ib_sa_comp_mask comp_mask,
+ ib_sa_comp_mask selector_mask,
+ ib_sa_comp_mask value_mask,
+ u8 selector, u8 src_value, u8 dst_value);
+
+int ib_sa_pack_attr(void *dst, void *src, int attr_id);
+
+int ib_sa_unpack_attr(void *dst, void *src, int attr_id);
+
+int ib_sa_path_rec_query(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ struct ib_sa_path_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ int timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_path_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query);
+
+int sa_db_init(void);
+void sa_db_cleanup(void);
+
+int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ u8 method,
+ struct ib_sa_mcmember_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ int timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_mcmember_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query);
+
+int mcast_init(void);
+void mcast_cleanup(void);
+
+int ib_sa_informinfo_query(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ struct ib_sa_inform *rec,
+ int timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_inform *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query);
+
+int notice_dispatch(struct ib_device *device, u8 port_num,
+ struct ib_sa_notice *notice);
+
+int notice_init(void);
+void notice_cleanup(void);
+
+#endif /* SA_H */
diff --git a/sys/ofed/drivers/infiniband/core/sa_query.c b/sys/ofed/drivers/infiniband/core/sa_query.c
new file mode 100644
index 0000000..0fc1c0e
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/sa_query.c
@@ -0,0 +1,1480 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/random.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/dma-mapping.h>
+#include <linux/kref.h>
+#include <linux/idr.h>
+#include <linux/workqueue.h>
+
+#include <rdma/ib_pack.h>
+#include <rdma/ib_cache.h>
+#include "sa.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand subnet administration query support");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct ib_sa_sm_ah {
+ struct ib_ah *ah;
+ struct kref ref;
+ u16 pkey_index;
+ u8 src_path_mask;
+};
+
+struct ib_sa_port {
+ struct ib_mad_agent *agent;
+ struct ib_mad_agent *notice_agent;
+ struct ib_sa_sm_ah *sm_ah;
+ struct work_struct update_task;
+ spinlock_t ah_lock;
+ u8 port_num;
+ struct ib_device *device;
+};
+
+struct ib_sa_device {
+ int start_port, end_port;
+ struct ib_event_handler event_handler;
+ struct ib_sa_port port[0];
+};
+
+struct ib_sa_query {
+ void (*callback)(struct ib_sa_query *, int, struct ib_sa_mad *);
+ void (*release)(struct ib_sa_query *);
+ struct ib_sa_client *client;
+ struct ib_sa_port *port;
+ struct ib_mad_send_buf *mad_buf;
+ struct ib_sa_sm_ah *sm_ah;
+ int id;
+};
+
+struct ib_sa_service_query {
+ void (*callback)(int, struct ib_sa_service_rec *, void *);
+ void *context;
+ struct ib_sa_query sa_query;
+};
+
+struct ib_sa_path_query {
+ void (*callback)(int, struct ib_sa_path_rec *, void *);
+ void *context;
+ struct ib_sa_query sa_query;
+};
+
+struct ib_sa_mcmember_query {
+ void (*callback)(int, struct ib_sa_mcmember_rec *, void *);
+ void *context;
+ struct ib_sa_query sa_query;
+};
+
+struct ib_sa_inform_query {
+ void (*callback)(int, struct ib_sa_inform *, void *);
+ void *context;
+ struct ib_sa_query sa_query;
+};
+
+static void ib_sa_add_one(struct ib_device *device);
+static void ib_sa_remove_one(struct ib_device *device);
+
+static struct ib_client sa_client = {
+ .name = "sa",
+ .add = ib_sa_add_one,
+ .remove = ib_sa_remove_one
+};
+
+static spinlock_t idr_lock;
+static DEFINE_IDR(query_idr);
+
+static spinlock_t tid_lock;
+static u32 tid;
+
+#define PATH_REC_FIELD(field) \
+ .struct_offset_bytes = offsetof(struct ib_sa_path_rec, field), \
+ .struct_size_bytes = sizeof ((struct ib_sa_path_rec *) 0)->field, \
+ .field_name = "sa_path_rec:" #field
+
+static const struct ib_field path_rec_table[] = {
+ { PATH_REC_FIELD(service_id),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 64 },
+ { PATH_REC_FIELD(dgid),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { PATH_REC_FIELD(sgid),
+ .offset_words = 6,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { PATH_REC_FIELD(dlid),
+ .offset_words = 10,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { PATH_REC_FIELD(slid),
+ .offset_words = 10,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { PATH_REC_FIELD(raw_traffic),
+ .offset_words = 11,
+ .offset_bits = 0,
+ .size_bits = 1 },
+ { RESERVED,
+ .offset_words = 11,
+ .offset_bits = 1,
+ .size_bits = 3 },
+ { PATH_REC_FIELD(flow_label),
+ .offset_words = 11,
+ .offset_bits = 4,
+ .size_bits = 20 },
+ { PATH_REC_FIELD(hop_limit),
+ .offset_words = 11,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { PATH_REC_FIELD(traffic_class),
+ .offset_words = 12,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { PATH_REC_FIELD(reversible),
+ .offset_words = 12,
+ .offset_bits = 8,
+ .size_bits = 1 },
+ { PATH_REC_FIELD(numb_path),
+ .offset_words = 12,
+ .offset_bits = 9,
+ .size_bits = 7 },
+ { PATH_REC_FIELD(pkey),
+ .offset_words = 12,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { PATH_REC_FIELD(qos_class),
+ .offset_words = 13,
+ .offset_bits = 0,
+ .size_bits = 12 },
+ { PATH_REC_FIELD(sl),
+ .offset_words = 13,
+ .offset_bits = 12,
+ .size_bits = 4 },
+ { PATH_REC_FIELD(mtu_selector),
+ .offset_words = 13,
+ .offset_bits = 16,
+ .size_bits = 2 },
+ { PATH_REC_FIELD(mtu),
+ .offset_words = 13,
+ .offset_bits = 18,
+ .size_bits = 6 },
+ { PATH_REC_FIELD(rate_selector),
+ .offset_words = 13,
+ .offset_bits = 24,
+ .size_bits = 2 },
+ { PATH_REC_FIELD(rate),
+ .offset_words = 13,
+ .offset_bits = 26,
+ .size_bits = 6 },
+ { PATH_REC_FIELD(packet_life_time_selector),
+ .offset_words = 14,
+ .offset_bits = 0,
+ .size_bits = 2 },
+ { PATH_REC_FIELD(packet_life_time),
+ .offset_words = 14,
+ .offset_bits = 2,
+ .size_bits = 6 },
+ { PATH_REC_FIELD(preference),
+ .offset_words = 14,
+ .offset_bits = 8,
+ .size_bits = 8 },
+ { RESERVED,
+ .offset_words = 14,
+ .offset_bits = 16,
+ .size_bits = 48 },
+};
+
+#define MCMEMBER_REC_FIELD(field) \
+ .struct_offset_bytes = offsetof(struct ib_sa_mcmember_rec, field), \
+ .struct_size_bytes = sizeof ((struct ib_sa_mcmember_rec *) 0)->field, \
+ .field_name = "sa_mcmember_rec:" #field
+
+static const struct ib_field mcmember_rec_table[] = {
+ { MCMEMBER_REC_FIELD(mgid),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { MCMEMBER_REC_FIELD(port_gid),
+ .offset_words = 4,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { MCMEMBER_REC_FIELD(qkey),
+ .offset_words = 8,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { MCMEMBER_REC_FIELD(mlid),
+ .offset_words = 9,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { MCMEMBER_REC_FIELD(mtu_selector),
+ .offset_words = 9,
+ .offset_bits = 16,
+ .size_bits = 2 },
+ { MCMEMBER_REC_FIELD(mtu),
+ .offset_words = 9,
+ .offset_bits = 18,
+ .size_bits = 6 },
+ { MCMEMBER_REC_FIELD(traffic_class),
+ .offset_words = 9,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { MCMEMBER_REC_FIELD(pkey),
+ .offset_words = 10,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { MCMEMBER_REC_FIELD(rate_selector),
+ .offset_words = 10,
+ .offset_bits = 16,
+ .size_bits = 2 },
+ { MCMEMBER_REC_FIELD(rate),
+ .offset_words = 10,
+ .offset_bits = 18,
+ .size_bits = 6 },
+ { MCMEMBER_REC_FIELD(packet_life_time_selector),
+ .offset_words = 10,
+ .offset_bits = 24,
+ .size_bits = 2 },
+ { MCMEMBER_REC_FIELD(packet_life_time),
+ .offset_words = 10,
+ .offset_bits = 26,
+ .size_bits = 6 },
+ { MCMEMBER_REC_FIELD(sl),
+ .offset_words = 11,
+ .offset_bits = 0,
+ .size_bits = 4 },
+ { MCMEMBER_REC_FIELD(flow_label),
+ .offset_words = 11,
+ .offset_bits = 4,
+ .size_bits = 20 },
+ { MCMEMBER_REC_FIELD(hop_limit),
+ .offset_words = 11,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { MCMEMBER_REC_FIELD(scope),
+ .offset_words = 12,
+ .offset_bits = 0,
+ .size_bits = 4 },
+ { MCMEMBER_REC_FIELD(join_state),
+ .offset_words = 12,
+ .offset_bits = 4,
+ .size_bits = 4 },
+ { MCMEMBER_REC_FIELD(proxy_join),
+ .offset_words = 12,
+ .offset_bits = 8,
+ .size_bits = 1 },
+ { RESERVED,
+ .offset_words = 12,
+ .offset_bits = 9,
+ .size_bits = 23 },
+};
+
+#define SERVICE_REC_FIELD(field) \
+ .struct_offset_bytes = offsetof(struct ib_sa_service_rec, field), \
+ .struct_size_bytes = sizeof ((struct ib_sa_service_rec *) 0)->field, \
+ .field_name = "sa_service_rec:" #field
+
+static const struct ib_field service_rec_table[] = {
+ { SERVICE_REC_FIELD(id),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 64 },
+ { SERVICE_REC_FIELD(gid),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { SERVICE_REC_FIELD(pkey),
+ .offset_words = 6,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { SERVICE_REC_FIELD(lease),
+ .offset_words = 7,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { SERVICE_REC_FIELD(key),
+ .offset_words = 8,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { SERVICE_REC_FIELD(name),
+ .offset_words = 12,
+ .offset_bits = 0,
+ .size_bits = 64*8 },
+ { SERVICE_REC_FIELD(data8),
+ .offset_words = 28,
+ .offset_bits = 0,
+ .size_bits = 16*8 },
+ { SERVICE_REC_FIELD(data16),
+ .offset_words = 32,
+ .offset_bits = 0,
+ .size_bits = 8*16 },
+ { SERVICE_REC_FIELD(data32),
+ .offset_words = 36,
+ .offset_bits = 0,
+ .size_bits = 4*32 },
+ { SERVICE_REC_FIELD(data64),
+ .offset_words = 40,
+ .offset_bits = 0,
+ .size_bits = 2*64 },
+};
+
+#define INFORM_FIELD(field) \
+ .struct_offset_bytes = offsetof(struct ib_sa_inform, field), \
+ .struct_size_bytes = sizeof ((struct ib_sa_inform *) 0)->field, \
+ .field_name = "sa_inform:" #field
+
+static const struct ib_field inform_table[] = {
+ { INFORM_FIELD(gid),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { INFORM_FIELD(lid_range_begin),
+ .offset_words = 4,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { INFORM_FIELD(lid_range_end),
+ .offset_words = 4,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { RESERVED,
+ .offset_words = 5,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { INFORM_FIELD(is_generic),
+ .offset_words = 5,
+ .offset_bits = 16,
+ .size_bits = 8 },
+ { INFORM_FIELD(subscribe),
+ .offset_words = 5,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { INFORM_FIELD(type),
+ .offset_words = 6,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { INFORM_FIELD(trap.generic.trap_num),
+ .offset_words = 6,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { INFORM_FIELD(trap.generic.qpn),
+ .offset_words = 7,
+ .offset_bits = 0,
+ .size_bits = 24 },
+ { RESERVED,
+ .offset_words = 7,
+ .offset_bits = 24,
+ .size_bits = 3 },
+ { INFORM_FIELD(trap.generic.resp_time),
+ .offset_words = 7,
+ .offset_bits = 27,
+ .size_bits = 5 },
+ { RESERVED,
+ .offset_words = 8,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { INFORM_FIELD(trap.generic.producer_type),
+ .offset_words = 8,
+ .offset_bits = 8,
+ .size_bits = 24 },
+};
+
+#define NOTICE_FIELD(field) \
+ .struct_offset_bytes = offsetof(struct ib_sa_notice, field), \
+ .struct_size_bytes = sizeof ((struct ib_sa_notice *) 0)->field, \
+ .field_name = "sa_notice:" #field
+
+static const struct ib_field notice_table[] = {
+ { NOTICE_FIELD(is_generic),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 1 },
+ { NOTICE_FIELD(type),
+ .offset_words = 0,
+ .offset_bits = 1,
+ .size_bits = 7 },
+ { NOTICE_FIELD(trap.generic.producer_type),
+ .offset_words = 0,
+ .offset_bits = 8,
+ .size_bits = 24 },
+ { NOTICE_FIELD(trap.generic.trap_num),
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { NOTICE_FIELD(issuer_lid),
+ .offset_words = 1,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { NOTICE_FIELD(notice_toggle),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 1 },
+ { NOTICE_FIELD(notice_count),
+ .offset_words = 2,
+ .offset_bits = 1,
+ .size_bits = 15 },
+ { NOTICE_FIELD(data_details),
+ .offset_words = 2,
+ .offset_bits = 16,
+ .size_bits = 432 },
+ { NOTICE_FIELD(issuer_gid),
+ .offset_words = 16,
+ .offset_bits = 0,
+ .size_bits = 128 },
+};
+
+int ib_sa_check_selector(ib_sa_comp_mask comp_mask,
+ ib_sa_comp_mask selector_mask,
+ ib_sa_comp_mask value_mask,
+ u8 selector, u8 src_value, u8 dst_value)
+{
+ int err;
+
+ if (!(comp_mask & selector_mask) || !(comp_mask & value_mask))
+ return 0;
+
+ switch (selector) {
+ case IB_SA_GT:
+ err = (src_value <= dst_value);
+ break;
+ case IB_SA_LT:
+ err = (src_value >= dst_value);
+ break;
+ case IB_SA_EQ:
+ err = (src_value != dst_value);
+ break;
+ default:
+ err = 0;
+ break;
+ }
+
+ return err;
+}
+
+int ib_sa_pack_attr(void *dst, void *src, int attr_id)
+{
+ switch (attr_id) {
+ case IB_SA_ATTR_PATH_REC:
+ ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), src, dst);
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int ib_sa_unpack_attr(void *dst, void *src, int attr_id)
+{
+ switch (attr_id) {
+ case IB_SA_ATTR_PATH_REC:
+ ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), src, dst);
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static void free_sm_ah(struct kref *kref)
+{
+ struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
+
+ ib_destroy_ah(sm_ah->ah);
+ kfree(sm_ah);
+}
+
+static void update_sm_ah(struct work_struct *work)
+{
+ struct ib_sa_port *port =
+ container_of(work, struct ib_sa_port, update_task);
+ struct ib_sa_sm_ah *new_ah;
+ struct ib_port_attr port_attr;
+ struct ib_ah_attr ah_attr;
+
+ if (ib_query_port(port->agent->device, port->port_num, &port_attr)) {
+ printk(KERN_WARNING "Couldn't query port\n");
+ return;
+ }
+
+ new_ah = kmalloc(sizeof *new_ah, GFP_KERNEL);
+ if (!new_ah) {
+ printk(KERN_WARNING "Couldn't allocate new SM AH\n");
+ return;
+ }
+
+ kref_init(&new_ah->ref);
+ new_ah->src_path_mask = (1 << port_attr.lmc) - 1;
+
+ new_ah->pkey_index = 0;
+ if (ib_find_pkey(port->agent->device, port->port_num,
+ IB_DEFAULT_PKEY_FULL, &new_ah->pkey_index))
+ printk(KERN_ERR "Couldn't find index for default PKey\n");
+
+ memset(&ah_attr, 0, sizeof ah_attr);
+ ah_attr.dlid = port_attr.sm_lid;
+ ah_attr.sl = port_attr.sm_sl;
+ ah_attr.port_num = port->port_num;
+
+ new_ah->ah = ib_create_ah(port->agent->qp->pd, &ah_attr);
+ if (IS_ERR(new_ah->ah)) {
+ printk(KERN_WARNING "Couldn't create new SM AH\n");
+ kfree(new_ah);
+ return;
+ }
+
+ spin_lock_irq(&port->ah_lock);
+ if (port->sm_ah)
+ kref_put(&port->sm_ah->ref, free_sm_ah);
+ port->sm_ah = new_ah;
+ spin_unlock_irq(&port->ah_lock);
+
+}
+
+static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event)
+{
+ if (event->event == IB_EVENT_PORT_ERR ||
+ event->event == IB_EVENT_PORT_ACTIVE ||
+ event->event == IB_EVENT_LID_CHANGE ||
+ event->event == IB_EVENT_PKEY_CHANGE ||
+ event->event == IB_EVENT_SM_CHANGE ||
+ event->event == IB_EVENT_CLIENT_REREGISTER) {
+ unsigned long flags;
+ struct ib_sa_device *sa_dev =
+ container_of(handler, typeof(*sa_dev), event_handler);
+ struct ib_sa_port *port =
+ &sa_dev->port[event->element.port_num - sa_dev->start_port];
+
+ if (rdma_port_get_link_layer(handler->device, port->port_num) != IB_LINK_LAYER_INFINIBAND)
+ return;
+
+ spin_lock_irqsave(&port->ah_lock, flags);
+ if (port->sm_ah)
+ kref_put(&port->sm_ah->ref, free_sm_ah);
+ port->sm_ah = NULL;
+ spin_unlock_irqrestore(&port->ah_lock, flags);
+
+ schedule_work(&sa_dev->port[event->element.port_num -
+ sa_dev->start_port].update_task);
+ }
+}
+
+void ib_sa_register_client(struct ib_sa_client *client)
+{
+ atomic_set(&client->users, 1);
+ init_completion(&client->comp);
+}
+EXPORT_SYMBOL(ib_sa_register_client);
+
+void ib_sa_unregister_client(struct ib_sa_client *client)
+{
+ ib_sa_client_put(client);
+ wait_for_completion(&client->comp);
+}
+EXPORT_SYMBOL(ib_sa_unregister_client);
+
+/**
+ * ib_sa_cancel_query - try to cancel an SA query
+ * @id:ID of query to cancel
+ * @query:query pointer to cancel
+ *
+ * Try to cancel an SA query. If the id and query don't match up or
+ * the query has already completed, nothing is done. Otherwise the
+ * query is canceled and will complete with a status of -EINTR.
+ */
+void ib_sa_cancel_query(int id, struct ib_sa_query *query)
+{
+ unsigned long flags;
+ struct ib_mad_agent *agent;
+ struct ib_mad_send_buf *mad_buf;
+
+ spin_lock_irqsave(&idr_lock, flags);
+ if (idr_find(&query_idr, id) != query) {
+ spin_unlock_irqrestore(&idr_lock, flags);
+ return;
+ }
+ agent = query->port->agent;
+ mad_buf = query->mad_buf;
+ spin_unlock_irqrestore(&idr_lock, flags);
+
+ ib_cancel_mad(agent, mad_buf);
+}
+EXPORT_SYMBOL(ib_sa_cancel_query);
+
+static u8 get_src_path_mask(struct ib_device *device, u8 port_num)
+{
+ struct ib_sa_device *sa_dev;
+ struct ib_sa_port *port;
+ unsigned long flags;
+ u8 src_path_mask;
+
+ sa_dev = ib_get_client_data(device, &sa_client);
+ if (!sa_dev)
+ return 0x7f;
+
+ port = &sa_dev->port[port_num - sa_dev->start_port];
+ spin_lock_irqsave(&port->ah_lock, flags);
+ src_path_mask = port->sm_ah ? port->sm_ah->src_path_mask : 0x7f;
+ spin_unlock_irqrestore(&port->ah_lock, flags);
+
+ return src_path_mask;
+}
+
+int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
+ struct ib_sa_path_rec *rec, struct ib_ah_attr *ah_attr)
+{
+ int ret;
+ u16 gid_index;
+ int force_grh;
+
+ memset(ah_attr, 0, sizeof *ah_attr);
+ ah_attr->dlid = be16_to_cpu(rec->dlid);
+ ah_attr->sl = rec->sl;
+ ah_attr->src_path_bits = be16_to_cpu(rec->slid) &
+ get_src_path_mask(device, port_num);
+ ah_attr->port_num = port_num;
+ ah_attr->static_rate = rec->rate;
+
+ force_grh = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_ETHERNET;
+
+ if (rec->hop_limit > 1 || force_grh) {
+ ah_attr->ah_flags = IB_AH_GRH;
+ ah_attr->grh.dgid = rec->dgid;
+
+ ret = ib_find_cached_gid(device, &rec->sgid, &port_num,
+ &gid_index);
+ if (ret)
+ return ret;
+
+ ah_attr->grh.sgid_index = gid_index;
+ ah_attr->grh.flow_label = be32_to_cpu(rec->flow_label);
+ ah_attr->grh.hop_limit = rec->hop_limit;
+ ah_attr->grh.traffic_class = rec->traffic_class;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ib_init_ah_from_path);
+
+static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&query->port->ah_lock, flags);
+ if (!query->port->sm_ah) {
+ spin_unlock_irqrestore(&query->port->ah_lock, flags);
+ return -EAGAIN;
+ }
+ kref_get(&query->port->sm_ah->ref);
+ query->sm_ah = query->port->sm_ah;
+ spin_unlock_irqrestore(&query->port->ah_lock, flags);
+
+ query->mad_buf = ib_create_send_mad(query->port->agent, 1,
+ query->sm_ah->pkey_index,
+ 0, IB_MGMT_SA_HDR, IB_MGMT_SA_DATA,
+ gfp_mask);
+ if (IS_ERR(query->mad_buf)) {
+ kref_put(&query->sm_ah->ref, free_sm_ah);
+ return -ENOMEM;
+ }
+
+ query->mad_buf->ah = query->sm_ah->ah;
+
+ return 0;
+}
+
+static void free_mad(struct ib_sa_query *query)
+{
+ ib_free_send_mad(query->mad_buf);
+ kref_put(&query->sm_ah->ref, free_sm_ah);
+}
+
+static void init_mad(struct ib_sa_mad *mad, struct ib_mad_agent *agent)
+{
+ unsigned long flags;
+
+ memset(mad, 0, sizeof *mad);
+
+ mad->mad_hdr.base_version = IB_MGMT_BASE_VERSION;
+ mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
+ mad->mad_hdr.class_version = IB_SA_CLASS_VERSION;
+
+ spin_lock_irqsave(&tid_lock, flags);
+ mad->mad_hdr.tid =
+ cpu_to_be64(((u64) agent->hi_tid) << 32 | tid++);
+ spin_unlock_irqrestore(&tid_lock, flags);
+}
+
+static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
+{
+ unsigned long flags;
+ int ret, id;
+
+retry:
+ if (!idr_pre_get(&query_idr, gfp_mask))
+ return -ENOMEM;
+ spin_lock_irqsave(&idr_lock, flags);
+ ret = idr_get_new(&query_idr, query, &id);
+ spin_unlock_irqrestore(&idr_lock, flags);
+ if (ret == -EAGAIN)
+ goto retry;
+ if (ret)
+ return ret;
+
+ query->mad_buf->timeout_ms = timeout_ms;
+ query->mad_buf->context[0] = query;
+ query->id = id;
+
+ ret = ib_post_send_mad(query->mad_buf, NULL);
+ if (ret) {
+ spin_lock_irqsave(&idr_lock, flags);
+ idr_remove(&query_idr, id);
+ spin_unlock_irqrestore(&idr_lock, flags);
+ }
+
+ /*
+ * It's not safe to dereference query any more, because the
+ * send may already have completed and freed the query in
+ * another context.
+ */
+ return ret ? ret : id;
+}
+
+void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec)
+{
+ ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), attribute, rec);
+}
+EXPORT_SYMBOL(ib_sa_unpack_path);
+
+static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
+ int status,
+ struct ib_sa_mad *mad)
+{
+ struct ib_sa_path_query *query =
+ container_of(sa_query, struct ib_sa_path_query, sa_query);
+
+ if (mad) {
+ struct ib_sa_path_rec rec;
+
+ ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table),
+ mad->data, &rec);
+ query->callback(status, &rec, query->context);
+ } else
+ query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_path_rec_release(struct ib_sa_query *sa_query)
+{
+ kfree(container_of(sa_query, struct ib_sa_path_query, sa_query));
+}
+
+int ib_sa_path_rec_query(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ struct ib_sa_path_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ int timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_path_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query)
+{
+ struct ib_sa_path_query *query;
+ struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ struct ib_sa_port *port;
+ struct ib_mad_agent *agent;
+ struct ib_sa_mad *mad;
+ int ret;
+
+ if (!sa_dev)
+ return -ENODEV;
+
+ port = &sa_dev->port[port_num - sa_dev->start_port];
+ agent = port->agent;
+
+ query = kmalloc(sizeof *query, gfp_mask);
+ if (!query)
+ return -ENOMEM;
+
+ query->sa_query.port = port;
+ ret = alloc_mad(&query->sa_query, gfp_mask);
+ if (ret)
+ goto err1;
+
+ ib_sa_client_get(client);
+ query->sa_query.client = client;
+ query->callback = callback;
+ query->context = context;
+
+ mad = query->sa_query.mad_buf->mad;
+ init_mad(mad, agent);
+
+ query->sa_query.callback = callback ? ib_sa_path_rec_callback : NULL;
+ query->sa_query.release = ib_sa_path_rec_release;
+ mad->mad_hdr.method = IB_MGMT_METHOD_GET;
+ mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_PATH_REC);
+ mad->sa_hdr.comp_mask = comp_mask;
+
+ ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), rec, mad->data);
+
+ *sa_query = &query->sa_query;
+
+ ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+ if (ret < 0)
+ goto err2;
+
+ return ret;
+
+err2:
+ *sa_query = NULL;
+ ib_sa_client_put(query->sa_query.client);
+ free_mad(&query->sa_query);
+
+err1:
+ kfree(query);
+ return ret;
+}
+
+static void ib_sa_service_rec_callback(struct ib_sa_query *sa_query,
+ int status,
+ struct ib_sa_mad *mad)
+{
+ struct ib_sa_service_query *query =
+ container_of(sa_query, struct ib_sa_service_query, sa_query);
+
+ if (mad) {
+ struct ib_sa_service_rec rec;
+
+ ib_unpack(service_rec_table, ARRAY_SIZE(service_rec_table),
+ mad->data, &rec);
+ query->callback(status, &rec, query->context);
+ } else
+ query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_service_rec_release(struct ib_sa_query *sa_query)
+{
+ kfree(container_of(sa_query, struct ib_sa_service_query, sa_query));
+}
+
+/**
+ * ib_sa_service_rec_query - Start Service Record operation
+ * @client:SA client
+ * @device:device to send request on
+ * @port_num: port number to send request on
+ * @method:SA method - should be get, set, or delete
+ * @rec:Service Record to send in request
+ * @comp_mask:component mask to send in request
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when request completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:request context, used to cancel request
+ *
+ * Send a Service Record set/get/delete to the SA to register,
+ * unregister or query a service record.
+ * The callback function will be called when the request completes (or
+ * fails); status is 0 for a successful response, -EINTR if the query
+ * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
+ * occurred sending the query. The resp parameter of the callback is
+ * only valid if status is 0.
+ *
+ * If the return value of ib_sa_service_rec_query() is negative, it is an
+ * error code. Otherwise it is a request ID that can be used to cancel
+ * the query.
+ */
+int ib_sa_service_rec_query(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num, u8 method,
+ struct ib_sa_service_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ int timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_service_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query)
+{
+ struct ib_sa_service_query *query;
+ struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ struct ib_sa_port *port;
+ struct ib_mad_agent *agent;
+ struct ib_sa_mad *mad;
+ int ret;
+
+ if (!sa_dev)
+ return -ENODEV;
+
+ port = &sa_dev->port[port_num - sa_dev->start_port];
+ agent = port->agent;
+
+ if (method != IB_MGMT_METHOD_GET &&
+ method != IB_MGMT_METHOD_SET &&
+ method != IB_SA_METHOD_DELETE)
+ return -EINVAL;
+
+ query = kmalloc(sizeof *query, gfp_mask);
+ if (!query)
+ return -ENOMEM;
+
+ query->sa_query.port = port;
+ ret = alloc_mad(&query->sa_query, gfp_mask);
+ if (ret)
+ goto err1;
+
+ ib_sa_client_get(client);
+ query->sa_query.client = client;
+ query->callback = callback;
+ query->context = context;
+
+ mad = query->sa_query.mad_buf->mad;
+ init_mad(mad, agent);
+
+ query->sa_query.callback = callback ? ib_sa_service_rec_callback : NULL;
+ query->sa_query.release = ib_sa_service_rec_release;
+ mad->mad_hdr.method = method;
+ mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_SERVICE_REC);
+ mad->sa_hdr.comp_mask = comp_mask;
+
+ ib_pack(service_rec_table, ARRAY_SIZE(service_rec_table),
+ rec, mad->data);
+
+ *sa_query = &query->sa_query;
+
+ ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+ if (ret < 0)
+ goto err2;
+
+ return ret;
+
+err2:
+ *sa_query = NULL;
+ ib_sa_client_put(query->sa_query.client);
+ free_mad(&query->sa_query);
+
+err1:
+ kfree(query);
+ return ret;
+}
+EXPORT_SYMBOL(ib_sa_service_rec_query);
+
+static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query,
+ int status,
+ struct ib_sa_mad *mad)
+{
+ struct ib_sa_mcmember_query *query =
+ container_of(sa_query, struct ib_sa_mcmember_query, sa_query);
+
+ if (mad) {
+ struct ib_sa_mcmember_rec rec;
+
+ ib_unpack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table),
+ mad->data, &rec);
+ query->callback(status, &rec, query->context);
+ } else
+ query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_mcmember_rec_release(struct ib_sa_query *sa_query)
+{
+ kfree(container_of(sa_query, struct ib_sa_mcmember_query, sa_query));
+}
+
+int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ u8 method,
+ struct ib_sa_mcmember_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ int timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_mcmember_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query)
+{
+ struct ib_sa_mcmember_query *query;
+ struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ struct ib_sa_port *port;
+ struct ib_mad_agent *agent;
+ struct ib_sa_mad *mad;
+ int ret;
+
+ if (!sa_dev)
+ return -ENODEV;
+
+ port = &sa_dev->port[port_num - sa_dev->start_port];
+ agent = port->agent;
+
+ query = kmalloc(sizeof *query, gfp_mask);
+ if (!query)
+ return -ENOMEM;
+
+ query->sa_query.port = port;
+ ret = alloc_mad(&query->sa_query, gfp_mask);
+ if (ret)
+ goto err1;
+
+ ib_sa_client_get(client);
+ query->sa_query.client = client;
+ query->callback = callback;
+ query->context = context;
+
+ mad = query->sa_query.mad_buf->mad;
+ init_mad(mad, agent);
+
+ query->sa_query.callback = callback ? ib_sa_mcmember_rec_callback : NULL;
+ query->sa_query.release = ib_sa_mcmember_rec_release;
+ mad->mad_hdr.method = method;
+ mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
+ mad->sa_hdr.comp_mask = comp_mask;
+
+ ib_pack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table),
+ rec, mad->data);
+
+ *sa_query = &query->sa_query;
+
+ ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+ if (ret < 0)
+ goto err2;
+
+ return ret;
+
+err2:
+ *sa_query = NULL;
+ ib_sa_client_put(query->sa_query.client);
+ free_mad(&query->sa_query);
+
+err1:
+ kfree(query);
+ return ret;
+}
+
+static void ib_sa_inform_callback(struct ib_sa_query *sa_query,
+ int status,
+ struct ib_sa_mad *mad)
+{
+ struct ib_sa_inform_query *query =
+ container_of(sa_query, struct ib_sa_inform_query, sa_query);
+
+ if (mad) {
+ struct ib_sa_inform rec;
+
+ ib_unpack(inform_table, ARRAY_SIZE(inform_table),
+ mad->data, &rec);
+ query->callback(status, &rec, query->context);
+ } else
+ query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_inform_release(struct ib_sa_query *sa_query)
+{
+ kfree(container_of(sa_query, struct ib_sa_inform_query, sa_query));
+}
+
+/**
+ * ib_sa_informinfo_query - Start an InformInfo registration.
+ * @client:SA client
+ * @device:device to send query on
+ * @port_num: port number to send query on
+ * @rec:Inform record to send in query
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when notice handler registration completes,
+ * times out or is canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:query context, used to cancel query
+ *
+ * This function sends inform info to register with SA to receive
+ * in-service notice.
+ * The callback function will be called when the query completes (or
+ * fails); status is 0 for a successful response, -EINTR if the query
+ * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
+ * occurred sending the query. The resp parameter of the callback is
+ * only valid if status is 0.
+ *
+ * If the return value of ib_sa_inform_query() is negative, it is an
+ * error code. Otherwise it is a query ID that can be used to cancel
+ * the query.
+ */
+int ib_sa_informinfo_query(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ struct ib_sa_inform *rec,
+ int timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_inform *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query)
+{
+ struct ib_sa_inform_query *query;
+ struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ struct ib_sa_port *port;
+ struct ib_mad_agent *agent;
+ struct ib_sa_mad *mad;
+ int ret;
+
+ if (!sa_dev)
+ return -ENODEV;
+
+ port = &sa_dev->port[port_num - sa_dev->start_port];
+ agent = port->agent;
+
+ query = kmalloc(sizeof *query, gfp_mask);
+ if (!query)
+ return -ENOMEM;
+
+ query->sa_query.port = port;
+ ret = alloc_mad(&query->sa_query, gfp_mask);
+ if (ret)
+ goto err1;
+
+ ib_sa_client_get(client);
+ query->sa_query.client = client;
+ query->callback = callback;
+ query->context = context;
+
+ mad = query->sa_query.mad_buf->mad;
+ init_mad(mad, agent);
+
+ query->sa_query.callback = callback ? ib_sa_inform_callback : NULL;
+ query->sa_query.release = ib_sa_inform_release;
+ query->sa_query.port = port;
+ mad->mad_hdr.method = IB_MGMT_METHOD_SET;
+ mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_INFORM_INFO);
+
+ ib_pack(inform_table, ARRAY_SIZE(inform_table), rec, mad->data);
+
+ *sa_query = &query->sa_query;
+ ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+ if (ret < 0)
+ goto err2;
+
+ return ret;
+
+err2:
+ *sa_query = NULL;
+ ib_sa_client_put(query->sa_query.client);
+ free_mad(&query->sa_query);
+err1:
+ kfree(query);
+ return ret;
+}
+
+static void ib_sa_notice_resp(struct ib_sa_port *port,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_mad_send_buf *mad_buf;
+ struct ib_sa_mad *mad;
+ int ret;
+ unsigned long flags;
+
+ mad_buf = ib_create_send_mad(port->notice_agent, 1, 0, 0,
+ IB_MGMT_SA_HDR, IB_MGMT_SA_DATA,
+ GFP_KERNEL);
+ if (IS_ERR(mad_buf))
+ return;
+
+ mad = mad_buf->mad;
+ memcpy(mad, mad_recv_wc->recv_buf.mad, sizeof *mad);
+ mad->mad_hdr.method = IB_MGMT_METHOD_REPORT_RESP;
+
+ spin_lock_irqsave(&port->ah_lock, flags);
+ if (!port->sm_ah) {
+ spin_unlock_irqrestore(&port->ah_lock, flags);
+ ib_free_send_mad(mad_buf);
+ return;
+ }
+ kref_get(&port->sm_ah->ref);
+ mad_buf->context[0] = &port->sm_ah->ref;
+ mad_buf->ah = port->sm_ah->ah;
+ spin_unlock_irqrestore(&port->ah_lock, flags);
+
+ ret = ib_post_send_mad(mad_buf, NULL);
+ if (ret)
+ goto err;
+
+ return;
+err:
+ kref_put(mad_buf->context[0], free_sm_ah);
+ ib_free_send_mad(mad_buf);
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ struct ib_sa_query *query = mad_send_wc->send_buf->context[0];
+ unsigned long flags;
+
+ if (query->callback)
+ switch (mad_send_wc->status) {
+ case IB_WC_SUCCESS:
+ /* No callback -- already got recv */
+ break;
+ case IB_WC_RESP_TIMEOUT_ERR:
+ query->callback(query, -ETIMEDOUT, NULL);
+ break;
+ case IB_WC_WR_FLUSH_ERR:
+ query->callback(query, -EINTR, NULL);
+ break;
+ default:
+ query->callback(query, -EIO, NULL);
+ break;
+ }
+
+ spin_lock_irqsave(&idr_lock, flags);
+ idr_remove(&query_idr, query->id);
+ spin_unlock_irqrestore(&idr_lock, flags);
+
+ free_mad(query);
+ ib_sa_client_put(query->client);
+ query->release(query);
+}
+
+static void recv_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_sa_query *query;
+ struct ib_mad_send_buf *mad_buf;
+
+ mad_buf = (void *) (unsigned long) mad_recv_wc->wc->wr_id;
+ query = mad_buf->context[0];
+
+ if (query->callback) {
+ if (mad_recv_wc->wc->status == IB_WC_SUCCESS)
+ query->callback(query,
+ mad_recv_wc->recv_buf.mad->mad_hdr.status ?
+ -EINVAL : 0,
+ (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad);
+ else
+ query->callback(query, -EIO, NULL);
+ }
+
+ ib_free_recv_mad(mad_recv_wc);
+}
+
+static void notice_resp_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ kref_put(mad_send_wc->send_buf->context[0], free_sm_ah);
+ ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+static void notice_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_sa_port *port;
+ struct ib_sa_mad *mad;
+ struct ib_sa_notice notice;
+
+ port = mad_agent->context;
+ mad = (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad;
+ ib_unpack(notice_table, ARRAY_SIZE(notice_table), mad->data, &notice);
+
+ if (!notice_dispatch(port->device, port->port_num, &notice))
+ ib_sa_notice_resp(port, mad_recv_wc);
+ ib_free_recv_mad(mad_recv_wc);
+}
+
+static void ib_sa_add_one(struct ib_device *device)
+{
+ struct ib_sa_device *sa_dev;
+ struct ib_mad_reg_req reg_req = {
+ .mgmt_class = IB_MGMT_CLASS_SUBN_ADM,
+ .mgmt_class_version = 2
+ };
+ int s, e, i;
+
+ if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+ return;
+
+ if (device->node_type == RDMA_NODE_IB_SWITCH)
+ s = e = 0;
+ else {
+ s = 1;
+ e = device->phys_port_cnt;
+ }
+
+ sa_dev = kzalloc(sizeof *sa_dev +
+ (e - s + 1) * sizeof (struct ib_sa_port),
+ GFP_KERNEL);
+ if (!sa_dev)
+ return;
+
+ sa_dev->start_port = s;
+ sa_dev->end_port = e;
+
+ for (i = 0; i <= e - s; ++i) {
+ spin_lock_init(&sa_dev->port[i].ah_lock);
+ if (rdma_port_get_link_layer(device, i + 1) != IB_LINK_LAYER_INFINIBAND)
+ continue;
+
+ sa_dev->port[i].sm_ah = NULL;
+ sa_dev->port[i].port_num = i + s;
+
+ sa_dev->port[i].agent =
+ ib_register_mad_agent(device, i + s, IB_QPT_GSI,
+ NULL, 0, send_handler,
+ recv_handler, sa_dev);
+ if (IS_ERR(sa_dev->port[i].agent))
+ goto err;
+
+ sa_dev->port[i].device = device;
+ set_bit(IB_MGMT_METHOD_REPORT, reg_req.method_mask);
+ sa_dev->port[i].notice_agent =
+ ib_register_mad_agent(device, i + s, IB_QPT_GSI,
+ &reg_req, 0, notice_resp_handler,
+ notice_handler, &sa_dev->port[i]);
+
+ if (IS_ERR(sa_dev->port[i].notice_agent))
+ goto err;
+
+ INIT_WORK(&sa_dev->port[i].update_task, update_sm_ah);
+ }
+
+ ib_set_client_data(device, &sa_client, sa_dev);
+
+ /*
+ * We register our event handler after everything is set up,
+ * and then update our cached info after the event handler is
+ * registered to avoid any problems if a port changes state
+ * during our initialization.
+ */
+
+ INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, ib_sa_event);
+ if (ib_register_event_handler(&sa_dev->event_handler))
+ goto err;
+
+ for (i = 0; i <= e - s; ++i)
+ if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND)
+ update_sm_ah(&sa_dev->port[i].update_task);
+
+ return;
+
+err:
+ while (--i >= 0)
+ if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) {
+ if (!IS_ERR(sa_dev->port[i].notice_agent))
+ ib_unregister_mad_agent(sa_dev->port[i].notice_agent);
+ if (!IS_ERR(sa_dev->port[i].agent))
+ ib_unregister_mad_agent(sa_dev->port[i].agent);
+ }
+
+ kfree(sa_dev);
+
+ return;
+}
+
+static void ib_sa_remove_one(struct ib_device *device)
+{
+ struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ int i;
+
+ if (!sa_dev)
+ return;
+
+ ib_unregister_event_handler(&sa_dev->event_handler);
+
+ flush_scheduled_work();
+
+ for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) {
+ if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) {
+ ib_unregister_mad_agent(sa_dev->port[i].notice_agent);
+ ib_unregister_mad_agent(sa_dev->port[i].agent);
+ if (sa_dev->port[i].sm_ah)
+ kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah);
+ }
+
+ }
+
+ kfree(sa_dev);
+}
+
+static int __init ib_sa_init(void)
+{
+ int ret;
+
+ spin_lock_init(&idr_lock);
+ spin_lock_init(&tid_lock);
+
+ get_random_bytes(&tid, sizeof tid);
+
+ ret = ib_register_client(&sa_client);
+ if (ret) {
+ printk(KERN_ERR "Couldn't register ib_sa client\n");
+ goto err1;
+ }
+
+ ret = mcast_init();
+ if (ret) {
+ printk(KERN_ERR "Couldn't initialize multicast handling\n");
+ goto err2;
+ }
+
+ ret = notice_init();
+ if (ret) {
+ printk(KERN_ERR "Couldn't initialize notice handling\n");
+ goto err3;
+ }
+
+ ret = sa_db_init();
+ if (ret) {
+ printk(KERN_ERR "Couldn't initialize local SA\n");
+ goto err4;
+ }
+
+ return 0;
+err4:
+ notice_cleanup();
+err3:
+ mcast_cleanup();
+err2:
+ ib_unregister_client(&sa_client);
+err1:
+ return ret;
+}
+
+static void __exit ib_sa_cleanup(void)
+{
+ sa_db_cleanup();
+ mcast_cleanup();
+ notice_cleanup();
+ ib_unregister_client(&sa_client);
+ idr_destroy(&query_idr);
+}
+
+module_init_order(ib_sa_init, SI_ORDER_SECOND);
+module_exit(ib_sa_cleanup);
diff --git a/sys/ofed/drivers/infiniband/core/smi.c b/sys/ofed/drivers/infiniband/core/smi.c
new file mode 100644
index 0000000..8723675
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/smi.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2004, 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004, 2005 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004, 2005 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <rdma/ib_smi.h>
+#include "smi.h"
+
+/*
+ * Fixup a directed route SMP for sending
+ * Return 0 if the SMP should be discarded
+ */
+enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
+ u8 node_type, int port_num)
+{
+ u8 hop_ptr, hop_cnt;
+
+ hop_ptr = smp->hop_ptr;
+ hop_cnt = smp->hop_cnt;
+
+ /* See section 14.2.2.2, Vol 1 IB spec */
+ if (!ib_get_smp_direction(smp)) {
+ /* C14-9:1 */
+ if (hop_cnt && hop_ptr == 0) {
+ smp->hop_ptr++;
+ return (smp->initial_path[smp->hop_ptr] ==
+ port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-9:2 */
+ if (hop_ptr && hop_ptr < hop_cnt) {
+ if (node_type != RDMA_NODE_IB_SWITCH)
+ return IB_SMI_DISCARD;
+
+ /* smp->return_path set when received */
+ smp->hop_ptr++;
+ return (smp->initial_path[smp->hop_ptr] ==
+ port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-9:3 -- We're at the end of the DR segment of path */
+ if (hop_ptr == hop_cnt) {
+ /* smp->return_path set when received */
+ smp->hop_ptr++;
+ return (node_type == RDMA_NODE_IB_SWITCH ||
+ smp->dr_dlid == IB_LID_PERMISSIVE ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+ /* C14-9:5 -- Fail unreasonable hop pointer */
+ return (hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+
+ } else {
+ /* C14-13:1 */
+ if (hop_cnt && hop_ptr == hop_cnt + 1) {
+ smp->hop_ptr--;
+ return (smp->return_path[smp->hop_ptr] ==
+ port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:2 */
+ if (2 <= hop_ptr && hop_ptr <= hop_cnt) {
+ if (node_type != RDMA_NODE_IB_SWITCH)
+ return IB_SMI_DISCARD;
+
+ smp->hop_ptr--;
+ return (smp->return_path[smp->hop_ptr] ==
+ port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:3 -- at the end of the DR segment of path */
+ if (hop_ptr == 1) {
+ smp->hop_ptr--;
+ /* C14-13:3 -- SMPs destined for SM shouldn't be here */
+ return (node_type == RDMA_NODE_IB_SWITCH ||
+ smp->dr_slid == IB_LID_PERMISSIVE ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:4 -- hop_ptr = 0 -> should have gone to SM */
+ if (hop_ptr == 0)
+ return IB_SMI_HANDLE;
+
+ /* C14-13:5 -- Check for unreasonable hop pointer */
+ return IB_SMI_DISCARD;
+ }
+}
+
+/*
+ * Adjust information for a received SMP
+ * Return 0 if the SMP should be dropped
+ */
+enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type,
+ int port_num, int phys_port_cnt)
+{
+ u8 hop_ptr, hop_cnt;
+
+ hop_ptr = smp->hop_ptr;
+ hop_cnt = smp->hop_cnt;
+
+ /* See section 14.2.2.2, Vol 1 IB spec */
+ if (!ib_get_smp_direction(smp)) {
+ /* C14-9:1 -- sender should have incremented hop_ptr */
+ if (hop_cnt && hop_ptr == 0)
+ return IB_SMI_DISCARD;
+
+ /* C14-9:2 -- intermediate hop */
+ if (hop_ptr && hop_ptr < hop_cnt) {
+ if (node_type != RDMA_NODE_IB_SWITCH)
+ return IB_SMI_DISCARD;
+
+ smp->return_path[hop_ptr] = port_num;
+ /* smp->hop_ptr updated when sending */
+ return (smp->initial_path[hop_ptr+1] <= phys_port_cnt ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-9:3 -- We're at the end of the DR segment of path */
+ if (hop_ptr == hop_cnt) {
+ if (hop_cnt)
+ smp->return_path[hop_ptr] = port_num;
+ /* smp->hop_ptr updated when sending */
+
+ return (node_type == RDMA_NODE_IB_SWITCH ||
+ smp->dr_dlid == IB_LID_PERMISSIVE ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+ /* C14-9:5 -- fail unreasonable hop pointer */
+ return (hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+
+ } else {
+
+ /* C14-13:1 */
+ if (hop_cnt && hop_ptr == hop_cnt + 1) {
+ smp->hop_ptr--;
+ return (smp->return_path[smp->hop_ptr] ==
+ port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:2 */
+ if (2 <= hop_ptr && hop_ptr <= hop_cnt) {
+ if (node_type != RDMA_NODE_IB_SWITCH)
+ return IB_SMI_DISCARD;
+
+ /* smp->hop_ptr updated when sending */
+ return (smp->return_path[hop_ptr-1] <= phys_port_cnt ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:3 -- We're at the end of the DR segment of path */
+ if (hop_ptr == 1) {
+ if (smp->dr_slid == IB_LID_PERMISSIVE) {
+ /* giving SMP to SM - update hop_ptr */
+ smp->hop_ptr--;
+ return IB_SMI_HANDLE;
+ }
+ /* smp->hop_ptr updated when sending */
+ return (node_type == RDMA_NODE_IB_SWITCH ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:4 -- hop_ptr = 0 -> give to SM */
+ /* C14-13:5 -- Check for unreasonable hop pointer */
+ return (hop_ptr == 0 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+}
+
+enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp)
+{
+ u8 hop_ptr, hop_cnt;
+
+ hop_ptr = smp->hop_ptr;
+ hop_cnt = smp->hop_cnt;
+
+ if (!ib_get_smp_direction(smp)) {
+ /* C14-9:2 -- intermediate hop */
+ if (hop_ptr && hop_ptr < hop_cnt)
+ return IB_SMI_FORWARD;
+
+ /* C14-9:3 -- at the end of the DR segment of path */
+ if (hop_ptr == hop_cnt)
+ return (smp->dr_dlid == IB_LID_PERMISSIVE ?
+ IB_SMI_SEND : IB_SMI_LOCAL);
+
+ /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+ if (hop_ptr == hop_cnt + 1)
+ return IB_SMI_SEND;
+ } else {
+ /* C14-13:2 -- intermediate hop */
+ if (2 <= hop_ptr && hop_ptr <= hop_cnt)
+ return IB_SMI_FORWARD;
+
+ /* C14-13:3 -- at the end of the DR segment of path */
+ if (hop_ptr == 1)
+ return (smp->dr_slid != IB_LID_PERMISSIVE ?
+ IB_SMI_SEND : IB_SMI_LOCAL);
+ }
+ return IB_SMI_LOCAL;
+}
+
+/*
+ * Return the forwarding port number from initial_path for outgoing SMP and
+ * from return_path for returning SMP
+ */
+int smi_get_fwd_port(struct ib_smp *smp)
+{
+ return (!ib_get_smp_direction(smp) ? smp->initial_path[smp->hop_ptr+1] :
+ smp->return_path[smp->hop_ptr-1]);
+}
diff --git a/sys/ofed/drivers/infiniband/core/smi.h b/sys/ofed/drivers/infiniband/core/smi.h
new file mode 100644
index 0000000..aff96ba
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/smi.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __SMI_H_
+#define __SMI_H_
+
+#include <rdma/ib_smi.h>
+
+enum smi_action {
+ IB_SMI_DISCARD,
+ IB_SMI_HANDLE
+};
+
+enum smi_forward_action {
+ IB_SMI_LOCAL, /* SMP should be completed up the stack */
+ IB_SMI_SEND, /* received DR SMP should be forwarded to the send queue */
+ IB_SMI_FORWARD /* SMP should be forwarded (for switches only) */
+};
+
+enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type,
+ int port_num, int phys_port_cnt);
+int smi_get_fwd_port(struct ib_smp *smp);
+extern enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp);
+extern enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
+ u8 node_type, int port_num);
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action smi_check_local_smp(struct ib_smp *smp,
+ struct ib_device *device)
+{
+ /* C14-9:3 -- We're at the end of the DR segment of path */
+ /* C14-9:4 -- Hop Pointer = Hop Count + 1 -> give to SMA/SM */
+ return ((device->process_mad &&
+ !ib_get_smp_direction(smp) &&
+ (smp->hop_ptr == smp->hop_cnt + 1)) ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+}
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action smi_check_local_returning_smp(struct ib_smp *smp,
+ struct ib_device *device)
+{
+ /* C14-13:3 -- We're at the end of the DR segment of path */
+ /* C14-13:4 -- Hop Pointer == 0 -> give to SM */
+ return ((device->process_mad &&
+ ib_get_smp_direction(smp) &&
+ !smp->hop_ptr) ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+}
+
+#endif /* __SMI_H_ */
diff --git a/sys/ofed/drivers/infiniband/core/sysfs.c b/sys/ofed/drivers/infiniband/core/sysfs.c
new file mode 100644
index 0000000..a406406
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/sysfs.c
@@ -0,0 +1,911 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "core_priv.h"
+
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include <rdma/ib_mad.h>
+
+struct ib_port {
+ struct kobject kobj;
+ struct ib_device *ibdev;
+ struct attribute_group gid_group;
+ struct attribute_group pkey_group;
+ u8 port_num;
+};
+
+struct port_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct ib_port *, struct port_attribute *, char *buf);
+ ssize_t (*store)(struct ib_port *, struct port_attribute *,
+ const char *buf, size_t count);
+};
+
+#define PORT_ATTR(_name, _mode, _show, _store) \
+struct port_attribute port_attr_##_name = __ATTR(_name, _mode, _show, _store)
+
+#define PORT_ATTR_RO(_name) \
+struct port_attribute port_attr_##_name = __ATTR_RO(_name)
+
+struct port_table_attribute {
+ struct port_attribute attr;
+ char name[8];
+ int index;
+};
+
+static ssize_t port_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct port_attribute *port_attr =
+ container_of(attr, struct port_attribute, attr);
+ struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+
+ if (!port_attr->show)
+ return -EIO;
+
+ return port_attr->show(p, port_attr, buf);
+}
+
+static const struct sysfs_ops port_sysfs_ops = {
+ .show = port_attr_show
+};
+
+static ssize_t state_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ static const char *state_name[] = {
+ [IB_PORT_NOP] = "NOP",
+ [IB_PORT_DOWN] = "DOWN",
+ [IB_PORT_INIT] = "INIT",
+ [IB_PORT_ARMED] = "ARMED",
+ [IB_PORT_ACTIVE] = "ACTIVE",
+ [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER"
+ };
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "%d: %s\n", attr.state,
+ attr.state >= 0 && attr.state < ARRAY_SIZE(state_name) ?
+ state_name[attr.state] : "UNKNOWN");
+}
+
+static ssize_t lid_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "0x%x\n", attr.lid);
+}
+
+static ssize_t lid_mask_count_show(struct ib_port *p,
+ struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "%d\n", attr.lmc);
+}
+
+static ssize_t sm_lid_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "0x%x\n", attr.sm_lid);
+}
+
+static ssize_t sm_sl_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "%d\n", attr.sm_sl);
+}
+
+static ssize_t cap_mask_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "0x%08x\n", attr.port_cap_flags);
+}
+
+static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ char *speed = "";
+ int rate;
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ switch (attr.active_speed) {
+ case 2: speed = " DDR"; break;
+ case 4: speed = " QDR"; break;
+ }
+
+ rate = 25 * ib_width_enum_to_int(attr.active_width) * attr.active_speed;
+ if (rate < 0)
+ return -EINVAL;
+
+ return sprintf(buf, "%d%s Gb/sec (%dX%s)\n",
+ rate / 10, rate % 10 ? ".5" : "",
+ ib_width_enum_to_int(attr.active_width), speed);
+}
+
+static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ switch (attr.phys_state) {
+ case 1: return sprintf(buf, "1: Sleep\n");
+ case 2: return sprintf(buf, "2: Polling\n");
+ case 3: return sprintf(buf, "3: Disabled\n");
+ case 4: return sprintf(buf, "4: PortConfigurationTraining\n");
+ case 5: return sprintf(buf, "5: LinkUp\n");
+ case 6: return sprintf(buf, "6: LinkErrorRecovery\n");
+ case 7: return sprintf(buf, "7: Phy Test\n");
+ default: return sprintf(buf, "%d: <unknown>\n", attr.phys_state);
+ }
+}
+
+static ssize_t link_layer_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ switch (rdma_port_get_link_layer(p->ibdev, p->port_num)) {
+ case IB_LINK_LAYER_INFINIBAND:
+ return sprintf(buf, "%s\n", "IB");
+ case IB_LINK_LAYER_ETHERNET:
+ return sprintf(buf, "%s\n", "Ethernet");
+ default:
+ return sprintf(buf, "%s\n", "Unknown");
+ }
+}
+
+static PORT_ATTR_RO(state);
+static PORT_ATTR_RO(lid);
+static PORT_ATTR_RO(lid_mask_count);
+static PORT_ATTR_RO(sm_lid);
+static PORT_ATTR_RO(sm_sl);
+static PORT_ATTR_RO(cap_mask);
+static PORT_ATTR_RO(rate);
+static PORT_ATTR_RO(phys_state);
+static PORT_ATTR_RO(link_layer);
+
+static struct attribute *port_default_attrs[] = {
+ &port_attr_state.attr,
+ &port_attr_lid.attr,
+ &port_attr_lid_mask_count.attr,
+ &port_attr_sm_lid.attr,
+ &port_attr_sm_sl.attr,
+ &port_attr_cap_mask.attr,
+ &port_attr_rate.attr,
+ &port_attr_phys_state.attr,
+ &port_attr_link_layer.attr,
+ NULL
+};
+
+static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr,
+ char *buf)
+{
+ struct port_table_attribute *tab_attr =
+ container_of(attr, struct port_table_attribute, attr);
+ union ib_gid gid;
+ ssize_t ret;
+ u16 *raw;
+
+ ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid);
+ if (ret)
+ return ret;
+
+ raw = (u16 *)gid.raw;
+ return sprintf(buf, "%.4x:%.4x:%.4x:%.4x:%.4x:%.4x:%.4x:%.4x\n",
+ htons(raw[0]), htons(raw[1]), htons(raw[2]), htons(raw[3]),
+ htons(raw[4]), htons(raw[5]), htons(raw[6]), htons(raw[7]));
+}
+
+static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr,
+ char *buf)
+{
+ struct port_table_attribute *tab_attr =
+ container_of(attr, struct port_table_attribute, attr);
+ u16 pkey;
+ ssize_t ret;
+
+ ret = ib_query_pkey(p->ibdev, p->port_num, tab_attr->index, &pkey);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "0x%04x\n", pkey);
+}
+
+#define PORT_PMA_ATTR(_name, _counter, _width, _offset) \
+struct port_table_attribute port_pma_attr_##_name = { \
+ .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \
+ .index = (_offset) | ((_width) << 16) | ((_counter) << 24) \
+}
+
+static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
+ char *buf)
+{
+ struct port_table_attribute *tab_attr =
+ container_of(attr, struct port_table_attribute, attr);
+ int offset = tab_attr->index & 0xffff;
+ int width = (tab_attr->index >> 16) & 0xff;
+ struct ib_mad *in_mad = NULL;
+ struct ib_mad *out_mad = NULL;
+ ssize_t ret;
+
+ if (!p->ibdev->process_mad)
+ return sprintf(buf, "N/A (no PMA)\n");
+
+ in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ in_mad->mad_hdr.base_version = 1;
+ in_mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_PERF_MGMT;
+ in_mad->mad_hdr.class_version = 1;
+ in_mad->mad_hdr.method = IB_MGMT_METHOD_GET;
+ in_mad->mad_hdr.attr_id = cpu_to_be16(0x12); /* PortCounters */
+
+ in_mad->data[41] = p->port_num; /* PortSelect field */
+
+ if ((p->ibdev->process_mad(p->ibdev, IB_MAD_IGNORE_MKEY,
+ p->port_num, NULL, NULL, in_mad, out_mad) &
+ (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) !=
+ (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ switch (width) {
+ case 4:
+ ret = sprintf(buf, "%u\n", (out_mad->data[40 + offset / 8] >>
+ (4 - (offset % 8))) & 0xf);
+ break;
+ case 8:
+ ret = sprintf(buf, "%u\n", out_mad->data[40 + offset / 8]);
+ break;
+ case 16:
+ ret = sprintf(buf, "%u\n",
+ be16_to_cpup((__be16 *)(out_mad->data + 40 + offset / 8)));
+ break;
+ case 32:
+ ret = sprintf(buf, "%u\n",
+ be32_to_cpup((__be32 *)(out_mad->data + 40 + offset / 8)));
+ break;
+ default:
+ ret = 0;
+ }
+
+out:
+ kfree(in_mad);
+ kfree(out_mad);
+
+ return ret;
+}
+
+static PORT_PMA_ATTR(symbol_error , 0, 16, 32);
+static PORT_PMA_ATTR(link_error_recovery , 1, 8, 48);
+static PORT_PMA_ATTR(link_downed , 2, 8, 56);
+static PORT_PMA_ATTR(port_rcv_errors , 3, 16, 64);
+static PORT_PMA_ATTR(port_rcv_remote_physical_errors, 4, 16, 80);
+static PORT_PMA_ATTR(port_rcv_switch_relay_errors , 5, 16, 96);
+static PORT_PMA_ATTR(port_xmit_discards , 6, 16, 112);
+static PORT_PMA_ATTR(port_xmit_constraint_errors , 7, 8, 128);
+static PORT_PMA_ATTR(port_rcv_constraint_errors , 8, 8, 136);
+static PORT_PMA_ATTR(local_link_integrity_errors , 9, 4, 152);
+static PORT_PMA_ATTR(excessive_buffer_overrun_errors, 10, 4, 156);
+static PORT_PMA_ATTR(VL15_dropped , 11, 16, 176);
+static PORT_PMA_ATTR(port_xmit_data , 12, 32, 192);
+static PORT_PMA_ATTR(port_rcv_data , 13, 32, 224);
+static PORT_PMA_ATTR(port_xmit_packets , 14, 32, 256);
+static PORT_PMA_ATTR(port_rcv_packets , 15, 32, 288);
+/*
+ * There is no bit allocated for port_xmit_wait in the CounterSelect field
+ * (IB spec). However, since this bit is ignored when reading
+ * (show_pma_counter), the _counter field of port_xmit_wait can be set to zero.
+ */
+static PORT_PMA_ATTR(port_xmit_wait , 0, 32, 320);
+
+static struct attribute *pma_attrs[] = {
+ &port_pma_attr_symbol_error.attr.attr,
+ &port_pma_attr_link_error_recovery.attr.attr,
+ &port_pma_attr_link_downed.attr.attr,
+ &port_pma_attr_port_rcv_errors.attr.attr,
+ &port_pma_attr_port_rcv_remote_physical_errors.attr.attr,
+ &port_pma_attr_port_rcv_switch_relay_errors.attr.attr,
+ &port_pma_attr_port_xmit_discards.attr.attr,
+ &port_pma_attr_port_xmit_constraint_errors.attr.attr,
+ &port_pma_attr_port_rcv_constraint_errors.attr.attr,
+ &port_pma_attr_local_link_integrity_errors.attr.attr,
+ &port_pma_attr_excessive_buffer_overrun_errors.attr.attr,
+ &port_pma_attr_VL15_dropped.attr.attr,
+ &port_pma_attr_port_xmit_data.attr.attr,
+ &port_pma_attr_port_rcv_data.attr.attr,
+ &port_pma_attr_port_xmit_packets.attr.attr,
+ &port_pma_attr_port_rcv_packets.attr.attr,
+ &port_pma_attr_port_xmit_wait.attr.attr,
+ NULL
+};
+
+static struct attribute_group pma_group = {
+ .name = "counters",
+ .attrs = pma_attrs
+};
+
+static void ib_port_release(struct kobject *kobj)
+{
+ struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+ struct attribute *a;
+ int i;
+
+ for (i = 0; (a = p->gid_group.attrs[i]); ++i)
+ kfree(a);
+
+ kfree(p->gid_group.attrs);
+
+ for (i = 0; (a = p->pkey_group.attrs[i]); ++i)
+ kfree(a);
+
+ kfree(p->pkey_group.attrs);
+
+ kfree(p);
+}
+
+static struct kobj_type port_type = {
+ .release = ib_port_release,
+ .sysfs_ops = &port_sysfs_ops,
+ .default_attrs = port_default_attrs
+};
+
+static void ib_device_release(struct device *device)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ kfree(dev);
+}
+
+#ifdef __linux__
+/* BSD supports this through devfs(5) and devd(8). */
+static int ib_device_uevent(struct device *device,
+ struct kobj_uevent_env *env)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ if (add_uevent_var(env, "NAME=%s", dev->name))
+ return -ENOMEM;
+
+ /*
+ * It would be nice to pass the node GUID with the event...
+ */
+
+ return 0;
+}
+#endif
+
+static struct attribute **
+alloc_group_attrs(ssize_t (*show)(struct ib_port *,
+ struct port_attribute *, char *buf),
+ int len)
+{
+ struct attribute **tab_attr;
+ struct port_table_attribute *element;
+ int i;
+
+ tab_attr = kcalloc(1 + len, sizeof(struct attribute *), GFP_KERNEL);
+ if (!tab_attr)
+ return NULL;
+
+ for (i = 0; i < len; i++) {
+ element = kzalloc(sizeof(struct port_table_attribute),
+ GFP_KERNEL);
+ if (!element)
+ goto err;
+
+ if (snprintf(element->name, sizeof(element->name),
+ "%d", i) >= sizeof(element->name)) {
+ kfree(element);
+ goto err;
+ }
+
+ element->attr.attr.name = element->name;
+ element->attr.attr.mode = S_IRUGO;
+ element->attr.show = show;
+ element->index = i;
+
+ tab_attr[i] = &element->attr.attr;
+ }
+
+ return tab_attr;
+
+err:
+ while (--i >= 0)
+ kfree(tab_attr[i]);
+ kfree(tab_attr);
+ return NULL;
+}
+
+static int add_port(struct ib_device *device, int port_num)
+{
+ struct ib_port *p;
+ struct ib_port_attr attr;
+ int i;
+ int ret;
+
+ ret = ib_query_port(device, port_num, &attr);
+ if (ret)
+ return ret;
+
+ p = kzalloc(sizeof *p, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ p->ibdev = device;
+ p->port_num = port_num;
+
+ ret = kobject_init_and_add(&p->kobj, &port_type,
+ device->ports_parent,
+ "%d", port_num);
+ if (ret)
+ goto err_put;
+
+ ret = sysfs_create_group(&p->kobj, &pma_group);
+ if (ret)
+ goto err_put;
+
+ p->gid_group.name = "gids";
+ p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len);
+ if (!p->gid_group.attrs)
+ goto err_remove_pma;
+
+ ret = sysfs_create_group(&p->kobj, &p->gid_group);
+ if (ret)
+ goto err_free_gid;
+
+ p->pkey_group.name = "pkeys";
+ p->pkey_group.attrs = alloc_group_attrs(show_port_pkey,
+ attr.pkey_tbl_len);
+ if (!p->pkey_group.attrs)
+ goto err_remove_gid;
+
+ ret = sysfs_create_group(&p->kobj, &p->pkey_group);
+ if (ret)
+ goto err_free_pkey;
+
+ list_add_tail(&p->kobj.entry, &device->port_list);
+
+#ifdef __linux__
+ kobject_uevent(&p->kobj, KOBJ_ADD);
+#endif
+ return 0;
+
+err_free_pkey:
+ for (i = 0; i < attr.pkey_tbl_len; ++i)
+ kfree(p->pkey_group.attrs[i]);
+
+ kfree(p->pkey_group.attrs);
+
+err_remove_gid:
+ sysfs_remove_group(&p->kobj, &p->gid_group);
+
+err_free_gid:
+ for (i = 0; i < attr.gid_tbl_len; ++i)
+ kfree(p->gid_group.attrs[i]);
+
+ kfree(p->gid_group.attrs);
+
+err_remove_pma:
+ sysfs_remove_group(&p->kobj, &pma_group);
+
+err_put:
+ kobject_put(device->ports_parent);
+ kfree(p);
+ return ret;
+}
+
+static ssize_t show_node_type(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ switch (dev->node_type) {
+ case RDMA_NODE_IB_CA: return sprintf(buf, "%d: CA\n", dev->node_type);
+ case RDMA_NODE_RNIC: return sprintf(buf, "%d: RNIC\n", dev->node_type);
+ case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type);
+ case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type);
+ default: return sprintf(buf, "%d: <unknown>\n", dev->node_type);
+ }
+}
+
+static ssize_t show_sys_image_guid(struct device *device,
+ struct device_attribute *dev_attr, char *buf)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+ struct ib_device_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_device(dev, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "%04x:%04x:%04x:%04x\n",
+ be16_to_cpu(((__be16 *) &attr.sys_image_guid)[0]),
+ be16_to_cpu(((__be16 *) &attr.sys_image_guid)[1]),
+ be16_to_cpu(((__be16 *) &attr.sys_image_guid)[2]),
+ be16_to_cpu(((__be16 *) &attr.sys_image_guid)[3]));
+}
+
+static ssize_t show_node_guid(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ return sprintf(buf, "%04x:%04x:%04x:%04x\n",
+ be16_to_cpu(((__be16 *) &dev->node_guid)[0]),
+ be16_to_cpu(((__be16 *) &dev->node_guid)[1]),
+ be16_to_cpu(((__be16 *) &dev->node_guid)[2]),
+ be16_to_cpu(((__be16 *) &dev->node_guid)[3]));
+}
+
+static ssize_t show_node_desc(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ return sprintf(buf, "%.64s\n", dev->node_desc);
+}
+
+static ssize_t set_node_desc(struct device *device,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+ struct ib_device_modify desc = {};
+ int ret;
+
+ if (!dev->modify_device)
+ return -EIO;
+
+ memcpy(desc.node_desc, buf, min_t(int, count, 64));
+ ret = ib_modify_device(dev, IB_DEVICE_MODIFY_NODE_DESC, &desc);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL);
+static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL);
+static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL);
+static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc);
+
+static struct device_attribute *ib_class_attributes[] = {
+ &dev_attr_node_type,
+ &dev_attr_sys_image_guid,
+ &dev_attr_node_guid,
+ &dev_attr_node_desc
+};
+
+static struct class ib_class = {
+ .name = "infiniband",
+ .dev_release = ib_device_release,
+#ifdef __linux__
+ .dev_uevent = ib_device_uevent,
+#endif
+};
+
+/* Show a given an attribute in the statistics group */
+static ssize_t show_protocol_stat(const struct device *device,
+ struct device_attribute *attr, char *buf,
+ unsigned offset)
+{
+ struct ib_device *dev = container_of(__DECONST(struct device *, device), struct ib_device, dev);
+ union rdma_protocol_stats stats;
+ ssize_t ret;
+
+ ret = dev->get_protocol_stats(dev, &stats);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "%llu\n",
+ (unsigned long long) ((u64 *) &stats)[offset]);
+}
+
+/* generate a read-only iwarp statistics attribute */
+#define IW_STATS_ENTRY(name) \
+static ssize_t show_##name(struct device *device, \
+ struct device_attribute *attr, char *buf) \
+{ \
+ return show_protocol_stat(device, attr, buf, \
+ offsetof(struct iw_protocol_stats, name) / \
+ sizeof (u64)); \
+} \
+static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+IW_STATS_ENTRY(ipInReceives);
+IW_STATS_ENTRY(ipInHdrErrors);
+IW_STATS_ENTRY(ipInTooBigErrors);
+IW_STATS_ENTRY(ipInNoRoutes);
+IW_STATS_ENTRY(ipInAddrErrors);
+IW_STATS_ENTRY(ipInUnknownProtos);
+IW_STATS_ENTRY(ipInTruncatedPkts);
+IW_STATS_ENTRY(ipInDiscards);
+IW_STATS_ENTRY(ipInDelivers);
+IW_STATS_ENTRY(ipOutForwDatagrams);
+IW_STATS_ENTRY(ipOutRequests);
+IW_STATS_ENTRY(ipOutDiscards);
+IW_STATS_ENTRY(ipOutNoRoutes);
+IW_STATS_ENTRY(ipReasmTimeout);
+IW_STATS_ENTRY(ipReasmReqds);
+IW_STATS_ENTRY(ipReasmOKs);
+IW_STATS_ENTRY(ipReasmFails);
+IW_STATS_ENTRY(ipFragOKs);
+IW_STATS_ENTRY(ipFragFails);
+IW_STATS_ENTRY(ipFragCreates);
+IW_STATS_ENTRY(ipInMcastPkts);
+IW_STATS_ENTRY(ipOutMcastPkts);
+IW_STATS_ENTRY(ipInBcastPkts);
+IW_STATS_ENTRY(ipOutBcastPkts);
+IW_STATS_ENTRY(tcpRtoAlgorithm);
+IW_STATS_ENTRY(tcpRtoMin);
+IW_STATS_ENTRY(tcpRtoMax);
+IW_STATS_ENTRY(tcpMaxConn);
+IW_STATS_ENTRY(tcpActiveOpens);
+IW_STATS_ENTRY(tcpPassiveOpens);
+IW_STATS_ENTRY(tcpAttemptFails);
+IW_STATS_ENTRY(tcpEstabResets);
+IW_STATS_ENTRY(tcpCurrEstab);
+IW_STATS_ENTRY(tcpInSegs);
+IW_STATS_ENTRY(tcpOutSegs);
+IW_STATS_ENTRY(tcpRetransSegs);
+IW_STATS_ENTRY(tcpInErrs);
+IW_STATS_ENTRY(tcpOutRsts);
+
+static struct attribute *iw_proto_stats_attrs[] = {
+ &dev_attr_ipInReceives.attr,
+ &dev_attr_ipInHdrErrors.attr,
+ &dev_attr_ipInTooBigErrors.attr,
+ &dev_attr_ipInNoRoutes.attr,
+ &dev_attr_ipInAddrErrors.attr,
+ &dev_attr_ipInUnknownProtos.attr,
+ &dev_attr_ipInTruncatedPkts.attr,
+ &dev_attr_ipInDiscards.attr,
+ &dev_attr_ipInDelivers.attr,
+ &dev_attr_ipOutForwDatagrams.attr,
+ &dev_attr_ipOutRequests.attr,
+ &dev_attr_ipOutDiscards.attr,
+ &dev_attr_ipOutNoRoutes.attr,
+ &dev_attr_ipReasmTimeout.attr,
+ &dev_attr_ipReasmReqds.attr,
+ &dev_attr_ipReasmOKs.attr,
+ &dev_attr_ipReasmFails.attr,
+ &dev_attr_ipFragOKs.attr,
+ &dev_attr_ipFragFails.attr,
+ &dev_attr_ipFragCreates.attr,
+ &dev_attr_ipInMcastPkts.attr,
+ &dev_attr_ipOutMcastPkts.attr,
+ &dev_attr_ipInBcastPkts.attr,
+ &dev_attr_ipOutBcastPkts.attr,
+ &dev_attr_tcpRtoAlgorithm.attr,
+ &dev_attr_tcpRtoMin.attr,
+ &dev_attr_tcpRtoMax.attr,
+ &dev_attr_tcpMaxConn.attr,
+ &dev_attr_tcpActiveOpens.attr,
+ &dev_attr_tcpPassiveOpens.attr,
+ &dev_attr_tcpAttemptFails.attr,
+ &dev_attr_tcpEstabResets.attr,
+ &dev_attr_tcpCurrEstab.attr,
+ &dev_attr_tcpInSegs.attr,
+ &dev_attr_tcpOutSegs.attr,
+ &dev_attr_tcpRetransSegs.attr,
+ &dev_attr_tcpInErrs.attr,
+ &dev_attr_tcpOutRsts.attr,
+ NULL
+};
+
+static struct attribute_group iw_stats_group = {
+ .name = "proto_stats",
+ .attrs = iw_proto_stats_attrs,
+};
+
+int ib_device_register_sysfs(struct ib_device *device)
+{
+ struct device *class_dev = &device->dev;
+ int ret;
+ int i;
+
+ class_dev->class = &ib_class;
+ class_dev->driver_data = device;
+ class_dev->parent = device->dma_device;
+ dev_set_name(class_dev, device->name);
+
+ INIT_LIST_HEAD(&device->port_list);
+
+ ret = device_register(class_dev);
+ if (ret)
+ goto err;
+
+ for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) {
+ ret = device_create_file(class_dev, ib_class_attributes[i]);
+ if (ret)
+ goto err_unregister;
+ }
+
+ device->ports_parent = kobject_create_and_add("ports",
+ &class_dev->kobj);
+ if (!device->ports_parent) {
+ ret = -ENOMEM;
+ goto err_put;
+ }
+
+ if (device->node_type == RDMA_NODE_IB_SWITCH) {
+ ret = add_port(device, 0);
+ if (ret)
+ goto err_put;
+ } else {
+ for (i = 1; i <= device->phys_port_cnt; ++i) {
+ ret = add_port(device, i);
+ if (ret)
+ goto err_put;
+ }
+ }
+
+ if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) {
+ ret = sysfs_create_group(&class_dev->kobj, &iw_stats_group);
+ if (ret)
+ goto err_put;
+ }
+
+ return 0;
+
+err_put:
+ {
+ struct kobject *p, *t;
+ struct ib_port *port;
+
+ list_for_each_entry_safe(p, t, &device->port_list, entry) {
+ list_del(&p->entry);
+ port = container_of(p, struct ib_port, kobj);
+ sysfs_remove_group(p, &pma_group);
+ sysfs_remove_group(p, &port->pkey_group);
+ sysfs_remove_group(p, &port->gid_group);
+ kobject_put(p);
+ }
+ }
+
+ kobject_put(&class_dev->kobj);
+
+err_unregister:
+ device_unregister(class_dev);
+
+err:
+ return ret;
+}
+
+void ib_device_unregister_sysfs(struct ib_device *device)
+{
+ struct kobject *p, *t;
+ struct ib_port *port;
+
+ /* Hold kobject until ib_dealloc_device() */
+ kobject_get(&device->dev.kobj);
+
+ list_for_each_entry_safe(p, t, &device->port_list, entry) {
+ list_del(&p->entry);
+ port = container_of(p, struct ib_port, kobj);
+ sysfs_remove_group(p, &pma_group);
+ sysfs_remove_group(p, &port->pkey_group);
+ sysfs_remove_group(p, &port->gid_group);
+ kobject_put(p);
+ }
+
+ kobject_put(device->ports_parent);
+ device_unregister(&device->dev);
+}
+
+int ib_sysfs_setup(void)
+{
+ return class_register(&ib_class);
+}
+
+void ib_sysfs_cleanup(void)
+{
+ class_unregister(&ib_class);
+}
+
+int ib_sysfs_create_port_files(struct ib_device *device,
+ int (*create)(struct ib_device *dev, u8 port_num,
+ struct kobject *kobj))
+{
+ struct kobject *p;
+ struct ib_port *port;
+ int ret = 0;
+
+ list_for_each_entry(p, &device->port_list, entry) {
+ port = container_of(p, struct ib_port, kobj);
+ ret = create(device, port->port_num, &port->kobj);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_sysfs_create_port_files);
diff --git a/sys/ofed/drivers/infiniband/core/ucm.c b/sys/ofed/drivers/infiniband/core/ucm.c
new file mode 100644
index 0000000..90e0b31
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/ucm.c
@@ -0,0 +1,1348 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/cdev.h>
+#include <linux/idr.h>
+#include <linux/mutex.h>
+
+#include <asm/uaccess.h>
+
+#include <rdma/ib_cm.h>
+#include <rdma/ib_user_cm.h>
+#include <rdma/ib_marshall.h>
+
+MODULE_AUTHOR("Libor Michalek");
+MODULE_DESCRIPTION("InfiniBand userspace Connection Manager access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct ib_ucm_device {
+ int devnum;
+ struct cdev cdev;
+ struct device dev;
+ struct ib_device *ib_dev;
+};
+
+struct ib_ucm_file {
+ struct mutex file_mutex;
+ struct file *filp;
+ struct ib_ucm_device *device;
+
+ struct list_head ctxs;
+ struct list_head events;
+ wait_queue_head_t poll_wait;
+};
+
+struct ib_ucm_context {
+ int id;
+ struct completion comp;
+ atomic_t ref;
+ int events_reported;
+
+ struct ib_ucm_file *file;
+ struct ib_cm_id *cm_id;
+ __u64 uid;
+
+ struct list_head events; /* list of pending events. */
+ struct list_head file_list; /* member in file ctx list */
+};
+
+struct ib_ucm_event {
+ struct ib_ucm_context *ctx;
+ struct list_head file_list; /* member in file event list */
+ struct list_head ctx_list; /* member in ctx event list */
+
+ struct ib_cm_id *cm_id;
+ struct ib_ucm_event_resp resp;
+ void *data;
+ void *info;
+ int data_len;
+ int info_len;
+};
+
+enum {
+ IB_UCM_MAJOR = 231,
+ IB_UCM_BASE_MINOR = 224,
+ IB_UCM_MAX_DEVICES = 32
+};
+
+/* ib_cm and ib_user_cm modules share /sys/class/infiniband_cm */
+extern struct class cm_class;
+
+#define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR)
+
+static void ib_ucm_add_one(struct ib_device *device);
+static void ib_ucm_remove_one(struct ib_device *device);
+
+static struct ib_client ucm_client = {
+ .name = "ucm",
+ .add = ib_ucm_add_one,
+ .remove = ib_ucm_remove_one
+};
+
+static DEFINE_MUTEX(ctx_id_mutex);
+static DEFINE_IDR(ctx_id_table);
+static DECLARE_BITMAP(dev_map, IB_UCM_MAX_DEVICES);
+
+static struct ib_ucm_context *ib_ucm_ctx_get(struct ib_ucm_file *file, int id)
+{
+ struct ib_ucm_context *ctx;
+
+ mutex_lock(&ctx_id_mutex);
+ ctx = idr_find(&ctx_id_table, id);
+ if (!ctx)
+ ctx = ERR_PTR(-ENOENT);
+ else if (ctx->file != file)
+ ctx = ERR_PTR(-EINVAL);
+ else
+ atomic_inc(&ctx->ref);
+ mutex_unlock(&ctx_id_mutex);
+
+ return ctx;
+}
+
+static void ib_ucm_ctx_put(struct ib_ucm_context *ctx)
+{
+ if (atomic_dec_and_test(&ctx->ref))
+ complete(&ctx->comp);
+}
+
+static inline int ib_ucm_new_cm_id(int event)
+{
+ return event == IB_CM_REQ_RECEIVED || event == IB_CM_SIDR_REQ_RECEIVED;
+}
+
+static void ib_ucm_cleanup_events(struct ib_ucm_context *ctx)
+{
+ struct ib_ucm_event *uevent;
+
+ mutex_lock(&ctx->file->file_mutex);
+ list_del(&ctx->file_list);
+ while (!list_empty(&ctx->events)) {
+
+ uevent = list_entry(ctx->events.next,
+ struct ib_ucm_event, ctx_list);
+ list_del(&uevent->file_list);
+ list_del(&uevent->ctx_list);
+ mutex_unlock(&ctx->file->file_mutex);
+
+ /* clear incoming connections. */
+ if (ib_ucm_new_cm_id(uevent->resp.event))
+ ib_destroy_cm_id(uevent->cm_id);
+
+ kfree(uevent);
+ mutex_lock(&ctx->file->file_mutex);
+ }
+ mutex_unlock(&ctx->file->file_mutex);
+}
+
+static struct ib_ucm_context *ib_ucm_ctx_alloc(struct ib_ucm_file *file)
+{
+ struct ib_ucm_context *ctx;
+ int result;
+
+ ctx = kzalloc(sizeof *ctx, GFP_KERNEL);
+ if (!ctx)
+ return NULL;
+
+ atomic_set(&ctx->ref, 1);
+ init_completion(&ctx->comp);
+ ctx->file = file;
+ INIT_LIST_HEAD(&ctx->events);
+
+ do {
+ result = idr_pre_get(&ctx_id_table, GFP_KERNEL);
+ if (!result)
+ goto error;
+
+ mutex_lock(&ctx_id_mutex);
+ result = idr_get_new(&ctx_id_table, ctx, &ctx->id);
+ mutex_unlock(&ctx_id_mutex);
+ } while (result == -EAGAIN);
+
+ if (result)
+ goto error;
+
+ list_add_tail(&ctx->file_list, &file->ctxs);
+ return ctx;
+
+error:
+ kfree(ctx);
+ return NULL;
+}
+
+static void ib_ucm_event_req_get(struct ib_ucm_req_event_resp *ureq,
+ struct ib_cm_req_event_param *kreq)
+{
+ ureq->remote_ca_guid = kreq->remote_ca_guid;
+ ureq->remote_qkey = kreq->remote_qkey;
+ ureq->remote_qpn = kreq->remote_qpn;
+ ureq->qp_type = kreq->qp_type;
+ ureq->starting_psn = kreq->starting_psn;
+ ureq->responder_resources = kreq->responder_resources;
+ ureq->initiator_depth = kreq->initiator_depth;
+ ureq->local_cm_response_timeout = kreq->local_cm_response_timeout;
+ ureq->flow_control = kreq->flow_control;
+ ureq->remote_cm_response_timeout = kreq->remote_cm_response_timeout;
+ ureq->retry_count = kreq->retry_count;
+ ureq->rnr_retry_count = kreq->rnr_retry_count;
+ ureq->srq = kreq->srq;
+ ureq->port = kreq->port;
+
+ ib_copy_path_rec_to_user(&ureq->primary_path, kreq->primary_path);
+ if (kreq->alternate_path)
+ ib_copy_path_rec_to_user(&ureq->alternate_path,
+ kreq->alternate_path);
+}
+
+static void ib_ucm_event_rep_get(struct ib_ucm_rep_event_resp *urep,
+ struct ib_cm_rep_event_param *krep)
+{
+ urep->remote_ca_guid = krep->remote_ca_guid;
+ urep->remote_qkey = krep->remote_qkey;
+ urep->remote_qpn = krep->remote_qpn;
+ urep->starting_psn = krep->starting_psn;
+ urep->responder_resources = krep->responder_resources;
+ urep->initiator_depth = krep->initiator_depth;
+ urep->target_ack_delay = krep->target_ack_delay;
+ urep->failover_accepted = krep->failover_accepted;
+ urep->flow_control = krep->flow_control;
+ urep->rnr_retry_count = krep->rnr_retry_count;
+ urep->srq = krep->srq;
+}
+
+static void ib_ucm_event_sidr_rep_get(struct ib_ucm_sidr_rep_event_resp *urep,
+ struct ib_cm_sidr_rep_event_param *krep)
+{
+ urep->status = krep->status;
+ urep->qkey = krep->qkey;
+ urep->qpn = krep->qpn;
+};
+
+static int ib_ucm_event_process(struct ib_cm_event *evt,
+ struct ib_ucm_event *uvt)
+{
+ void *info = NULL;
+
+ switch (evt->event) {
+ case IB_CM_REQ_RECEIVED:
+ ib_ucm_event_req_get(&uvt->resp.u.req_resp,
+ &evt->param.req_rcvd);
+ uvt->data_len = IB_CM_REQ_PRIVATE_DATA_SIZE;
+ uvt->resp.present = IB_UCM_PRES_PRIMARY;
+ uvt->resp.present |= (evt->param.req_rcvd.alternate_path ?
+ IB_UCM_PRES_ALTERNATE : 0);
+ break;
+ case IB_CM_REP_RECEIVED:
+ ib_ucm_event_rep_get(&uvt->resp.u.rep_resp,
+ &evt->param.rep_rcvd);
+ uvt->data_len = IB_CM_REP_PRIVATE_DATA_SIZE;
+ break;
+ case IB_CM_RTU_RECEIVED:
+ uvt->data_len = IB_CM_RTU_PRIVATE_DATA_SIZE;
+ uvt->resp.u.send_status = evt->param.send_status;
+ break;
+ case IB_CM_DREQ_RECEIVED:
+ uvt->data_len = IB_CM_DREQ_PRIVATE_DATA_SIZE;
+ uvt->resp.u.send_status = evt->param.send_status;
+ break;
+ case IB_CM_DREP_RECEIVED:
+ uvt->data_len = IB_CM_DREP_PRIVATE_DATA_SIZE;
+ uvt->resp.u.send_status = evt->param.send_status;
+ break;
+ case IB_CM_MRA_RECEIVED:
+ uvt->resp.u.mra_resp.timeout =
+ evt->param.mra_rcvd.service_timeout;
+ uvt->data_len = IB_CM_MRA_PRIVATE_DATA_SIZE;
+ break;
+ case IB_CM_REJ_RECEIVED:
+ uvt->resp.u.rej_resp.reason = evt->param.rej_rcvd.reason;
+ uvt->data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
+ uvt->info_len = evt->param.rej_rcvd.ari_length;
+ info = evt->param.rej_rcvd.ari;
+ break;
+ case IB_CM_LAP_RECEIVED:
+ ib_copy_path_rec_to_user(&uvt->resp.u.lap_resp.path,
+ evt->param.lap_rcvd.alternate_path);
+ uvt->data_len = IB_CM_LAP_PRIVATE_DATA_SIZE;
+ uvt->resp.present = IB_UCM_PRES_ALTERNATE;
+ break;
+ case IB_CM_APR_RECEIVED:
+ uvt->resp.u.apr_resp.status = evt->param.apr_rcvd.ap_status;
+ uvt->data_len = IB_CM_APR_PRIVATE_DATA_SIZE;
+ uvt->info_len = evt->param.apr_rcvd.info_len;
+ info = evt->param.apr_rcvd.apr_info;
+ break;
+ case IB_CM_SIDR_REQ_RECEIVED:
+ uvt->resp.u.sidr_req_resp.pkey =
+ evt->param.sidr_req_rcvd.pkey;
+ uvt->resp.u.sidr_req_resp.port =
+ evt->param.sidr_req_rcvd.port;
+ uvt->data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE;
+ break;
+ case IB_CM_SIDR_REP_RECEIVED:
+ ib_ucm_event_sidr_rep_get(&uvt->resp.u.sidr_rep_resp,
+ &evt->param.sidr_rep_rcvd);
+ uvt->data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE;
+ uvt->info_len = evt->param.sidr_rep_rcvd.info_len;
+ info = evt->param.sidr_rep_rcvd.info;
+ break;
+ default:
+ uvt->resp.u.send_status = evt->param.send_status;
+ break;
+ }
+
+ if (uvt->data_len) {
+ uvt->data = kmemdup(evt->private_data, uvt->data_len, GFP_KERNEL);
+ if (!uvt->data)
+ goto err1;
+
+ uvt->resp.present |= IB_UCM_PRES_DATA;
+ }
+
+ if (uvt->info_len) {
+ uvt->info = kmemdup(info, uvt->info_len, GFP_KERNEL);
+ if (!uvt->info)
+ goto err2;
+
+ uvt->resp.present |= IB_UCM_PRES_INFO;
+ }
+ return 0;
+
+err2:
+ kfree(uvt->data);
+err1:
+ return -ENOMEM;
+}
+
+static int ib_ucm_event_handler(struct ib_cm_id *cm_id,
+ struct ib_cm_event *event)
+{
+ struct ib_ucm_event *uevent;
+ struct ib_ucm_context *ctx;
+ int result = 0;
+
+ ctx = cm_id->context;
+
+ uevent = kzalloc(sizeof *uevent, GFP_KERNEL);
+ if (!uevent)
+ goto err1;
+
+ uevent->ctx = ctx;
+ uevent->cm_id = cm_id;
+ uevent->resp.uid = ctx->uid;
+ uevent->resp.id = ctx->id;
+ uevent->resp.event = event->event;
+
+ result = ib_ucm_event_process(event, uevent);
+ if (result)
+ goto err2;
+
+ mutex_lock(&ctx->file->file_mutex);
+ list_add_tail(&uevent->file_list, &ctx->file->events);
+ list_add_tail(&uevent->ctx_list, &ctx->events);
+ wake_up_interruptible(&ctx->file->poll_wait);
+ if (ctx->file->filp)
+ selwakeup(&ctx->file->filp->f_selinfo);
+ mutex_unlock(&ctx->file->file_mutex);
+ return 0;
+
+err2:
+ kfree(uevent);
+err1:
+ /* Destroy new cm_id's */
+ return ib_ucm_new_cm_id(event->event);
+}
+
+static ssize_t ib_ucm_event(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_ucm_context *ctx;
+ struct ib_ucm_event_get cmd;
+ struct ib_ucm_event *uevent;
+ int result = 0;
+ DEFINE_WAIT(wait);
+
+ if (out_len < sizeof(struct ib_ucm_event_resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ mutex_lock(&file->file_mutex);
+ while (list_empty(&file->events)) {
+ mutex_unlock(&file->file_mutex);
+
+ if (file->filp->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ if (wait_event_interruptible(file->poll_wait,
+ !list_empty(&file->events)))
+ return -ERESTARTSYS;
+
+ mutex_lock(&file->file_mutex);
+ }
+
+ uevent = list_entry(file->events.next, struct ib_ucm_event, file_list);
+
+ if (ib_ucm_new_cm_id(uevent->resp.event)) {
+ ctx = ib_ucm_ctx_alloc(file);
+ if (!ctx) {
+ result = -ENOMEM;
+ goto done;
+ }
+
+ ctx->cm_id = uevent->cm_id;
+ ctx->cm_id->context = ctx;
+ uevent->resp.id = ctx->id;
+ }
+
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &uevent->resp, sizeof(uevent->resp))) {
+ result = -EFAULT;
+ goto done;
+ }
+
+ if (uevent->data) {
+ if (cmd.data_len < uevent->data_len) {
+ result = -ENOMEM;
+ goto done;
+ }
+ if (copy_to_user((void __user *)(unsigned long)cmd.data,
+ uevent->data, uevent->data_len)) {
+ result = -EFAULT;
+ goto done;
+ }
+ }
+
+ if (uevent->info) {
+ if (cmd.info_len < uevent->info_len) {
+ result = -ENOMEM;
+ goto done;
+ }
+ if (copy_to_user((void __user *)(unsigned long)cmd.info,
+ uevent->info, uevent->info_len)) {
+ result = -EFAULT;
+ goto done;
+ }
+ }
+
+ list_del(&uevent->file_list);
+ list_del(&uevent->ctx_list);
+ uevent->ctx->events_reported++;
+
+ kfree(uevent->data);
+ kfree(uevent->info);
+ kfree(uevent);
+done:
+ mutex_unlock(&file->file_mutex);
+ return result;
+}
+
+static ssize_t ib_ucm_create_id(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_ucm_create_id cmd;
+ struct ib_ucm_create_id_resp resp;
+ struct ib_ucm_context *ctx;
+ int result;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ mutex_lock(&file->file_mutex);
+ ctx = ib_ucm_ctx_alloc(file);
+ mutex_unlock(&file->file_mutex);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->uid = cmd.uid;
+ ctx->cm_id = ib_create_cm_id(file->device->ib_dev,
+ ib_ucm_event_handler, ctx);
+ if (IS_ERR(ctx->cm_id)) {
+ result = PTR_ERR(ctx->cm_id);
+ goto err1;
+ }
+
+ resp.id = ctx->id;
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &resp, sizeof(resp))) {
+ result = -EFAULT;
+ goto err2;
+ }
+ return 0;
+
+err2:
+ ib_destroy_cm_id(ctx->cm_id);
+err1:
+ mutex_lock(&ctx_id_mutex);
+ idr_remove(&ctx_id_table, ctx->id);
+ mutex_unlock(&ctx_id_mutex);
+ kfree(ctx);
+ return result;
+}
+
+static ssize_t ib_ucm_destroy_id(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_ucm_destroy_id cmd;
+ struct ib_ucm_destroy_id_resp resp;
+ struct ib_ucm_context *ctx;
+ int result = 0;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ mutex_lock(&ctx_id_mutex);
+ ctx = idr_find(&ctx_id_table, cmd.id);
+ if (!ctx)
+ ctx = ERR_PTR(-ENOENT);
+ else if (ctx->file != file)
+ ctx = ERR_PTR(-EINVAL);
+ else
+ idr_remove(&ctx_id_table, ctx->id);
+ mutex_unlock(&ctx_id_mutex);
+
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ib_ucm_ctx_put(ctx);
+ wait_for_completion(&ctx->comp);
+
+ /* No new events will be generated after destroying the cm_id. */
+ ib_destroy_cm_id(ctx->cm_id);
+ /* Cleanup events not yet reported to the user. */
+ ib_ucm_cleanup_events(ctx);
+
+ resp.events_reported = ctx->events_reported;
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &resp, sizeof(resp)))
+ result = -EFAULT;
+
+ kfree(ctx);
+ return result;
+}
+
+static ssize_t ib_ucm_attr_id(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_ucm_attr_id_resp resp;
+ struct ib_ucm_attr_id cmd;
+ struct ib_ucm_context *ctx;
+ int result = 0;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ resp.service_id = ctx->cm_id->service_id;
+ resp.service_mask = ctx->cm_id->service_mask;
+ resp.local_id = ctx->cm_id->local_id;
+ resp.remote_id = ctx->cm_id->remote_id;
+
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &resp, sizeof(resp)))
+ result = -EFAULT;
+
+ ib_ucm_ctx_put(ctx);
+ return result;
+}
+
+static ssize_t ib_ucm_init_qp_attr(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_uverbs_qp_attr resp;
+ struct ib_ucm_init_qp_attr cmd;
+ struct ib_ucm_context *ctx;
+ struct ib_qp_attr qp_attr;
+ int result = 0;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ resp.qp_attr_mask = 0;
+ memset(&qp_attr, 0, sizeof qp_attr);
+ qp_attr.qp_state = cmd.qp_state;
+ result = ib_cm_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask);
+ if (result)
+ goto out;
+
+ ib_copy_qp_attr_to_user(&resp, &qp_attr);
+
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &resp, sizeof(resp)))
+ result = -EFAULT;
+
+out:
+ ib_ucm_ctx_put(ctx);
+ return result;
+}
+
+static int ucm_validate_listen(__be64 service_id, __be64 service_mask)
+{
+ service_id &= service_mask;
+
+ if (((service_id & IB_CMA_SERVICE_ID_MASK) == IB_CMA_SERVICE_ID) ||
+ ((service_id & IB_SDP_SERVICE_ID_MASK) == IB_SDP_SERVICE_ID))
+ return -EINVAL;
+
+ return 0;
+}
+
+static ssize_t ib_ucm_listen(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_ucm_listen cmd;
+ struct ib_ucm_context *ctx;
+ int result;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ result = ucm_validate_listen(cmd.service_id, cmd.service_mask);
+ if (result)
+ goto out;
+
+ result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask,
+ NULL);
+out:
+ ib_ucm_ctx_put(ctx);
+ return result;
+}
+
+static ssize_t ib_ucm_notify(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_ucm_notify cmd;
+ struct ib_ucm_context *ctx;
+ int result;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ result = ib_cm_notify(ctx->cm_id, (enum ib_event_type) cmd.event);
+ ib_ucm_ctx_put(ctx);
+ return result;
+}
+
+static int ib_ucm_alloc_data(const void **dest, u64 src, u32 len)
+{
+ void *data;
+
+ *dest = NULL;
+
+ if (!len)
+ return 0;
+
+ data = kmalloc(len, GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ if (copy_from_user(data, (void __user *)(unsigned long)src, len)) {
+ kfree(data);
+ return -EFAULT;
+ }
+
+ *dest = data;
+ return 0;
+}
+
+static int ib_ucm_path_get(struct ib_sa_path_rec **path, u64 src)
+{
+ struct ib_user_path_rec upath;
+ struct ib_sa_path_rec *sa_path;
+
+ *path = NULL;
+
+ if (!src)
+ return 0;
+
+ sa_path = kmalloc(sizeof(*sa_path), GFP_KERNEL);
+ if (!sa_path)
+ return -ENOMEM;
+
+ if (copy_from_user(&upath, (void __user *)(unsigned long)src,
+ sizeof(upath))) {
+
+ kfree(sa_path);
+ return -EFAULT;
+ }
+
+ ib_copy_path_rec_from_user(sa_path, &upath);
+ *path = sa_path;
+ return 0;
+}
+
+static ssize_t ib_ucm_send_req(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_cm_req_param param;
+ struct ib_ucm_context *ctx;
+ struct ib_ucm_req cmd;
+ int result;
+
+ param.private_data = NULL;
+ param.primary_path = NULL;
+ param.alternate_path = NULL;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
+ if (result)
+ goto done;
+
+ result = ib_ucm_path_get(&param.primary_path, cmd.primary_path);
+ if (result)
+ goto done;
+
+ result = ib_ucm_path_get(&param.alternate_path, cmd.alternate_path);
+ if (result)
+ goto done;
+
+ param.private_data_len = cmd.len;
+ param.service_id = cmd.sid;
+ param.qp_num = cmd.qpn;
+ param.qp_type = cmd.qp_type;
+ param.starting_psn = cmd.psn;
+ param.peer_to_peer = cmd.peer_to_peer;
+ param.responder_resources = cmd.responder_resources;
+ param.initiator_depth = cmd.initiator_depth;
+ param.remote_cm_response_timeout = cmd.remote_cm_response_timeout;
+ param.flow_control = cmd.flow_control;
+ param.local_cm_response_timeout = cmd.local_cm_response_timeout;
+ param.retry_count = cmd.retry_count;
+ param.rnr_retry_count = cmd.rnr_retry_count;
+ param.max_cm_retries = cmd.max_cm_retries;
+ param.srq = cmd.srq;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (!IS_ERR(ctx)) {
+ result = ib_send_cm_req(ctx->cm_id, &param);
+ ib_ucm_ctx_put(ctx);
+ } else
+ result = PTR_ERR(ctx);
+
+done:
+ kfree(param.private_data);
+ kfree(param.primary_path);
+ kfree(param.alternate_path);
+ return result;
+}
+
+static ssize_t ib_ucm_send_rep(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_cm_rep_param param;
+ struct ib_ucm_context *ctx;
+ struct ib_ucm_rep cmd;
+ int result;
+
+ param.private_data = NULL;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
+ if (result)
+ return result;
+
+ param.qp_num = cmd.qpn;
+ param.starting_psn = cmd.psn;
+ param.private_data_len = cmd.len;
+ param.responder_resources = cmd.responder_resources;
+ param.initiator_depth = cmd.initiator_depth;
+ param.failover_accepted = cmd.failover_accepted;
+ param.flow_control = cmd.flow_control;
+ param.rnr_retry_count = cmd.rnr_retry_count;
+ param.srq = cmd.srq;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (!IS_ERR(ctx)) {
+ ctx->uid = cmd.uid;
+ result = ib_send_cm_rep(ctx->cm_id, &param);
+ ib_ucm_ctx_put(ctx);
+ } else
+ result = PTR_ERR(ctx);
+
+ kfree(param.private_data);
+ return result;
+}
+
+static ssize_t ib_ucm_send_private_data(struct ib_ucm_file *file,
+ const char __user *inbuf, int in_len,
+ int (*func)(struct ib_cm_id *cm_id,
+ const void *private_data,
+ u8 private_data_len))
+{
+ struct ib_ucm_private_data cmd;
+ struct ib_ucm_context *ctx;
+ const void *private_data = NULL;
+ int result;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ result = ib_ucm_alloc_data(&private_data, cmd.data, cmd.len);
+ if (result)
+ return result;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (!IS_ERR(ctx)) {
+ result = func(ctx->cm_id, private_data, cmd.len);
+ ib_ucm_ctx_put(ctx);
+ } else
+ result = PTR_ERR(ctx);
+
+ kfree(private_data);
+ return result;
+}
+
+static ssize_t ib_ucm_send_rtu(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_rtu);
+}
+
+static ssize_t ib_ucm_send_dreq(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_dreq);
+}
+
+static ssize_t ib_ucm_send_drep(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_drep);
+}
+
+static ssize_t ib_ucm_send_info(struct ib_ucm_file *file,
+ const char __user *inbuf, int in_len,
+ int (*func)(struct ib_cm_id *cm_id,
+ int status,
+ const void *info,
+ u8 info_len,
+ const void *data,
+ u8 data_len))
+{
+ struct ib_ucm_context *ctx;
+ struct ib_ucm_info cmd;
+ const void *data = NULL;
+ const void *info = NULL;
+ int result;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ result = ib_ucm_alloc_data(&data, cmd.data, cmd.data_len);
+ if (result)
+ goto done;
+
+ result = ib_ucm_alloc_data(&info, cmd.info, cmd.info_len);
+ if (result)
+ goto done;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (!IS_ERR(ctx)) {
+ result = func(ctx->cm_id, cmd.status, info, cmd.info_len,
+ data, cmd.data_len);
+ ib_ucm_ctx_put(ctx);
+ } else
+ result = PTR_ERR(ctx);
+
+done:
+ kfree(data);
+ kfree(info);
+ return result;
+}
+
+static ssize_t ib_ucm_send_rej(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_rej);
+}
+
+static ssize_t ib_ucm_send_apr(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_apr);
+}
+
+static ssize_t ib_ucm_send_mra(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_ucm_context *ctx;
+ struct ib_ucm_mra cmd;
+ const void *data = NULL;
+ int result;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ result = ib_ucm_alloc_data(&data, cmd.data, cmd.len);
+ if (result)
+ return result;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (!IS_ERR(ctx)) {
+ result = ib_send_cm_mra(ctx->cm_id, cmd.timeout, data, cmd.len);
+ ib_ucm_ctx_put(ctx);
+ } else
+ result = PTR_ERR(ctx);
+
+ kfree(data);
+ return result;
+}
+
+static ssize_t ib_ucm_send_lap(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_ucm_context *ctx;
+ struct ib_sa_path_rec *path = NULL;
+ struct ib_ucm_lap cmd;
+ const void *data = NULL;
+ int result;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ result = ib_ucm_alloc_data(&data, cmd.data, cmd.len);
+ if (result)
+ goto done;
+
+ result = ib_ucm_path_get(&path, cmd.path);
+ if (result)
+ goto done;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (!IS_ERR(ctx)) {
+ result = ib_send_cm_lap(ctx->cm_id, path, data, cmd.len);
+ ib_ucm_ctx_put(ctx);
+ } else
+ result = PTR_ERR(ctx);
+
+done:
+ kfree(data);
+ kfree(path);
+ return result;
+}
+
+static ssize_t ib_ucm_send_sidr_req(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_cm_sidr_req_param param;
+ struct ib_ucm_context *ctx;
+ struct ib_ucm_sidr_req cmd;
+ int result;
+
+ param.private_data = NULL;
+ param.path = NULL;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
+ if (result)
+ goto done;
+
+ result = ib_ucm_path_get(&param.path, cmd.path);
+ if (result)
+ goto done;
+
+ param.private_data_len = cmd.len;
+ param.service_id = cmd.sid;
+ param.timeout_ms = cmd.timeout;
+ param.max_cm_retries = cmd.max_cm_retries;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (!IS_ERR(ctx)) {
+ result = ib_send_cm_sidr_req(ctx->cm_id, &param);
+ ib_ucm_ctx_put(ctx);
+ } else
+ result = PTR_ERR(ctx);
+
+done:
+ kfree(param.private_data);
+ kfree(param.path);
+ return result;
+}
+
+static ssize_t ib_ucm_send_sidr_rep(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_cm_sidr_rep_param param;
+ struct ib_ucm_sidr_rep cmd;
+ struct ib_ucm_context *ctx;
+ int result;
+
+ param.info = NULL;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ result = ib_ucm_alloc_data(&param.private_data,
+ cmd.data, cmd.data_len);
+ if (result)
+ goto done;
+
+ result = ib_ucm_alloc_data(&param.info, cmd.info, cmd.info_len);
+ if (result)
+ goto done;
+
+ param.qp_num = cmd.qpn;
+ param.qkey = cmd.qkey;
+ param.status = cmd.status;
+ param.info_length = cmd.info_len;
+ param.private_data_len = cmd.data_len;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (!IS_ERR(ctx)) {
+ result = ib_send_cm_sidr_rep(ctx->cm_id, &param);
+ ib_ucm_ctx_put(ctx);
+ } else
+ result = PTR_ERR(ctx);
+
+done:
+ kfree(param.private_data);
+ kfree(param.info);
+ return result;
+}
+
+static ssize_t (*ucm_cmd_table[])(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len) = {
+ [IB_USER_CM_CMD_CREATE_ID] = ib_ucm_create_id,
+ [IB_USER_CM_CMD_DESTROY_ID] = ib_ucm_destroy_id,
+ [IB_USER_CM_CMD_ATTR_ID] = ib_ucm_attr_id,
+ [IB_USER_CM_CMD_LISTEN] = ib_ucm_listen,
+ [IB_USER_CM_CMD_NOTIFY] = ib_ucm_notify,
+ [IB_USER_CM_CMD_SEND_REQ] = ib_ucm_send_req,
+ [IB_USER_CM_CMD_SEND_REP] = ib_ucm_send_rep,
+ [IB_USER_CM_CMD_SEND_RTU] = ib_ucm_send_rtu,
+ [IB_USER_CM_CMD_SEND_DREQ] = ib_ucm_send_dreq,
+ [IB_USER_CM_CMD_SEND_DREP] = ib_ucm_send_drep,
+ [IB_USER_CM_CMD_SEND_REJ] = ib_ucm_send_rej,
+ [IB_USER_CM_CMD_SEND_MRA] = ib_ucm_send_mra,
+ [IB_USER_CM_CMD_SEND_LAP] = ib_ucm_send_lap,
+ [IB_USER_CM_CMD_SEND_APR] = ib_ucm_send_apr,
+ [IB_USER_CM_CMD_SEND_SIDR_REQ] = ib_ucm_send_sidr_req,
+ [IB_USER_CM_CMD_SEND_SIDR_REP] = ib_ucm_send_sidr_rep,
+ [IB_USER_CM_CMD_EVENT] = ib_ucm_event,
+ [IB_USER_CM_CMD_INIT_QP_ATTR] = ib_ucm_init_qp_attr,
+};
+
+static ssize_t ib_ucm_write(struct file *filp, const char __user *buf,
+ size_t len, loff_t *pos)
+{
+ struct ib_ucm_file *file = filp->private_data;
+ struct ib_ucm_cmd_hdr hdr;
+ ssize_t result;
+
+ if (len < sizeof(hdr))
+ return -EINVAL;
+
+ if (copy_from_user(&hdr, buf, sizeof(hdr)))
+ return -EFAULT;
+
+ if (hdr.cmd < 0 || hdr.cmd >= ARRAY_SIZE(ucm_cmd_table))
+ return -EINVAL;
+
+ if (hdr.in + sizeof(hdr) > len)
+ return -EINVAL;
+
+ result = ucm_cmd_table[hdr.cmd](file, buf + sizeof(hdr),
+ hdr.in, hdr.out);
+ if (!result)
+ result = len;
+
+ return result;
+}
+
+static unsigned int ib_ucm_poll(struct file *filp,
+ struct poll_table_struct *wait)
+{
+ struct ib_ucm_file *file = filp->private_data;
+ unsigned int mask = 0;
+
+ poll_wait(filp, &file->poll_wait, wait);
+
+ if (!list_empty(&file->events))
+ mask = POLLIN | POLLRDNORM;
+
+ return mask;
+}
+
+/*
+ * ib_ucm_open() does not need the BKL:
+ *
+ * - no global state is referred to;
+ * - there is no ioctl method to race against;
+ * - no further module initialization is required for open to work
+ * after the device is registered.
+ */
+static int ib_ucm_open(struct inode *inode, struct file *filp)
+{
+ struct ib_ucm_file *file;
+
+ file = kzalloc(sizeof(*file), GFP_KERNEL);
+ if (!file)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&file->events);
+ INIT_LIST_HEAD(&file->ctxs);
+ init_waitqueue_head(&file->poll_wait);
+
+ mutex_init(&file->file_mutex);
+
+ filp->private_data = file;
+ file->filp = filp;
+ file->device = container_of(inode->i_cdev->si_drv1, struct ib_ucm_device, cdev);
+
+ return 0;
+}
+
+static int ib_ucm_close(struct inode *inode, struct file *filp)
+{
+ struct ib_ucm_file *file = filp->private_data;
+ struct ib_ucm_context *ctx;
+
+ mutex_lock(&file->file_mutex);
+ while (!list_empty(&file->ctxs)) {
+ ctx = list_entry(file->ctxs.next,
+ struct ib_ucm_context, file_list);
+ mutex_unlock(&file->file_mutex);
+
+ mutex_lock(&ctx_id_mutex);
+ idr_remove(&ctx_id_table, ctx->id);
+ mutex_unlock(&ctx_id_mutex);
+
+ ib_destroy_cm_id(ctx->cm_id);
+ ib_ucm_cleanup_events(ctx);
+ kfree(ctx);
+
+ mutex_lock(&file->file_mutex);
+ }
+ mutex_unlock(&file->file_mutex);
+ kfree(file);
+ return 0;
+}
+
+static void ib_ucm_release_dev(struct device *dev)
+{
+ struct ib_ucm_device *ucm_dev;
+
+ ucm_dev = container_of(dev, struct ib_ucm_device, dev);
+ cdev_del(&ucm_dev->cdev);
+ clear_bit(ucm_dev->devnum, dev_map);
+ kfree(ucm_dev);
+}
+
+static const struct file_operations ucm_fops = {
+ .owner = THIS_MODULE,
+ .open = ib_ucm_open,
+ .release = ib_ucm_close,
+ .write = ib_ucm_write,
+ .poll = ib_ucm_poll,
+};
+
+static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct ib_ucm_device *ucm_dev;
+
+ ucm_dev = container_of(dev, struct ib_ucm_device, dev);
+ return sprintf(buf, "%s\n", ucm_dev->ib_dev->name);
+}
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static void ib_ucm_add_one(struct ib_device *device)
+{
+ struct ib_ucm_device *ucm_dev;
+
+ if (!device->alloc_ucontext ||
+ rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+ return;
+
+ ucm_dev = kzalloc(sizeof *ucm_dev, GFP_KERNEL);
+ if (!ucm_dev)
+ return;
+
+ ucm_dev->ib_dev = device;
+
+ ucm_dev->devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES);
+ if (ucm_dev->devnum >= IB_UCM_MAX_DEVICES)
+ goto err;
+
+ set_bit(ucm_dev->devnum, dev_map);
+
+ cdev_init(&ucm_dev->cdev, &ucm_fops);
+ ucm_dev->cdev.owner = THIS_MODULE;
+ kobject_set_name(&ucm_dev->cdev.kobj, "ucm%d", ucm_dev->devnum);
+ if (cdev_add(&ucm_dev->cdev, IB_UCM_BASE_DEV + ucm_dev->devnum, 1))
+ goto err;
+
+ ucm_dev->dev.class = &cm_class;
+ ucm_dev->dev.parent = device->dma_device;
+ ucm_dev->dev.devt = ucm_dev->cdev.dev;
+ ucm_dev->dev.release = ib_ucm_release_dev;
+ dev_set_name(&ucm_dev->dev, "ucm%d", ucm_dev->devnum);
+ if (device_register(&ucm_dev->dev))
+ goto err_cdev;
+
+ if (device_create_file(&ucm_dev->dev, &dev_attr_ibdev))
+ goto err_dev;
+
+ ib_set_client_data(device, &ucm_client, ucm_dev);
+ return;
+
+err_dev:
+ device_unregister(&ucm_dev->dev);
+err_cdev:
+ cdev_del(&ucm_dev->cdev);
+ clear_bit(ucm_dev->devnum, dev_map);
+err:
+ kfree(ucm_dev);
+ return;
+}
+
+static void ib_ucm_remove_one(struct ib_device *device)
+{
+ struct ib_ucm_device *ucm_dev = ib_get_client_data(device, &ucm_client);
+
+ if (!ucm_dev)
+ return;
+
+ device_unregister(&ucm_dev->dev);
+}
+
+static ssize_t show_abi_version(struct class *class, char *buf)
+{
+ return sprintf(buf, "%d\n", IB_USER_CM_ABI_VERSION);
+}
+static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
+
+static int __init ib_ucm_init(void)
+{
+ int ret;
+
+ ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES,
+ "infiniband_cm");
+ if (ret) {
+ printk(KERN_ERR "ucm: couldn't register device number\n");
+ goto error1;
+ }
+
+ ret = class_create_file(&cm_class, &class_attr_abi_version);
+ if (ret) {
+ printk(KERN_ERR "ucm: couldn't create abi_version attribute\n");
+ goto error2;
+ }
+
+ ret = ib_register_client(&ucm_client);
+ if (ret) {
+ printk(KERN_ERR "ucm: couldn't register client\n");
+ goto error3;
+ }
+ return 0;
+
+error3:
+ class_remove_file(&cm_class, &class_attr_abi_version);
+error2:
+ unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES);
+error1:
+ return ret;
+}
+
+static void __exit ib_ucm_cleanup(void)
+{
+ ib_unregister_client(&ucm_client);
+ class_remove_file(&cm_class, &class_attr_abi_version);
+ unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES);
+ idr_destroy(&ctx_id_table);
+}
+
+module_init_order(ib_ucm_init, SI_ORDER_THIRD);
+module_exit(ib_ucm_cleanup);
diff --git a/sys/ofed/drivers/infiniband/core/ucma.c b/sys/ofed/drivers/infiniband/core/ucma.c
new file mode 100644
index 0000000..23cbf7b
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/ucma.c
@@ -0,0 +1,1337 @@
+/*
+ * Copyright (c) 2005-2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/file.h>
+#include <linux/mutex.h>
+#include <linux/poll.h>
+#include <linux/idr.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/miscdevice.h>
+
+#include <rdma/rdma_user_cm.h>
+#include <rdma/ib_marshall.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/rdma_cm_ib.h>
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+enum {
+ UCMA_MAX_BACKLOG = 1024
+};
+
+struct ucma_file {
+ struct mutex mut;
+ struct file *filp;
+ struct list_head ctx_list;
+ struct list_head event_list;
+ wait_queue_head_t poll_wait;
+};
+
+struct ucma_context {
+ int id;
+ struct completion comp;
+ atomic_t ref;
+ int events_reported;
+ int backlog;
+
+ struct ucma_file *file;
+ struct rdma_cm_id *cm_id;
+ u64 uid;
+
+ struct list_head list;
+ struct list_head mc_list;
+};
+
+struct ucma_multicast {
+ struct ucma_context *ctx;
+ int id;
+ int events_reported;
+
+ u64 uid;
+ struct list_head list;
+ struct sockaddr_storage addr;
+};
+
+struct ucma_event {
+ struct ucma_context *ctx;
+ struct ucma_multicast *mc;
+ struct list_head list;
+ struct rdma_cm_id *cm_id;
+ struct rdma_ucm_event_resp resp;
+};
+
+static DEFINE_MUTEX(mut);
+static DEFINE_IDR(ctx_idr);
+static DEFINE_IDR(multicast_idr);
+
+static inline struct ucma_context *_ucma_find_context(int id,
+ struct ucma_file *file)
+{
+ struct ucma_context *ctx;
+
+ ctx = idr_find(&ctx_idr, id);
+ if (!ctx)
+ ctx = ERR_PTR(-ENOENT);
+ else if (ctx->file != file)
+ ctx = ERR_PTR(-EINVAL);
+ return ctx;
+}
+
+static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id)
+{
+ struct ucma_context *ctx;
+
+ mutex_lock(&mut);
+ ctx = _ucma_find_context(id, file);
+ if (!IS_ERR(ctx))
+ atomic_inc(&ctx->ref);
+ mutex_unlock(&mut);
+ return ctx;
+}
+
+static void ucma_put_ctx(struct ucma_context *ctx)
+{
+ if (atomic_dec_and_test(&ctx->ref))
+ complete(&ctx->comp);
+}
+
+static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
+{
+ struct ucma_context *ctx;
+ int ret;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return NULL;
+
+ atomic_set(&ctx->ref, 1);
+ init_completion(&ctx->comp);
+ INIT_LIST_HEAD(&ctx->mc_list);
+ ctx->file = file;
+
+ do {
+ ret = idr_pre_get(&ctx_idr, GFP_KERNEL);
+ if (!ret)
+ goto error;
+
+ mutex_lock(&mut);
+ ret = idr_get_new(&ctx_idr, ctx, &ctx->id);
+ mutex_unlock(&mut);
+ } while (ret == -EAGAIN);
+
+ if (ret)
+ goto error;
+
+ list_add_tail(&ctx->list, &file->ctx_list);
+ return ctx;
+
+error:
+ kfree(ctx);
+ return NULL;
+}
+
+static struct ucma_multicast* ucma_alloc_multicast(struct ucma_context *ctx)
+{
+ struct ucma_multicast *mc;
+ int ret;
+
+ mc = kzalloc(sizeof(*mc), GFP_KERNEL);
+ if (!mc)
+ return NULL;
+
+ do {
+ ret = idr_pre_get(&multicast_idr, GFP_KERNEL);
+ if (!ret)
+ goto error;
+
+ mutex_lock(&mut);
+ ret = idr_get_new(&multicast_idr, mc, &mc->id);
+ mutex_unlock(&mut);
+ } while (ret == -EAGAIN);
+
+ if (ret)
+ goto error;
+
+ mc->ctx = ctx;
+ list_add_tail(&mc->list, &ctx->mc_list);
+ return mc;
+
+error:
+ kfree(mc);
+ return NULL;
+}
+
+static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst,
+ struct rdma_conn_param *src)
+{
+ if (src->private_data_len)
+ memcpy(dst->private_data, src->private_data,
+ src->private_data_len);
+ dst->private_data_len = src->private_data_len;
+ dst->responder_resources =src->responder_resources;
+ dst->initiator_depth = src->initiator_depth;
+ dst->flow_control = src->flow_control;
+ dst->retry_count = src->retry_count;
+ dst->rnr_retry_count = src->rnr_retry_count;
+ dst->srq = src->srq;
+ dst->qp_num = src->qp_num;
+}
+
+static void ucma_copy_ud_event(struct rdma_ucm_ud_param *dst,
+ struct rdma_ud_param *src)
+{
+ if (src->private_data_len)
+ memcpy(dst->private_data, src->private_data,
+ src->private_data_len);
+ dst->private_data_len = src->private_data_len;
+ ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr);
+ dst->qp_num = src->qp_num;
+ dst->qkey = src->qkey;
+}
+
+static void ucma_set_event_context(struct ucma_context *ctx,
+ struct rdma_cm_event *event,
+ struct ucma_event *uevent)
+{
+ uevent->ctx = ctx;
+ switch (event->event) {
+ case RDMA_CM_EVENT_MULTICAST_JOIN:
+ case RDMA_CM_EVENT_MULTICAST_ERROR:
+ uevent->mc = (struct ucma_multicast *)
+ event->param.ud.private_data;
+ uevent->resp.uid = uevent->mc->uid;
+ uevent->resp.id = uevent->mc->id;
+ break;
+ default:
+ uevent->resp.uid = ctx->uid;
+ uevent->resp.id = ctx->id;
+ break;
+ }
+}
+
+static int ucma_event_handler(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event)
+{
+ struct ucma_event *uevent;
+ struct ucma_context *ctx = cm_id->context;
+ int ret = 0;
+
+ uevent = kzalloc(sizeof(*uevent), GFP_KERNEL);
+ if (!uevent)
+ return event->event == RDMA_CM_EVENT_CONNECT_REQUEST;
+
+ uevent->cm_id = cm_id;
+ ucma_set_event_context(ctx, event, uevent);
+ uevent->resp.event = event->event;
+ uevent->resp.status = event->status;
+ if (cm_id->ps == RDMA_PS_UDP || cm_id->ps == RDMA_PS_IPOIB)
+ ucma_copy_ud_event(&uevent->resp.param.ud, &event->param.ud);
+ else
+ ucma_copy_conn_event(&uevent->resp.param.conn,
+ &event->param.conn);
+
+ mutex_lock(&ctx->file->mut);
+ if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) {
+ if (!ctx->backlog) {
+ ret = -ENOMEM;
+ kfree(uevent);
+ goto out;
+ }
+ ctx->backlog--;
+ } else if (!ctx->uid) {
+ /*
+ * We ignore events for new connections until userspace has set
+ * their context. This can only happen if an error occurs on a
+ * new connection before the user accepts it. This is okay,
+ * since the accept will just fail later.
+ */
+ kfree(uevent);
+ goto out;
+ }
+
+ list_add_tail(&uevent->list, &ctx->file->event_list);
+ wake_up_interruptible(&ctx->file->poll_wait);
+ if (ctx->file->filp)
+ selwakeup(&ctx->file->filp->f_selinfo);
+out:
+ mutex_unlock(&ctx->file->mut);
+ return ret;
+}
+
+static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ucma_context *ctx;
+ struct rdma_ucm_get_event cmd;
+ struct ucma_event *uevent;
+ int ret = 0;
+ DEFINE_WAIT(wait);
+
+ if (out_len < sizeof uevent->resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ mutex_lock(&file->mut);
+ while (list_empty(&file->event_list)) {
+ mutex_unlock(&file->mut);
+
+ if (file->filp->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ if (wait_event_interruptible(file->poll_wait,
+ !list_empty(&file->event_list)))
+ return -ERESTARTSYS;
+
+ mutex_lock(&file->mut);
+ }
+
+ uevent = list_entry(file->event_list.next, struct ucma_event, list);
+
+ if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) {
+ ctx = ucma_alloc_ctx(file);
+ if (!ctx) {
+ ret = -ENOMEM;
+ goto done;
+ }
+ uevent->ctx->backlog++;
+ ctx->cm_id = uevent->cm_id;
+ ctx->cm_id->context = ctx;
+ uevent->resp.id = ctx->id;
+ }
+
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &uevent->resp, sizeof uevent->resp)) {
+ ret = -EFAULT;
+ goto done;
+ }
+
+ list_del(&uevent->list);
+ uevent->ctx->events_reported++;
+ if (uevent->mc)
+ uevent->mc->events_reported++;
+ kfree(uevent);
+done:
+ mutex_unlock(&file->mut);
+ return ret;
+}
+
+static ssize_t ucma_create_id(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_create_id cmd;
+ struct rdma_ucm_create_id_resp resp;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ mutex_lock(&file->mut);
+ ctx = ucma_alloc_ctx(file);
+ mutex_unlock(&file->mut);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->uid = cmd.uid;
+ ctx->cm_id = rdma_create_id(ucma_event_handler, ctx, cmd.ps);
+ if (IS_ERR(ctx->cm_id)) {
+ ret = PTR_ERR(ctx->cm_id);
+ goto err1;
+ }
+
+ resp.id = ctx->id;
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &resp, sizeof(resp))) {
+ ret = -EFAULT;
+ goto err2;
+ }
+ return 0;
+
+err2:
+ rdma_destroy_id(ctx->cm_id);
+err1:
+ mutex_lock(&mut);
+ idr_remove(&ctx_idr, ctx->id);
+ mutex_unlock(&mut);
+ kfree(ctx);
+ return ret;
+}
+
+static void ucma_cleanup_multicast(struct ucma_context *ctx)
+{
+ struct ucma_multicast *mc, *tmp;
+
+ mutex_lock(&mut);
+ list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) {
+ list_del(&mc->list);
+ idr_remove(&multicast_idr, mc->id);
+ kfree(mc);
+ }
+ mutex_unlock(&mut);
+}
+
+static void ucma_cleanup_events(struct ucma_context *ctx)
+{
+ struct ucma_event *uevent, *tmp;
+
+ list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) {
+ if (uevent->ctx != ctx)
+ continue;
+
+ list_del(&uevent->list);
+
+ /* clear incoming connections. */
+ if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST)
+ rdma_destroy_id(uevent->cm_id);
+
+ kfree(uevent);
+ }
+}
+
+static void ucma_cleanup_mc_events(struct ucma_multicast *mc)
+{
+ struct ucma_event *uevent, *tmp;
+
+ list_for_each_entry_safe(uevent, tmp, &mc->ctx->file->event_list, list) {
+ if (uevent->mc != mc)
+ continue;
+
+ list_del(&uevent->list);
+ kfree(uevent);
+ }
+}
+
+static int ucma_free_ctx(struct ucma_context *ctx)
+{
+ int events_reported;
+
+ /* No new events will be generated after destroying the id. */
+ rdma_destroy_id(ctx->cm_id);
+
+ ucma_cleanup_multicast(ctx);
+
+ /* Cleanup events not yet reported to the user. */
+ mutex_lock(&ctx->file->mut);
+ ucma_cleanup_events(ctx);
+ list_del(&ctx->list);
+ mutex_unlock(&ctx->file->mut);
+
+ events_reported = ctx->events_reported;
+ kfree(ctx);
+ return events_reported;
+}
+
+static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_destroy_id cmd;
+ struct rdma_ucm_destroy_id_resp resp;
+ struct ucma_context *ctx;
+ int ret = 0;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ mutex_lock(&mut);
+ ctx = _ucma_find_context(cmd.id, file);
+ if (!IS_ERR(ctx))
+ idr_remove(&ctx_idr, ctx->id);
+ mutex_unlock(&mut);
+
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ucma_put_ctx(ctx);
+ wait_for_completion(&ctx->comp);
+ resp.events_reported = ucma_free_ctx(ctx);
+
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &resp, sizeof(resp)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static ssize_t ucma_bind_addr(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_bind_addr cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_resolve_addr(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_resolve_addr cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr,
+ (struct sockaddr *) &cmd.dst_addr,
+ cmd.timeout_ms);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_resolve_route(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_resolve_route cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ret = rdma_resolve_route(ctx->cm_id, cmd.timeout_ms);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp,
+ struct rdma_route *route)
+{
+ struct rdma_dev_addr *dev_addr;
+
+ resp->num_paths = route->num_paths;
+ switch (route->num_paths) {
+ case 0:
+ dev_addr = &route->addr.dev_addr;
+ rdma_addr_get_dgid(dev_addr,
+ (union ib_gid *) &resp->ib_route[0].dgid);
+ rdma_addr_get_sgid(dev_addr,
+ (union ib_gid *) &resp->ib_route[0].sgid);
+ resp->ib_route[0].pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
+ break;
+ case 2:
+ ib_copy_path_rec_to_user(&resp->ib_route[1],
+ &route->path_rec[1]);
+ /* fall through */
+ case 1:
+ ib_copy_path_rec_to_user(&resp->ib_route[0],
+ &route->path_rec[0]);
+ break;
+ default:
+ break;
+ }
+}
+
+static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp,
+ struct rdma_route *route)
+{
+ struct rdma_dev_addr *dev_addr;
+ struct net_device *dev;
+ u16 vid = 0;
+
+ resp->num_paths = route->num_paths;
+ switch (route->num_paths) {
+ case 0:
+ dev_addr = &route->addr.dev_addr;
+ dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+ if (dev) {
+ vid = rdma_vlan_dev_vlan_id(dev);
+ dev_put(dev);
+ }
+
+ iboe_mac_vlan_to_ll((union ib_gid *) &resp->ib_route[0].dgid,
+ dev_addr->dst_dev_addr, vid);
+ iboe_addr_get_sgid(dev_addr,
+ (union ib_gid *) &resp->ib_route[0].sgid);
+ resp->ib_route[0].pkey = cpu_to_be16(0xffff);
+ break;
+ case 2:
+ ib_copy_path_rec_to_user(&resp->ib_route[1],
+ &route->path_rec[1]);
+ /* fall through */
+ case 1:
+ ib_copy_path_rec_to_user(&resp->ib_route[0],
+ &route->path_rec[0]);
+ break;
+ default:
+ break;
+ }
+}
+
+static ssize_t ucma_query_route(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_query_route cmd;
+ struct rdma_ucm_query_route_resp resp;
+ struct ucma_context *ctx;
+ struct sockaddr *addr;
+ int ret = 0;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ memset(&resp, 0, sizeof resp);
+ addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr;
+ memcpy(&resp.src_addr, addr, addr->sa_family == AF_INET ?
+ sizeof(struct sockaddr_in) :
+ sizeof(struct sockaddr_in6));
+ addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr;
+ memcpy(&resp.dst_addr, addr, addr->sa_family == AF_INET ?
+ sizeof(struct sockaddr_in) :
+ sizeof(struct sockaddr_in6));
+ if (!ctx->cm_id->device)
+ goto out;
+
+ resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid;
+ resp.port_num = ctx->cm_id->port_num;
+ if (rdma_node_get_transport(ctx->cm_id->device->node_type) == RDMA_TRANSPORT_IB) {
+ switch (rdma_port_get_link_layer(ctx->cm_id->device, ctx->cm_id->port_num)) {
+ case IB_LINK_LAYER_INFINIBAND:
+ ucma_copy_ib_route(&resp, &ctx->cm_id->route);
+ break;
+ case IB_LINK_LAYER_ETHERNET:
+ ucma_copy_iboe_route(&resp, &ctx->cm_id->route);
+ break;
+ default:
+ break;
+ }
+ }
+
+out:
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &resp, sizeof(resp)))
+ ret = -EFAULT;
+
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static void ucma_copy_conn_param(struct rdma_conn_param *dst,
+ struct rdma_ucm_conn_param *src)
+{
+ dst->private_data = src->private_data;
+ dst->private_data_len = src->private_data_len;
+ dst->responder_resources =src->responder_resources;
+ dst->initiator_depth = src->initiator_depth;
+ dst->flow_control = src->flow_control;
+ dst->retry_count = src->retry_count;
+ dst->rnr_retry_count = src->rnr_retry_count;
+ dst->srq = src->srq;
+ dst->qp_num = src->qp_num;
+}
+
+static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_connect cmd;
+ struct rdma_conn_param conn_param;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ if (!cmd.conn_param.valid)
+ return -EINVAL;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ucma_copy_conn_param(&conn_param, &cmd.conn_param);
+ ret = rdma_connect(ctx->cm_id, &conn_param);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_listen(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_listen cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ctx->backlog = cmd.backlog > 0 && cmd.backlog < UCMA_MAX_BACKLOG ?
+ cmd.backlog : UCMA_MAX_BACKLOG;
+ ret = rdma_listen(ctx->cm_id, ctx->backlog);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_accept cmd;
+ struct rdma_conn_param conn_param;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ if (cmd.conn_param.valid) {
+ ctx->uid = cmd.uid;
+ ucma_copy_conn_param(&conn_param, &cmd.conn_param);
+ ret = rdma_accept(ctx->cm_id, &conn_param);
+ } else
+ ret = rdma_accept(ctx->cm_id, NULL);
+
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_reject(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_reject cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ret = rdma_reject(ctx->cm_id, cmd.private_data, cmd.private_data_len);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_disconnect(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_disconnect cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ret = rdma_disconnect(ctx->cm_id);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_init_qp_attr(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_init_qp_attr cmd;
+ struct ib_uverbs_qp_attr resp;
+ struct ucma_context *ctx;
+ struct ib_qp_attr qp_attr;
+ int ret;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ resp.qp_attr_mask = 0;
+ memset(&qp_attr, 0, sizeof qp_attr);
+ qp_attr.qp_state = cmd.qp_state;
+ ret = rdma_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask);
+ if (ret)
+ goto out;
+
+ ib_copy_qp_attr_to_user(&resp, &qp_attr);
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &resp, sizeof(resp)))
+ ret = -EFAULT;
+
+out:
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static int ucma_set_option_id(struct ucma_context *ctx, int optname,
+ void *optval, size_t optlen)
+{
+ int ret = 0;
+
+ switch (optname) {
+ case RDMA_OPTION_ID_TOS:
+ if (optlen != sizeof(u8)) {
+ ret = -EINVAL;
+ break;
+ }
+ rdma_set_service_type(ctx->cm_id, *((u8 *) optval));
+ break;
+ default:
+ ret = -ENOSYS;
+ }
+
+ return ret;
+}
+
+static int ucma_set_ib_path(struct ucma_context *ctx,
+ struct ib_path_rec_data *path_data, size_t optlen)
+{
+ struct ib_sa_path_rec sa_path;
+ struct rdma_cm_event event;
+ int ret;
+
+ if (optlen % sizeof(*path_data))
+ return -EINVAL;
+
+ for (; optlen; optlen -= sizeof(*path_data), path_data++) {
+ if (path_data->flags == (IB_PATH_GMP | IB_PATH_PRIMARY |
+ IB_PATH_BIDIRECTIONAL))
+ break;
+ }
+
+ if (!optlen)
+ return -EINVAL;
+
+ ib_sa_unpack_path(path_data->path_rec, &sa_path);
+ ret = rdma_set_ib_paths(ctx->cm_id, &sa_path, 1);
+ if (ret)
+ return ret;
+
+ memset(&event, 0, sizeof event);
+ event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+ return ucma_event_handler(ctx->cm_id, &event);
+}
+
+static int ucma_set_option_ib(struct ucma_context *ctx, int optname,
+ void *optval, size_t optlen)
+{
+ int ret;
+
+ switch (optname) {
+ case RDMA_OPTION_IB_PATH:
+ ret = ucma_set_ib_path(ctx, optval, optlen);
+ break;
+ default:
+ ret = -ENOSYS;
+ }
+
+ return ret;
+}
+
+static int ucma_set_option_level(struct ucma_context *ctx, int level,
+ int optname, void *optval, size_t optlen)
+{
+ int ret;
+
+ switch (level) {
+ case RDMA_OPTION_ID:
+ ret = ucma_set_option_id(ctx, optname, optval, optlen);
+ break;
+ case RDMA_OPTION_IB:
+ ret = ucma_set_option_ib(ctx, optname, optval, optlen);
+ break;
+ default:
+ ret = -ENOSYS;
+ }
+
+ return ret;
+}
+
+static ssize_t ucma_set_option(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_set_option cmd;
+ struct ucma_context *ctx;
+ void *optval;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ optval = kmalloc(cmd.optlen, GFP_KERNEL);
+ if (!optval) {
+ ret = -ENOMEM;
+ goto out1;
+ }
+
+ if (copy_from_user(optval, (void __user *) (unsigned long) cmd.optval,
+ cmd.optlen)) {
+ ret = -EFAULT;
+ goto out2;
+ }
+
+ ret = ucma_set_option_level(ctx, cmd.level, cmd.optname, optval,
+ cmd.optlen);
+out2:
+ kfree(optval);
+out1:
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_notify(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_notify cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ret = rdma_notify(ctx->cm_id, (enum ib_event_type) cmd.event);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_join_multicast(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_join_mcast cmd;
+ struct rdma_ucm_create_id_resp resp;
+ struct ucma_context *ctx;
+ struct ucma_multicast *mc;
+ int ret;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ mutex_lock(&file->mut);
+ mc = ucma_alloc_multicast(ctx);
+ if (!mc) {
+ ret = -ENOMEM;
+ goto err1;
+ }
+
+ mc->uid = cmd.uid;
+ memcpy(&mc->addr, &cmd.addr, sizeof cmd.addr);
+ ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr, mc);
+ if (ret)
+ goto err2;
+
+ resp.id = mc->id;
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &resp, sizeof(resp))) {
+ ret = -EFAULT;
+ goto err3;
+ }
+
+ mutex_unlock(&file->mut);
+ ucma_put_ctx(ctx);
+ return 0;
+
+err3:
+ rdma_leave_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr);
+ ucma_cleanup_mc_events(mc);
+err2:
+ mutex_lock(&mut);
+ idr_remove(&multicast_idr, mc->id);
+ mutex_unlock(&mut);
+ list_del(&mc->list);
+ kfree(mc);
+err1:
+ mutex_unlock(&file->mut);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_leave_multicast(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_destroy_id cmd;
+ struct rdma_ucm_destroy_id_resp resp;
+ struct ucma_multicast *mc;
+ int ret = 0;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ mutex_lock(&mut);
+ mc = idr_find(&multicast_idr, cmd.id);
+ if (!mc)
+ mc = ERR_PTR(-ENOENT);
+ else if (mc->ctx->file != file)
+ mc = ERR_PTR(-EINVAL);
+ else {
+ idr_remove(&multicast_idr, mc->id);
+ atomic_inc(&mc->ctx->ref);
+ }
+ mutex_unlock(&mut);
+
+ if (IS_ERR(mc)) {
+ ret = PTR_ERR(mc);
+ goto out;
+ }
+
+ rdma_leave_multicast(mc->ctx->cm_id, (struct sockaddr *) &mc->addr);
+ mutex_lock(&mc->ctx->file->mut);
+ ucma_cleanup_mc_events(mc);
+ list_del(&mc->list);
+ mutex_unlock(&mc->ctx->file->mut);
+
+ ucma_put_ctx(mc->ctx);
+ resp.events_reported = mc->events_reported;
+ kfree(mc);
+
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &resp, sizeof(resp)))
+ ret = -EFAULT;
+out:
+ return ret;
+}
+
+static void ucma_lock_files(struct ucma_file *file1, struct ucma_file *file2)
+{
+ /* Acquire mutex's based on pointer comparison to prevent deadlock. */
+ if (file1 < file2) {
+ mutex_lock(&file1->mut);
+ mutex_lock(&file2->mut);
+ } else {
+ mutex_lock(&file2->mut);
+ mutex_lock(&file1->mut);
+ }
+}
+
+static void ucma_unlock_files(struct ucma_file *file1, struct ucma_file *file2)
+{
+ if (file1 < file2) {
+ mutex_unlock(&file2->mut);
+ mutex_unlock(&file1->mut);
+ } else {
+ mutex_unlock(&file1->mut);
+ mutex_unlock(&file2->mut);
+ }
+}
+
+static void ucma_move_events(struct ucma_context *ctx, struct ucma_file *file)
+{
+ struct ucma_event *uevent, *tmp;
+
+ list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list)
+ if (uevent->ctx == ctx)
+ list_move_tail(&uevent->list, &file->event_list);
+}
+
+static ssize_t ucma_migrate_id(struct ucma_file *new_file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_migrate_id cmd;
+ struct rdma_ucm_migrate_resp resp;
+ struct ucma_context *ctx;
+ struct file *filp;
+ struct ucma_file *cur_file;
+ int ret = 0;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ /* Get current fd to protect against it being closed */
+ filp = fget(cmd.fd);
+ if (!filp)
+ return -ENOENT;
+
+ /* Validate current fd and prevent destruction of id. */
+ ctx = ucma_get_ctx(filp->private_data, cmd.id);
+ if (IS_ERR(ctx)) {
+ ret = PTR_ERR(ctx);
+ goto file_put;
+ }
+
+ cur_file = ctx->file;
+ if (cur_file == new_file) {
+ resp.events_reported = ctx->events_reported;
+ goto response;
+ }
+
+ /*
+ * Migrate events between fd's, maintaining order, and avoiding new
+ * events being added before existing events.
+ */
+ ucma_lock_files(cur_file, new_file);
+ mutex_lock(&mut);
+
+ list_move_tail(&ctx->list, &new_file->ctx_list);
+ ucma_move_events(ctx, new_file);
+ ctx->file = new_file;
+ resp.events_reported = ctx->events_reported;
+
+ mutex_unlock(&mut);
+ ucma_unlock_files(cur_file, new_file);
+
+response:
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &resp, sizeof(resp)))
+ ret = -EFAULT;
+
+ ucma_put_ctx(ctx);
+file_put:
+ fput(filp);
+ return ret;
+}
+
+static ssize_t (*ucma_cmd_table[])(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len) = {
+ [RDMA_USER_CM_CMD_CREATE_ID] = ucma_create_id,
+ [RDMA_USER_CM_CMD_DESTROY_ID] = ucma_destroy_id,
+ [RDMA_USER_CM_CMD_BIND_ADDR] = ucma_bind_addr,
+ [RDMA_USER_CM_CMD_RESOLVE_ADDR] = ucma_resolve_addr,
+ [RDMA_USER_CM_CMD_RESOLVE_ROUTE]= ucma_resolve_route,
+ [RDMA_USER_CM_CMD_QUERY_ROUTE] = ucma_query_route,
+ [RDMA_USER_CM_CMD_CONNECT] = ucma_connect,
+ [RDMA_USER_CM_CMD_LISTEN] = ucma_listen,
+ [RDMA_USER_CM_CMD_ACCEPT] = ucma_accept,
+ [RDMA_USER_CM_CMD_REJECT] = ucma_reject,
+ [RDMA_USER_CM_CMD_DISCONNECT] = ucma_disconnect,
+ [RDMA_USER_CM_CMD_INIT_QP_ATTR] = ucma_init_qp_attr,
+ [RDMA_USER_CM_CMD_GET_EVENT] = ucma_get_event,
+ [RDMA_USER_CM_CMD_GET_OPTION] = NULL,
+ [RDMA_USER_CM_CMD_SET_OPTION] = ucma_set_option,
+ [RDMA_USER_CM_CMD_NOTIFY] = ucma_notify,
+ [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast,
+ [RDMA_USER_CM_CMD_LEAVE_MCAST] = ucma_leave_multicast,
+ [RDMA_USER_CM_CMD_MIGRATE_ID] = ucma_migrate_id
+};
+
+static ssize_t ucma_write(struct file *filp, const char __user *buf,
+ size_t len, loff_t *pos)
+{
+ struct ucma_file *file = filp->private_data;
+ struct rdma_ucm_cmd_hdr hdr;
+ ssize_t ret;
+
+ if (len < sizeof(hdr))
+ return -EINVAL;
+
+ if (copy_from_user(&hdr, buf, sizeof(hdr)))
+ return -EFAULT;
+
+ if (hdr.cmd < 0 || hdr.cmd >= ARRAY_SIZE(ucma_cmd_table))
+ return -EINVAL;
+
+ if (hdr.in + sizeof(hdr) > len)
+ return -EINVAL;
+
+ if (!ucma_cmd_table[hdr.cmd])
+ return -ENOSYS;
+
+ ret = ucma_cmd_table[hdr.cmd](file, buf + sizeof(hdr), hdr.in, hdr.out);
+ if (!ret)
+ ret = len;
+
+ return ret;
+}
+
+static unsigned int ucma_poll(struct file *filp, struct poll_table_struct *wait)
+{
+ struct ucma_file *file = filp->private_data;
+ unsigned int mask = 0;
+
+ poll_wait(filp, &file->poll_wait, wait);
+
+ if (!list_empty(&file->event_list))
+ mask = POLLIN | POLLRDNORM;
+
+ return mask;
+}
+
+/*
+ * ucma_open() does not need the BKL:
+ *
+ * - no global state is referred to;
+ * - there is no ioctl method to race against;
+ * - no further module initialization is required for open to work
+ * after the device is registered.
+ */
+static int ucma_open(struct inode *inode, struct file *filp)
+{
+ struct ucma_file *file;
+
+ file = kmalloc(sizeof *file, GFP_KERNEL);
+ if (!file)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&file->event_list);
+ INIT_LIST_HEAD(&file->ctx_list);
+ init_waitqueue_head(&file->poll_wait);
+ mutex_init(&file->mut);
+
+ filp->private_data = file;
+ file->filp = filp;
+ return 0;
+}
+
+static int ucma_close(struct inode *inode, struct file *filp)
+{
+ struct ucma_file *file = filp->private_data;
+ struct ucma_context *ctx, *tmp;
+
+ mutex_lock(&file->mut);
+ list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) {
+ mutex_unlock(&file->mut);
+
+ mutex_lock(&mut);
+ idr_remove(&ctx_idr, ctx->id);
+ mutex_unlock(&mut);
+
+ ucma_free_ctx(ctx);
+ mutex_lock(&file->mut);
+ }
+ mutex_unlock(&file->mut);
+ kfree(file);
+ return 0;
+}
+
+static const struct file_operations ucma_fops = {
+ .owner = THIS_MODULE,
+ .open = ucma_open,
+ .release = ucma_close,
+ .write = ucma_write,
+ .poll = ucma_poll,
+};
+
+static struct miscdevice ucma_misc = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "rdma_cm",
+ .fops = &ucma_fops,
+};
+
+static ssize_t show_abi_version(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%d\n", RDMA_USER_CM_ABI_VERSION);
+}
+static DEVICE_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
+
+static int __init ucma_init(void)
+{
+ int ret;
+
+ ret = misc_register(&ucma_misc);
+ if (ret)
+ return ret;
+
+ ret = device_create_file(ucma_misc.this_device, &dev_attr_abi_version);
+ if (ret) {
+ printk(KERN_ERR "rdma_ucm: couldn't create abi_version attr\n");
+ goto err;
+ }
+ return 0;
+err:
+ misc_deregister(&ucma_misc);
+ return ret;
+}
+
+static void __exit ucma_cleanup(void)
+{
+ device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);
+ misc_deregister(&ucma_misc);
+ idr_destroy(&ctx_idr);
+}
+
+module_init(ucma_init);
+module_exit(ucma_cleanup);
diff --git a/sys/ofed/drivers/infiniband/core/ud_header.c b/sys/ofed/drivers/infiniband/core/ud_header.c
new file mode 100644
index 0000000..e095a12
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/ud_header.c
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/if_ether.h>
+
+#include <rdma/ib_pack.h>
+
+#define STRUCT_FIELD(header, field) \
+ .struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field), \
+ .struct_size_bytes = sizeof ((struct ib_unpacked_ ## header *) 0)->field, \
+ .field_name = #header ":" #field
+
+static const struct ib_field lrh_table[] = {
+ { STRUCT_FIELD(lrh, virtual_lane),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 4 },
+ { STRUCT_FIELD(lrh, link_version),
+ .offset_words = 0,
+ .offset_bits = 4,
+ .size_bits = 4 },
+ { STRUCT_FIELD(lrh, service_level),
+ .offset_words = 0,
+ .offset_bits = 8,
+ .size_bits = 4 },
+ { RESERVED,
+ .offset_words = 0,
+ .offset_bits = 12,
+ .size_bits = 2 },
+ { STRUCT_FIELD(lrh, link_next_header),
+ .offset_words = 0,
+ .offset_bits = 14,
+ .size_bits = 2 },
+ { STRUCT_FIELD(lrh, destination_lid),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { RESERVED,
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 5 },
+ { STRUCT_FIELD(lrh, packet_length),
+ .offset_words = 1,
+ .offset_bits = 5,
+ .size_bits = 11 },
+ { STRUCT_FIELD(lrh, source_lid),
+ .offset_words = 1,
+ .offset_bits = 16,
+ .size_bits = 16 }
+};
+
+static const struct ib_field eth_table[] = {
+ { STRUCT_FIELD(eth, dmac_h),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { STRUCT_FIELD(eth, dmac_l),
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { STRUCT_FIELD(eth, smac_h),
+ .offset_words = 1,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { STRUCT_FIELD(eth, smac_l),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { STRUCT_FIELD(eth, type),
+ .offset_words = 3,
+ .offset_bits = 0,
+ .size_bits = 16 }
+};
+
+static const struct ib_field vlan_table[] = {
+ { STRUCT_FIELD(vlan, tag),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { STRUCT_FIELD(vlan, type),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 }
+};
+
+static const struct ib_field grh_table[] = {
+ { STRUCT_FIELD(grh, ip_version),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 4 },
+ { STRUCT_FIELD(grh, traffic_class),
+ .offset_words = 0,
+ .offset_bits = 4,
+ .size_bits = 8 },
+ { STRUCT_FIELD(grh, flow_label),
+ .offset_words = 0,
+ .offset_bits = 12,
+ .size_bits = 20 },
+ { STRUCT_FIELD(grh, payload_length),
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { STRUCT_FIELD(grh, next_header),
+ .offset_words = 1,
+ .offset_bits = 16,
+ .size_bits = 8 },
+ { STRUCT_FIELD(grh, hop_limit),
+ .offset_words = 1,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { STRUCT_FIELD(grh, source_gid),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { STRUCT_FIELD(grh, destination_gid),
+ .offset_words = 6,
+ .offset_bits = 0,
+ .size_bits = 128 }
+};
+
+static const struct ib_field bth_table[] = {
+ { STRUCT_FIELD(bth, opcode),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { STRUCT_FIELD(bth, solicited_event),
+ .offset_words = 0,
+ .offset_bits = 8,
+ .size_bits = 1 },
+ { STRUCT_FIELD(bth, mig_req),
+ .offset_words = 0,
+ .offset_bits = 9,
+ .size_bits = 1 },
+ { STRUCT_FIELD(bth, pad_count),
+ .offset_words = 0,
+ .offset_bits = 10,
+ .size_bits = 2 },
+ { STRUCT_FIELD(bth, transport_header_version),
+ .offset_words = 0,
+ .offset_bits = 12,
+ .size_bits = 4 },
+ { STRUCT_FIELD(bth, pkey),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { RESERVED,
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { STRUCT_FIELD(bth, destination_qpn),
+ .offset_words = 1,
+ .offset_bits = 8,
+ .size_bits = 24 },
+ { STRUCT_FIELD(bth, ack_req),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 1 },
+ { RESERVED,
+ .offset_words = 2,
+ .offset_bits = 1,
+ .size_bits = 7 },
+ { STRUCT_FIELD(bth, psn),
+ .offset_words = 2,
+ .offset_bits = 8,
+ .size_bits = 24 }
+};
+
+static const struct ib_field deth_table[] = {
+ { STRUCT_FIELD(deth, qkey),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { RESERVED,
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { STRUCT_FIELD(deth, source_qpn),
+ .offset_words = 1,
+ .offset_bits = 8,
+ .size_bits = 24 }
+};
+
+/**
+ * ib_ud_header_init - Initialize UD header structure
+ * @payload_bytes:Length of packet payload
+ * @lrh_present: specify if LRH is present
+ * @eth_present: specify if Eth header is present
+ * @vlan_present: packet is tagged vlan
+ * @grh_present:GRH flag (if non-zero, GRH will be included)
+ * @immediate_present: specify if immediate data is present
+ * @header:Structure to initialize
+ */
+void ib_ud_header_init(int payload_bytes,
+ int lrh_present,
+ int eth_present,
+ int vlan_present,
+ int grh_present,
+ int immediate_present,
+ struct ib_ud_header *header)
+{
+ u16 packet_length;
+
+ memset(header, 0, sizeof *header);
+
+ if (lrh_present) {
+ header->lrh.link_version = 0;
+ header->lrh.link_next_header =
+ grh_present ? IB_LNH_IBA_GLOBAL : IB_LNH_IBA_LOCAL;
+ packet_length = IB_LRH_BYTES;
+ }
+
+ if (eth_present) {
+ if (vlan_present) {
+ header->eth.type = cpu_to_be16(ETH_P_8021Q);
+ packet_length += IB_VLAN_BYTES;
+
+ }
+ packet_length += IB_ETH_BYTES;
+ }
+
+ packet_length += IB_BTH_BYTES + IB_DETH_BYTES + payload_bytes +
+ 4 + /* ICRC */
+ 3; /* round up */
+ packet_length /= 4;
+ if (grh_present) {
+ packet_length += IB_GRH_BYTES / 4;
+ header->grh.ip_version = 6;
+ header->grh.payload_length =
+ cpu_to_be16((IB_BTH_BYTES +
+ IB_DETH_BYTES +
+ payload_bytes +
+ 4 + /* ICRC */
+ 3) & ~3); /* round up */
+ header->grh.next_header = 0x1b;
+ }
+
+ if (lrh_present)
+ header->lrh.packet_length = cpu_to_be16(packet_length);
+
+ if (immediate_present)
+ header->bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+ else
+ header->bth.opcode = IB_OPCODE_UD_SEND_ONLY;
+ header->bth.pad_count = (4 - payload_bytes) & 3;
+ header->bth.transport_header_version = 0;
+
+ header->lrh_present = lrh_present;
+ header->eth_present = eth_present;
+ header->vlan_present = vlan_present;
+ header->grh_present = grh_present;
+ header->immediate_present = immediate_present;
+}
+EXPORT_SYMBOL(ib_ud_header_init);
+
+/**
+ * ib_lrh_header_pack - Pack LRH header struct into wire format
+ * @lrh:unpacked LRH header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_lrh_header_pack() packs the LRH header structure @lrh into
+ * wire format in the buffer @buf.
+ */
+int ib_lrh_header_pack(struct ib_unpacked_lrh *lrh, void *buf)
+{
+ ib_pack(lrh_table, ARRAY_SIZE(lrh_table), lrh, buf);
+ return 0;
+}
+EXPORT_SYMBOL(ib_lrh_header_pack);
+
+/**
+ * ib_lrh_header_unpack - Unpack LRH structure from wire format
+ * @lrh:unpacked LRH header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_lrh_header_unpack() unpacks the LRH header structure from
+ * wire format (in buf) into @lrh.
+ */
+int ib_lrh_header_unpack(void *buf, struct ib_unpacked_lrh *lrh)
+{
+ ib_unpack(lrh_table, ARRAY_SIZE(lrh_table), buf, lrh);
+ return 0;
+}
+EXPORT_SYMBOL(ib_lrh_header_unpack);
+
+/**
+ * ib_ud_header_pack - Pack UD header struct into wire format
+ * @header:UD header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_ud_header_pack() packs the UD header structure @header into wire
+ * format in the buffer @buf.
+ */
+int ib_ud_header_pack(struct ib_ud_header *header,
+ void *buf)
+{
+ int len = 0;
+
+ if (header->lrh_present) {
+ ib_pack(lrh_table, ARRAY_SIZE(lrh_table),
+ &header->lrh, buf + len);
+ len += IB_LRH_BYTES;
+ }
+ if (header->eth_present) {
+ ib_pack(eth_table, ARRAY_SIZE(eth_table),
+ &header->eth, buf + len);
+ len += IB_ETH_BYTES;
+ }
+
+
+ if (header->vlan_present) {
+ ib_pack(vlan_table, ARRAY_SIZE(vlan_table),
+ &header->vlan, buf + len);
+ len += IB_VLAN_BYTES;
+ }
+
+ if (header->grh_present) {
+ ib_pack(grh_table, ARRAY_SIZE(grh_table),
+ &header->grh, buf + len);
+ len += IB_GRH_BYTES;
+ }
+
+ ib_pack(bth_table, ARRAY_SIZE(bth_table),
+ &header->bth, buf + len);
+ len += IB_BTH_BYTES;
+
+ ib_pack(deth_table, ARRAY_SIZE(deth_table),
+ &header->deth, buf + len);
+ len += IB_DETH_BYTES;
+
+ if (header->immediate_present) {
+ memcpy(buf + len, &header->immediate_data, sizeof header->immediate_data);
+ len += sizeof header->immediate_data;
+ }
+
+ return len;
+}
+EXPORT_SYMBOL(ib_ud_header_pack);
+
+/**
+ * ib_ud_header_unpack - Unpack UD header struct from wire format
+ * @header:UD header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_ud_header_pack() unpacks the UD header structure @header from wire
+ * format in the buffer @buf.
+ */
+int ib_ud_header_unpack(void *buf,
+ struct ib_ud_header *header)
+{
+ ib_unpack(lrh_table, ARRAY_SIZE(lrh_table),
+ buf, &header->lrh);
+ buf += IB_LRH_BYTES;
+
+ if (header->lrh.link_version != 0) {
+ printk(KERN_WARNING "Invalid LRH.link_version %d\n",
+ header->lrh.link_version);
+ return -EINVAL;
+ }
+
+ switch (header->lrh.link_next_header) {
+ case IB_LNH_IBA_LOCAL:
+ header->grh_present = 0;
+ break;
+
+ case IB_LNH_IBA_GLOBAL:
+ header->grh_present = 1;
+ ib_unpack(grh_table, ARRAY_SIZE(grh_table),
+ buf, &header->grh);
+ buf += IB_GRH_BYTES;
+
+ if (header->grh.ip_version != 6) {
+ printk(KERN_WARNING "Invalid GRH.ip_version %d\n",
+ header->grh.ip_version);
+ return -EINVAL;
+ }
+ if (header->grh.next_header != 0x1b) {
+ printk(KERN_WARNING "Invalid GRH.next_header 0x%02x\n",
+ header->grh.next_header);
+ return -EINVAL;
+ }
+ break;
+
+ default:
+ printk(KERN_WARNING "Invalid LRH.link_next_header %d\n",
+ header->lrh.link_next_header);
+ return -EINVAL;
+ }
+
+ ib_unpack(bth_table, ARRAY_SIZE(bth_table),
+ buf, &header->bth);
+ buf += IB_BTH_BYTES;
+
+ switch (header->bth.opcode) {
+ case IB_OPCODE_UD_SEND_ONLY:
+ header->immediate_present = 0;
+ break;
+ case IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE:
+ header->immediate_present = 1;
+ break;
+ default:
+ printk(KERN_WARNING "Invalid BTH.opcode 0x%02x\n",
+ header->bth.opcode);
+ return -EINVAL;
+ }
+
+ if (header->bth.transport_header_version != 0) {
+ printk(KERN_WARNING "Invalid BTH.transport_header_version %d\n",
+ header->bth.transport_header_version);
+ return -EINVAL;
+ }
+
+ ib_unpack(deth_table, ARRAY_SIZE(deth_table),
+ buf, &header->deth);
+ buf += IB_DETH_BYTES;
+
+ if (header->immediate_present)
+ memcpy(&header->immediate_data, buf, sizeof header->immediate_data);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_ud_header_unpack);
diff --git a/sys/ofed/drivers/infiniband/core/umem.c b/sys/ofed/drivers/infiniband/core/umem.c
new file mode 100644
index 0000000..0c6fed2
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/umem.c
@@ -0,0 +1,532 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/sched.h>
+#ifdef __linux__
+#include <linux/hugetlb.h>
+#endif
+#include <linux/dma-attrs.h>
+
+#include <sys/priv.h>
+#include <sys/resource.h>
+#include <sys/resourcevar.h>
+
+#include <vm/vm.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_pageout.h>
+
+#include "uverbs.h"
+
+static int allow_weak_ordering;
+module_param(allow_weak_ordering, bool, 0444);
+MODULE_PARM_DESC(allow_weak_ordering, "Allow weak ordering for data registered memory");
+
+#define IB_UMEM_MAX_PAGE_CHUNK \
+ ((PAGE_SIZE - offsetof(struct ib_umem_chunk, page_list)) / \
+ ((void *) &((struct ib_umem_chunk *) 0)->page_list[1] - \
+ (void *) &((struct ib_umem_chunk *) 0)->page_list[0]))
+
+#ifdef __ia64__
+extern int dma_map_sg_hp_wa;
+
+static int dma_map_sg_ia64(struct ib_device *ibdev,
+ struct scatterlist *sg,
+ int nents,
+ enum dma_data_direction dir)
+{
+ int i, rc, j, lents = 0;
+ struct device *dev;
+
+ if (!dma_map_sg_hp_wa)
+ return ib_dma_map_sg(ibdev, sg, nents, dir);
+
+ dev = ibdev->dma_device;
+ for (i = 0; i < nents; ++i) {
+ rc = dma_map_sg(dev, sg + i, 1, dir);
+ if (rc <= 0) {
+ for (j = 0; j < i; ++j)
+ dma_unmap_sg(dev, sg + j, 1, dir);
+
+ return 0;
+ }
+ lents += rc;
+ }
+
+ return lents;
+}
+
+static void dma_unmap_sg_ia64(struct ib_device *ibdev,
+ struct scatterlist *sg,
+ int nents,
+ enum dma_data_direction dir)
+{
+ int i;
+ struct device *dev;
+
+ if (!dma_map_sg_hp_wa)
+ return ib_dma_unmap_sg(ibdev, sg, nents, dir);
+
+ dev = ibdev->dma_device;
+ for (i = 0; i < nents; ++i)
+ dma_unmap_sg(dev, sg + i, 1, dir);
+}
+
+#define ib_dma_map_sg(dev, sg, nents, dir) dma_map_sg_ia64(dev, sg, nents, dir)
+#define ib_dma_unmap_sg(dev, sg, nents, dir) dma_unmap_sg_ia64(dev, sg, nents, dir)
+
+#endif
+
+static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
+{
+#ifdef __linux__
+ struct ib_umem_chunk *chunk, *tmp;
+ int i;
+
+ list_for_each_entry_safe(chunk, tmp, &umem->chunk_list, list) {
+ ib_dma_unmap_sg_attrs(dev, chunk->page_list,
+ chunk->nents, DMA_BIDIRECTIONAL, &chunk->attrs);
+ for (i = 0; i < chunk->nents; ++i) {
+ struct page *page = sg_page(&chunk->page_list[i]);
+ if (umem->writable && dirty)
+ set_page_dirty_lock(page);
+ put_page(page);
+ }
+ kfree(chunk);
+ }
+#else
+ struct ib_umem_chunk *chunk, *tmp;
+ vm_object_t object;
+ int i;
+
+ object = NULL;
+ list_for_each_entry_safe(chunk, tmp, &umem->chunk_list, list) {
+ ib_dma_unmap_sg_attrs(dev, chunk->page_list,
+ chunk->nents, DMA_BIDIRECTIONAL, &chunk->attrs);
+ for (i = 0; i < chunk->nents; ++i) {
+ struct page *page = sg_page(&chunk->page_list[i]);
+ if (umem->writable && dirty) {
+ if (object && object != page->object)
+ VM_OBJECT_UNLOCK(object);
+ if (object != page->object) {
+ object = page->object;
+ VM_OBJECT_LOCK(object);
+ }
+ vm_page_dirty(page);
+ }
+ }
+ kfree(chunk);
+ }
+ if (object)
+ VM_OBJECT_UNLOCK(object);
+
+#endif
+}
+
+/**
+ * ib_umem_get - Pin and DMA map userspace memory.
+ * @context: userspace context to pin memory for
+ * @addr: userspace virtual address to start at
+ * @size: length of region to pin
+ * @access: IB_ACCESS_xxx flags for memory being pinned
+ * @dmasync: flush in-flight DMA when the memory region is written
+ */
+struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
+ size_t size, int access, int dmasync)
+{
+#ifdef __linux__
+ struct ib_umem *umem;
+ struct page **page_list;
+ struct vm_area_struct **vma_list;
+ struct ib_umem_chunk *chunk;
+ unsigned long locked;
+ unsigned long lock_limit;
+ unsigned long cur_base;
+ unsigned long npages;
+ int ret;
+ int off;
+ int i;
+ DEFINE_DMA_ATTRS(attrs);
+
+ if (dmasync)
+ dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs);
+ else if (allow_weak_ordering)
+ dma_set_attr(DMA_ATTR_WEAK_ORDERING, &attrs);
+
+ if (!can_do_mlock())
+ return ERR_PTR(-EPERM);
+
+ umem = kmalloc(sizeof *umem, GFP_KERNEL);
+ if (!umem)
+ return ERR_PTR(-ENOMEM);
+
+ umem->context = context;
+ umem->length = size;
+ umem->offset = addr & ~PAGE_MASK;
+ umem->page_size = PAGE_SIZE;
+ /*
+ * We ask for writable memory if any access flags other than
+ * "remote read" are set. "Local write" and "remote write"
+ * obviously require write access. "Remote atomic" can do
+ * things like fetch and add, which will modify memory, and
+ * "MW bind" can change permissions by binding a window.
+ */
+ umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ);
+
+ /* We assume the memory is from hugetlb until proved otherwise */
+ umem->hugetlb = 1;
+
+ INIT_LIST_HEAD(&umem->chunk_list);
+
+ page_list = (struct page **) __get_free_page(GFP_KERNEL);
+ if (!page_list) {
+ kfree(umem);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /*
+ * if we can't alloc the vma_list, it's not so bad;
+ * just assume the memory is not hugetlb memory
+ */
+ vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL);
+ if (!vma_list)
+ umem->hugetlb = 0;
+
+ npages = PAGE_ALIGN(size + umem->offset) >> PAGE_SHIFT;
+
+ down_write(&current->mm->mmap_sem);
+
+ locked = npages + current->mm->locked_vm;
+ lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+
+ if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ cur_base = addr & PAGE_MASK;
+
+ ret = 0;
+
+ while (npages) {
+ ret = get_user_pages(current, current->mm, cur_base,
+ min_t(unsigned long, npages,
+ PAGE_SIZE / sizeof (struct page *)),
+ 1, !umem->writable, page_list, vma_list);
+
+ if (ret < 0)
+ goto out;
+
+ cur_base += ret * PAGE_SIZE;
+ npages -= ret;
+
+ off = 0;
+
+ while (ret) {
+ chunk = kmalloc(sizeof *chunk + sizeof (struct scatterlist) *
+ min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK),
+ GFP_KERNEL);
+ if (!chunk) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ chunk->attrs = attrs;
+ chunk->nents = min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK);
+ sg_init_table(chunk->page_list, chunk->nents);
+ for (i = 0; i < chunk->nents; ++i) {
+ if (vma_list &&
+ !is_vm_hugetlb_page(vma_list[i + off]))
+ umem->hugetlb = 0;
+ sg_set_page(&chunk->page_list[i], page_list[i + off], PAGE_SIZE, 0);
+ }
+
+ chunk->nmap = ib_dma_map_sg_attrs(context->device,
+ &chunk->page_list[0],
+ chunk->nents,
+ DMA_BIDIRECTIONAL,
+ &attrs);
+ if (chunk->nmap <= 0) {
+ for (i = 0; i < chunk->nents; ++i)
+ put_page(sg_page(&chunk->page_list[i]));
+ kfree(chunk);
+
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret -= chunk->nents;
+ off += chunk->nents;
+ list_add_tail(&chunk->list, &umem->chunk_list);
+ }
+
+ ret = 0;
+ }
+
+out:
+ if (ret < 0) {
+ __ib_umem_release(context->device, umem, 0);
+ kfree(umem);
+ } else
+ current->mm->locked_vm = locked;
+
+ up_write(&current->mm->mmap_sem);
+ if (vma_list)
+ free_page((unsigned long) vma_list);
+ free_page((unsigned long) page_list);
+
+ return ret < 0 ? ERR_PTR(ret) : umem;
+#else
+ struct ib_umem *umem;
+ struct ib_umem_chunk *chunk;
+ struct proc *proc;
+ pmap_t pmap;
+ vm_offset_t end, last, start;
+ vm_size_t npages;
+ int error;
+ int ents;
+ int ret;
+ int i;
+ DEFINE_DMA_ATTRS(attrs);
+
+ error = priv_check(curthread, PRIV_VM_MLOCK);
+ if (error)
+ return ERR_PTR(-error);
+
+ last = addr + size;
+ start = addr & PAGE_MASK; /* Use the linux PAGE_MASK definition. */
+ end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */
+ if (last < addr || end < addr)
+ return ERR_PTR(-EINVAL);
+ npages = atop(end - start);
+ if (npages > vm_page_max_wired)
+ return ERR_PTR(-ENOMEM);
+ umem = kzalloc(sizeof *umem, GFP_KERNEL);
+ if (!umem)
+ return ERR_PTR(-ENOMEM);
+ proc = curthread->td_proc;
+ PROC_LOCK(proc);
+ if (ptoa(npages +
+ pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) >
+ lim_cur(proc, RLIMIT_MEMLOCK)) {
+ PROC_UNLOCK(proc);
+ kfree(umem);
+ return ERR_PTR(-ENOMEM);
+ }
+ PROC_UNLOCK(proc);
+ if (npages + cnt.v_wire_count > vm_page_max_wired) {
+ kfree(umem);
+ return ERR_PTR(-EAGAIN);
+ }
+ error = vm_map_wire(&proc->p_vmspace->vm_map, start, end,
+ VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES |
+ (umem->writable ? VM_MAP_WIRE_WRITE : 0));
+ if (error != KERN_SUCCESS) {
+ kfree(umem);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ umem->context = context;
+ umem->length = size;
+ umem->offset = addr & ~PAGE_MASK;
+ umem->page_size = PAGE_SIZE;
+ umem->start = addr;
+ /*
+ * We ask for writable memory if any access flags other than
+ * "remote read" are set. "Local write" and "remote write"
+ * obviously require write access. "Remote atomic" can do
+ * things like fetch and add, which will modify memory, and
+ * "MW bind" can change permissions by binding a window.
+ */
+ umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ);
+ umem->hugetlb = 0;
+ INIT_LIST_HEAD(&umem->chunk_list);
+
+ pmap = vm_map_pmap(&proc->p_vmspace->vm_map);
+ ret = 0;
+ while (npages) {
+ ents = min_t(int, npages, IB_UMEM_MAX_PAGE_CHUNK);
+ chunk = kmalloc(sizeof(*chunk) +
+ (sizeof(struct scatterlist) * ents),
+ GFP_KERNEL);
+ if (!chunk) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ chunk->attrs = attrs;
+ chunk->nents = ents;
+ sg_init_table(&chunk->page_list[0], ents);
+ for (i = 0; i < chunk->nents; ++i) {
+ vm_paddr_t pa;
+
+ pa = pmap_extract(pmap, start);
+ if (pa == 0) {
+ ret = -ENOMEM;
+ kfree(chunk);
+ goto out;
+ }
+ sg_set_page(&chunk->page_list[i], PHYS_TO_VM_PAGE(pa),
+ PAGE_SIZE, 0);
+ npages--;
+ start += PAGE_SIZE;
+ }
+
+ chunk->nmap = ib_dma_map_sg_attrs(context->device,
+ &chunk->page_list[0],
+ chunk->nents,
+ DMA_BIDIRECTIONAL,
+ &attrs);
+ if (chunk->nmap != chunk->nents) {
+ kfree(chunk);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ list_add_tail(&chunk->list, &umem->chunk_list);
+ }
+
+out:
+ if (ret < 0) {
+ __ib_umem_release(context->device, umem, 0);
+ kfree(umem);
+ }
+
+ return ret < 0 ? ERR_PTR(ret) : umem;
+#endif
+}
+EXPORT_SYMBOL(ib_umem_get);
+
+#ifdef __linux__
+static void ib_umem_account(struct work_struct *work)
+{
+ struct ib_umem *umem = container_of(work, struct ib_umem, work);
+
+ down_write(&umem->mm->mmap_sem);
+ umem->mm->locked_vm -= umem->diff;
+ up_write(&umem->mm->mmap_sem);
+ mmput(umem->mm);
+ kfree(umem);
+}
+#endif
+
+/**
+ * ib_umem_release - release memory pinned with ib_umem_get
+ * @umem: umem struct to release
+ */
+void ib_umem_release(struct ib_umem *umem)
+{
+#ifdef __linux__
+ struct ib_ucontext *context = umem->context;
+ struct mm_struct *mm;
+ unsigned long diff;
+
+ __ib_umem_release(umem->context->device, umem, 1);
+
+ mm = get_task_mm(current);
+ if (!mm) {
+ kfree(umem);
+ return;
+ }
+
+ diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT;
+
+ /*
+ * We may be called with the mm's mmap_sem already held. This
+ * can happen when a userspace munmap() is the call that drops
+ * the last reference to our file and calls our release
+ * method. If there are memory regions to destroy, we'll end
+ * up here and not be able to take the mmap_sem. In that case
+ * we defer the vm_locked accounting to the system workqueue.
+ */
+ if (context->closing) {
+ if (!down_write_trylock(&mm->mmap_sem)) {
+ INIT_WORK(&umem->work, ib_umem_account);
+ umem->mm = mm;
+ umem->diff = diff;
+
+ schedule_work(&umem->work);
+ return;
+ }
+ } else
+ down_write(&mm->mmap_sem);
+
+ current->mm->locked_vm -= diff;
+ up_write(&mm->mmap_sem);
+ mmput(mm);
+#else
+ vm_offset_t addr, end, last, start;
+ vm_size_t size;
+ int error;
+
+ __ib_umem_release(umem->context->device, umem, 1);
+ if (umem->context->closing) {
+ kfree(umem);
+ return;
+ }
+ error = priv_check(curthread, PRIV_VM_MUNLOCK);
+ if (error)
+ return;
+ addr = umem->start;
+ size = umem->length;
+ last = addr + size;
+ start = addr & PAGE_MASK; /* Use the linux PAGE_MASK definition. */
+ end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */
+ vm_map_unwire(&curthread->td_proc->p_vmspace->vm_map, start, end,
+ VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
+
+#endif
+ kfree(umem);
+}
+EXPORT_SYMBOL(ib_umem_release);
+
+int ib_umem_page_count(struct ib_umem *umem)
+{
+ struct ib_umem_chunk *chunk;
+ int shift;
+ int i;
+ int n;
+
+ shift = ilog2(umem->page_size);
+
+ n = 0;
+ list_for_each_entry(chunk, &umem->chunk_list, list)
+ for (i = 0; i < chunk->nmap; ++i)
+ n += sg_dma_len(&chunk->page_list[i]) >> shift;
+
+ return n;
+}
+EXPORT_SYMBOL(ib_umem_page_count);
diff --git a/sys/ofed/drivers/infiniband/core/user_mad.c b/sys/ofed/drivers/infiniband/core/user_mad.c
new file mode 100644
index 0000000..3dae9ce
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/user_mad.c
@@ -0,0 +1,1225 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2008 Cisco. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <linux/dma-mapping.h>
+#include <linux/poll.h>
+#include <linux/mutex.h>
+#include <linux/kref.h>
+#include <linux/compat.h>
+#include <linux/semaphore.h>
+
+#include <asm/uaccess.h>
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_user_mad.h>
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand userspace MAD packet access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+enum {
+ IB_UMAD_MAX_PORTS = 64,
+ IB_UMAD_MAX_AGENTS = 32,
+
+ IB_UMAD_MAJOR = 231,
+ IB_UMAD_MINOR_BASE = 0
+};
+
+/*
+ * Our lifetime rules for these structs are the following: each time a
+ * device special file is opened, we look up the corresponding struct
+ * ib_umad_port by minor in the umad_port[] table while holding the
+ * port_lock. If this lookup succeeds, we take a reference on the
+ * ib_umad_port's struct ib_umad_device while still holding the
+ * port_lock; if the lookup fails, we fail the open(). We drop these
+ * references in the corresponding close().
+ *
+ * In addition to references coming from open character devices, there
+ * is one more reference to each ib_umad_device representing the
+ * module's reference taken when allocating the ib_umad_device in
+ * ib_umad_add_one().
+ *
+ * When destroying an ib_umad_device, we clear all of its
+ * ib_umad_ports from umad_port[] while holding port_lock before
+ * dropping the module's reference to the ib_umad_device. This is
+ * always safe because any open() calls will either succeed and obtain
+ * a reference before we clear the umad_port[] entries, or fail after
+ * we clear the umad_port[] entries.
+ */
+
+struct ib_umad_port {
+ struct cdev *cdev;
+ struct device *dev;
+
+ struct cdev *sm_cdev;
+ struct device *sm_dev;
+ struct semaphore sm_sem;
+
+ struct mutex file_mutex;
+ struct list_head file_list;
+
+ struct ib_device *ib_dev;
+ struct ib_umad_device *umad_dev;
+ int dev_num;
+ u8 port_num;
+};
+
+struct ib_umad_device {
+ int start_port, end_port;
+ struct kref ref;
+ struct ib_umad_port port[0];
+};
+
+struct ib_umad_file {
+ struct mutex mutex;
+ struct ib_umad_port *port;
+ struct file *filp;
+ struct list_head recv_list;
+ struct list_head send_list;
+ struct list_head port_list;
+ spinlock_t send_lock;
+ wait_queue_head_t recv_wait;
+ struct ib_mad_agent *agent[IB_UMAD_MAX_AGENTS];
+ int agents_dead;
+ u8 use_pkey_index;
+ u8 already_used;
+};
+
+struct ib_umad_packet {
+ struct ib_mad_send_buf *msg;
+ struct ib_mad_recv_wc *recv_wc;
+ struct list_head list;
+ int length;
+ struct ib_user_mad mad;
+};
+
+static struct class *umad_class;
+
+static const dev_t base_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE);
+
+static DEFINE_SPINLOCK(port_lock);
+static struct ib_umad_port *umad_port[IB_UMAD_MAX_PORTS];
+static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS);
+
+static void ib_umad_add_one(struct ib_device *device);
+static void ib_umad_remove_one(struct ib_device *device);
+
+static void ib_umad_release_dev(struct kref *ref)
+{
+ struct ib_umad_device *dev =
+ container_of(ref, struct ib_umad_device, ref);
+
+ kfree(dev);
+}
+
+static int hdr_size(struct ib_umad_file *file)
+{
+ return file->use_pkey_index ? sizeof (struct ib_user_mad_hdr) :
+ sizeof (struct ib_user_mad_hdr_old);
+}
+
+/* caller must hold file->mutex */
+static struct ib_mad_agent *__get_agent(struct ib_umad_file *file, int id)
+{
+ return file->agents_dead ? NULL : file->agent[id];
+}
+
+static int queue_packet(struct ib_umad_file *file,
+ struct ib_mad_agent *agent,
+ struct ib_umad_packet *packet)
+{
+ int ret = 1;
+
+ mutex_lock(&file->mutex);
+
+ for (packet->mad.hdr.id = 0;
+ packet->mad.hdr.id < IB_UMAD_MAX_AGENTS;
+ packet->mad.hdr.id++)
+ if (agent == __get_agent(file, packet->mad.hdr.id)) {
+ list_add_tail(&packet->list, &file->recv_list);
+ selwakeup(&file->filp->f_selinfo);
+ wake_up_interruptible(&file->recv_wait);
+ ret = 0;
+ break;
+ }
+
+ mutex_unlock(&file->mutex);
+
+ return ret;
+}
+
+static void dequeue_send(struct ib_umad_file *file,
+ struct ib_umad_packet *packet)
+{
+ spin_lock_irq(&file->send_lock);
+ list_del(&packet->list);
+ spin_unlock_irq(&file->send_lock);
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_wc *send_wc)
+{
+ struct ib_umad_file *file = agent->context;
+ struct ib_umad_packet *packet = send_wc->send_buf->context[0];
+
+ dequeue_send(file, packet);
+ ib_destroy_ah(packet->msg->ah);
+ ib_free_send_mad(packet->msg);
+
+ if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) {
+ packet->length = IB_MGMT_MAD_HDR;
+ packet->mad.hdr.status = ETIMEDOUT;
+ if (!queue_packet(file, agent, packet))
+ return;
+ }
+ kfree(packet);
+}
+
+static void recv_handler(struct ib_mad_agent *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_umad_file *file = agent->context;
+ struct ib_umad_packet *packet;
+
+ if (mad_recv_wc->wc->status != IB_WC_SUCCESS)
+ goto err1;
+
+ packet = kzalloc(sizeof *packet, GFP_KERNEL);
+ if (!packet)
+ goto err1;
+
+ packet->length = mad_recv_wc->mad_len;
+ packet->recv_wc = mad_recv_wc;
+
+ packet->mad.hdr.status = 0;
+ packet->mad.hdr.length = hdr_size(file) + mad_recv_wc->mad_len;
+ packet->mad.hdr.qpn = cpu_to_be32(mad_recv_wc->wc->src_qp);
+ packet->mad.hdr.lid = cpu_to_be16(mad_recv_wc->wc->slid);
+ packet->mad.hdr.sl = mad_recv_wc->wc->sl;
+ packet->mad.hdr.path_bits = mad_recv_wc->wc->dlid_path_bits;
+ packet->mad.hdr.pkey_index = mad_recv_wc->wc->pkey_index;
+ packet->mad.hdr.grh_present = !!(mad_recv_wc->wc->wc_flags & IB_WC_GRH);
+ if (packet->mad.hdr.grh_present) {
+ struct ib_ah_attr ah_attr;
+
+ ib_init_ah_from_wc(agent->device, agent->port_num,
+ mad_recv_wc->wc, mad_recv_wc->recv_buf.grh,
+ &ah_attr);
+
+ packet->mad.hdr.gid_index = ah_attr.grh.sgid_index;
+ packet->mad.hdr.hop_limit = ah_attr.grh.hop_limit;
+ packet->mad.hdr.traffic_class = ah_attr.grh.traffic_class;
+ memcpy(packet->mad.hdr.gid, &ah_attr.grh.dgid, 16);
+ packet->mad.hdr.flow_label = cpu_to_be32(ah_attr.grh.flow_label);
+ }
+
+ if (queue_packet(file, agent, packet))
+ goto err2;
+ return;
+
+err2:
+ kfree(packet);
+err1:
+ ib_free_recv_mad(mad_recv_wc);
+}
+
+static ssize_t copy_recv_mad(struct ib_umad_file *file, char __user *buf,
+ struct ib_umad_packet *packet, size_t count)
+{
+ struct ib_mad_recv_buf *recv_buf;
+ int left, seg_payload, offset, max_seg_payload;
+
+ /* We need enough room to copy the first (or only) MAD segment. */
+ recv_buf = &packet->recv_wc->recv_buf;
+ if ((packet->length <= sizeof (*recv_buf->mad) &&
+ count < hdr_size(file) + packet->length) ||
+ (packet->length > sizeof (*recv_buf->mad) &&
+ count < hdr_size(file) + sizeof (*recv_buf->mad)))
+ return -EINVAL;
+
+ if (copy_to_user(buf, &packet->mad, hdr_size(file)))
+ return -EFAULT;
+
+ buf += hdr_size(file);
+ seg_payload = min_t(int, packet->length, sizeof (*recv_buf->mad));
+ if (copy_to_user(buf, recv_buf->mad, seg_payload))
+ return -EFAULT;
+
+ if (seg_payload < packet->length) {
+ /*
+ * Multipacket RMPP MAD message. Copy remainder of message.
+ * Note that last segment may have a shorter payload.
+ */
+ if (count < hdr_size(file) + packet->length) {
+ /*
+ * The buffer is too small, return the first RMPP segment,
+ * which includes the RMPP message length.
+ */
+ return -ENOSPC;
+ }
+ offset = ib_get_mad_data_offset(recv_buf->mad->mad_hdr.mgmt_class);
+ max_seg_payload = sizeof (struct ib_mad) - offset;
+
+ for (left = packet->length - seg_payload, buf += seg_payload;
+ left; left -= seg_payload, buf += seg_payload) {
+ recv_buf = container_of(recv_buf->list.next,
+ struct ib_mad_recv_buf, list);
+ seg_payload = min(left, max_seg_payload);
+ if (copy_to_user(buf, ((void *) recv_buf->mad) + offset,
+ seg_payload))
+ return -EFAULT;
+ }
+ }
+ return hdr_size(file) + packet->length;
+}
+
+static ssize_t copy_send_mad(struct ib_umad_file *file, char __user *buf,
+ struct ib_umad_packet *packet, size_t count)
+{
+ ssize_t size = hdr_size(file) + packet->length;
+
+ if (count < size)
+ return -EINVAL;
+
+ if (copy_to_user(buf, &packet->mad, hdr_size(file)))
+ return -EFAULT;
+
+ buf += hdr_size(file);
+
+ if (copy_to_user(buf, packet->mad.data, packet->length))
+ return -EFAULT;
+
+ return size;
+}
+
+static ssize_t ib_umad_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct ib_umad_file *file = filp->private_data;
+ struct ib_umad_packet *packet;
+ ssize_t ret;
+
+ if (count < hdr_size(file))
+ return -EINVAL;
+
+ mutex_lock(&file->mutex);
+
+ while (list_empty(&file->recv_list)) {
+ mutex_unlock(&file->mutex);
+
+ if (filp->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ if (wait_event_interruptible(file->recv_wait,
+ !list_empty(&file->recv_list)))
+ return -ERESTARTSYS;
+
+ mutex_lock(&file->mutex);
+ }
+
+ packet = list_entry(file->recv_list.next, struct ib_umad_packet, list);
+ list_del(&packet->list);
+
+ mutex_unlock(&file->mutex);
+
+ if (packet->recv_wc)
+ ret = copy_recv_mad(file, buf, packet, count);
+ else
+ ret = copy_send_mad(file, buf, packet, count);
+
+ if (ret < 0) {
+ /* Requeue packet */
+ mutex_lock(&file->mutex);
+ list_add(&packet->list, &file->recv_list);
+ mutex_unlock(&file->mutex);
+ } else {
+ if (packet->recv_wc)
+ ib_free_recv_mad(packet->recv_wc);
+ kfree(packet);
+ }
+ return ret;
+}
+
+static int copy_rmpp_mad(struct ib_mad_send_buf *msg, const char __user *buf)
+{
+ int left, seg;
+
+ /* Copy class specific header */
+ if ((msg->hdr_len > IB_MGMT_RMPP_HDR) &&
+ copy_from_user(msg->mad + IB_MGMT_RMPP_HDR, buf + IB_MGMT_RMPP_HDR,
+ msg->hdr_len - IB_MGMT_RMPP_HDR))
+ return -EFAULT;
+
+ /* All headers are in place. Copy data segments. */
+ for (seg = 1, left = msg->data_len, buf += msg->hdr_len; left > 0;
+ seg++, left -= msg->seg_size, buf += msg->seg_size) {
+ if (copy_from_user(ib_get_rmpp_segment(msg, seg), buf,
+ min(left, msg->seg_size)))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+static int same_destination(struct ib_user_mad_hdr *hdr1,
+ struct ib_user_mad_hdr *hdr2)
+{
+ if (!hdr1->grh_present && !hdr2->grh_present)
+ return (hdr1->lid == hdr2->lid);
+
+ if (hdr1->grh_present && hdr2->grh_present)
+ return !memcmp(hdr1->gid, hdr2->gid, 16);
+
+ return 0;
+}
+
+static int is_duplicate(struct ib_umad_file *file,
+ struct ib_umad_packet *packet)
+{
+ struct ib_umad_packet *sent_packet;
+ struct ib_mad_hdr *sent_hdr, *hdr;
+
+ hdr = (struct ib_mad_hdr *) packet->mad.data;
+ list_for_each_entry(sent_packet, &file->send_list, list) {
+ sent_hdr = (struct ib_mad_hdr *) sent_packet->mad.data;
+
+ if ((hdr->tid != sent_hdr->tid) ||
+ (hdr->mgmt_class != sent_hdr->mgmt_class))
+ continue;
+
+ /*
+ * No need to be overly clever here. If two new operations have
+ * the same TID, reject the second as a duplicate. This is more
+ * restrictive than required by the spec.
+ */
+ if (!ib_response_mad((struct ib_mad *) hdr)) {
+ if (!ib_response_mad((struct ib_mad *) sent_hdr))
+ return 1;
+ continue;
+ } else if (!ib_response_mad((struct ib_mad *) sent_hdr))
+ continue;
+
+ if (same_destination(&packet->mad.hdr, &sent_packet->mad.hdr))
+ return 1;
+ }
+
+ return 0;
+}
+
+static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct ib_umad_file *file = filp->private_data;
+ struct ib_umad_packet *packet;
+ struct ib_mad_agent *agent;
+ struct ib_ah_attr ah_attr;
+ struct ib_ah *ah;
+ struct ib_rmpp_mad *rmpp_mad;
+ __be64 *tid;
+ int ret, data_len, hdr_len, copy_offset, rmpp_active;
+
+ if (count < hdr_size(file) + IB_MGMT_RMPP_HDR)
+ return -EINVAL;
+
+ packet = kzalloc(sizeof *packet + IB_MGMT_RMPP_HDR, GFP_KERNEL);
+ if (!packet)
+ return -ENOMEM;
+
+ if (copy_from_user(&packet->mad, buf, hdr_size(file))) {
+ ret = -EFAULT;
+ goto err;
+ }
+
+ if (packet->mad.hdr.id < 0 ||
+ packet->mad.hdr.id >= IB_UMAD_MAX_AGENTS) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ buf += hdr_size(file);
+
+ if (copy_from_user(packet->mad.data, buf, IB_MGMT_RMPP_HDR)) {
+ ret = -EFAULT;
+ goto err;
+ }
+
+ mutex_lock(&file->mutex);
+
+ agent = __get_agent(file, packet->mad.hdr.id);
+ if (!agent) {
+ ret = -EINVAL;
+ goto err_up;
+ }
+
+ memset(&ah_attr, 0, sizeof ah_attr);
+ ah_attr.dlid = be16_to_cpu(packet->mad.hdr.lid);
+ ah_attr.sl = packet->mad.hdr.sl;
+ ah_attr.src_path_bits = packet->mad.hdr.path_bits;
+ ah_attr.port_num = file->port->port_num;
+ if (packet->mad.hdr.grh_present) {
+ ah_attr.ah_flags = IB_AH_GRH;
+ memcpy(ah_attr.grh.dgid.raw, packet->mad.hdr.gid, 16);
+ ah_attr.grh.sgid_index = packet->mad.hdr.gid_index;
+ ah_attr.grh.flow_label = be32_to_cpu(packet->mad.hdr.flow_label);
+ ah_attr.grh.hop_limit = packet->mad.hdr.hop_limit;
+ ah_attr.grh.traffic_class = packet->mad.hdr.traffic_class;
+ }
+
+ ah = ib_create_ah(agent->qp->pd, &ah_attr);
+ if (IS_ERR(ah)) {
+ ret = PTR_ERR(ah);
+ goto err_up;
+ }
+
+ rmpp_mad = (struct ib_rmpp_mad *) packet->mad.data;
+ hdr_len = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class);
+ if (!ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)) {
+ copy_offset = IB_MGMT_MAD_HDR;
+ rmpp_active = 0;
+ } else {
+ copy_offset = IB_MGMT_RMPP_HDR;
+ rmpp_active = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+ IB_MGMT_RMPP_FLAG_ACTIVE;
+ }
+
+ data_len = count - hdr_size(file) - hdr_len;
+ packet->msg = ib_create_send_mad(agent,
+ be32_to_cpu(packet->mad.hdr.qpn),
+ packet->mad.hdr.pkey_index, rmpp_active,
+ hdr_len, data_len, GFP_KERNEL);
+ if (IS_ERR(packet->msg)) {
+ ret = PTR_ERR(packet->msg);
+ goto err_ah;
+ }
+
+ packet->msg->ah = ah;
+ packet->msg->timeout_ms = packet->mad.hdr.timeout_ms;
+ packet->msg->retries = packet->mad.hdr.retries;
+ packet->msg->context[0] = packet;
+
+ /* Copy MAD header. Any RMPP header is already in place. */
+ memcpy(packet->msg->mad, packet->mad.data, IB_MGMT_MAD_HDR);
+
+ if (!rmpp_active) {
+ if (copy_from_user(packet->msg->mad + copy_offset,
+ buf + copy_offset,
+ hdr_len + data_len - copy_offset)) {
+ ret = -EFAULT;
+ goto err_msg;
+ }
+ } else {
+ ret = copy_rmpp_mad(packet->msg, buf);
+ if (ret)
+ goto err_msg;
+ }
+
+ /*
+ * Set the high-order part of the transaction ID to make MADs from
+ * different agents unique, and allow routing responses back to the
+ * original requestor.
+ */
+ if (!ib_response_mad(packet->msg->mad)) {
+ tid = &((struct ib_mad_hdr *) packet->msg->mad)->tid;
+ *tid = cpu_to_be64(((u64) agent->hi_tid) << 32 |
+ (be64_to_cpup(tid) & 0xffffffff));
+ rmpp_mad->mad_hdr.tid = *tid;
+ }
+
+ spin_lock_irq(&file->send_lock);
+ ret = is_duplicate(file, packet);
+ if (!ret)
+ list_add_tail(&packet->list, &file->send_list);
+ spin_unlock_irq(&file->send_lock);
+ if (ret) {
+ ret = -EINVAL;
+ goto err_msg;
+ }
+
+ ret = ib_post_send_mad(packet->msg, NULL);
+ if (ret)
+ goto err_send;
+
+ mutex_unlock(&file->mutex);
+ return count;
+
+err_send:
+ dequeue_send(file, packet);
+err_msg:
+ ib_free_send_mad(packet->msg);
+err_ah:
+ ib_destroy_ah(ah);
+err_up:
+ mutex_unlock(&file->mutex);
+err:
+ kfree(packet);
+ return ret;
+}
+
+static unsigned int ib_umad_poll(struct file *filp, struct poll_table_struct *wait)
+{
+ struct ib_umad_file *file = filp->private_data;
+
+ /* we will always be able to post a MAD send */
+ unsigned int mask = POLLOUT | POLLWRNORM;
+
+ poll_wait(filp, &file->recv_wait, wait);
+
+ if (!list_empty(&file->recv_list))
+ mask |= POLLIN | POLLRDNORM;
+
+ return mask;
+}
+
+static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg,
+ int compat_method_mask)
+{
+ struct ib_user_mad_reg_req ureq;
+ struct ib_mad_reg_req req;
+ struct ib_mad_agent *agent = NULL;
+ int agent_id;
+ int ret;
+
+ mutex_lock(&file->port->file_mutex);
+ mutex_lock(&file->mutex);
+
+ if (!file->port->ib_dev) {
+ ret = -EPIPE;
+ goto out;
+ }
+
+ if (copy_from_user(&ureq, arg, sizeof ureq)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (ureq.qpn != 0 && ureq.qpn != 1) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id)
+ if (!__get_agent(file, agent_id))
+ goto found;
+
+ ret = -ENOMEM;
+ goto out;
+
+found:
+ if (ureq.mgmt_class) {
+ req.mgmt_class = ureq.mgmt_class;
+ req.mgmt_class_version = ureq.mgmt_class_version;
+ memcpy(req.oui, ureq.oui, sizeof req.oui);
+
+ if (compat_method_mask) {
+ u32 *umm = (u32 *) ureq.method_mask;
+ int i;
+
+ for (i = 0; i < BITS_TO_LONGS(IB_MGMT_MAX_METHODS); ++i)
+ req.method_mask[i] =
+ umm[i * 2] | ((u64) umm[i * 2 + 1] << 32);
+ } else
+ memcpy(req.method_mask, ureq.method_mask,
+ sizeof req.method_mask);
+ }
+
+ agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num,
+ ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI,
+ ureq.mgmt_class ? &req : NULL,
+ ureq.rmpp_version,
+ send_handler, recv_handler, file);
+ if (IS_ERR(agent)) {
+ ret = PTR_ERR(agent);
+ agent = NULL;
+ goto out;
+ }
+
+ if (put_user(agent_id,
+ (u32 __user *) (arg + offsetof(struct ib_user_mad_reg_req, id)))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (!file->already_used) {
+ file->already_used = 1;
+ if (!file->use_pkey_index) {
+ printk(KERN_WARNING "user_mad: process %s did not enable "
+ "P_Key index support.\n", curproc->p_comm);
+ printk(KERN_WARNING "user_mad: Documentation/infiniband/user_mad.txt "
+ "has info on the new ABI.\n");
+ }
+ }
+
+ file->agent[agent_id] = agent;
+ ret = 0;
+
+out:
+ mutex_unlock(&file->mutex);
+
+ if (ret && agent)
+ ib_unregister_mad_agent(agent);
+
+ mutex_unlock(&file->port->file_mutex);
+
+ return ret;
+}
+
+static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg)
+{
+ struct ib_mad_agent *agent = NULL;
+ u32 id;
+ int ret = 0;
+
+ if (get_user(id, arg))
+ return -EFAULT;
+
+ mutex_lock(&file->port->file_mutex);
+ mutex_lock(&file->mutex);
+
+ if (id < 0 || id >= IB_UMAD_MAX_AGENTS || !__get_agent(file, id)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ agent = file->agent[id];
+ file->agent[id] = NULL;
+
+out:
+ mutex_unlock(&file->mutex);
+
+ if (agent)
+ ib_unregister_mad_agent(agent);
+
+ mutex_unlock(&file->port->file_mutex);
+
+ return ret;
+}
+
+static long ib_umad_enable_pkey(struct ib_umad_file *file)
+{
+ int ret = 0;
+
+ mutex_lock(&file->mutex);
+ if (file->already_used)
+ ret = -EINVAL;
+ else
+ file->use_pkey_index = 1;
+ mutex_unlock(&file->mutex);
+
+ return ret;
+}
+
+static long ib_umad_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ switch (cmd) {
+ case IB_USER_MAD_REGISTER_AGENT:
+ return ib_umad_reg_agent(filp->private_data, (void __user *) arg, 0);
+ case IB_USER_MAD_UNREGISTER_AGENT:
+ return ib_umad_unreg_agent(filp->private_data, (__u32 __user *) arg);
+ case IB_USER_MAD_ENABLE_PKEY:
+ return ib_umad_enable_pkey(filp->private_data);
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+
+#ifdef CONFIG_COMPAT
+static long ib_umad_compat_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ switch (cmd) {
+ case IB_USER_MAD_REGISTER_AGENT:
+ return ib_umad_reg_agent(filp->private_data, compat_ptr(arg), 1);
+ case IB_USER_MAD_UNREGISTER_AGENT:
+ return ib_umad_unreg_agent(filp->private_data, compat_ptr(arg));
+ case IB_USER_MAD_ENABLE_PKEY:
+ return ib_umad_enable_pkey(filp->private_data);
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+#endif
+
+/*
+ * ib_umad_open() does not need the BKL:
+ *
+ * - umad_port[] accesses are protected by port_lock, the
+ * ib_umad_port structures are properly reference counted, and
+ * everything else is purely local to the file being created, so
+ * races against other open calls are not a problem;
+ * - the ioctl method does not affect any global state outside of the
+ * file structure being operated on;
+ * - the port is added to umad_port[] as the last part of module
+ * initialization so the open method will either immediately run
+ * -ENXIO, or all required initialization will be done.
+ */
+static int ib_umad_open(struct inode *inode, struct file *filp)
+{
+ struct ib_umad_port *port;
+ struct ib_umad_file *file;
+ int ret = 0;
+
+ spin_lock(&port_lock);
+ port = umad_port[iminor(inode) - IB_UMAD_MINOR_BASE];
+ if (port)
+ kref_get(&port->umad_dev->ref);
+ spin_unlock(&port_lock);
+
+ if (!port)
+ return -ENXIO;
+
+ mutex_lock(&port->file_mutex);
+
+ if (!port->ib_dev) {
+ ret = -ENXIO;
+ goto out;
+ }
+
+ file = kzalloc(sizeof *file, GFP_KERNEL);
+ if (!file) {
+ kref_put(&port->umad_dev->ref, ib_umad_release_dev);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ mutex_init(&file->mutex);
+ spin_lock_init(&file->send_lock);
+ INIT_LIST_HEAD(&file->recv_list);
+ INIT_LIST_HEAD(&file->send_list);
+ init_waitqueue_head(&file->recv_wait);
+
+ file->port = port;
+ file->filp = filp;
+ filp->private_data = file;
+
+ list_add_tail(&file->port_list, &port->file_list);
+
+out:
+ mutex_unlock(&port->file_mutex);
+ return ret;
+}
+
+static int ib_umad_close(struct inode *inode, struct file *filp)
+{
+ struct ib_umad_file *file = filp->private_data;
+ struct ib_umad_device *dev = file->port->umad_dev;
+ struct ib_umad_packet *packet, *tmp;
+ int already_dead;
+ int i;
+
+ mutex_lock(&file->port->file_mutex);
+ mutex_lock(&file->mutex);
+
+ already_dead = file->agents_dead;
+ file->agents_dead = 1;
+
+ list_for_each_entry_safe(packet, tmp, &file->recv_list, list) {
+ if (packet->recv_wc)
+ ib_free_recv_mad(packet->recv_wc);
+ kfree(packet);
+ }
+
+ list_del(&file->port_list);
+
+ mutex_unlock(&file->mutex);
+
+ if (!already_dead)
+ for (i = 0; i < IB_UMAD_MAX_AGENTS; ++i)
+ if (file->agent[i])
+ ib_unregister_mad_agent(file->agent[i]);
+
+ mutex_unlock(&file->port->file_mutex);
+
+ kfree(file);
+ kref_put(&dev->ref, ib_umad_release_dev);
+
+ return 0;
+}
+
+static const struct file_operations umad_fops = {
+ .owner = THIS_MODULE,
+ .read = ib_umad_read,
+ .write = ib_umad_write,
+ .poll = ib_umad_poll,
+ .unlocked_ioctl = ib_umad_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ib_umad_compat_ioctl,
+#endif
+ .open = ib_umad_open,
+ .release = ib_umad_close
+};
+
+static int ib_umad_sm_open(struct inode *inode, struct file *filp)
+{
+ struct ib_umad_port *port;
+ struct ib_port_modify props = {
+ .set_port_cap_mask = IB_PORT_SM
+ };
+ int ret;
+
+ spin_lock(&port_lock);
+ port = umad_port[iminor(inode) - IB_UMAD_MINOR_BASE - IB_UMAD_MAX_PORTS];
+ if (port)
+ kref_get(&port->umad_dev->ref);
+ spin_unlock(&port_lock);
+
+ if (!port)
+ return -ENXIO;
+
+ if (filp->f_flags & O_NONBLOCK) {
+ if (down_trylock(&port->sm_sem)) {
+ ret = -EAGAIN;
+ goto fail;
+ }
+ } else {
+ if (down_interruptible(&port->sm_sem)) {
+ ret = -ERESTARTSYS;
+ goto fail;
+ }
+ }
+
+ ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+ if (ret) {
+ up(&port->sm_sem);
+ goto fail;
+ }
+
+ filp->private_data = port;
+
+ return 0;
+
+fail:
+ kref_put(&port->umad_dev->ref, ib_umad_release_dev);
+ return ret;
+}
+
+static int ib_umad_sm_close(struct inode *inode, struct file *filp)
+{
+ struct ib_umad_port *port = filp->private_data;
+ struct ib_port_modify props = {
+ .clr_port_cap_mask = IB_PORT_SM
+ };
+ int ret = 0;
+
+ mutex_lock(&port->file_mutex);
+ if (port->ib_dev)
+ ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+ mutex_unlock(&port->file_mutex);
+
+ up(&port->sm_sem);
+
+ kref_put(&port->umad_dev->ref, ib_umad_release_dev);
+
+ return ret;
+}
+
+static const struct file_operations umad_sm_fops = {
+ .owner = THIS_MODULE,
+ .open = ib_umad_sm_open,
+ .release = ib_umad_sm_close
+};
+
+static struct ib_client umad_client = {
+ .name = "umad",
+ .add = ib_umad_add_one,
+ .remove = ib_umad_remove_one
+};
+
+static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct ib_umad_port *port = dev_get_drvdata(dev);
+
+ if (!port)
+ return -ENODEV;
+
+ return sprintf(buf, "%s\n", port->ib_dev->name);
+}
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static ssize_t show_port(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct ib_umad_port *port = dev_get_drvdata(dev);
+
+ if (!port)
+ return -ENODEV;
+
+ return sprintf(buf, "%d\n", port->port_num);
+}
+static DEVICE_ATTR(port, S_IRUGO, show_port, NULL);
+
+static ssize_t show_abi_version(struct class *class, char *buf)
+{
+ return sprintf(buf, "%d\n", IB_USER_MAD_ABI_VERSION);
+}
+static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
+
+static int ib_umad_init_port(struct ib_device *device, int port_num,
+ struct ib_umad_port *port)
+{
+ spin_lock(&port_lock);
+ port->dev_num = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS);
+ if (port->dev_num >= IB_UMAD_MAX_PORTS) {
+ spin_unlock(&port_lock);
+ return -1;
+ }
+ set_bit(port->dev_num, dev_map);
+ spin_unlock(&port_lock);
+
+ port->ib_dev = device;
+ port->port_num = port_num;
+ init_MUTEX(&port->sm_sem);
+ mutex_init(&port->file_mutex);
+ INIT_LIST_HEAD(&port->file_list);
+
+ port->cdev = cdev_alloc();
+ if (!port->cdev)
+ return -1;
+ port->cdev->owner = THIS_MODULE;
+ port->cdev->ops = &umad_fops;
+ kobject_set_name(&port->cdev->kobj, "umad%d", port->dev_num);
+ if (cdev_add(port->cdev, base_dev + port->dev_num, 1))
+ goto err_cdev;
+
+ port->dev = device_create(umad_class, device->dma_device,
+ port->cdev->dev, port,
+ "umad%d", port->dev_num);
+ if (IS_ERR(port->dev))
+ goto err_cdev;
+
+ if (device_create_file(port->dev, &dev_attr_ibdev))
+ goto err_dev;
+ if (device_create_file(port->dev, &dev_attr_port))
+ goto err_dev;
+
+ port->sm_cdev = cdev_alloc();
+ if (!port->sm_cdev)
+ goto err_dev;
+ port->sm_cdev->owner = THIS_MODULE;
+ port->sm_cdev->ops = &umad_sm_fops;
+ kobject_set_name(&port->sm_cdev->kobj, "issm%d", port->dev_num);
+ if (cdev_add(port->sm_cdev, base_dev + port->dev_num + IB_UMAD_MAX_PORTS, 1))
+ goto err_sm_cdev;
+
+ port->sm_dev = device_create(umad_class, device->dma_device,
+ port->sm_cdev->dev, port,
+ "issm%d", port->dev_num);
+ if (IS_ERR(port->sm_dev))
+ goto err_sm_cdev;
+
+ if (device_create_file(port->sm_dev, &dev_attr_ibdev))
+ goto err_sm_dev;
+ if (device_create_file(port->sm_dev, &dev_attr_port))
+ goto err_sm_dev;
+
+ spin_lock(&port_lock);
+ umad_port[port->dev_num] = port;
+ spin_unlock(&port_lock);
+
+ return 0;
+
+err_sm_dev:
+ device_destroy(umad_class, port->sm_cdev->dev);
+
+err_sm_cdev:
+ cdev_del(port->sm_cdev);
+
+err_dev:
+ device_destroy(umad_class, port->cdev->dev);
+
+err_cdev:
+ cdev_del(port->cdev);
+ clear_bit(port->dev_num, dev_map);
+
+ return -1;
+}
+
+static void ib_umad_kill_port(struct ib_umad_port *port)
+{
+ struct ib_umad_file *file;
+ int already_dead;
+ int id;
+
+ dev_set_drvdata(port->dev, NULL);
+ dev_set_drvdata(port->sm_dev, NULL);
+
+ device_destroy(umad_class, port->cdev->dev);
+ device_destroy(umad_class, port->sm_cdev->dev);
+
+ cdev_del(port->cdev);
+ cdev_del(port->sm_cdev);
+
+ spin_lock(&port_lock);
+ umad_port[port->dev_num] = NULL;
+ spin_unlock(&port_lock);
+
+ mutex_lock(&port->file_mutex);
+
+ port->ib_dev = NULL;
+
+ list_for_each_entry(file, &port->file_list, port_list) {
+ mutex_lock(&file->mutex);
+ already_dead = file->agents_dead;
+ file->agents_dead = 1;
+ mutex_unlock(&file->mutex);
+
+ for (id = 0; id < IB_UMAD_MAX_AGENTS; ++id)
+ if (file->agent[id])
+ ib_unregister_mad_agent(file->agent[id]);
+ }
+
+ mutex_unlock(&port->file_mutex);
+
+ clear_bit(port->dev_num, dev_map);
+}
+
+static void ib_umad_add_one(struct ib_device *device)
+{
+ struct ib_umad_device *umad_dev;
+ int s, e, i;
+
+ if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+ return;
+
+ if (device->node_type == RDMA_NODE_IB_SWITCH)
+ s = e = 0;
+ else {
+ s = 1;
+ e = device->phys_port_cnt;
+ }
+
+ umad_dev = kzalloc(sizeof *umad_dev +
+ (e - s + 1) * sizeof (struct ib_umad_port),
+ GFP_KERNEL);
+ if (!umad_dev)
+ return;
+
+ kref_init(&umad_dev->ref);
+
+ umad_dev->start_port = s;
+ umad_dev->end_port = e;
+
+ for (i = s; i <= e; ++i) {
+ umad_dev->port[i - s].umad_dev = umad_dev;
+
+ if (rdma_port_get_link_layer(device, i) == IB_LINK_LAYER_INFINIBAND)
+ if (ib_umad_init_port(device, i, &umad_dev->port[i - s]))
+ goto err;
+ }
+
+ ib_set_client_data(device, &umad_client, umad_dev);
+
+ return;
+
+err:
+ while (--i >= s)
+ if (rdma_port_get_link_layer(device, i) == IB_LINK_LAYER_INFINIBAND)
+ ib_umad_kill_port(&umad_dev->port[i - s]);
+
+ kref_put(&umad_dev->ref, ib_umad_release_dev);
+}
+
+static void ib_umad_remove_one(struct ib_device *device)
+{
+ struct ib_umad_device *umad_dev = ib_get_client_data(device, &umad_client);
+ int i;
+
+ if (!umad_dev)
+ return;
+
+ for (i = 0; i <= umad_dev->end_port - umad_dev->start_port; ++i)
+ if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND)
+ ib_umad_kill_port(&umad_dev->port[i]);
+
+ kref_put(&umad_dev->ref, ib_umad_release_dev);
+}
+
+static int __init ib_umad_init(void)
+{
+ int ret;
+
+ ret = register_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2,
+ "infiniband_mad");
+ if (ret) {
+ printk(KERN_ERR "user_mad: couldn't register device number\n");
+ goto out;
+ }
+
+ umad_class = class_create(THIS_MODULE, "infiniband_mad");
+ if (IS_ERR(umad_class)) {
+ ret = PTR_ERR(umad_class);
+ printk(KERN_ERR "user_mad: couldn't create class infiniband_mad\n");
+ goto out_chrdev;
+ }
+
+ ret = class_create_file(umad_class, &class_attr_abi_version);
+ if (ret) {
+ printk(KERN_ERR "user_mad: couldn't create abi_version attribute\n");
+ goto out_class;
+ }
+
+ ret = ib_register_client(&umad_client);
+ if (ret) {
+ printk(KERN_ERR "user_mad: couldn't register ib_umad client\n");
+ goto out_class;
+ }
+
+ return 0;
+
+out_class:
+ class_destroy(umad_class);
+
+out_chrdev:
+ unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2);
+
+out:
+ return ret;
+}
+
+static void __exit ib_umad_cleanup(void)
+{
+ ib_unregister_client(&umad_client);
+ class_destroy(umad_class);
+ unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2);
+}
+
+module_init(ib_umad_init);
+module_exit(ib_umad_cleanup);
diff --git a/sys/ofed/drivers/infiniband/core/uverbs.h b/sys/ofed/drivers/infiniband/core/uverbs.h
new file mode 100644
index 0000000..fa64da5
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/uverbs.h
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef UVERBS_H
+#define UVERBS_H
+
+#include <linux/kref.h>
+#include <linux/idr.h>
+#include <linux/mutex.h>
+#include <linux/completion.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+
+/*
+ * Our lifetime rules for these structs are the following:
+ *
+ * struct ib_uverbs_device: One reference is held by the module and
+ * released in ib_uverbs_remove_one(). Another reference is taken by
+ * ib_uverbs_open() each time the character special file is opened,
+ * and released in ib_uverbs_release_file() when the file is released.
+ *
+ * struct ib_uverbs_file: One reference is held by the VFS and
+ * released when the file is closed. Another reference is taken when
+ * an asynchronous event queue file is created and released when the
+ * event file is closed.
+ *
+ * struct ib_uverbs_event_file: One reference is held by the VFS and
+ * released when the file is closed. For asynchronous event files,
+ * another reference is held by the corresponding main context file
+ * and released when that file is closed. For completion event files,
+ * a reference is taken when a CQ is created that uses the file, and
+ * released when the CQ is destroyed.
+ */
+
+struct ib_uverbs_device {
+ struct kref ref;
+ struct completion comp;
+ int devnum;
+ struct cdev *cdev;
+ struct device *dev;
+ struct ib_device *ib_dev;
+ int num_comp_vectors;
+};
+
+struct ib_uverbs_event_file {
+ struct kref ref;
+ struct file *filp;
+ struct ib_uverbs_file *uverbs_file;
+ spinlock_t lock;
+ wait_queue_head_t poll_wait;
+ struct fasync_struct *async_queue;
+ struct list_head event_list;
+ int is_async;
+ int is_closed;
+};
+
+struct ib_uverbs_file {
+ struct kref ref;
+ struct mutex mutex;
+ struct ib_uverbs_device *device;
+ struct ib_ucontext *ucontext;
+ struct ib_event_handler event_handler;
+ struct ib_uverbs_event_file *async_file;
+};
+
+struct ib_uverbs_event {
+ union {
+ struct ib_uverbs_async_event_desc async;
+ struct ib_uverbs_comp_event_desc comp;
+ } desc;
+ struct list_head list;
+ struct list_head obj_list;
+ u32 *counter;
+};
+
+struct ib_uverbs_mcast_entry {
+ struct list_head list;
+ union ib_gid gid;
+ u16 lid;
+};
+
+struct ib_uevent_object {
+ struct ib_uobject uobject;
+ struct list_head event_list;
+ u32 events_reported;
+};
+
+struct ib_uqp_object {
+ struct ib_uevent_object uevent;
+ struct list_head mcast_list;
+};
+
+struct ib_ucq_object {
+ struct ib_uobject uobject;
+ struct ib_uverbs_file *uverbs_file;
+ struct list_head comp_list;
+ struct list_head async_list;
+ u32 comp_events_reported;
+ u32 async_events_reported;
+};
+
+struct ib_uxrcd_object {
+ struct ib_uobject uobject;
+ struct list_head xrc_reg_qp_list;
+};
+
+extern spinlock_t ib_uverbs_idr_lock;
+extern struct idr ib_uverbs_pd_idr;
+extern struct idr ib_uverbs_mr_idr;
+extern struct idr ib_uverbs_mw_idr;
+extern struct idr ib_uverbs_ah_idr;
+extern struct idr ib_uverbs_cq_idr;
+extern struct idr ib_uverbs_qp_idr;
+extern struct idr ib_uverbs_srq_idr;
+extern struct idr ib_uverbs_xrc_domain_idr;
+
+void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj);
+
+struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+ int is_async, int *fd);
+struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd);
+
+void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
+ struct ib_uverbs_event_file *ev_file,
+ struct ib_ucq_object *uobj);
+void ib_uverbs_release_uevent(struct ib_uverbs_file *file,
+ struct ib_uevent_object *uobj);
+
+void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context);
+void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_event_handler(struct ib_event_handler *handler,
+ struct ib_event *event);
+void ib_uverbs_xrc_rcv_qp_event_handler(struct ib_event *event,
+ void *context_ptr);
+void ib_uverbs_dealloc_xrcd(struct ib_device *ib_dev,
+ struct ib_xrcd *xrcd);
+int ib_uverbs_cleanup_xrc_rcv_qp(struct ib_uverbs_file *file,
+ struct ib_xrcd *xrcd, u32 qp_num);
+
+#define IB_UVERBS_DECLARE_CMD(name) \
+ ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \
+ const char __user *buf, int in_len, \
+ int out_len)
+
+IB_UVERBS_DECLARE_CMD(get_context);
+IB_UVERBS_DECLARE_CMD(query_device);
+IB_UVERBS_DECLARE_CMD(query_port);
+IB_UVERBS_DECLARE_CMD(alloc_pd);
+IB_UVERBS_DECLARE_CMD(dealloc_pd);
+IB_UVERBS_DECLARE_CMD(reg_mr);
+IB_UVERBS_DECLARE_CMD(dereg_mr);
+IB_UVERBS_DECLARE_CMD(create_comp_channel);
+IB_UVERBS_DECLARE_CMD(create_cq);
+IB_UVERBS_DECLARE_CMD(resize_cq);
+IB_UVERBS_DECLARE_CMD(poll_cq);
+IB_UVERBS_DECLARE_CMD(req_notify_cq);
+IB_UVERBS_DECLARE_CMD(destroy_cq);
+IB_UVERBS_DECLARE_CMD(create_qp);
+IB_UVERBS_DECLARE_CMD(query_qp);
+IB_UVERBS_DECLARE_CMD(modify_qp);
+IB_UVERBS_DECLARE_CMD(destroy_qp);
+IB_UVERBS_DECLARE_CMD(post_send);
+IB_UVERBS_DECLARE_CMD(post_recv);
+IB_UVERBS_DECLARE_CMD(post_srq_recv);
+IB_UVERBS_DECLARE_CMD(create_ah);
+IB_UVERBS_DECLARE_CMD(destroy_ah);
+IB_UVERBS_DECLARE_CMD(attach_mcast);
+IB_UVERBS_DECLARE_CMD(detach_mcast);
+IB_UVERBS_DECLARE_CMD(create_srq);
+IB_UVERBS_DECLARE_CMD(modify_srq);
+IB_UVERBS_DECLARE_CMD(query_srq);
+IB_UVERBS_DECLARE_CMD(destroy_srq);
+IB_UVERBS_DECLARE_CMD(create_xrc_srq);
+IB_UVERBS_DECLARE_CMD(open_xrc_domain);
+IB_UVERBS_DECLARE_CMD(close_xrc_domain);
+IB_UVERBS_DECLARE_CMD(create_xrc_rcv_qp);
+IB_UVERBS_DECLARE_CMD(modify_xrc_rcv_qp);
+IB_UVERBS_DECLARE_CMD(query_xrc_rcv_qp);
+IB_UVERBS_DECLARE_CMD(reg_xrc_rcv_qp);
+IB_UVERBS_DECLARE_CMD(unreg_xrc_rcv_qp);
+
+
+#endif /* UVERBS_H */
diff --git a/sys/ofed/drivers/infiniband/core/uverbs_cmd.c b/sys/ofed/drivers/infiniband/core/uverbs_cmd.c
new file mode 100644
index 0000000..3520182
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/uverbs_cmd.c
@@ -0,0 +1,3022 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ * Copyright (c) 2006 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/file.h>
+#include <linux/fs.h>
+
+#include <asm/uaccess.h>
+#include <asm/fcntl.h>
+
+#include "uverbs.h"
+
+static struct lock_class_key pd_lock_key;
+static struct lock_class_key mr_lock_key;
+static struct lock_class_key cq_lock_key;
+static struct lock_class_key qp_lock_key;
+static struct lock_class_key ah_lock_key;
+static struct lock_class_key srq_lock_key;
+
+#define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \
+ do { \
+ (udata)->inbuf = (void __user *) (ibuf); \
+ (udata)->outbuf = (void __user *) (obuf); \
+ (udata)->inlen = (ilen); \
+ (udata)->outlen = (olen); \
+ } while (0)
+
+/*
+ * The ib_uobject locking scheme is as follows:
+ *
+ * - ib_uverbs_idr_lock protects the uverbs idrs themselves, so it
+ * needs to be held during all idr operations. When an object is
+ * looked up, a reference must be taken on the object's kref before
+ * dropping this lock.
+ *
+ * - Each object also has an rwsem. This rwsem must be held for
+ * reading while an operation that uses the object is performed.
+ * For example, while registering an MR, the associated PD's
+ * uobject.mutex must be held for reading. The rwsem must be held
+ * for writing while initializing or destroying an object.
+ *
+ * - In addition, each object has a "live" flag. If this flag is not
+ * set, then lookups of the object will fail even if it is found in
+ * the idr. This handles a reader that blocks and does not acquire
+ * the rwsem until after the object is destroyed. The destroy
+ * operation will set the live flag to 0 and then drop the rwsem;
+ * this will allow the reader to acquire the rwsem, see that the
+ * live flag is 0, and then drop the rwsem and its reference to
+ * object. The underlying storage will not be freed until the last
+ * reference to the object is dropped.
+ */
+
+static void init_uobj(struct ib_uobject *uobj, u64 user_handle,
+ struct ib_ucontext *context, struct lock_class_key *key)
+{
+ uobj->user_handle = user_handle;
+ uobj->context = context;
+ kref_init(&uobj->ref);
+ init_rwsem(&uobj->mutex);
+ lockdep_set_class(&uobj->mutex, key);
+ uobj->live = 0;
+}
+
+static void release_uobj(struct kref *kref)
+{
+ kfree(container_of(kref, struct ib_uobject, ref));
+}
+
+static void put_uobj(struct ib_uobject *uobj)
+{
+ kref_put(&uobj->ref, release_uobj);
+}
+
+static void put_uobj_read(struct ib_uobject *uobj)
+{
+ up_read(&uobj->mutex);
+ put_uobj(uobj);
+}
+
+static void put_uobj_write(struct ib_uobject *uobj)
+{
+ up_write(&uobj->mutex);
+ put_uobj(uobj);
+}
+
+static int idr_add_uobj(struct idr *idr, struct ib_uobject *uobj)
+{
+ int ret;
+
+retry:
+ if (!idr_pre_get(idr, GFP_KERNEL))
+ return -ENOMEM;
+
+ spin_lock(&ib_uverbs_idr_lock);
+ ret = idr_get_new(idr, uobj, &uobj->id);
+ spin_unlock(&ib_uverbs_idr_lock);
+
+ if (ret == -EAGAIN)
+ goto retry;
+
+ return ret;
+}
+
+void idr_remove_uobj(struct idr *idr, struct ib_uobject *uobj)
+{
+ spin_lock(&ib_uverbs_idr_lock);
+ idr_remove(idr, uobj->id);
+ spin_unlock(&ib_uverbs_idr_lock);
+}
+
+static struct ib_uobject *__idr_get_uobj(struct idr *idr, int id,
+ struct ib_ucontext *context)
+{
+ struct ib_uobject *uobj;
+
+ spin_lock(&ib_uverbs_idr_lock);
+ uobj = idr_find(idr, id);
+ if (uobj) {
+ if (uobj->context == context)
+ kref_get(&uobj->ref);
+ else
+ uobj = NULL;
+ }
+ spin_unlock(&ib_uverbs_idr_lock);
+
+ return uobj;
+}
+
+static struct ib_uobject *idr_read_uobj(struct idr *idr, int id,
+ struct ib_ucontext *context, int nested)
+{
+ struct ib_uobject *uobj;
+
+ uobj = __idr_get_uobj(idr, id, context);
+ if (!uobj)
+ return NULL;
+
+ if (nested)
+ down_read_nested(&uobj->mutex, SINGLE_DEPTH_NESTING);
+ else
+ down_read(&uobj->mutex);
+ if (!uobj->live) {
+ put_uobj_read(uobj);
+ return NULL;
+ }
+
+ return uobj;
+}
+
+static struct ib_uobject *idr_write_uobj(struct idr *idr, int id,
+ struct ib_ucontext *context)
+{
+ struct ib_uobject *uobj;
+
+ uobj = __idr_get_uobj(idr, id, context);
+ if (!uobj)
+ return NULL;
+
+ down_write(&uobj->mutex);
+ if (!uobj->live) {
+ put_uobj_write(uobj);
+ return NULL;
+ }
+
+ return uobj;
+}
+
+static void *idr_read_obj(struct idr *idr, int id, struct ib_ucontext *context,
+ int nested)
+{
+ struct ib_uobject *uobj;
+
+ uobj = idr_read_uobj(idr, id, context, nested);
+ return uobj ? uobj->object : NULL;
+}
+
+static struct ib_pd *idr_read_pd(int pd_handle, struct ib_ucontext *context)
+{
+ return idr_read_obj(&ib_uverbs_pd_idr, pd_handle, context, 0);
+}
+
+static void put_pd_read(struct ib_pd *pd)
+{
+ put_uobj_read(pd->uobject);
+}
+
+static struct ib_cq *idr_read_cq(int cq_handle, struct ib_ucontext *context, int nested)
+{
+ return idr_read_obj(&ib_uverbs_cq_idr, cq_handle, context, nested);
+}
+
+static void put_cq_read(struct ib_cq *cq)
+{
+ put_uobj_read(cq->uobject);
+}
+
+static struct ib_ah *idr_read_ah(int ah_handle, struct ib_ucontext *context)
+{
+ return idr_read_obj(&ib_uverbs_ah_idr, ah_handle, context, 0);
+}
+
+static void put_ah_read(struct ib_ah *ah)
+{
+ put_uobj_read(ah->uobject);
+}
+
+static struct ib_qp *idr_read_qp(int qp_handle, struct ib_ucontext *context)
+{
+ return idr_read_obj(&ib_uverbs_qp_idr, qp_handle, context, 0);
+}
+
+static void put_qp_read(struct ib_qp *qp)
+{
+ put_uobj_read(qp->uobject);
+}
+
+static struct ib_srq *idr_read_srq(int srq_handle, struct ib_ucontext *context)
+{
+ return idr_read_obj(&ib_uverbs_srq_idr, srq_handle, context, 0);
+}
+
+static void put_srq_read(struct ib_srq *srq)
+{
+ put_uobj_read(srq->uobject);
+}
+
+static struct ib_xrcd *idr_read_xrcd(int xrcd_handle,
+ struct ib_ucontext *context,
+ struct ib_uobject **uobj)
+{
+ *uobj = idr_read_uobj(&ib_uverbs_xrc_domain_idr, xrcd_handle,
+ context, 0);
+ return *uobj ? (*uobj)->object : NULL;
+}
+
+static void put_xrcd_read(struct ib_uobject *uobj)
+{
+ put_uobj_read(uobj);
+}
+
+ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
+ const char __user *buf,
+ int in_len, int out_len)
+{
+ struct ib_uverbs_get_context cmd;
+ struct ib_uverbs_get_context_resp resp;
+ struct ib_udata udata;
+ struct ib_device *ibdev = file->device->ib_dev;
+ struct ib_ucontext *ucontext;
+ struct file *filp;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ mutex_lock(&file->mutex);
+
+ if (file->ucontext) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ INIT_UDATA(&udata, buf + sizeof cmd,
+ (unsigned long) cmd.response + sizeof resp,
+ in_len - sizeof cmd, out_len - sizeof resp);
+
+ ucontext = ibdev->alloc_ucontext(ibdev, &udata);
+ if (IS_ERR(ucontext)) {
+ ret = PTR_ERR(file->ucontext);
+ goto err;
+ }
+
+ ucontext->device = ibdev;
+ INIT_LIST_HEAD(&ucontext->pd_list);
+ INIT_LIST_HEAD(&ucontext->mr_list);
+ INIT_LIST_HEAD(&ucontext->mw_list);
+ INIT_LIST_HEAD(&ucontext->cq_list);
+ INIT_LIST_HEAD(&ucontext->qp_list);
+ INIT_LIST_HEAD(&ucontext->srq_list);
+ INIT_LIST_HEAD(&ucontext->ah_list);
+ INIT_LIST_HEAD(&ucontext->xrc_domain_list);
+ ucontext->closing = 0;
+
+ resp.num_comp_vectors = file->device->num_comp_vectors;
+
+ filp = ib_uverbs_alloc_event_file(file, 1, &resp.async_fd);
+ if (IS_ERR(filp)) {
+ ret = PTR_ERR(filp);
+ goto err_free;
+ }
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_file;
+ }
+
+ file->async_file = filp->private_data;
+
+ INIT_IB_EVENT_HANDLER(&file->event_handler, file->device->ib_dev,
+ ib_uverbs_event_handler);
+ ret = ib_register_event_handler(&file->event_handler);
+ if (ret)
+ goto err_file;
+
+ kref_get(&file->async_file->ref);
+ kref_get(&file->ref);
+ file->ucontext = ucontext;
+
+ fd_install(resp.async_fd, filp);
+
+ mutex_unlock(&file->mutex);
+
+ return in_len;
+
+err_file:
+ put_unused_fd(resp.async_fd);
+ fput(filp);
+
+err_free:
+ ibdev->dealloc_ucontext(ucontext);
+
+err:
+ mutex_unlock(&file->mutex);
+ return ret;
+}
+
+ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
+ const char __user *buf,
+ int in_len, int out_len)
+{
+ struct ib_uverbs_query_device cmd;
+ struct ib_uverbs_query_device_resp resp;
+ struct ib_device_attr attr;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ ret = ib_query_device(file->device->ib_dev, &attr);
+ if (ret)
+ return ret;
+
+ memset(&resp, 0, sizeof resp);
+
+ resp.fw_ver = attr.fw_ver;
+ resp.node_guid = file->device->ib_dev->node_guid;
+ resp.sys_image_guid = attr.sys_image_guid;
+ resp.max_mr_size = attr.max_mr_size;
+ resp.page_size_cap = attr.page_size_cap;
+ resp.vendor_id = attr.vendor_id;
+ resp.vendor_part_id = attr.vendor_part_id;
+ resp.hw_ver = attr.hw_ver;
+ resp.max_qp = attr.max_qp;
+ resp.max_qp_wr = attr.max_qp_wr;
+ resp.device_cap_flags = attr.device_cap_flags;
+ resp.max_sge = attr.max_sge;
+ resp.max_sge_rd = attr.max_sge_rd;
+ resp.max_cq = attr.max_cq;
+ resp.max_cqe = attr.max_cqe;
+ resp.max_mr = attr.max_mr;
+ resp.max_pd = attr.max_pd;
+ resp.max_qp_rd_atom = attr.max_qp_rd_atom;
+ resp.max_ee_rd_atom = attr.max_ee_rd_atom;
+ resp.max_res_rd_atom = attr.max_res_rd_atom;
+ resp.max_qp_init_rd_atom = attr.max_qp_init_rd_atom;
+ resp.max_ee_init_rd_atom = attr.max_ee_init_rd_atom;
+ resp.atomic_cap = attr.atomic_cap;
+ resp.max_ee = attr.max_ee;
+ resp.max_rdd = attr.max_rdd;
+ resp.max_mw = attr.max_mw;
+ resp.max_raw_ipv6_qp = attr.max_raw_ipv6_qp;
+ resp.max_raw_ethy_qp = attr.max_raw_ethy_qp;
+ resp.max_mcast_grp = attr.max_mcast_grp;
+ resp.max_mcast_qp_attach = attr.max_mcast_qp_attach;
+ resp.max_total_mcast_qp_attach = attr.max_total_mcast_qp_attach;
+ resp.max_ah = attr.max_ah;
+ resp.max_fmr = attr.max_fmr;
+ resp.max_map_per_fmr = attr.max_map_per_fmr;
+ resp.max_srq = attr.max_srq;
+ resp.max_srq_wr = attr.max_srq_wr;
+ resp.max_srq_sge = attr.max_srq_sge;
+ resp.max_pkeys = attr.max_pkeys;
+ resp.local_ca_ack_delay = attr.local_ca_ack_delay;
+ resp.phys_port_cnt = file->device->ib_dev->phys_port_cnt;
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp))
+ return -EFAULT;
+
+ return in_len;
+}
+
+ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
+ const char __user *buf,
+ int in_len, int out_len)
+{
+ struct ib_uverbs_query_port cmd;
+ struct ib_uverbs_query_port_resp resp;
+ struct ib_port_attr attr;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ ret = ib_query_port(file->device->ib_dev, cmd.port_num, &attr);
+ if (ret)
+ return ret;
+
+ memset(&resp, 0, sizeof resp);
+
+ resp.state = attr.state;
+ resp.max_mtu = attr.max_mtu;
+ resp.active_mtu = attr.active_mtu;
+ resp.gid_tbl_len = attr.gid_tbl_len;
+ resp.port_cap_flags = attr.port_cap_flags;
+ resp.max_msg_sz = attr.max_msg_sz;
+ resp.bad_pkey_cntr = attr.bad_pkey_cntr;
+ resp.qkey_viol_cntr = attr.qkey_viol_cntr;
+ resp.pkey_tbl_len = attr.pkey_tbl_len;
+ resp.lid = attr.lid;
+ resp.sm_lid = attr.sm_lid;
+ resp.lmc = attr.lmc;
+ resp.max_vl_num = attr.max_vl_num;
+ resp.sm_sl = attr.sm_sl;
+ resp.subnet_timeout = attr.subnet_timeout;
+ resp.init_type_reply = attr.init_type_reply;
+ resp.active_width = attr.active_width;
+ resp.active_speed = attr.active_speed;
+ resp.phys_state = attr.phys_state;
+ resp.link_layer = attr.link_layer;
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp))
+ return -EFAULT;
+
+ return in_len;
+}
+
+ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
+ const char __user *buf,
+ int in_len, int out_len)
+{
+ struct ib_uverbs_alloc_pd cmd;
+ struct ib_uverbs_alloc_pd_resp resp;
+ struct ib_udata udata;
+ struct ib_uobject *uobj;
+ struct ib_pd *pd;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ INIT_UDATA(&udata, buf + sizeof cmd,
+ (unsigned long) cmd.response + sizeof resp,
+ in_len - sizeof cmd, out_len - sizeof resp);
+
+ uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
+ if (!uobj)
+ return -ENOMEM;
+
+ init_uobj(uobj, 0, file->ucontext, &pd_lock_key);
+ down_write(&uobj->mutex);
+
+ pd = file->device->ib_dev->alloc_pd(file->device->ib_dev,
+ file->ucontext, &udata);
+ if (IS_ERR(pd)) {
+ ret = PTR_ERR(pd);
+ goto err;
+ }
+
+ pd->device = file->device->ib_dev;
+ pd->uobject = uobj;
+ atomic_set(&pd->usecnt, 0);
+
+ uobj->object = pd;
+ ret = idr_add_uobj(&ib_uverbs_pd_idr, uobj);
+ if (ret)
+ goto err_idr;
+
+ memset(&resp, 0, sizeof resp);
+ resp.pd_handle = uobj->id;
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_copy;
+ }
+
+ mutex_lock(&file->mutex);
+ list_add_tail(&uobj->list, &file->ucontext->pd_list);
+ mutex_unlock(&file->mutex);
+
+ uobj->live = 1;
+
+ up_write(&uobj->mutex);
+
+ return in_len;
+
+err_copy:
+ idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
+
+err_idr:
+ ib_dealloc_pd(pd);
+
+err:
+ put_uobj_write(uobj);
+ return ret;
+}
+
+ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
+ const char __user *buf,
+ int in_len, int out_len)
+{
+ struct ib_uverbs_dealloc_pd cmd;
+ struct ib_uobject *uobj;
+ int ret;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext);
+ if (!uobj)
+ return -EINVAL;
+
+ ret = ib_dealloc_pd(uobj->object);
+ if (!ret)
+ uobj->live = 0;
+
+ put_uobj_write(uobj);
+
+ if (ret)
+ return ret;
+
+ idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
+
+ mutex_lock(&file->mutex);
+ list_del(&uobj->list);
+ mutex_unlock(&file->mutex);
+
+ put_uobj(uobj);
+
+ return in_len;
+}
+
+ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_reg_mr cmd;
+ struct ib_uverbs_reg_mr_resp resp;
+ struct ib_udata udata;
+ struct ib_uobject *uobj;
+ struct ib_pd *pd;
+ struct ib_mr *mr;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ INIT_UDATA(&udata, buf + sizeof cmd,
+ (unsigned long) cmd.response + sizeof resp,
+ in_len - sizeof cmd, out_len - sizeof resp);
+
+ if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))
+ return -EINVAL;
+
+ /*
+ * Local write permission is required if remote write or
+ * remote atomic permission is also requested.
+ */
+ if (cmd.access_flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) &&
+ !(cmd.access_flags & IB_ACCESS_LOCAL_WRITE))
+ return -EINVAL;
+
+ uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
+ if (!uobj)
+ return -ENOMEM;
+
+ init_uobj(uobj, 0, file->ucontext, &mr_lock_key);
+ down_write(&uobj->mutex);
+
+ pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+ if (!pd) {
+ ret = -EINVAL;
+ goto err_free;
+ }
+
+ mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
+ cmd.access_flags, &udata);
+ if (IS_ERR(mr)) {
+ ret = PTR_ERR(mr);
+ goto err_put;
+ }
+
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->uobject = uobj;
+ atomic_inc(&pd->usecnt);
+ atomic_set(&mr->usecnt, 0);
+
+ uobj->object = mr;
+ ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj);
+ if (ret)
+ goto err_unreg;
+
+ memset(&resp, 0, sizeof resp);
+ resp.lkey = mr->lkey;
+ resp.rkey = mr->rkey;
+ resp.mr_handle = uobj->id;
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_copy;
+ }
+
+ put_pd_read(pd);
+
+ mutex_lock(&file->mutex);
+ list_add_tail(&uobj->list, &file->ucontext->mr_list);
+ mutex_unlock(&file->mutex);
+
+ uobj->live = 1;
+
+ up_write(&uobj->mutex);
+
+ return in_len;
+
+err_copy:
+ idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
+
+err_unreg:
+ ib_dereg_mr(mr);
+
+err_put:
+ put_pd_read(pd);
+
+err_free:
+ put_uobj_write(uobj);
+ return ret;
+}
+
+ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_dereg_mr cmd;
+ struct ib_mr *mr;
+ struct ib_uobject *uobj;
+ int ret = -EINVAL;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle, file->ucontext);
+ if (!uobj)
+ return -EINVAL;
+
+ mr = uobj->object;
+
+ ret = ib_dereg_mr(mr);
+ if (!ret)
+ uobj->live = 0;
+
+ put_uobj_write(uobj);
+
+ if (ret)
+ return ret;
+
+ idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
+
+ mutex_lock(&file->mutex);
+ list_del(&uobj->list);
+ mutex_unlock(&file->mutex);
+
+ put_uobj(uobj);
+
+ return in_len;
+}
+
+ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_create_comp_channel cmd;
+ struct ib_uverbs_create_comp_channel_resp resp;
+ struct file *filp;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ filp = ib_uverbs_alloc_event_file(file, 0, &resp.fd);
+ if (IS_ERR(filp))
+ return PTR_ERR(filp);
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp)) {
+ put_unused_fd(resp.fd);
+ fput(filp);
+ return -EFAULT;
+ }
+
+ fd_install(resp.fd, filp);
+ return in_len;
+}
+
+ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_create_cq cmd;
+ struct ib_uverbs_create_cq_resp resp;
+ struct ib_udata udata;
+ struct ib_ucq_object *obj;
+ struct ib_uverbs_event_file *ev_file = NULL;
+ struct ib_cq *cq;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ INIT_UDATA(&udata, buf + sizeof cmd,
+ (unsigned long) cmd.response + sizeof resp,
+ in_len - sizeof cmd, out_len - sizeof resp);
+
+ if (cmd.comp_vector >= file->device->num_comp_vectors)
+ return -EINVAL;
+
+ obj = kmalloc(sizeof *obj, GFP_KERNEL);
+ if (!obj)
+ return -ENOMEM;
+
+ init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, &cq_lock_key);
+ down_write(&obj->uobject.mutex);
+
+ if (cmd.comp_channel >= 0) {
+ ev_file = ib_uverbs_lookup_comp_file(cmd.comp_channel);
+ if (!ev_file) {
+ ret = -EINVAL;
+ goto err;
+ }
+ }
+
+ obj->uverbs_file = file;
+ obj->comp_events_reported = 0;
+ obj->async_events_reported = 0;
+ INIT_LIST_HEAD(&obj->comp_list);
+ INIT_LIST_HEAD(&obj->async_list);
+
+ cq = file->device->ib_dev->create_cq(file->device->ib_dev, cmd.cqe,
+ cmd.comp_vector,
+ file->ucontext, &udata);
+ if (IS_ERR(cq)) {
+ ret = PTR_ERR(cq);
+ goto err_file;
+ }
+
+ cq->device = file->device->ib_dev;
+ cq->uobject = &obj->uobject;
+ cq->comp_handler = ib_uverbs_comp_handler;
+ cq->event_handler = ib_uverbs_cq_event_handler;
+ cq->cq_context = ev_file;
+ atomic_set(&cq->usecnt, 0);
+
+ obj->uobject.object = cq;
+ ret = idr_add_uobj(&ib_uverbs_cq_idr, &obj->uobject);
+ if (ret)
+ goto err_free;
+
+ memset(&resp, 0, sizeof resp);
+ resp.cq_handle = obj->uobject.id;
+ resp.cqe = cq->cqe;
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_copy;
+ }
+
+ mutex_lock(&file->mutex);
+ list_add_tail(&obj->uobject.list, &file->ucontext->cq_list);
+ mutex_unlock(&file->mutex);
+
+ obj->uobject.live = 1;
+
+ up_write(&obj->uobject.mutex);
+
+ return in_len;
+
+err_copy:
+ idr_remove_uobj(&ib_uverbs_cq_idr, &obj->uobject);
+
+err_free:
+ ib_destroy_cq(cq);
+
+err_file:
+ if (ev_file)
+ ib_uverbs_release_ucq(file, ev_file, obj);
+
+err:
+ put_uobj_write(&obj->uobject);
+ return ret;
+}
+
+ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_resize_cq cmd;
+ struct ib_uverbs_resize_cq_resp resp;
+ struct ib_udata udata;
+ struct ib_cq *cq;
+ int ret = -EINVAL;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ INIT_UDATA(&udata, buf + sizeof cmd,
+ (unsigned long) cmd.response + sizeof resp,
+ in_len - sizeof cmd, out_len - sizeof resp);
+
+ cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+ if (!cq)
+ return -EINVAL;
+
+ ret = cq->device->resize_cq(cq, cmd.cqe, &udata);
+ if (ret)
+ goto out;
+
+ resp.cqe = cq->cqe;
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp.cqe))
+ ret = -EFAULT;
+
+out:
+ put_cq_read(cq);
+
+ return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_poll_cq cmd;
+ struct ib_uverbs_poll_cq_resp *resp;
+ struct ib_cq *cq;
+ struct ib_wc *wc;
+ int ret = 0;
+ int i;
+ int rsize;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ wc = kmalloc(cmd.ne * sizeof *wc, GFP_KERNEL);
+ if (!wc)
+ return -ENOMEM;
+
+ rsize = sizeof *resp + cmd.ne * sizeof(struct ib_uverbs_wc);
+ resp = kmalloc(rsize, GFP_KERNEL);
+ if (!resp) {
+ ret = -ENOMEM;
+ goto out_wc;
+ }
+
+ cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+ if (!cq) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ resp->count = ib_poll_cq(cq, cmd.ne, wc);
+
+ put_cq_read(cq);
+
+ for (i = 0; i < resp->count; i++) {
+ resp->wc[i].wr_id = wc[i].wr_id;
+ resp->wc[i].status = wc[i].status;
+ resp->wc[i].opcode = wc[i].opcode;
+ resp->wc[i].vendor_err = wc[i].vendor_err;
+ resp->wc[i].byte_len = wc[i].byte_len;
+ resp->wc[i].ex.imm_data = (__u32 __force) wc[i].ex.imm_data;
+ resp->wc[i].qp_num = wc[i].qp->qp_num;
+ resp->wc[i].src_qp = wc[i].src_qp;
+ resp->wc[i].wc_flags = wc[i].wc_flags;
+ resp->wc[i].pkey_index = wc[i].pkey_index;
+ resp->wc[i].slid = wc[i].slid;
+ resp->wc[i].sl = wc[i].sl;
+ resp->wc[i].dlid_path_bits = wc[i].dlid_path_bits;
+ resp->wc[i].port_num = wc[i].port_num;
+ }
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response, resp, rsize))
+ ret = -EFAULT;
+
+out:
+ kfree(resp);
+
+out_wc:
+ kfree(wc);
+ return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_req_notify_cq cmd;
+ struct ib_cq *cq;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+ if (!cq)
+ return -EINVAL;
+
+ ib_req_notify_cq(cq, cmd.solicited_only ?
+ IB_CQ_SOLICITED : IB_CQ_NEXT_COMP);
+
+ put_cq_read(cq);
+
+ return in_len;
+}
+
+ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_destroy_cq cmd;
+ struct ib_uverbs_destroy_cq_resp resp;
+ struct ib_uobject *uobj;
+ struct ib_cq *cq;
+ struct ib_ucq_object *obj;
+ struct ib_uverbs_event_file *ev_file;
+ int ret = -EINVAL;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ uobj = idr_write_uobj(&ib_uverbs_cq_idr, cmd.cq_handle, file->ucontext);
+ if (!uobj)
+ return -EINVAL;
+ cq = uobj->object;
+ ev_file = cq->cq_context;
+ obj = container_of(cq->uobject, struct ib_ucq_object, uobject);
+
+ ret = ib_destroy_cq(cq);
+ if (!ret)
+ uobj->live = 0;
+
+ put_uobj_write(uobj);
+
+ if (ret)
+ return ret;
+
+ idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
+
+ mutex_lock(&file->mutex);
+ list_del(&uobj->list);
+ mutex_unlock(&file->mutex);
+
+ ib_uverbs_release_ucq(file, ev_file, obj);
+
+ memset(&resp, 0, sizeof resp);
+ resp.comp_events_reported = obj->comp_events_reported;
+ resp.async_events_reported = obj->async_events_reported;
+
+ put_uobj(uobj);
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp))
+ return -EFAULT;
+
+ return in_len;
+}
+
+ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_create_qp cmd;
+ struct ib_uverbs_create_qp_resp resp;
+ struct ib_udata udata;
+ struct ib_uqp_object *obj;
+ struct ib_pd *pd;
+ struct ib_cq *scq, *rcq;
+ struct ib_srq *srq;
+ struct ib_qp *qp;
+ struct ib_qp_init_attr attr;
+ struct ib_xrcd *xrcd;
+ struct ib_uobject *xrcd_uobj;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ INIT_UDATA(&udata, buf + sizeof cmd,
+ (unsigned long) cmd.response + sizeof resp,
+ in_len - sizeof cmd, out_len - sizeof resp);
+
+ obj = kmalloc(sizeof *obj, GFP_KERNEL);
+ if (!obj)
+ return -ENOMEM;
+
+ init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_key);
+ down_write(&obj->uevent.uobject.mutex);
+
+ srq = (cmd.is_srq && cmd.qp_type != IB_QPT_XRC) ?
+ idr_read_srq(cmd.srq_handle, file->ucontext) : NULL;
+ xrcd = cmd.qp_type == IB_QPT_XRC ?
+ idr_read_xrcd(cmd.srq_handle, file->ucontext, &xrcd_uobj) : NULL;
+ pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+ scq = idr_read_cq(cmd.send_cq_handle, file->ucontext, 0);
+ rcq = cmd.recv_cq_handle == cmd.send_cq_handle ?
+ scq : idr_read_cq(cmd.recv_cq_handle, file->ucontext, 1);
+
+ if (!pd || !scq || !rcq || (cmd.is_srq && !srq) ||
+ (cmd.qp_type == IB_QPT_XRC && !xrcd)) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+
+ attr.create_flags = 0;
+ attr.event_handler = ib_uverbs_qp_event_handler;
+ attr.qp_context = file;
+ attr.send_cq = scq;
+ attr.recv_cq = rcq;
+ attr.srq = srq;
+ attr.sq_sig_type = cmd.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
+ attr.qp_type = cmd.qp_type;
+ attr.xrc_domain = xrcd;
+ attr.create_flags = 0;
+
+ attr.cap.max_send_wr = cmd.max_send_wr;
+ attr.cap.max_recv_wr = cmd.max_recv_wr;
+ attr.cap.max_send_sge = cmd.max_send_sge;
+ attr.cap.max_recv_sge = cmd.max_recv_sge;
+ attr.cap.max_inline_data = cmd.max_inline_data;
+
+ obj->uevent.events_reported = 0;
+ INIT_LIST_HEAD(&obj->uevent.event_list);
+ INIT_LIST_HEAD(&obj->mcast_list);
+
+ qp = pd->device->create_qp(pd, &attr, &udata);
+ if (IS_ERR(qp)) {
+ ret = PTR_ERR(qp);
+ goto err_put;
+ }
+
+ qp->device = pd->device;
+ qp->pd = pd;
+ qp->send_cq = attr.send_cq;
+ qp->recv_cq = attr.recv_cq;
+ qp->srq = attr.srq;
+ qp->uobject = &obj->uevent.uobject;
+ qp->event_handler = attr.event_handler;
+ qp->qp_context = attr.qp_context;
+ qp->qp_type = attr.qp_type;
+ qp->xrcd = attr.xrc_domain;
+ atomic_inc(&pd->usecnt);
+ atomic_inc(&attr.send_cq->usecnt);
+ atomic_inc(&attr.recv_cq->usecnt);
+ if (attr.srq)
+ atomic_inc(&attr.srq->usecnt);
+ else if (attr.xrc_domain)
+ atomic_inc(&attr.xrc_domain->usecnt);
+
+ obj->uevent.uobject.object = qp;
+ ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
+ if (ret)
+ goto err_destroy;
+
+ memset(&resp, 0, sizeof resp);
+ resp.qpn = qp->qp_num;
+ resp.qp_handle = obj->uevent.uobject.id;
+ resp.max_recv_sge = attr.cap.max_recv_sge;
+ resp.max_send_sge = attr.cap.max_send_sge;
+ resp.max_recv_wr = attr.cap.max_recv_wr;
+ resp.max_send_wr = attr.cap.max_send_wr;
+ resp.max_inline_data = attr.cap.max_inline_data;
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_copy;
+ }
+
+ put_pd_read(pd);
+ put_cq_read(scq);
+ if (rcq != scq)
+ put_cq_read(rcq);
+ if (srq)
+ put_srq_read(srq);
+ if (xrcd)
+ put_xrcd_read(xrcd_uobj);
+
+ mutex_lock(&file->mutex);
+ list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list);
+ mutex_unlock(&file->mutex);
+
+ obj->uevent.uobject.live = 1;
+
+ up_write(&obj->uevent.uobject.mutex);
+
+ return in_len;
+
+err_copy:
+ idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
+
+err_destroy:
+ ib_destroy_qp(qp);
+
+err_put:
+ if (pd)
+ put_pd_read(pd);
+ if (scq)
+ put_cq_read(scq);
+ if (rcq && rcq != scq)
+ put_cq_read(rcq);
+ if (srq)
+ put_srq_read(srq);
+ if (xrcd)
+ put_xrcd_read(xrcd_uobj);
+
+ put_uobj_write(&obj->uevent.uobject);
+ return ret;
+}
+
+ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_query_qp cmd;
+ struct ib_uverbs_query_qp_resp resp;
+ struct ib_qp *qp;
+ struct ib_qp_attr *attr;
+ struct ib_qp_init_attr *init_attr;
+ int ret;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ attr = kmalloc(sizeof *attr, GFP_KERNEL);
+ init_attr = kmalloc(sizeof *init_attr, GFP_KERNEL);
+ if (!attr || !init_attr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+ if (!qp) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = ib_query_qp(qp, attr, cmd.attr_mask, init_attr);
+
+ put_qp_read(qp);
+
+ if (ret)
+ goto out;
+
+ memset(&resp, 0, sizeof resp);
+
+ resp.qp_state = attr->qp_state;
+ resp.cur_qp_state = attr->cur_qp_state;
+ resp.path_mtu = attr->path_mtu;
+ resp.path_mig_state = attr->path_mig_state;
+ resp.qkey = attr->qkey;
+ resp.rq_psn = attr->rq_psn;
+ resp.sq_psn = attr->sq_psn;
+ resp.dest_qp_num = attr->dest_qp_num;
+ resp.qp_access_flags = attr->qp_access_flags;
+ resp.pkey_index = attr->pkey_index;
+ resp.alt_pkey_index = attr->alt_pkey_index;
+ resp.sq_draining = attr->sq_draining;
+ resp.max_rd_atomic = attr->max_rd_atomic;
+ resp.max_dest_rd_atomic = attr->max_dest_rd_atomic;
+ resp.min_rnr_timer = attr->min_rnr_timer;
+ resp.port_num = attr->port_num;
+ resp.timeout = attr->timeout;
+ resp.retry_cnt = attr->retry_cnt;
+ resp.rnr_retry = attr->rnr_retry;
+ resp.alt_port_num = attr->alt_port_num;
+ resp.alt_timeout = attr->alt_timeout;
+
+ memcpy(resp.dest.dgid, attr->ah_attr.grh.dgid.raw, 16);
+ resp.dest.flow_label = attr->ah_attr.grh.flow_label;
+ resp.dest.sgid_index = attr->ah_attr.grh.sgid_index;
+ resp.dest.hop_limit = attr->ah_attr.grh.hop_limit;
+ resp.dest.traffic_class = attr->ah_attr.grh.traffic_class;
+ resp.dest.dlid = attr->ah_attr.dlid;
+ resp.dest.sl = attr->ah_attr.sl;
+ resp.dest.src_path_bits = attr->ah_attr.src_path_bits;
+ resp.dest.static_rate = attr->ah_attr.static_rate;
+ resp.dest.is_global = !!(attr->ah_attr.ah_flags & IB_AH_GRH);
+ resp.dest.port_num = attr->ah_attr.port_num;
+
+ memcpy(resp.alt_dest.dgid, attr->alt_ah_attr.grh.dgid.raw, 16);
+ resp.alt_dest.flow_label = attr->alt_ah_attr.grh.flow_label;
+ resp.alt_dest.sgid_index = attr->alt_ah_attr.grh.sgid_index;
+ resp.alt_dest.hop_limit = attr->alt_ah_attr.grh.hop_limit;
+ resp.alt_dest.traffic_class = attr->alt_ah_attr.grh.traffic_class;
+ resp.alt_dest.dlid = attr->alt_ah_attr.dlid;
+ resp.alt_dest.sl = attr->alt_ah_attr.sl;
+ resp.alt_dest.src_path_bits = attr->alt_ah_attr.src_path_bits;
+ resp.alt_dest.static_rate = attr->alt_ah_attr.static_rate;
+ resp.alt_dest.is_global = !!(attr->alt_ah_attr.ah_flags & IB_AH_GRH);
+ resp.alt_dest.port_num = attr->alt_ah_attr.port_num;
+
+ resp.max_send_wr = init_attr->cap.max_send_wr;
+ resp.max_recv_wr = init_attr->cap.max_recv_wr;
+ resp.max_send_sge = init_attr->cap.max_send_sge;
+ resp.max_recv_sge = init_attr->cap.max_recv_sge;
+ resp.max_inline_data = init_attr->cap.max_inline_data;
+ resp.sq_sig_all = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR;
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp))
+ ret = -EFAULT;
+
+out:
+ kfree(attr);
+ kfree(init_attr);
+
+ return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_modify_qp cmd;
+ struct ib_udata udata;
+ struct ib_qp *qp;
+ struct ib_qp_attr *attr;
+ int ret;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd,
+ out_len);
+
+ attr = kmalloc(sizeof *attr, GFP_KERNEL);
+ if (!attr)
+ return -ENOMEM;
+
+ qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+ if (!qp) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ attr->qp_state = cmd.qp_state;
+ attr->cur_qp_state = cmd.cur_qp_state;
+ attr->path_mtu = cmd.path_mtu;
+ attr->path_mig_state = cmd.path_mig_state;
+ attr->qkey = cmd.qkey;
+ attr->rq_psn = cmd.rq_psn;
+ attr->sq_psn = cmd.sq_psn;
+ attr->dest_qp_num = cmd.dest_qp_num;
+ attr->qp_access_flags = cmd.qp_access_flags;
+ attr->pkey_index = cmd.pkey_index;
+ attr->alt_pkey_index = cmd.alt_pkey_index;
+ attr->en_sqd_async_notify = cmd.en_sqd_async_notify;
+ attr->max_rd_atomic = cmd.max_rd_atomic;
+ attr->max_dest_rd_atomic = cmd.max_dest_rd_atomic;
+ attr->min_rnr_timer = cmd.min_rnr_timer;
+ attr->port_num = cmd.port_num;
+ attr->timeout = cmd.timeout;
+ attr->retry_cnt = cmd.retry_cnt;
+ attr->rnr_retry = cmd.rnr_retry;
+ attr->alt_port_num = cmd.alt_port_num;
+ attr->alt_timeout = cmd.alt_timeout;
+
+ memcpy(attr->ah_attr.grh.dgid.raw, cmd.dest.dgid, 16);
+ attr->ah_attr.grh.flow_label = cmd.dest.flow_label;
+ attr->ah_attr.grh.sgid_index = cmd.dest.sgid_index;
+ attr->ah_attr.grh.hop_limit = cmd.dest.hop_limit;
+ attr->ah_attr.grh.traffic_class = cmd.dest.traffic_class;
+ attr->ah_attr.dlid = cmd.dest.dlid;
+ attr->ah_attr.sl = cmd.dest.sl;
+ attr->ah_attr.src_path_bits = cmd.dest.src_path_bits;
+ attr->ah_attr.static_rate = cmd.dest.static_rate;
+ attr->ah_attr.ah_flags = cmd.dest.is_global ? IB_AH_GRH : 0;
+ attr->ah_attr.port_num = cmd.dest.port_num;
+
+ memcpy(attr->alt_ah_attr.grh.dgid.raw, cmd.alt_dest.dgid, 16);
+ attr->alt_ah_attr.grh.flow_label = cmd.alt_dest.flow_label;
+ attr->alt_ah_attr.grh.sgid_index = cmd.alt_dest.sgid_index;
+ attr->alt_ah_attr.grh.hop_limit = cmd.alt_dest.hop_limit;
+ attr->alt_ah_attr.grh.traffic_class = cmd.alt_dest.traffic_class;
+ attr->alt_ah_attr.dlid = cmd.alt_dest.dlid;
+ attr->alt_ah_attr.sl = cmd.alt_dest.sl;
+ attr->alt_ah_attr.src_path_bits = cmd.alt_dest.src_path_bits;
+ attr->alt_ah_attr.static_rate = cmd.alt_dest.static_rate;
+ attr->alt_ah_attr.ah_flags = cmd.alt_dest.is_global ? IB_AH_GRH : 0;
+ attr->alt_ah_attr.port_num = cmd.alt_dest.port_num;
+
+ ret = qp->device->modify_qp(qp, attr, cmd.attr_mask, &udata);
+
+ put_qp_read(qp);
+
+ if (ret)
+ goto out;
+
+ ret = in_len;
+
+out:
+ kfree(attr);
+
+ return ret;
+}
+
+ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_destroy_qp cmd;
+ struct ib_uverbs_destroy_qp_resp resp;
+ struct ib_uobject *uobj;
+ struct ib_qp *qp;
+ struct ib_uqp_object *obj;
+ int ret = -EINVAL;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ memset(&resp, 0, sizeof resp);
+
+ uobj = idr_write_uobj(&ib_uverbs_qp_idr, cmd.qp_handle, file->ucontext);
+ if (!uobj)
+ return -EINVAL;
+ qp = uobj->object;
+ obj = container_of(uobj, struct ib_uqp_object, uevent.uobject);
+
+ if (!list_empty(&obj->mcast_list)) {
+ put_uobj_write(uobj);
+ return -EBUSY;
+ }
+
+ ret = ib_destroy_qp(qp);
+ if (!ret)
+ uobj->live = 0;
+
+ put_uobj_write(uobj);
+
+ if (ret)
+ return ret;
+
+ idr_remove_uobj(&ib_uverbs_qp_idr, uobj);
+
+ mutex_lock(&file->mutex);
+ list_del(&uobj->list);
+ mutex_unlock(&file->mutex);
+
+ ib_uverbs_release_uevent(file, &obj->uevent);
+
+ resp.events_reported = obj->uevent.events_reported;
+
+ put_uobj(uobj);
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp))
+ return -EFAULT;
+
+ return in_len;
+}
+
+ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_post_send cmd;
+ struct ib_uverbs_post_send_resp resp;
+ struct ib_uverbs_send_wr *user_wr;
+ struct ib_send_wr *wr = NULL, *last, *next, *bad_wr;
+ struct ib_qp *qp;
+ int i, sg_ind;
+ int is_ud;
+ ssize_t ret = -EINVAL;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ if (in_len < sizeof cmd + cmd.wqe_size * cmd.wr_count +
+ cmd.sge_count * sizeof (struct ib_uverbs_sge))
+ return -EINVAL;
+
+ if (cmd.wqe_size < sizeof (struct ib_uverbs_send_wr))
+ return -EINVAL;
+
+ user_wr = kmalloc(cmd.wqe_size, GFP_KERNEL);
+ if (!user_wr)
+ return -ENOMEM;
+
+ qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+ if (!qp)
+ goto out;
+
+ is_ud = qp->qp_type == IB_QPT_UD;
+ sg_ind = 0;
+ last = NULL;
+ for (i = 0; i < cmd.wr_count; ++i) {
+ if (copy_from_user(user_wr,
+ buf + sizeof cmd + i * cmd.wqe_size,
+ cmd.wqe_size)) {
+ ret = -EFAULT;
+ goto out_put;
+ }
+
+ if (user_wr->num_sge + sg_ind > cmd.sge_count) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
+ next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) +
+ user_wr->num_sge * sizeof (struct ib_sge),
+ GFP_KERNEL);
+ if (!next) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+
+ if (!last)
+ wr = next;
+ else
+ last->next = next;
+ last = next;
+
+ next->next = NULL;
+ next->wr_id = user_wr->wr_id;
+ next->num_sge = user_wr->num_sge;
+ next->opcode = user_wr->opcode;
+ next->send_flags = user_wr->send_flags;
+
+ if (is_ud) {
+ next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah,
+ file->ucontext);
+ if (!next->wr.ud.ah) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+ next->wr.ud.remote_qpn = user_wr->wr.ud.remote_qpn;
+ next->wr.ud.remote_qkey = user_wr->wr.ud.remote_qkey;
+ } else {
+ switch (next->opcode) {
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ next->ex.imm_data =
+ (__be32 __force) user_wr->ex.imm_data;
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_READ:
+ next->wr.rdma.remote_addr =
+ user_wr->wr.rdma.remote_addr;
+ next->wr.rdma.rkey =
+ user_wr->wr.rdma.rkey;
+ break;
+ case IB_WR_SEND_WITH_IMM:
+ next->ex.imm_data =
+ (__be32 __force) user_wr->ex.imm_data;
+ break;
+ case IB_WR_SEND_WITH_INV:
+ next->ex.invalidate_rkey =
+ user_wr->ex.invalidate_rkey;
+ break;
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ next->wr.atomic.remote_addr =
+ user_wr->wr.atomic.remote_addr;
+ next->wr.atomic.compare_add =
+ user_wr->wr.atomic.compare_add;
+ next->wr.atomic.swap = user_wr->wr.atomic.swap;
+ next->wr.atomic.rkey = user_wr->wr.atomic.rkey;
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (next->num_sge) {
+ next->sg_list = (void *) next +
+ ALIGN(sizeof *next, sizeof (struct ib_sge));
+ if (copy_from_user(next->sg_list,
+ buf + sizeof cmd +
+ cmd.wr_count * cmd.wqe_size +
+ sg_ind * sizeof (struct ib_sge),
+ next->num_sge * sizeof (struct ib_sge))) {
+ ret = -EFAULT;
+ goto out_put;
+ }
+ sg_ind += next->num_sge;
+ } else
+ next->sg_list = NULL;
+ }
+
+ resp.bad_wr = 0;
+ ret = qp->device->post_send(qp, wr, &bad_wr);
+ if (ret)
+ for (next = wr; next; next = next->next) {
+ ++resp.bad_wr;
+ if (next == bad_wr)
+ break;
+ }
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp))
+ ret = -EFAULT;
+
+out_put:
+ put_qp_read(qp);
+
+ while (wr) {
+ if (is_ud && wr->wr.ud.ah)
+ put_ah_read(wr->wr.ud.ah);
+ next = wr->next;
+ kfree(wr);
+ wr = next;
+ }
+
+out:
+ kfree(user_wr);
+
+ return ret ? ret : in_len;
+}
+
+static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf,
+ int in_len,
+ u32 wr_count,
+ u32 sge_count,
+ u32 wqe_size)
+{
+ struct ib_uverbs_recv_wr *user_wr;
+ struct ib_recv_wr *wr = NULL, *last, *next;
+ int sg_ind;
+ int i;
+ int ret;
+
+ if (in_len < wqe_size * wr_count +
+ sge_count * sizeof (struct ib_uverbs_sge))
+ return ERR_PTR(-EINVAL);
+
+ if (wqe_size < sizeof (struct ib_uverbs_recv_wr))
+ return ERR_PTR(-EINVAL);
+
+ user_wr = kmalloc(wqe_size, GFP_KERNEL);
+ if (!user_wr)
+ return ERR_PTR(-ENOMEM);
+
+ sg_ind = 0;
+ last = NULL;
+ for (i = 0; i < wr_count; ++i) {
+ if (copy_from_user(user_wr, buf + i * wqe_size,
+ wqe_size)) {
+ ret = -EFAULT;
+ goto err;
+ }
+
+ if (user_wr->num_sge + sg_ind > sge_count) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) +
+ user_wr->num_sge * sizeof (struct ib_sge),
+ GFP_KERNEL);
+ if (!next) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ if (!last)
+ wr = next;
+ else
+ last->next = next;
+ last = next;
+
+ next->next = NULL;
+ next->wr_id = user_wr->wr_id;
+ next->num_sge = user_wr->num_sge;
+
+ if (next->num_sge) {
+ next->sg_list = (void *) next +
+ ALIGN(sizeof *next, sizeof (struct ib_sge));
+ if (copy_from_user(next->sg_list,
+ buf + wr_count * wqe_size +
+ sg_ind * sizeof (struct ib_sge),
+ next->num_sge * sizeof (struct ib_sge))) {
+ ret = -EFAULT;
+ goto err;
+ }
+ sg_ind += next->num_sge;
+ } else
+ next->sg_list = NULL;
+ }
+
+ kfree(user_wr);
+ return wr;
+
+err:
+ kfree(user_wr);
+
+ while (wr) {
+ next = wr->next;
+ kfree(wr);
+ wr = next;
+ }
+
+ return ERR_PTR(ret);
+}
+
+ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_post_recv cmd;
+ struct ib_uverbs_post_recv_resp resp;
+ struct ib_recv_wr *wr, *next, *bad_wr;
+ struct ib_qp *qp;
+ ssize_t ret = -EINVAL;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd,
+ in_len - sizeof cmd, cmd.wr_count,
+ cmd.sge_count, cmd.wqe_size);
+ if (IS_ERR(wr))
+ return PTR_ERR(wr);
+
+ qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+ if (!qp)
+ goto out;
+
+ resp.bad_wr = 0;
+ ret = qp->device->post_recv(qp, wr, &bad_wr);
+
+ put_qp_read(qp);
+
+ if (ret)
+ for (next = wr; next; next = next->next) {
+ ++resp.bad_wr;
+ if (next == bad_wr)
+ break;
+ }
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp))
+ ret = -EFAULT;
+
+out:
+ while (wr) {
+ next = wr->next;
+ kfree(wr);
+ wr = next;
+ }
+
+ return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_post_srq_recv cmd;
+ struct ib_uverbs_post_srq_recv_resp resp;
+ struct ib_recv_wr *wr, *next, *bad_wr;
+ struct ib_srq *srq;
+ ssize_t ret = -EINVAL;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd,
+ in_len - sizeof cmd, cmd.wr_count,
+ cmd.sge_count, cmd.wqe_size);
+ if (IS_ERR(wr))
+ return PTR_ERR(wr);
+
+ srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+ if (!srq)
+ goto out;
+
+ resp.bad_wr = 0;
+ ret = srq->device->post_srq_recv(srq, wr, &bad_wr);
+
+ put_srq_read(srq);
+
+ if (ret)
+ for (next = wr; next; next = next->next) {
+ ++resp.bad_wr;
+ if (next == bad_wr)
+ break;
+ }
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp))
+ ret = -EFAULT;
+
+out:
+ while (wr) {
+ next = wr->next;
+ kfree(wr);
+ wr = next;
+ }
+
+ return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_create_ah cmd;
+ struct ib_uverbs_create_ah_resp resp;
+ struct ib_uobject *uobj;
+ struct ib_pd *pd;
+ struct ib_ah *ah;
+ struct ib_ah_attr attr;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
+ if (!uobj)
+ return -ENOMEM;
+
+ init_uobj(uobj, cmd.user_handle, file->ucontext, &ah_lock_key);
+ down_write(&uobj->mutex);
+
+ pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+ if (!pd) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ attr.dlid = cmd.attr.dlid;
+ attr.sl = cmd.attr.sl;
+ attr.src_path_bits = cmd.attr.src_path_bits;
+ attr.static_rate = cmd.attr.static_rate;
+ attr.ah_flags = cmd.attr.is_global ? IB_AH_GRH : 0;
+ attr.port_num = cmd.attr.port_num;
+ attr.grh.flow_label = cmd.attr.grh.flow_label;
+ attr.grh.sgid_index = cmd.attr.grh.sgid_index;
+ attr.grh.hop_limit = cmd.attr.grh.hop_limit;
+ attr.grh.traffic_class = cmd.attr.grh.traffic_class;
+ memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16);
+
+ ah = ib_create_ah(pd, &attr);
+ if (IS_ERR(ah)) {
+ ret = PTR_ERR(ah);
+ goto err_put;
+ }
+
+ ah->uobject = uobj;
+ uobj->object = ah;
+
+ ret = idr_add_uobj(&ib_uverbs_ah_idr, uobj);
+ if (ret)
+ goto err_destroy;
+
+ resp.ah_handle = uobj->id;
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_copy;
+ }
+
+ put_pd_read(pd);
+
+ mutex_lock(&file->mutex);
+ list_add_tail(&uobj->list, &file->ucontext->ah_list);
+ mutex_unlock(&file->mutex);
+
+ uobj->live = 1;
+
+ up_write(&uobj->mutex);
+
+ return in_len;
+
+err_copy:
+ idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
+
+err_destroy:
+ ib_destroy_ah(ah);
+
+err_put:
+ put_pd_read(pd);
+
+err:
+ put_uobj_write(uobj);
+ return ret;
+}
+
+ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len, int out_len)
+{
+ struct ib_uverbs_destroy_ah cmd;
+ struct ib_ah *ah;
+ struct ib_uobject *uobj;
+ int ret;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ uobj = idr_write_uobj(&ib_uverbs_ah_idr, cmd.ah_handle, file->ucontext);
+ if (!uobj)
+ return -EINVAL;
+ ah = uobj->object;
+
+ ret = ib_destroy_ah(ah);
+ if (!ret)
+ uobj->live = 0;
+
+ put_uobj_write(uobj);
+
+ if (ret)
+ return ret;
+
+ idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
+
+ mutex_lock(&file->mutex);
+ list_del(&uobj->list);
+ mutex_unlock(&file->mutex);
+
+ put_uobj(uobj);
+
+ return in_len;
+}
+
+ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_attach_mcast cmd;
+ struct ib_qp *qp;
+ struct ib_uqp_object *obj;
+ struct ib_uverbs_mcast_entry *mcast;
+ int ret;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+ if (!qp)
+ return -EINVAL;
+
+ obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject);
+
+ list_for_each_entry(mcast, &obj->mcast_list, list)
+ if (cmd.mlid == mcast->lid &&
+ !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) {
+ ret = 0;
+ goto out_put;
+ }
+
+ mcast = kmalloc(sizeof *mcast, GFP_KERNEL);
+ if (!mcast) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+
+ mcast->lid = cmd.mlid;
+ memcpy(mcast->gid.raw, cmd.gid, sizeof mcast->gid.raw);
+
+ ret = ib_attach_mcast(qp, &mcast->gid, cmd.mlid);
+ if (!ret)
+ list_add_tail(&mcast->list, &obj->mcast_list);
+ else
+ kfree(mcast);
+
+out_put:
+ put_qp_read(qp);
+
+ return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_detach_mcast cmd;
+ struct ib_uqp_object *obj;
+ struct ib_qp *qp;
+ struct ib_uverbs_mcast_entry *mcast;
+ int ret = -EINVAL;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+ if (!qp)
+ return -EINVAL;
+
+ ret = ib_detach_mcast(qp, (union ib_gid *) cmd.gid, cmd.mlid);
+ if (ret)
+ goto out_put;
+
+ obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject);
+
+ list_for_each_entry(mcast, &obj->mcast_list, list)
+ if (cmd.mlid == mcast->lid &&
+ !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) {
+ list_del(&mcast->list);
+ kfree(mcast);
+ break;
+ }
+
+out_put:
+ put_qp_read(qp);
+
+ return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_create_srq cmd;
+ struct ib_uverbs_create_srq_resp resp;
+ struct ib_udata udata;
+ struct ib_uevent_object *obj;
+ struct ib_pd *pd;
+ struct ib_srq *srq;
+ struct ib_srq_init_attr attr;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ INIT_UDATA(&udata, buf + sizeof cmd,
+ (unsigned long) cmd.response + sizeof resp,
+ in_len - sizeof cmd, out_len - sizeof resp);
+
+ obj = kmalloc(sizeof *obj, GFP_KERNEL);
+ if (!obj)
+ return -ENOMEM;
+
+ init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, &srq_lock_key);
+ down_write(&obj->uobject.mutex);
+
+ pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+ if (!pd) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ attr.event_handler = ib_uverbs_srq_event_handler;
+ attr.srq_context = file;
+ attr.attr.max_wr = cmd.max_wr;
+ attr.attr.max_sge = cmd.max_sge;
+ attr.attr.srq_limit = cmd.srq_limit;
+
+ obj->events_reported = 0;
+ INIT_LIST_HEAD(&obj->event_list);
+
+ srq = pd->device->create_srq(pd, &attr, &udata);
+ if (IS_ERR(srq)) {
+ ret = PTR_ERR(srq);
+ goto err_put;
+ }
+
+ srq->device = pd->device;
+ srq->pd = pd;
+ srq->uobject = &obj->uobject;
+ srq->event_handler = attr.event_handler;
+ srq->srq_context = attr.srq_context;
+ srq->xrc_cq = NULL;
+ srq->xrcd = NULL;
+ atomic_inc(&pd->usecnt);
+ atomic_set(&srq->usecnt, 0);
+
+ obj->uobject.object = srq;
+ ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uobject);
+ if (ret)
+ goto err_destroy;
+
+ memset(&resp, 0, sizeof resp);
+ resp.srq_handle = obj->uobject.id;
+ resp.max_wr = attr.attr.max_wr;
+ resp.max_sge = attr.attr.max_sge;
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_copy;
+ }
+
+ put_pd_read(pd);
+
+ mutex_lock(&file->mutex);
+ list_add_tail(&obj->uobject.list, &file->ucontext->srq_list);
+ mutex_unlock(&file->mutex);
+
+ obj->uobject.live = 1;
+
+ up_write(&obj->uobject.mutex);
+
+ return in_len;
+
+err_copy:
+ idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uobject);
+
+err_destroy:
+ ib_destroy_srq(srq);
+
+err_put:
+ put_pd_read(pd);
+
+err:
+ put_uobj_write(&obj->uobject);
+ return ret;
+}
+
+ssize_t ib_uverbs_create_xrc_srq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_create_xrc_srq cmd;
+ struct ib_uverbs_create_srq_resp resp;
+ struct ib_udata udata;
+ struct ib_uevent_object *obj;
+ struct ib_pd *pd;
+ struct ib_srq *srq;
+ struct ib_cq *xrc_cq;
+ struct ib_xrcd *xrcd;
+ struct ib_srq_init_attr attr;
+ struct ib_uobject *xrcd_uobj;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ INIT_UDATA(&udata, buf + sizeof cmd,
+ (unsigned long) cmd.response + sizeof resp,
+ in_len - sizeof cmd, out_len - sizeof resp);
+
+ obj = kmalloc(sizeof *obj, GFP_KERNEL);
+ if (!obj)
+ return -ENOMEM;
+
+ init_uobj(&obj->uobject, cmd.user_handle, file->ucontext,
+ &srq_lock_key);
+ down_write(&obj->uobject.mutex);
+
+ pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+ if (!pd) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ xrc_cq = idr_read_cq(cmd.xrc_cq, file->ucontext, 0);
+ if (!xrc_cq) {
+ ret = -EINVAL;
+ goto err_put_pd;
+ }
+
+ xrcd = idr_read_xrcd(cmd.xrcd_handle, file->ucontext, &xrcd_uobj);
+ if (!xrcd) {
+ ret = -EINVAL;
+ goto err_put_cq;
+ }
+
+
+ attr.event_handler = ib_uverbs_srq_event_handler;
+ attr.srq_context = file;
+ attr.attr.max_wr = cmd.max_wr;
+ attr.attr.max_sge = cmd.max_sge;
+ attr.attr.srq_limit = cmd.srq_limit;
+
+ obj->events_reported = 0;
+ INIT_LIST_HEAD(&obj->event_list);
+
+ srq = pd->device->create_xrc_srq(pd, xrc_cq, xrcd, &attr, &udata);
+ if (IS_ERR(srq)) {
+ ret = PTR_ERR(srq);
+ goto err_put;
+ }
+
+ srq->device = pd->device;
+ srq->pd = pd;
+ srq->uobject = &obj->uobject;
+ srq->event_handler = attr.event_handler;
+ srq->srq_context = attr.srq_context;
+ srq->xrc_cq = xrc_cq;
+ srq->xrcd = xrcd;
+ atomic_inc(&pd->usecnt);
+ atomic_inc(&xrc_cq->usecnt);
+ atomic_inc(&xrcd->usecnt);
+
+ atomic_set(&srq->usecnt, 0);
+
+ obj->uobject.object = srq;
+ ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uobject);
+ if (ret)
+ goto err_destroy;
+
+ memset(&resp, 0, sizeof resp);
+ resp.srq_handle = obj->uobject.id;
+ resp.max_wr = attr.attr.max_wr;
+ resp.max_sge = attr.attr.max_sge;
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_copy;
+ }
+
+ put_xrcd_read(xrcd_uobj);
+ put_cq_read(xrc_cq);
+ put_pd_read(pd);
+
+ mutex_lock(&file->mutex);
+ list_add_tail(&obj->uobject.list, &file->ucontext->srq_list);
+ mutex_unlock(&file->mutex);
+
+ obj->uobject.live = 1;
+
+ up_write(&obj->uobject.mutex);
+
+ return in_len;
+
+err_copy:
+ idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uobject);
+
+err_destroy:
+ ib_destroy_srq(srq);
+
+err_put:
+ put_xrcd_read(xrcd_uobj);
+
+err_put_cq:
+ put_cq_read(xrc_cq);
+
+err_put_pd:
+ put_pd_read(pd);
+
+err:
+ put_uobj_write(&obj->uobject);
+ return ret;
+}
+
+ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_modify_srq cmd;
+ struct ib_udata udata;
+ struct ib_srq *srq;
+ struct ib_srq_attr attr;
+ int ret;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd,
+ out_len);
+
+ srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+ if (!srq)
+ return -EINVAL;
+
+ attr.max_wr = cmd.max_wr;
+ attr.srq_limit = cmd.srq_limit;
+
+ ret = srq->device->modify_srq(srq, &attr, cmd.attr_mask, &udata);
+
+ put_srq_read(srq);
+
+ return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
+ const char __user *buf,
+ int in_len, int out_len)
+{
+ struct ib_uverbs_query_srq cmd;
+ struct ib_uverbs_query_srq_resp resp;
+ struct ib_srq_attr attr;
+ struct ib_srq *srq;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+ if (!srq)
+ return -EINVAL;
+
+ ret = ib_query_srq(srq, &attr);
+
+ put_srq_read(srq);
+
+ if (ret)
+ return ret;
+
+ memset(&resp, 0, sizeof resp);
+
+ resp.max_wr = attr.max_wr;
+ resp.max_sge = attr.max_sge;
+ resp.srq_limit = attr.srq_limit;
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp))
+ return -EFAULT;
+
+ return in_len;
+}
+
+ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_destroy_srq cmd;
+ struct ib_uverbs_destroy_srq_resp resp;
+ struct ib_uobject *uobj;
+ struct ib_srq *srq;
+ struct ib_uevent_object *obj;
+ int ret = -EINVAL;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ uobj = idr_write_uobj(&ib_uverbs_srq_idr, cmd.srq_handle, file->ucontext);
+ if (!uobj)
+ return -EINVAL;
+ srq = uobj->object;
+ obj = container_of(uobj, struct ib_uevent_object, uobject);
+
+ ret = ib_destroy_srq(srq);
+ if (!ret)
+ uobj->live = 0;
+
+ put_uobj_write(uobj);
+
+ if (ret)
+ return ret;
+
+ idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
+
+ mutex_lock(&file->mutex);
+ list_del(&uobj->list);
+ mutex_unlock(&file->mutex);
+
+ ib_uverbs_release_uevent(file, obj);
+
+ memset(&resp, 0, sizeof resp);
+ resp.events_reported = obj->events_reported;
+
+ put_uobj(uobj);
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp))
+ ret = -EFAULT;
+
+ return ret ? ret : in_len;
+}
+
+static struct inode *xrc_file2inode(struct file *f)
+{
+ return f->f_dentry->d_inode;
+}
+
+struct xrcd_table_entry {
+ struct rb_node node;
+ struct inode *inode;
+ struct ib_xrcd *xrcd;
+};
+
+static int xrcd_table_insert(struct ib_device *dev,
+ struct inode *i_n,
+ struct ib_xrcd *xrcd)
+{
+ struct xrcd_table_entry *entry, *scan;
+ struct rb_node **p = &dev->ib_uverbs_xrcd_table.rb_node;
+ struct rb_node *parent = NULL;
+
+ entry = kmalloc(sizeof(struct xrcd_table_entry), GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->inode = i_n;
+ entry->xrcd = xrcd;
+
+ while (*p) {
+ parent = *p;
+ scan = rb_entry(parent, struct xrcd_table_entry, node);
+
+ if (i_n < scan->inode)
+ p = &(*p)->rb_left;
+ else if (i_n > scan->inode)
+ p = &(*p)->rb_right;
+ else {
+ kfree(entry);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&entry->node, parent, p);
+ rb_insert_color(&entry->node, &dev->ib_uverbs_xrcd_table);
+ igrab(i_n);
+ return 0;
+}
+
+static struct xrcd_table_entry *xrcd_table_search(struct ib_device *dev,
+ struct inode *i_n)
+{
+ struct xrcd_table_entry *scan;
+ struct rb_node **p = &dev->ib_uverbs_xrcd_table.rb_node;
+ struct rb_node *parent = NULL;
+
+ while (*p) {
+ parent = *p;
+ scan = rb_entry(parent, struct xrcd_table_entry, node);
+
+ if (i_n < scan->inode)
+ p = &(*p)->rb_left;
+ else if (i_n > scan->inode)
+ p = &(*p)->rb_right;
+ else
+ return scan;
+ }
+ return NULL;
+}
+
+static int find_xrcd(struct ib_device *dev, struct inode *i_n,
+ struct ib_xrcd **xrcd)
+{
+ struct xrcd_table_entry *entry;
+
+ entry = xrcd_table_search(dev, i_n);
+ if (!entry)
+ return -EINVAL;
+
+ *xrcd = entry->xrcd;
+ return 0;
+}
+
+
+static void xrcd_table_delete(struct ib_device *dev,
+ struct inode *i_n)
+{
+ struct xrcd_table_entry *entry = xrcd_table_search(dev, i_n);
+
+ if (entry) {
+ iput(i_n);
+ rb_erase(&entry->node, &dev->ib_uverbs_xrcd_table);
+ kfree(entry);
+ }
+}
+
+ssize_t ib_uverbs_open_xrc_domain(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_open_xrc_domain cmd;
+ struct ib_uverbs_open_xrc_domain_resp resp;
+ struct ib_udata udata;
+ struct ib_uobject *uobj;
+ struct ib_uxrcd_object *xrcd_uobj;
+ struct ib_xrcd *xrcd = NULL;
+ struct file *f = NULL;
+ struct inode *inode = NULL;
+ int ret = 0;
+ int new_xrcd = 0;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ INIT_UDATA(&udata, buf + sizeof cmd,
+ (unsigned long) cmd.response + sizeof resp,
+ in_len - sizeof cmd, out_len - sizeof resp);
+
+ mutex_lock(&file->device->ib_dev->xrcd_table_mutex);
+ if (cmd.fd != (u32) (-1)) {
+ /* search for file descriptor */
+ f = fget(cmd.fd);
+ if (!f) {
+ ret = -EBADF;
+ goto err_table_mutex_unlock;
+ }
+
+ inode = xrc_file2inode(f);
+ if (!inode) {
+ ret = -EBADF;
+ goto err_table_mutex_unlock;
+ }
+
+ ret = find_xrcd(file->device->ib_dev, inode, &xrcd);
+ if (ret && !(cmd.oflags & O_CREAT)) {
+ /* no file descriptor. Need CREATE flag */
+ ret = -EAGAIN;
+ goto err_table_mutex_unlock;
+ }
+
+ if (xrcd && cmd.oflags & O_EXCL) {
+ ret = -EINVAL;
+ goto err_table_mutex_unlock;
+ }
+ }
+
+ xrcd_uobj = kmalloc(sizeof *xrcd_uobj, GFP_KERNEL);
+ if (!xrcd_uobj) {
+ ret = -ENOMEM;
+ goto err_table_mutex_unlock;
+ }
+
+ uobj = &xrcd_uobj->uobject;
+ init_uobj(uobj, 0, file->ucontext, &pd_lock_key);
+ down_write(&uobj->mutex);
+
+ if (!xrcd) {
+ xrcd = file->device->ib_dev->alloc_xrcd(file->device->ib_dev,
+ file->ucontext, &udata);
+ if (IS_ERR(xrcd)) {
+ ret = PTR_ERR(xrcd);
+ goto err;
+ }
+ xrcd->uobject = (cmd.fd == -1) ? uobj : NULL;
+ xrcd->inode = inode;
+ xrcd->device = file->device->ib_dev;
+ atomic_set(&xrcd->usecnt, 0);
+ new_xrcd = 1;
+ }
+
+ uobj->object = xrcd;
+ ret = idr_add_uobj(&ib_uverbs_xrc_domain_idr, uobj);
+ if (ret)
+ goto err_idr;
+
+ memset(&resp, 0, sizeof resp);
+ resp.xrcd_handle = uobj->id;
+
+ if (inode) {
+ if (new_xrcd) {
+ /* create new inode/xrcd table entry */
+ ret = xrcd_table_insert(file->device->ib_dev, inode, xrcd);
+ if (ret)
+ goto err_insert_xrcd;
+ }
+ atomic_inc(&xrcd->usecnt);
+ }
+ if (f)
+ fput(f);
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_copy;
+ }
+
+ INIT_LIST_HEAD(&xrcd_uobj->xrc_reg_qp_list);
+
+ mutex_lock(&file->mutex);
+ list_add_tail(&uobj->list, &file->ucontext->xrc_domain_list);
+ mutex_unlock(&file->mutex);
+
+ uobj->live = 1;
+
+ up_write(&uobj->mutex);
+
+ mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+ return in_len;
+
+err_copy:
+
+ if (inode) {
+ if (new_xrcd)
+ xrcd_table_delete(file->device->ib_dev, inode);
+ atomic_dec(&xrcd->usecnt);
+ }
+
+err_insert_xrcd:
+ idr_remove_uobj(&ib_uverbs_xrc_domain_idr, uobj);
+
+err_idr:
+ ib_dealloc_xrcd(xrcd);
+
+err:
+ put_uobj_write(uobj);
+
+err_table_mutex_unlock:
+
+ if (f)
+ fput(f);
+ mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+ return ret;
+}
+
+ssize_t ib_uverbs_close_xrc_domain(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_close_xrc_domain cmd;
+ struct ib_uobject *uobj, *t_uobj;
+ struct ib_uxrcd_object *xrcd_uobj;
+ struct ib_xrcd *xrcd = NULL;
+ struct inode *inode = NULL;
+ int ret = 0;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ mutex_lock(&file->device->ib_dev->xrcd_table_mutex);
+ uobj = idr_write_uobj(&ib_uverbs_xrc_domain_idr, cmd.xrcd_handle,
+ file->ucontext);
+ if (!uobj) {
+ ret = -EINVAL;
+ goto err_unlock_mutex;
+ }
+
+ mutex_lock(&file->mutex);
+ if (!ret) {
+ list_for_each_entry(t_uobj, &file->ucontext->qp_list, list) {
+ struct ib_qp *qp = t_uobj->object;
+ if (qp->xrcd && qp->xrcd == uobj->object) {
+ ret = -EBUSY;
+ break;
+ }
+ }
+ }
+ if (!ret) {
+ list_for_each_entry(t_uobj, &file->ucontext->srq_list, list) {
+ struct ib_srq *srq = t_uobj->object;
+ if (srq->xrcd && srq->xrcd == uobj->object) {
+ ret = -EBUSY;
+ break;
+ }
+ }
+ }
+ mutex_unlock(&file->mutex);
+ if (ret) {
+ put_uobj_write(uobj);
+ goto err_unlock_mutex;
+ }
+
+ xrcd_uobj = container_of(uobj, struct ib_uxrcd_object, uobject);
+ if (!list_empty(&xrcd_uobj->xrc_reg_qp_list)) {
+ ret = -EBUSY;
+ put_uobj_write(uobj);
+ goto err_unlock_mutex;
+ }
+
+ xrcd = (struct ib_xrcd *) (uobj->object);
+ inode = xrcd->inode;
+
+ if (inode)
+ atomic_dec(&xrcd->usecnt);
+
+ ret = ib_dealloc_xrcd(uobj->object);
+ if (!ret)
+ uobj->live = 0;
+
+ put_uobj_write(uobj);
+
+ if (ret && !inode)
+ goto err_unlock_mutex;
+
+ if (!ret && inode)
+ xrcd_table_delete(file->device->ib_dev, inode);
+
+ idr_remove_uobj(&ib_uverbs_xrc_domain_idr, uobj);
+
+ mutex_lock(&file->mutex);
+ list_del(&uobj->list);
+ mutex_unlock(&file->mutex);
+
+ put_uobj(uobj);
+
+ mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+ return in_len;
+
+err_unlock_mutex:
+ mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+ return ret;
+}
+
+void ib_uverbs_dealloc_xrcd(struct ib_device *ib_dev,
+ struct ib_xrcd *xrcd)
+{
+ struct inode *inode = NULL;
+ int ret = 0;
+
+ inode = xrcd->inode;
+ if (inode)
+ atomic_dec(&xrcd->usecnt);
+
+ ret = ib_dealloc_xrcd(xrcd);
+ if (!ret && inode)
+ xrcd_table_delete(ib_dev, inode);
+}
+
+ssize_t ib_uverbs_create_xrc_rcv_qp(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_create_xrc_rcv_qp cmd;
+ struct ib_uverbs_create_xrc_rcv_qp_resp resp;
+ struct ib_uxrc_rcv_object *obj;
+ struct ib_qp_init_attr init_attr;
+ struct ib_xrcd *xrcd;
+ struct ib_uobject *uobj;
+ struct ib_uxrcd_object *xrcd_uobj;
+ u32 qp_num;
+ int err;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ obj = kzalloc(sizeof *obj, GFP_KERNEL);
+ if (!obj)
+ return -ENOMEM;
+
+ xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj);
+ if (!xrcd) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ init_attr.event_handler = ib_uverbs_xrc_rcv_qp_event_handler;
+ init_attr.qp_context = file;
+ init_attr.srq = NULL;
+ init_attr.sq_sig_type =
+ cmd.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
+ init_attr.qp_type = IB_QPT_XRC;
+ init_attr.xrc_domain = xrcd;
+
+ init_attr.cap.max_send_wr = 1;
+ init_attr.cap.max_recv_wr = 0;
+ init_attr.cap.max_send_sge = 1;
+ init_attr.cap.max_recv_sge = 0;
+ init_attr.cap.max_inline_data = 0;
+
+ err = xrcd->device->create_xrc_rcv_qp(&init_attr, &qp_num);
+ if (err)
+ goto err_put;
+
+ memset(&resp, 0, sizeof resp);
+ resp.qpn = qp_num;
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp)) {
+ err = -EFAULT;
+ goto err_destroy;
+ }
+
+ atomic_inc(&xrcd->usecnt);
+ put_xrcd_read(uobj);
+ obj->qp_num = qp_num;
+ obj->domain_handle = cmd.xrc_domain_handle;
+ xrcd_uobj = container_of(uobj, struct ib_uxrcd_object, uobject);
+ mutex_lock(&file->device->ib_dev->xrcd_table_mutex);
+ list_add_tail(&obj->list, &xrcd_uobj->xrc_reg_qp_list);
+ mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+
+ return in_len;
+
+err_destroy:
+ xrcd->device->unreg_xrc_rcv_qp(xrcd, file, qp_num);
+err_put:
+ put_xrcd_read(uobj);
+err_out:
+ kfree(obj);
+ return err;
+}
+
+ssize_t ib_uverbs_modify_xrc_rcv_qp(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_modify_xrc_rcv_qp cmd;
+ struct ib_qp_attr *attr;
+ struct ib_xrcd *xrcd;
+ struct ib_uobject *uobj;
+ int err;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ attr = kzalloc(sizeof *attr, GFP_KERNEL);
+ if (!attr)
+ return -ENOMEM;
+
+ xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj);
+ if (!xrcd) {
+ kfree(attr);
+ return -EINVAL;
+ }
+
+ attr->qp_state = cmd.qp_state;
+ attr->cur_qp_state = cmd.cur_qp_state;
+ attr->qp_access_flags = cmd.qp_access_flags;
+ attr->pkey_index = cmd.pkey_index;
+ attr->port_num = cmd.port_num;
+ attr->path_mtu = cmd.path_mtu;
+ attr->path_mig_state = cmd.path_mig_state;
+ attr->qkey = cmd.qkey;
+ attr->rq_psn = cmd.rq_psn;
+ attr->sq_psn = cmd.sq_psn;
+ attr->dest_qp_num = cmd.dest_qp_num;
+ attr->alt_pkey_index = cmd.alt_pkey_index;
+ attr->en_sqd_async_notify = cmd.en_sqd_async_notify;
+ attr->max_rd_atomic = cmd.max_rd_atomic;
+ attr->max_dest_rd_atomic = cmd.max_dest_rd_atomic;
+ attr->min_rnr_timer = cmd.min_rnr_timer;
+ attr->port_num = cmd.port_num;
+ attr->timeout = cmd.timeout;
+ attr->retry_cnt = cmd.retry_cnt;
+ attr->rnr_retry = cmd.rnr_retry;
+ attr->alt_port_num = cmd.alt_port_num;
+ attr->alt_timeout = cmd.alt_timeout;
+
+ memcpy(attr->ah_attr.grh.dgid.raw, cmd.dest.dgid, 16);
+ attr->ah_attr.grh.flow_label = cmd.dest.flow_label;
+ attr->ah_attr.grh.sgid_index = cmd.dest.sgid_index;
+ attr->ah_attr.grh.hop_limit = cmd.dest.hop_limit;
+ attr->ah_attr.grh.traffic_class = cmd.dest.traffic_class;
+ attr->ah_attr.dlid = cmd.dest.dlid;
+ attr->ah_attr.sl = cmd.dest.sl;
+ attr->ah_attr.src_path_bits = cmd.dest.src_path_bits;
+ attr->ah_attr.static_rate = cmd.dest.static_rate;
+ attr->ah_attr.ah_flags = cmd.dest.is_global ? IB_AH_GRH : 0;
+ attr->ah_attr.port_num = cmd.dest.port_num;
+
+ memcpy(attr->alt_ah_attr.grh.dgid.raw, cmd.alt_dest.dgid, 16);
+ attr->alt_ah_attr.grh.flow_label = cmd.alt_dest.flow_label;
+ attr->alt_ah_attr.grh.sgid_index = cmd.alt_dest.sgid_index;
+ attr->alt_ah_attr.grh.hop_limit = cmd.alt_dest.hop_limit;
+ attr->alt_ah_attr.grh.traffic_class = cmd.alt_dest.traffic_class;
+ attr->alt_ah_attr.dlid = cmd.alt_dest.dlid;
+ attr->alt_ah_attr.sl = cmd.alt_dest.sl;
+ attr->alt_ah_attr.src_path_bits = cmd.alt_dest.src_path_bits;
+ attr->alt_ah_attr.static_rate = cmd.alt_dest.static_rate;
+ attr->alt_ah_attr.ah_flags = cmd.alt_dest.is_global ? IB_AH_GRH : 0;
+ attr->alt_ah_attr.port_num = cmd.alt_dest.port_num;
+
+ err = xrcd->device->modify_xrc_rcv_qp(xrcd, cmd.qp_num, attr, cmd.attr_mask);
+ put_xrcd_read(uobj);
+ kfree(attr);
+ return err ? err : in_len;
+}
+
+ssize_t ib_uverbs_query_xrc_rcv_qp(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_query_xrc_rcv_qp cmd;
+ struct ib_uverbs_query_qp_resp resp;
+ struct ib_qp_attr *attr;
+ struct ib_qp_init_attr *init_attr;
+ struct ib_xrcd *xrcd;
+ struct ib_uobject *uobj;
+ int ret;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ attr = kmalloc(sizeof *attr, GFP_KERNEL);
+ init_attr = kmalloc(sizeof *init_attr, GFP_KERNEL);
+ if (!attr || !init_attr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj);
+ if (!xrcd) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = xrcd->device->query_xrc_rcv_qp(xrcd, cmd.qp_num, attr,
+ cmd.attr_mask, init_attr);
+
+ put_xrcd_read(uobj);
+
+ if (ret)
+ goto out;
+
+ memset(&resp, 0, sizeof resp);
+ resp.qp_state = attr->qp_state;
+ resp.cur_qp_state = attr->cur_qp_state;
+ resp.path_mtu = attr->path_mtu;
+ resp.path_mig_state = attr->path_mig_state;
+ resp.qkey = attr->qkey;
+ resp.rq_psn = attr->rq_psn;
+ resp.sq_psn = attr->sq_psn;
+ resp.dest_qp_num = attr->dest_qp_num;
+ resp.qp_access_flags = attr->qp_access_flags;
+ resp.pkey_index = attr->pkey_index;
+ resp.alt_pkey_index = attr->alt_pkey_index;
+ resp.sq_draining = attr->sq_draining;
+ resp.max_rd_atomic = attr->max_rd_atomic;
+ resp.max_dest_rd_atomic = attr->max_dest_rd_atomic;
+ resp.min_rnr_timer = attr->min_rnr_timer;
+ resp.port_num = attr->port_num;
+ resp.timeout = attr->timeout;
+ resp.retry_cnt = attr->retry_cnt;
+ resp.rnr_retry = attr->rnr_retry;
+ resp.alt_port_num = attr->alt_port_num;
+ resp.alt_timeout = attr->alt_timeout;
+
+ memcpy(resp.dest.dgid, attr->ah_attr.grh.dgid.raw, 16);
+ resp.dest.flow_label = attr->ah_attr.grh.flow_label;
+ resp.dest.sgid_index = attr->ah_attr.grh.sgid_index;
+ resp.dest.hop_limit = attr->ah_attr.grh.hop_limit;
+ resp.dest.traffic_class = attr->ah_attr.grh.traffic_class;
+ resp.dest.dlid = attr->ah_attr.dlid;
+ resp.dest.sl = attr->ah_attr.sl;
+ resp.dest.src_path_bits = attr->ah_attr.src_path_bits;
+ resp.dest.static_rate = attr->ah_attr.static_rate;
+ resp.dest.is_global = !!(attr->ah_attr.ah_flags & IB_AH_GRH);
+ resp.dest.port_num = attr->ah_attr.port_num;
+
+ memcpy(resp.alt_dest.dgid, attr->alt_ah_attr.grh.dgid.raw, 16);
+ resp.alt_dest.flow_label = attr->alt_ah_attr.grh.flow_label;
+ resp.alt_dest.sgid_index = attr->alt_ah_attr.grh.sgid_index;
+ resp.alt_dest.hop_limit = attr->alt_ah_attr.grh.hop_limit;
+ resp.alt_dest.traffic_class = attr->alt_ah_attr.grh.traffic_class;
+ resp.alt_dest.dlid = attr->alt_ah_attr.dlid;
+ resp.alt_dest.sl = attr->alt_ah_attr.sl;
+ resp.alt_dest.src_path_bits = attr->alt_ah_attr.src_path_bits;
+ resp.alt_dest.static_rate = attr->alt_ah_attr.static_rate;
+ resp.alt_dest.is_global = !!(attr->alt_ah_attr.ah_flags & IB_AH_GRH);
+ resp.alt_dest.port_num = attr->alt_ah_attr.port_num;
+
+ resp.max_send_wr = init_attr->cap.max_send_wr;
+ resp.max_recv_wr = init_attr->cap.max_recv_wr;
+ resp.max_send_sge = init_attr->cap.max_send_sge;
+ resp.max_recv_sge = init_attr->cap.max_recv_sge;
+ resp.max_inline_data = init_attr->cap.max_inline_data;
+ resp.sq_sig_all = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR;
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp))
+ ret = -EFAULT;
+
+out:
+ kfree(attr);
+ kfree(init_attr);
+
+ return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_reg_xrc_rcv_qp(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_reg_xrc_rcv_qp cmd;
+ struct ib_uxrc_rcv_object *qp_obj, *tmp;
+ struct ib_xrcd *xrcd;
+ struct ib_uobject *uobj;
+ struct ib_uxrcd_object *xrcd_uobj;
+ int ret;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ qp_obj = kmalloc(sizeof *qp_obj, GFP_KERNEL);
+ if (!qp_obj)
+ return -ENOMEM;
+
+ xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj);
+ if (!xrcd) {
+ ret = -EINVAL;
+ goto err_out;
+ }
+
+ ret = xrcd->device->reg_xrc_rcv_qp(xrcd, file, cmd.qp_num);
+ if (ret)
+ goto err_put;
+
+ xrcd_uobj = container_of(uobj, struct ib_uxrcd_object, uobject);
+ mutex_lock(&file->device->ib_dev->xrcd_table_mutex);
+ list_for_each_entry(tmp, &xrcd_uobj->xrc_reg_qp_list, list)
+ if (cmd.qp_num == tmp->qp_num) {
+ kfree(qp_obj);
+ mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+ put_xrcd_read(uobj);
+ return in_len;
+ }
+ qp_obj->qp_num = cmd.qp_num;
+ qp_obj->domain_handle = cmd.xrc_domain_handle;
+ list_add_tail(&qp_obj->list, &xrcd_uobj->xrc_reg_qp_list);
+ mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+ atomic_inc(&xrcd->usecnt);
+ put_xrcd_read(uobj);
+ return in_len;
+
+err_put:
+ put_xrcd_read(uobj);
+err_out:
+
+ kfree(qp_obj);
+ return ret;
+}
+
+int ib_uverbs_cleanup_xrc_rcv_qp(struct ib_uverbs_file *file,
+ struct ib_xrcd *xrcd, u32 qp_num)
+{
+ int err;
+ err = xrcd->device->unreg_xrc_rcv_qp(xrcd, file, qp_num);
+ if (!err)
+ atomic_dec(&xrcd->usecnt);
+ return err;
+}
+
+ssize_t ib_uverbs_unreg_xrc_rcv_qp(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_unreg_xrc_rcv_qp cmd;
+ struct ib_uxrc_rcv_object *qp_obj, *tmp;
+ struct ib_xrcd *xrcd;
+ struct ib_uobject *uobj;
+ struct ib_uxrcd_object *xrcd_uobj;
+ int ret;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj);
+ if (!xrcd)
+ return -EINVAL;
+
+ ret = xrcd->device->unreg_xrc_rcv_qp(xrcd, file, cmd.qp_num);
+ if (ret) {
+ put_xrcd_read(uobj);
+ return -EINVAL;
+ }
+ atomic_dec(&xrcd->usecnt);
+
+ xrcd_uobj = container_of(uobj, struct ib_uxrcd_object, uobject);
+ mutex_lock(&file->device->ib_dev->xrcd_table_mutex);
+ list_for_each_entry_safe(qp_obj, tmp, &xrcd_uobj->xrc_reg_qp_list, list)
+ if (cmd.qp_num == qp_obj->qp_num) {
+ list_del(&qp_obj->list);
+ kfree(qp_obj);
+ break;
+ }
+ mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+ put_xrcd_read(uobj);
+ return in_len;
+}
diff --git a/sys/ofed/drivers/infiniband/core/uverbs_main.c b/sys/ofed/drivers/infiniband/core/uverbs_main.c
new file mode 100644
index 0000000..380abd3
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/uverbs_main.c
@@ -0,0 +1,1012 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/cdev.h>
+
+#include <asm/uaccess.h>
+
+#include "uverbs.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand userspace verbs access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+#define INFINIBANDEVENTFS_MAGIC 0x49426576 /* "IBev" */
+
+enum {
+ IB_UVERBS_MAJOR = 231,
+ IB_UVERBS_BASE_MINOR = 192,
+ IB_UVERBS_MAX_DEVICES = 32
+};
+
+#define IB_UVERBS_BASE_DEV MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR)
+
+static struct class *uverbs_class;
+
+DEFINE_SPINLOCK(ib_uverbs_idr_lock);
+DEFINE_IDR(ib_uverbs_pd_idr);
+DEFINE_IDR(ib_uverbs_mr_idr);
+DEFINE_IDR(ib_uverbs_mw_idr);
+DEFINE_IDR(ib_uverbs_ah_idr);
+DEFINE_IDR(ib_uverbs_cq_idr);
+DEFINE_IDR(ib_uverbs_qp_idr);
+DEFINE_IDR(ib_uverbs_srq_idr);
+DEFINE_IDR(ib_uverbs_xrc_domain_idr);
+
+static spinlock_t map_lock;
+static struct ib_uverbs_device *dev_table[IB_UVERBS_MAX_DEVICES];
+static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
+
+static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len) = {
+ [IB_USER_VERBS_CMD_GET_CONTEXT] = ib_uverbs_get_context,
+ [IB_USER_VERBS_CMD_QUERY_DEVICE] = ib_uverbs_query_device,
+ [IB_USER_VERBS_CMD_QUERY_PORT] = ib_uverbs_query_port,
+ [IB_USER_VERBS_CMD_ALLOC_PD] = ib_uverbs_alloc_pd,
+ [IB_USER_VERBS_CMD_DEALLOC_PD] = ib_uverbs_dealloc_pd,
+ [IB_USER_VERBS_CMD_REG_MR] = ib_uverbs_reg_mr,
+ [IB_USER_VERBS_CMD_DEREG_MR] = ib_uverbs_dereg_mr,
+ [IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL] = ib_uverbs_create_comp_channel,
+ [IB_USER_VERBS_CMD_CREATE_CQ] = ib_uverbs_create_cq,
+ [IB_USER_VERBS_CMD_RESIZE_CQ] = ib_uverbs_resize_cq,
+ [IB_USER_VERBS_CMD_POLL_CQ] = ib_uverbs_poll_cq,
+ [IB_USER_VERBS_CMD_REQ_NOTIFY_CQ] = ib_uverbs_req_notify_cq,
+ [IB_USER_VERBS_CMD_DESTROY_CQ] = ib_uverbs_destroy_cq,
+ [IB_USER_VERBS_CMD_CREATE_QP] = ib_uverbs_create_qp,
+ [IB_USER_VERBS_CMD_QUERY_QP] = ib_uverbs_query_qp,
+ [IB_USER_VERBS_CMD_MODIFY_QP] = ib_uverbs_modify_qp,
+ [IB_USER_VERBS_CMD_DESTROY_QP] = ib_uverbs_destroy_qp,
+ [IB_USER_VERBS_CMD_POST_SEND] = ib_uverbs_post_send,
+ [IB_USER_VERBS_CMD_POST_RECV] = ib_uverbs_post_recv,
+ [IB_USER_VERBS_CMD_POST_SRQ_RECV] = ib_uverbs_post_srq_recv,
+ [IB_USER_VERBS_CMD_CREATE_AH] = ib_uverbs_create_ah,
+ [IB_USER_VERBS_CMD_DESTROY_AH] = ib_uverbs_destroy_ah,
+ [IB_USER_VERBS_CMD_ATTACH_MCAST] = ib_uverbs_attach_mcast,
+ [IB_USER_VERBS_CMD_DETACH_MCAST] = ib_uverbs_detach_mcast,
+ [IB_USER_VERBS_CMD_CREATE_SRQ] = ib_uverbs_create_srq,
+ [IB_USER_VERBS_CMD_MODIFY_SRQ] = ib_uverbs_modify_srq,
+ [IB_USER_VERBS_CMD_QUERY_SRQ] = ib_uverbs_query_srq,
+ [IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq,
+ [IB_USER_VERBS_CMD_CREATE_XRC_SRQ] = ib_uverbs_create_xrc_srq,
+ [IB_USER_VERBS_CMD_OPEN_XRC_DOMAIN] = ib_uverbs_open_xrc_domain,
+ [IB_USER_VERBS_CMD_CLOSE_XRC_DOMAIN] = ib_uverbs_close_xrc_domain,
+ [IB_USER_VERBS_CMD_CREATE_XRC_RCV_QP] = ib_uverbs_create_xrc_rcv_qp,
+ [IB_USER_VERBS_CMD_MODIFY_XRC_RCV_QP] = ib_uverbs_modify_xrc_rcv_qp,
+ [IB_USER_VERBS_CMD_QUERY_XRC_RCV_QP] = ib_uverbs_query_xrc_rcv_qp,
+ [IB_USER_VERBS_CMD_REG_XRC_RCV_QP] = ib_uverbs_reg_xrc_rcv_qp,
+ [IB_USER_VERBS_CMD_UNREG_XRC_RCV_QP] = ib_uverbs_unreg_xrc_rcv_qp,
+};
+
+#ifdef __linux__
+/* BSD Does not require a fake mountpoint for all files. */
+static struct vfsmount *uverbs_event_mnt;
+#endif
+
+static void ib_uverbs_add_one(struct ib_device *device);
+static void ib_uverbs_remove_one(struct ib_device *device);
+
+static void ib_uverbs_release_dev(struct kref *ref)
+{
+ struct ib_uverbs_device *dev =
+ container_of(ref, struct ib_uverbs_device, ref);
+
+ complete(&dev->comp);
+}
+
+static void ib_uverbs_release_event_file(struct kref *ref)
+{
+ struct ib_uverbs_event_file *file =
+ container_of(ref, struct ib_uverbs_event_file, ref);
+
+ kfree(file);
+}
+
+void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
+ struct ib_uverbs_event_file *ev_file,
+ struct ib_ucq_object *uobj)
+{
+ struct ib_uverbs_event *evt, *tmp;
+
+ if (ev_file) {
+ spin_lock_irq(&ev_file->lock);
+ list_for_each_entry_safe(evt, tmp, &uobj->comp_list, obj_list) {
+ list_del(&evt->list);
+ kfree(evt);
+ }
+ spin_unlock_irq(&ev_file->lock);
+
+ kref_put(&ev_file->ref, ib_uverbs_release_event_file);
+ }
+
+ spin_lock_irq(&file->async_file->lock);
+ list_for_each_entry_safe(evt, tmp, &uobj->async_list, obj_list) {
+ list_del(&evt->list);
+ kfree(evt);
+ }
+ spin_unlock_irq(&file->async_file->lock);
+}
+
+void ib_uverbs_release_uevent(struct ib_uverbs_file *file,
+ struct ib_uevent_object *uobj)
+{
+ struct ib_uverbs_event *evt, *tmp;
+
+ spin_lock_irq(&file->async_file->lock);
+ list_for_each_entry_safe(evt, tmp, &uobj->event_list, obj_list) {
+ list_del(&evt->list);
+ kfree(evt);
+ }
+ spin_unlock_irq(&file->async_file->lock);
+}
+
+static void ib_uverbs_detach_umcast(struct ib_qp *qp,
+ struct ib_uqp_object *uobj)
+{
+ struct ib_uverbs_mcast_entry *mcast, *tmp;
+
+ list_for_each_entry_safe(mcast, tmp, &uobj->mcast_list, list) {
+ ib_detach_mcast(qp, &mcast->gid, mcast->lid);
+ list_del(&mcast->list);
+ kfree(mcast);
+ }
+}
+
+static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
+ struct ib_ucontext *context)
+{
+ struct ib_uobject *uobj, *tmp;
+
+ if (!context)
+ return 0;
+
+ context->closing = 1;
+
+ list_for_each_entry_safe(uobj, tmp, &context->ah_list, list) {
+ struct ib_ah *ah = uobj->object;
+
+ idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
+ ib_destroy_ah(ah);
+ kfree(uobj);
+ }
+
+ list_for_each_entry_safe(uobj, tmp, &context->qp_list, list) {
+ struct ib_qp *qp = uobj->object;
+ struct ib_uqp_object *uqp =
+ container_of(uobj, struct ib_uqp_object, uevent.uobject);
+
+ idr_remove_uobj(&ib_uverbs_qp_idr, uobj);
+ ib_uverbs_detach_umcast(qp, uqp);
+ ib_destroy_qp(qp);
+ ib_uverbs_release_uevent(file, &uqp->uevent);
+ kfree(uqp);
+ }
+
+
+ list_for_each_entry_safe(uobj, tmp, &context->srq_list, list) {
+ struct ib_srq *srq = uobj->object;
+ struct ib_uevent_object *uevent =
+ container_of(uobj, struct ib_uevent_object, uobject);
+
+ idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
+ ib_destroy_srq(srq);
+ ib_uverbs_release_uevent(file, uevent);
+ kfree(uevent);
+ }
+
+ list_for_each_entry_safe(uobj, tmp, &context->cq_list, list) {
+ struct ib_cq *cq = uobj->object;
+ struct ib_uverbs_event_file *ev_file = cq->cq_context;
+ struct ib_ucq_object *ucq =
+ container_of(uobj, struct ib_ucq_object, uobject);
+
+ idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
+ ib_destroy_cq(cq);
+ ib_uverbs_release_ucq(file, ev_file, ucq);
+ kfree(ucq);
+ }
+
+ /* XXX Free MWs */
+
+ list_for_each_entry_safe(uobj, tmp, &context->mr_list, list) {
+ struct ib_mr *mr = uobj->object;
+
+ idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
+ ib_dereg_mr(mr);
+ kfree(uobj);
+ }
+
+ mutex_lock(&file->device->ib_dev->xrcd_table_mutex);
+ list_for_each_entry_safe(uobj, tmp, &context->xrc_domain_list, list) {
+ struct ib_xrcd *xrcd = uobj->object;
+ struct ib_uxrc_rcv_object *xrc_qp_obj, *tmp1;
+ struct ib_uxrcd_object *xrcd_uobj =
+ container_of(uobj, struct ib_uxrcd_object, uobject);
+
+ list_for_each_entry_safe(xrc_qp_obj, tmp1,
+ &xrcd_uobj->xrc_reg_qp_list, list) {
+ list_del(&xrc_qp_obj->list);
+ ib_uverbs_cleanup_xrc_rcv_qp(file, xrcd,
+ xrc_qp_obj->qp_num);
+ kfree(xrc_qp_obj);
+ }
+
+ idr_remove_uobj(&ib_uverbs_xrc_domain_idr, uobj);
+ ib_uverbs_dealloc_xrcd(file->device->ib_dev, xrcd);
+ kfree(uobj);
+ }
+ mutex_unlock(&file->device->ib_dev->xrcd_table_mutex);
+
+ list_for_each_entry_safe(uobj, tmp, &context->pd_list, list) {
+ struct ib_pd *pd = uobj->object;
+
+ idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
+ ib_dealloc_pd(pd);
+ kfree(uobj);
+ }
+
+ return context->device->dealloc_ucontext(context);
+}
+
+static void ib_uverbs_release_file(struct kref *ref)
+{
+ struct ib_uverbs_file *file =
+ container_of(ref, struct ib_uverbs_file, ref);
+
+ module_put(file->device->ib_dev->owner);
+ kref_put(&file->device->ref, ib_uverbs_release_dev);
+
+ kfree(file);
+}
+
+static ssize_t ib_uverbs_event_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct ib_uverbs_event_file *file = filp->private_data;
+ struct ib_uverbs_event *event;
+ int eventsz;
+ int ret = 0;
+
+ spin_lock_irq(&file->lock);
+
+ while (list_empty(&file->event_list)) {
+ spin_unlock_irq(&file->lock);
+
+ if (filp->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ if (wait_event_interruptible(file->poll_wait,
+ !list_empty(&file->event_list)))
+ return -ERESTARTSYS;
+
+ spin_lock_irq(&file->lock);
+ }
+
+ event = list_entry(file->event_list.next, struct ib_uverbs_event, list);
+
+ if (file->is_async)
+ eventsz = sizeof (struct ib_uverbs_async_event_desc);
+ else
+ eventsz = sizeof (struct ib_uverbs_comp_event_desc);
+
+ if (eventsz > count) {
+ ret = -EINVAL;
+ event = NULL;
+ } else {
+ list_del(file->event_list.next);
+ if (event->counter) {
+ ++(*event->counter);
+ list_del(&event->obj_list);
+ }
+ }
+
+ spin_unlock_irq(&file->lock);
+
+ if (event) {
+ if (copy_to_user(buf, event, eventsz))
+ ret = -EFAULT;
+ else
+ ret = eventsz;
+ }
+
+ kfree(event);
+
+ return ret;
+}
+
+static unsigned int ib_uverbs_event_poll(struct file *filp,
+ struct poll_table_struct *wait)
+{
+ unsigned int pollflags = 0;
+ struct ib_uverbs_event_file *file = filp->private_data;
+
+ file->filp = filp;
+ poll_wait(filp, &file->poll_wait, wait);
+
+ spin_lock_irq(&file->lock);
+ if (!list_empty(&file->event_list))
+ pollflags = POLLIN | POLLRDNORM;
+ spin_unlock_irq(&file->lock);
+
+ return pollflags;
+}
+
+static int ib_uverbs_event_fasync(int fd, struct file *filp, int on)
+{
+ struct ib_uverbs_event_file *file = filp->private_data;
+
+ return fasync_helper(fd, filp, on, &file->async_queue);
+}
+
+static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
+{
+ struct ib_uverbs_event_file *file = filp->private_data;
+ struct ib_uverbs_event *entry, *tmp;
+
+ spin_lock_irq(&file->lock);
+ file->is_closed = 1;
+ list_for_each_entry_safe(entry, tmp, &file->event_list, list) {
+ if (entry->counter)
+ list_del(&entry->obj_list);
+ kfree(entry);
+ }
+ spin_unlock_irq(&file->lock);
+
+ if (file->is_async) {
+ ib_unregister_event_handler(&file->uverbs_file->event_handler);
+ kref_put(&file->uverbs_file->ref, ib_uverbs_release_file);
+ }
+ kref_put(&file->ref, ib_uverbs_release_event_file);
+
+ return 0;
+}
+
+static const struct file_operations uverbs_event_fops = {
+ .owner = THIS_MODULE,
+ .read = ib_uverbs_event_read,
+ .poll = ib_uverbs_event_poll,
+ .release = ib_uverbs_event_close,
+ .fasync = ib_uverbs_event_fasync
+};
+
+void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context)
+{
+ struct ib_uverbs_event_file *file = cq_context;
+ struct ib_ucq_object *uobj;
+ struct ib_uverbs_event *entry;
+ unsigned long flags;
+
+ if (!file)
+ return;
+
+ spin_lock_irqsave(&file->lock, flags);
+ if (file->is_closed) {
+ spin_unlock_irqrestore(&file->lock, flags);
+ return;
+ }
+
+ entry = kmalloc(sizeof *entry, GFP_ATOMIC);
+ if (!entry) {
+ spin_unlock_irqrestore(&file->lock, flags);
+ return;
+ }
+
+ uobj = container_of(cq->uobject, struct ib_ucq_object, uobject);
+
+ entry->desc.comp.cq_handle = cq->uobject->user_handle;
+ entry->counter = &uobj->comp_events_reported;
+
+ list_add_tail(&entry->list, &file->event_list);
+ list_add_tail(&entry->obj_list, &uobj->comp_list);
+ spin_unlock_irqrestore(&file->lock, flags);
+
+ wake_up_interruptible(&file->poll_wait);
+ if (file->filp)
+ selwakeup(&file->filp->f_selinfo);
+ kill_fasync(&file->async_queue, SIGIO, POLL_IN);
+}
+
+static void ib_uverbs_async_handler(struct ib_uverbs_file *file,
+ __u64 element, __u64 event,
+ struct list_head *obj_list,
+ u32 *counter)
+{
+ struct ib_uverbs_event *entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&file->async_file->lock, flags);
+ if (file->async_file->is_closed) {
+ spin_unlock_irqrestore(&file->async_file->lock, flags);
+ return;
+ }
+
+ entry = kmalloc(sizeof *entry, GFP_ATOMIC);
+ if (!entry) {
+ spin_unlock_irqrestore(&file->async_file->lock, flags);
+ return;
+ }
+
+ entry->desc.async.element = element;
+ entry->desc.async.event_type = event;
+ entry->counter = counter;
+
+ list_add_tail(&entry->list, &file->async_file->event_list);
+ if (obj_list)
+ list_add_tail(&entry->obj_list, obj_list);
+ spin_unlock_irqrestore(&file->async_file->lock, flags);
+
+ wake_up_interruptible(&file->async_file->poll_wait);
+ if (file->async_file->filp)
+ selwakeup(&file->async_file->filp->f_selinfo);
+ kill_fasync(&file->async_file->async_queue, SIGIO, POLL_IN);
+}
+
+void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr)
+{
+ struct ib_ucq_object *uobj = container_of(event->element.cq->uobject,
+ struct ib_ucq_object, uobject);
+
+ ib_uverbs_async_handler(uobj->uverbs_file, uobj->uobject.user_handle,
+ event->event, &uobj->async_list,
+ &uobj->async_events_reported);
+}
+
+void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr)
+{
+ struct ib_uevent_object *uobj;
+
+ uobj = container_of(event->element.qp->uobject,
+ struct ib_uevent_object, uobject);
+
+ ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle,
+ event->event, &uobj->event_list,
+ &uobj->events_reported);
+}
+
+void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr)
+{
+ struct ib_uevent_object *uobj;
+
+ uobj = container_of(event->element.srq->uobject,
+ struct ib_uevent_object, uobject);
+
+ ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle,
+ event->event, &uobj->event_list,
+ &uobj->events_reported);
+}
+
+void ib_uverbs_event_handler(struct ib_event_handler *handler,
+ struct ib_event *event)
+{
+ struct ib_uverbs_file *file =
+ container_of(handler, struct ib_uverbs_file, event_handler);
+
+ ib_uverbs_async_handler(file, event->element.port_num, event->event,
+ NULL, NULL);
+}
+
+void ib_uverbs_xrc_rcv_qp_event_handler(struct ib_event *event,
+ void *context_ptr)
+{
+ ib_uverbs_async_handler(context_ptr, event->element.xrc_qp_num,
+ event->event, NULL, NULL);
+}
+
+struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+ int is_async, int *fd)
+{
+ struct ib_uverbs_event_file *ev_file;
+ struct file *filp;
+ int ret;
+
+ ev_file = kmalloc(sizeof *ev_file, GFP_KERNEL);
+ if (!ev_file)
+ return ERR_PTR(-ENOMEM);
+
+ kref_init(&ev_file->ref);
+ spin_lock_init(&ev_file->lock);
+ INIT_LIST_HEAD(&ev_file->event_list);
+ init_waitqueue_head(&ev_file->poll_wait);
+ ev_file->uverbs_file = uverbs_file;
+ ev_file->async_queue = NULL;
+ ev_file->is_async = is_async;
+ ev_file->is_closed = 0;
+ ev_file->filp = NULL;
+
+ *fd = get_unused_fd();
+ if (*fd < 0) {
+ ret = *fd;
+ goto err;
+ }
+
+ /*
+ * fops_get() can't fail here, because we're coming from a
+ * system call on a uverbs file, which will already have a
+ * module reference.
+ */
+ filp = alloc_file(uverbs_event_mnt, dget(uverbs_event_mnt->mnt_root),
+ FMODE_READ, fops_get(&uverbs_event_fops));
+ if (!filp) {
+ ret = -ENFILE;
+ goto err_fd;
+ }
+
+ filp->private_data = ev_file;
+
+ return filp;
+
+err_fd:
+ put_unused_fd(*fd);
+
+err:
+ kfree(ev_file);
+ return ERR_PTR(ret);
+}
+
+/*
+ * Look up a completion event file by FD. If lookup is successful,
+ * takes a ref to the event file struct that it returns; if
+ * unsuccessful, returns NULL.
+ */
+struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd)
+{
+ struct ib_uverbs_event_file *ev_file = NULL;
+ struct file *filp;
+
+ filp = fget(fd);
+ if (!filp)
+ return NULL;
+
+ if (filp->f_op != &uverbs_event_fops)
+ goto out;
+
+ ev_file = filp->private_data;
+ if (ev_file->is_async) {
+ ev_file = NULL;
+ goto out;
+ }
+
+ kref_get(&ev_file->ref);
+
+out:
+ fput(filp);
+ return ev_file;
+}
+
+static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct ib_uverbs_file *file = filp->private_data;
+ struct ib_uverbs_cmd_hdr hdr;
+
+ if (count < sizeof hdr)
+ return -EINVAL;
+
+ if (copy_from_user(&hdr, buf, sizeof hdr))
+ return -EFAULT;
+
+ if (hdr.in_words * 4 != count)
+ return -EINVAL;
+
+ if (hdr.command < 0 ||
+ hdr.command >= ARRAY_SIZE(uverbs_cmd_table) ||
+ !uverbs_cmd_table[hdr.command] ||
+ !(file->device->ib_dev->uverbs_cmd_mask & (1ull << hdr.command)))
+ return -EINVAL;
+
+ if (!file->ucontext &&
+ hdr.command != IB_USER_VERBS_CMD_GET_CONTEXT)
+ return -EINVAL;
+
+ return uverbs_cmd_table[hdr.command](file, buf + sizeof hdr,
+ hdr.in_words * 4, hdr.out_words * 4);
+}
+
+static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct ib_uverbs_file *file = filp->private_data;
+
+ if (!file->ucontext)
+ return -ENODEV;
+ else
+ return file->device->ib_dev->mmap(file->ucontext, vma);
+}
+
+/*
+ * ib_uverbs_open() does not need the BKL:
+ *
+ * - dev_table[] accesses are protected by map_lock, the
+ * ib_uverbs_device structures are properly reference counted, and
+ * everything else is purely local to the file being created, so
+ * races against other open calls are not a problem;
+ * - there is no ioctl method to race against;
+ * - the device is added to dev_table[] as the last part of module
+ * initialization, the open method will either immediately run
+ * -ENXIO, or all required initialization will be done.
+ */
+static int ib_uverbs_open(struct inode *inode, struct file *filp)
+{
+ struct ib_uverbs_device *dev;
+ struct ib_uverbs_file *file;
+ int ret;
+
+ spin_lock(&map_lock);
+ dev = dev_table[iminor(inode) - IB_UVERBS_BASE_MINOR];
+ if (dev)
+ kref_get(&dev->ref);
+ spin_unlock(&map_lock);
+
+ if (!dev)
+ return -ENXIO;
+
+ if (!try_module_get(dev->ib_dev->owner)) {
+ ret = -ENODEV;
+ goto err;
+ }
+
+ file = kmalloc(sizeof *file, GFP_KERNEL);
+ if (!file) {
+ ret = -ENOMEM;
+ goto err_module;
+ }
+
+ file->device = dev;
+ file->ucontext = NULL;
+ file->async_file = NULL;
+ kref_init(&file->ref);
+ mutex_init(&file->mutex);
+
+ filp->private_data = file;
+
+ return 0;
+
+err_module:
+ module_put(dev->ib_dev->owner);
+
+err:
+ kref_put(&dev->ref, ib_uverbs_release_dev);
+ return ret;
+}
+
+static int ib_uverbs_close(struct inode *inode, struct file *filp)
+{
+ struct ib_uverbs_file *file = filp->private_data;
+
+ ib_uverbs_cleanup_ucontext(file, file->ucontext);
+
+ if (file->async_file)
+ kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
+
+ kref_put(&file->ref, ib_uverbs_release_file);
+
+ return 0;
+}
+
+static const struct file_operations uverbs_fops = {
+ .owner = THIS_MODULE,
+ .write = ib_uverbs_write,
+ .open = ib_uverbs_open,
+ .release = ib_uverbs_close
+};
+
+static const struct file_operations uverbs_mmap_fops = {
+ .owner = THIS_MODULE,
+ .write = ib_uverbs_write,
+ .mmap = ib_uverbs_mmap,
+ .open = ib_uverbs_open,
+ .release = ib_uverbs_close
+};
+
+static struct ib_client uverbs_client = {
+ .name = "uverbs",
+ .add = ib_uverbs_add_one,
+ .remove = ib_uverbs_remove_one
+};
+
+static ssize_t show_ibdev(struct device *device, struct device_attribute *attr,
+ char *buf)
+{
+ struct ib_uverbs_device *dev = dev_get_drvdata(device);
+
+ if (!dev)
+ return -ENODEV;
+
+ return sprintf(buf, "%s\n", dev->ib_dev->name);
+}
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static ssize_t show_dev_abi_version(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct ib_uverbs_device *dev = dev_get_drvdata(device);
+
+ if (!dev)
+ return -ENODEV;
+
+ return sprintf(buf, "%d\n", dev->ib_dev->uverbs_abi_ver);
+}
+static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL);
+
+static ssize_t show_abi_version(struct class *class, char *buf)
+{
+ return sprintf(buf, "%d\n", IB_USER_VERBS_ABI_VERSION);
+}
+static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
+
+#include <linux/pci.h>
+
+static ssize_t
+show_dev_device(struct device *device, struct device_attribute *attr, char *buf)
+{
+ struct ib_uverbs_device *dev = dev_get_drvdata(device);
+
+ if (!dev)
+ return -ENODEV;
+
+ return sprintf(buf, "0x%04x\n",
+ ((struct pci_dev *)dev->ib_dev->dma_device)->device);
+}
+static DEVICE_ATTR(device, S_IRUGO, show_dev_device, NULL);
+
+static ssize_t
+show_dev_vendor(struct device *device, struct device_attribute *attr, char *buf)
+{
+ struct ib_uverbs_device *dev = dev_get_drvdata(device);
+
+ if (!dev)
+ return -ENODEV;
+
+ return sprintf(buf, "0x%04x\n",
+ ((struct pci_dev *)dev->ib_dev->dma_device)->vendor);
+}
+static DEVICE_ATTR(vendor, S_IRUGO, show_dev_vendor, NULL);
+
+struct attribute *device_attrs[] =
+{
+ &dev_attr_device.attr,
+ &dev_attr_vendor.attr,
+ NULL
+};
+
+static struct attribute_group device_group = {
+ .name = "device",
+ .attrs = device_attrs
+};
+
+static void ib_uverbs_add_one(struct ib_device *device)
+{
+ struct ib_uverbs_device *uverbs_dev;
+
+ if (!device->alloc_ucontext)
+ return;
+
+ uverbs_dev = kzalloc(sizeof *uverbs_dev, GFP_KERNEL);
+ if (!uverbs_dev)
+ return;
+
+ kref_init(&uverbs_dev->ref);
+ init_completion(&uverbs_dev->comp);
+
+ spin_lock(&map_lock);
+ uverbs_dev->devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
+ if (uverbs_dev->devnum >= IB_UVERBS_MAX_DEVICES) {
+ spin_unlock(&map_lock);
+ goto err;
+ }
+ set_bit(uverbs_dev->devnum, dev_map);
+ spin_unlock(&map_lock);
+
+ uverbs_dev->ib_dev = device;
+ uverbs_dev->num_comp_vectors = device->num_comp_vectors;
+
+ uverbs_dev->cdev = cdev_alloc();
+ if (!uverbs_dev->cdev)
+ goto err;
+ uverbs_dev->cdev->owner = THIS_MODULE;
+ uverbs_dev->cdev->ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops;
+ kobject_set_name(&uverbs_dev->cdev->kobj, "uverbs%d", uverbs_dev->devnum);
+ if (cdev_add(uverbs_dev->cdev, IB_UVERBS_BASE_DEV + uverbs_dev->devnum, 1))
+ goto err_cdev;
+
+ uverbs_dev->dev = device_create(uverbs_class, device->dma_device,
+ uverbs_dev->cdev->dev, uverbs_dev,
+ "uverbs%d", uverbs_dev->devnum);
+ if (IS_ERR(uverbs_dev->dev))
+ goto err_cdev;
+
+ if (device_create_file(uverbs_dev->dev, &dev_attr_ibdev))
+ goto err_class;
+ if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version))
+ goto err_class;
+ if (sysfs_create_group(&uverbs_dev->dev->kobj, &device_group))
+ goto err_class;
+
+ spin_lock(&map_lock);
+ dev_table[uverbs_dev->devnum] = uverbs_dev;
+ spin_unlock(&map_lock);
+
+ ib_set_client_data(device, &uverbs_client, uverbs_dev);
+
+ return;
+
+err_class:
+ device_destroy(uverbs_class, uverbs_dev->cdev->dev);
+
+err_cdev:
+ cdev_del(uverbs_dev->cdev);
+ clear_bit(uverbs_dev->devnum, dev_map);
+
+err:
+ kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
+ wait_for_completion(&uverbs_dev->comp);
+ kfree(uverbs_dev);
+ return;
+}
+
+static void ib_uverbs_remove_one(struct ib_device *device)
+{
+ struct ib_uverbs_device *uverbs_dev = ib_get_client_data(device, &uverbs_client);
+
+ if (!uverbs_dev)
+ return;
+
+ sysfs_remove_group(&uverbs_dev->dev->kobj, &device_group);
+ dev_set_drvdata(uverbs_dev->dev, NULL);
+ device_destroy(uverbs_class, uverbs_dev->cdev->dev);
+ cdev_del(uverbs_dev->cdev);
+
+ spin_lock(&map_lock);
+ dev_table[uverbs_dev->devnum] = NULL;
+ spin_unlock(&map_lock);
+
+ clear_bit(uverbs_dev->devnum, dev_map);
+
+ kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
+ wait_for_completion(&uverbs_dev->comp);
+ kfree(uverbs_dev);
+}
+#ifdef __linux__
+static int uverbs_event_get_sb(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data,
+ struct vfsmount *mnt)
+{
+ return get_sb_pseudo(fs_type, "infinibandevent:", NULL,
+ INFINIBANDEVENTFS_MAGIC, mnt);
+}
+
+static struct file_system_type uverbs_event_fs = {
+ /* No owner field so module can be unloaded */
+ .name = "infinibandeventfs",
+ .get_sb = uverbs_event_get_sb,
+ .kill_sb = kill_litter_super
+};
+#endif
+
+static int __init ib_uverbs_init(void)
+{
+ int ret;
+
+ spin_lock_init(&map_lock);
+
+ ret = register_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES,
+ "infiniband_verbs");
+ if (ret) {
+ printk(KERN_ERR "user_verbs: couldn't register device number\n");
+ goto out;
+ }
+
+ uverbs_class = class_create(THIS_MODULE, "infiniband_verbs");
+ if (IS_ERR(uverbs_class)) {
+ ret = PTR_ERR(uverbs_class);
+ printk(KERN_ERR "user_verbs: couldn't create class infiniband_verbs\n");
+ goto out_chrdev;
+ }
+
+ ret = class_create_file(uverbs_class, &class_attr_abi_version);
+ if (ret) {
+ printk(KERN_ERR "user_verbs: couldn't create abi_version attribute\n");
+ goto out_class;
+ }
+
+#ifdef __linux__
+ ret = register_filesystem(&uverbs_event_fs);
+ if (ret) {
+ printk(KERN_ERR "user_verbs: couldn't register infinibandeventfs\n");
+ goto out_class;
+ }
+
+ uverbs_event_mnt = kern_mount(&uverbs_event_fs);
+ if (IS_ERR(uverbs_event_mnt)) {
+ ret = PTR_ERR(uverbs_event_mnt);
+ printk(KERN_ERR "user_verbs: couldn't mount infinibandeventfs\n");
+ goto out_fs;
+ }
+#endif
+
+ ret = ib_register_client(&uverbs_client);
+ if (ret) {
+ printk(KERN_ERR "user_verbs: couldn't register client\n");
+ goto out_mnt;
+ }
+
+ return 0;
+
+out_mnt:
+#ifdef __linux__
+ mntput(uverbs_event_mnt);
+
+out_fs:
+ unregister_filesystem(&uverbs_event_fs);
+#endif
+
+out_class:
+ class_destroy(uverbs_class);
+
+out_chrdev:
+ unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES);
+
+out:
+ return ret;
+}
+
+static void __exit ib_uverbs_cleanup(void)
+{
+ ib_unregister_client(&uverbs_client);
+#ifdef __linux__
+ mntput(uverbs_event_mnt);
+ unregister_filesystem(&uverbs_event_fs);
+#endif
+ class_destroy(uverbs_class);
+ unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES);
+ idr_destroy(&ib_uverbs_pd_idr);
+ idr_destroy(&ib_uverbs_mr_idr);
+ idr_destroy(&ib_uverbs_mw_idr);
+ idr_destroy(&ib_uverbs_ah_idr);
+ idr_destroy(&ib_uverbs_cq_idr);
+ idr_destroy(&ib_uverbs_qp_idr);
+ idr_destroy(&ib_uverbs_srq_idr);
+}
+
+module_init(ib_uverbs_init);
+module_exit(ib_uverbs_cleanup);
diff --git a/sys/ofed/drivers/infiniband/core/uverbs_marshall.c b/sys/ofed/drivers/infiniband/core/uverbs_marshall.c
new file mode 100644
index 0000000..5440da0
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/uverbs_marshall.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_marshall.h>
+
+void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst,
+ struct ib_ah_attr *src)
+{
+ memcpy(dst->grh.dgid, src->grh.dgid.raw, sizeof src->grh.dgid);
+ dst->grh.flow_label = src->grh.flow_label;
+ dst->grh.sgid_index = src->grh.sgid_index;
+ dst->grh.hop_limit = src->grh.hop_limit;
+ dst->grh.traffic_class = src->grh.traffic_class;
+ dst->dlid = src->dlid;
+ dst->sl = src->sl;
+ dst->src_path_bits = src->src_path_bits;
+ dst->static_rate = src->static_rate;
+ dst->is_global = src->ah_flags & IB_AH_GRH ? 1 : 0;
+ dst->port_num = src->port_num;
+}
+EXPORT_SYMBOL(ib_copy_ah_attr_to_user);
+
+void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst,
+ struct ib_qp_attr *src)
+{
+ dst->cur_qp_state = src->cur_qp_state;
+ dst->path_mtu = src->path_mtu;
+ dst->path_mig_state = src->path_mig_state;
+ dst->qkey = src->qkey;
+ dst->rq_psn = src->rq_psn;
+ dst->sq_psn = src->sq_psn;
+ dst->dest_qp_num = src->dest_qp_num;
+ dst->qp_access_flags = src->qp_access_flags;
+
+ dst->max_send_wr = src->cap.max_send_wr;
+ dst->max_recv_wr = src->cap.max_recv_wr;
+ dst->max_send_sge = src->cap.max_send_sge;
+ dst->max_recv_sge = src->cap.max_recv_sge;
+ dst->max_inline_data = src->cap.max_inline_data;
+
+ ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr);
+ ib_copy_ah_attr_to_user(&dst->alt_ah_attr, &src->alt_ah_attr);
+
+ dst->pkey_index = src->pkey_index;
+ dst->alt_pkey_index = src->alt_pkey_index;
+ dst->en_sqd_async_notify = src->en_sqd_async_notify;
+ dst->sq_draining = src->sq_draining;
+ dst->max_rd_atomic = src->max_rd_atomic;
+ dst->max_dest_rd_atomic = src->max_dest_rd_atomic;
+ dst->min_rnr_timer = src->min_rnr_timer;
+ dst->port_num = src->port_num;
+ dst->timeout = src->timeout;
+ dst->retry_cnt = src->retry_cnt;
+ dst->rnr_retry = src->rnr_retry;
+ dst->alt_port_num = src->alt_port_num;
+ dst->alt_timeout = src->alt_timeout;
+}
+EXPORT_SYMBOL(ib_copy_qp_attr_to_user);
+
+void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst,
+ struct ib_sa_path_rec *src)
+{
+ memcpy(dst->dgid, src->dgid.raw, sizeof src->dgid);
+ memcpy(dst->sgid, src->sgid.raw, sizeof src->sgid);
+
+ dst->dlid = src->dlid;
+ dst->slid = src->slid;
+ dst->raw_traffic = src->raw_traffic;
+ dst->flow_label = src->flow_label;
+ dst->hop_limit = src->hop_limit;
+ dst->traffic_class = src->traffic_class;
+ dst->reversible = src->reversible;
+ dst->numb_path = src->numb_path;
+ dst->pkey = src->pkey;
+ dst->sl = src->sl;
+ dst->mtu_selector = src->mtu_selector;
+ dst->mtu = src->mtu;
+ dst->rate_selector = src->rate_selector;
+ dst->rate = src->rate;
+ dst->packet_life_time = src->packet_life_time;
+ dst->preference = src->preference;
+ dst->packet_life_time_selector = src->packet_life_time_selector;
+}
+EXPORT_SYMBOL(ib_copy_path_rec_to_user);
+
+void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst,
+ struct ib_user_path_rec *src)
+{
+ memcpy(dst->dgid.raw, src->dgid, sizeof dst->dgid);
+ memcpy(dst->sgid.raw, src->sgid, sizeof dst->sgid);
+
+ dst->dlid = src->dlid;
+ dst->slid = src->slid;
+ dst->raw_traffic = src->raw_traffic;
+ dst->flow_label = src->flow_label;
+ dst->hop_limit = src->hop_limit;
+ dst->traffic_class = src->traffic_class;
+ dst->reversible = src->reversible;
+ dst->numb_path = src->numb_path;
+ dst->pkey = src->pkey;
+ dst->sl = src->sl;
+ dst->mtu_selector = src->mtu_selector;
+ dst->mtu = src->mtu;
+ dst->rate_selector = src->rate_selector;
+ dst->rate = src->rate;
+ dst->packet_life_time = src->packet_life_time;
+ dst->preference = src->preference;
+ dst->packet_life_time_selector = src->packet_life_time_selector;
+}
+EXPORT_SYMBOL(ib_copy_path_rec_from_user);
diff --git a/sys/ofed/drivers/infiniband/core/verbs.c b/sys/ofed/drivers/infiniband/core/verbs.c
new file mode 100644
index 0000000..90bdeaa
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/core/verbs.c
@@ -0,0 +1,1073 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/string.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+
+int ib_rate_to_mult(enum ib_rate rate)
+{
+ switch (rate) {
+ case IB_RATE_2_5_GBPS: return 1;
+ case IB_RATE_5_GBPS: return 2;
+ case IB_RATE_10_GBPS: return 4;
+ case IB_RATE_20_GBPS: return 8;
+ case IB_RATE_30_GBPS: return 12;
+ case IB_RATE_40_GBPS: return 16;
+ case IB_RATE_60_GBPS: return 24;
+ case IB_RATE_80_GBPS: return 32;
+ case IB_RATE_120_GBPS: return 48;
+ default: return -1;
+ }
+}
+EXPORT_SYMBOL(ib_rate_to_mult);
+
+enum ib_rate mult_to_ib_rate(int mult)
+{
+ switch (mult) {
+ case 1: return IB_RATE_2_5_GBPS;
+ case 2: return IB_RATE_5_GBPS;
+ case 4: return IB_RATE_10_GBPS;
+ case 8: return IB_RATE_20_GBPS;
+ case 12: return IB_RATE_30_GBPS;
+ case 16: return IB_RATE_40_GBPS;
+ case 24: return IB_RATE_60_GBPS;
+ case 32: return IB_RATE_80_GBPS;
+ case 48: return IB_RATE_120_GBPS;
+ default: return IB_RATE_PORT_CURRENT;
+ }
+}
+EXPORT_SYMBOL(mult_to_ib_rate);
+
+enum rdma_transport_type
+rdma_node_get_transport(enum rdma_node_type node_type)
+{
+ switch (node_type) {
+ case RDMA_NODE_IB_CA:
+ case RDMA_NODE_IB_SWITCH:
+ case RDMA_NODE_IB_ROUTER:
+ return RDMA_TRANSPORT_IB;
+ case RDMA_NODE_RNIC:
+ return RDMA_TRANSPORT_IWARP;
+ default:
+ BUG();
+ return 0;
+ }
+}
+EXPORT_SYMBOL(rdma_node_get_transport);
+
+enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num)
+{
+ if (device->get_link_layer)
+ return device->get_link_layer(device, port_num);
+
+ switch (rdma_node_get_transport(device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ return IB_LINK_LAYER_INFINIBAND;
+ case RDMA_TRANSPORT_IWARP:
+ return IB_LINK_LAYER_ETHERNET;
+ default:
+ return IB_LINK_LAYER_UNSPECIFIED;
+ }
+}
+EXPORT_SYMBOL(rdma_port_get_link_layer);
+
+/* Protection domains */
+
+struct ib_pd *ib_alloc_pd(struct ib_device *device)
+{
+ struct ib_pd *pd;
+
+ pd = device->alloc_pd(device, NULL, NULL);
+
+ if (!IS_ERR(pd)) {
+ pd->device = device;
+ pd->uobject = NULL;
+ atomic_set(&pd->usecnt, 0);
+ }
+
+ return pd;
+}
+EXPORT_SYMBOL(ib_alloc_pd);
+
+int ib_dealloc_pd(struct ib_pd *pd)
+{
+ if (atomic_read(&pd->usecnt))
+ return -EBUSY;
+
+ return pd->device->dealloc_pd(pd);
+}
+EXPORT_SYMBOL(ib_dealloc_pd);
+
+/* Address handles */
+
+struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+ struct ib_ah *ah;
+
+ ah = pd->device->create_ah(pd, ah_attr);
+
+ if (!IS_ERR(ah)) {
+ ah->device = pd->device;
+ ah->pd = pd;
+ ah->uobject = NULL;
+ atomic_inc(&pd->usecnt);
+ }
+
+ return ah;
+}
+EXPORT_SYMBOL(ib_create_ah);
+
+int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
+ struct ib_grh *grh, struct ib_ah_attr *ah_attr)
+{
+ u32 flow_class;
+ u16 gid_index;
+ int ret;
+
+ memset(ah_attr, 0, sizeof *ah_attr);
+ ah_attr->dlid = wc->slid;
+ ah_attr->sl = wc->sl;
+ ah_attr->src_path_bits = wc->dlid_path_bits;
+ ah_attr->port_num = port_num;
+
+ if (wc->wc_flags & IB_WC_GRH) {
+ ah_attr->ah_flags = IB_AH_GRH;
+ ah_attr->grh.dgid = grh->sgid;
+
+ ret = ib_find_cached_gid(device, &grh->dgid, &port_num,
+ &gid_index);
+ if (ret)
+ return ret;
+
+ ah_attr->grh.sgid_index = (u8) gid_index;
+ flow_class = be32_to_cpu(grh->version_tclass_flow);
+ ah_attr->grh.flow_label = flow_class & 0xFFFFF;
+ ah_attr->grh.hop_limit = 0xFF;
+ ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ib_init_ah_from_wc);
+
+struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc,
+ struct ib_grh *grh, u8 port_num)
+{
+ struct ib_ah_attr ah_attr;
+ int ret;
+
+ ret = ib_init_ah_from_wc(pd->device, port_num, wc, grh, &ah_attr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ return ib_create_ah(pd, &ah_attr);
+}
+EXPORT_SYMBOL(ib_create_ah_from_wc);
+
+int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+ return ah->device->modify_ah ?
+ ah->device->modify_ah(ah, ah_attr) :
+ -ENOSYS;
+}
+EXPORT_SYMBOL(ib_modify_ah);
+
+int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+ return ah->device->query_ah ?
+ ah->device->query_ah(ah, ah_attr) :
+ -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_ah);
+
+int ib_destroy_ah(struct ib_ah *ah)
+{
+ struct ib_pd *pd;
+ int ret;
+
+ pd = ah->pd;
+ ret = ah->device->destroy_ah(ah);
+ if (!ret)
+ atomic_dec(&pd->usecnt);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_destroy_ah);
+
+/* Shared receive queues */
+
+struct ib_srq *ib_create_srq(struct ib_pd *pd,
+ struct ib_srq_init_attr *srq_init_attr)
+{
+ struct ib_srq *srq;
+
+ if (!pd->device->create_srq)
+ return ERR_PTR(-ENOSYS);
+
+ srq = pd->device->create_srq(pd, srq_init_attr, NULL);
+
+ if (!IS_ERR(srq)) {
+ srq->device = pd->device;
+ srq->pd = pd;
+ srq->uobject = NULL;
+ srq->event_handler = srq_init_attr->event_handler;
+ srq->srq_context = srq_init_attr->srq_context;
+ srq->xrc_cq = NULL;
+ srq->xrcd = NULL;
+ atomic_inc(&pd->usecnt);
+ atomic_set(&srq->usecnt, 0);
+ }
+
+ return srq;
+}
+EXPORT_SYMBOL(ib_create_srq);
+
+struct ib_srq *ib_create_xrc_srq(struct ib_pd *pd,
+ struct ib_cq *xrc_cq,
+ struct ib_xrcd *xrcd,
+ struct ib_srq_init_attr *srq_init_attr)
+{
+ struct ib_srq *srq;
+
+ if (!pd->device->create_xrc_srq)
+ return ERR_PTR(-ENOSYS);
+
+ srq = pd->device->create_xrc_srq(pd, xrc_cq, xrcd, srq_init_attr, NULL);
+
+ if (!IS_ERR(srq)) {
+ srq->device = pd->device;
+ srq->pd = pd;
+ srq->uobject = NULL;
+ srq->event_handler = srq_init_attr->event_handler;
+ srq->srq_context = srq_init_attr->srq_context;
+ srq->xrc_cq = xrc_cq;
+ srq->xrcd = xrcd;
+ atomic_inc(&pd->usecnt);
+ atomic_inc(&xrcd->usecnt);
+ atomic_inc(&xrc_cq->usecnt);
+ atomic_set(&srq->usecnt, 0);
+ }
+
+ return srq;
+}
+EXPORT_SYMBOL(ib_create_xrc_srq);
+
+int ib_modify_srq(struct ib_srq *srq,
+ struct ib_srq_attr *srq_attr,
+ enum ib_srq_attr_mask srq_attr_mask)
+{
+ return srq->device->modify_srq ?
+ srq->device->modify_srq(srq, srq_attr, srq_attr_mask, NULL) :
+ -ENOSYS;
+}
+EXPORT_SYMBOL(ib_modify_srq);
+
+int ib_query_srq(struct ib_srq *srq,
+ struct ib_srq_attr *srq_attr)
+{
+ return srq->device->query_srq ?
+ srq->device->query_srq(srq, srq_attr) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_srq);
+
+int ib_destroy_srq(struct ib_srq *srq)
+{
+ struct ib_pd *pd;
+ struct ib_cq *xrc_cq;
+ struct ib_xrcd *xrcd;
+ int ret;
+
+ if (atomic_read(&srq->usecnt))
+ return -EBUSY;
+
+ pd = srq->pd;
+ xrc_cq = srq->xrc_cq;
+ xrcd = srq->xrcd;
+
+ ret = srq->device->destroy_srq(srq);
+ if (!ret) {
+ atomic_dec(&pd->usecnt);
+ if (xrc_cq)
+ atomic_dec(&xrc_cq->usecnt);
+ if (xrcd)
+ atomic_dec(&xrcd->usecnt);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_destroy_srq);
+
+/* Queue pairs */
+
+struct ib_qp *ib_create_qp(struct ib_pd *pd,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ struct ib_qp *qp;
+
+ qp = pd->device->create_qp(pd, qp_init_attr, NULL);
+
+ if (!IS_ERR(qp)) {
+ qp->device = pd->device;
+ qp->pd = pd;
+ qp->send_cq = qp_init_attr->send_cq;
+ qp->recv_cq = qp_init_attr->recv_cq;
+ qp->srq = qp_init_attr->srq;
+ qp->uobject = NULL;
+ qp->event_handler = qp_init_attr->event_handler;
+ qp->qp_context = qp_init_attr->qp_context;
+ qp->qp_type = qp_init_attr->qp_type;
+ qp->xrcd = qp->qp_type == IB_QPT_XRC ?
+ qp_init_attr->xrc_domain : NULL;
+ atomic_inc(&pd->usecnt);
+ atomic_inc(&qp_init_attr->send_cq->usecnt);
+ atomic_inc(&qp_init_attr->recv_cq->usecnt);
+ if (qp_init_attr->srq)
+ atomic_inc(&qp_init_attr->srq->usecnt);
+ if (qp->qp_type == IB_QPT_XRC)
+ atomic_inc(&qp->xrcd->usecnt);
+ }
+
+ return qp;
+}
+EXPORT_SYMBOL(ib_create_qp);
+
+static const struct {
+ int valid;
+ enum ib_qp_attr_mask req_param[IB_QPT_RAW_ETH + 1];
+ enum ib_qp_attr_mask opt_param[IB_QPT_RAW_ETH + 1];
+} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
+ [IB_QPS_RESET] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_INIT] = {
+ .valid = 1,
+ .req_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_QKEY),
+ [IB_QPT_RAW_ETH] = IB_QP_PORT,
+ [IB_QPT_UC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_RC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_XRC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ }
+ },
+ },
+ [IB_QPS_INIT] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_INIT] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_RC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_XRC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ }
+ },
+ [IB_QPS_RTR] = {
+ .valid = 1,
+ .req_param = {
+ [IB_QPT_UC] = (IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN),
+ [IB_QPT_RC] = (IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_MIN_RNR_TIMER),
+ [IB_QPT_XRC] = (IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_MIN_RNR_TIMER),
+ },
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX),
+ [IB_QPT_RC] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX),
+ [IB_QPT_XRC] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ }
+ }
+ },
+ [IB_QPS_RTR] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .req_param = {
+ [IB_QPT_UD] = IB_QP_SQ_PSN,
+ [IB_QPT_UC] = IB_QP_SQ_PSN,
+ [IB_QPT_RC] = (IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_SQ_PSN |
+ IB_QP_MAX_QP_RD_ATOMIC),
+ [IB_QPT_XRC] = (IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_SQ_PSN |
+ IB_QP_MAX_QP_RD_ATOMIC),
+ [IB_QPT_SMI] = IB_QP_SQ_PSN,
+ [IB_QPT_GSI] = IB_QP_SQ_PSN,
+ },
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ }
+ }
+ },
+ [IB_QPS_RTS] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE |
+ IB_QP_MIN_RNR_TIMER),
+ [IB_QPT_XRC] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE |
+ IB_QP_MIN_RNR_TIMER),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ }
+ },
+ [IB_QPS_SQD] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_UC] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_RC] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_XRC] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY
+ }
+ },
+ },
+ [IB_QPS_SQD] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ }
+ },
+ [IB_QPS_SQD] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_AV |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_PORT |
+ IB_QP_AV |
+ IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_MAX_QP_RD_ATOMIC |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC] = (IB_QP_PORT |
+ IB_QP_AV |
+ IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_MAX_QP_RD_ATOMIC |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ }
+ }
+ },
+ [IB_QPS_SQE] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ }
+ }
+ },
+ [IB_QPS_ERR] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 }
+ }
+};
+
+int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
+ enum ib_qp_type type, enum ib_qp_attr_mask mask)
+{
+ enum ib_qp_attr_mask req_param, opt_param;
+
+ if (cur_state < 0 || cur_state > IB_QPS_ERR ||
+ next_state < 0 || next_state > IB_QPS_ERR)
+ return 0;
+
+ if (mask & IB_QP_CUR_STATE &&
+ cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS &&
+ cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE)
+ return 0;
+
+ if (!qp_state_table[cur_state][next_state].valid)
+ return 0;
+
+ req_param = qp_state_table[cur_state][next_state].req_param[type];
+ opt_param = qp_state_table[cur_state][next_state].opt_param[type];
+
+ if ((mask & req_param) != req_param)
+ return 0;
+
+ if (mask & ~(req_param | opt_param | IB_QP_STATE))
+ return 0;
+
+ return 1;
+}
+EXPORT_SYMBOL(ib_modify_qp_is_ok);
+
+int ib_modify_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask)
+{
+ return qp->device->modify_qp(qp, qp_attr, qp_attr_mask, NULL);
+}
+EXPORT_SYMBOL(ib_modify_qp);
+
+int ib_query_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ return qp->device->query_qp ?
+ qp->device->query_qp(qp, qp_attr, qp_attr_mask, qp_init_attr) :
+ -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_qp);
+
+int ib_destroy_qp(struct ib_qp *qp)
+{
+ struct ib_pd *pd;
+ struct ib_cq *scq, *rcq;
+ struct ib_srq *srq;
+ struct ib_xrcd *xrcd;
+ enum ib_qp_type qp_type = qp->qp_type;
+ int ret;
+
+ pd = qp->pd;
+ scq = qp->send_cq;
+ rcq = qp->recv_cq;
+ srq = qp->srq;
+ xrcd = qp->xrcd;
+
+ ret = qp->device->destroy_qp(qp);
+ if (!ret) {
+ atomic_dec(&pd->usecnt);
+ atomic_dec(&scq->usecnt);
+ atomic_dec(&rcq->usecnt);
+ if (srq)
+ atomic_dec(&srq->usecnt);
+ if (qp_type == IB_QPT_XRC)
+ atomic_dec(&xrcd->usecnt);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_destroy_qp);
+
+/* Completion queues */
+
+struct ib_cq *ib_create_cq(struct ib_device *device,
+ ib_comp_handler comp_handler,
+ void (*event_handler)(struct ib_event *, void *),
+ void *cq_context, int cqe, int comp_vector)
+{
+ struct ib_cq *cq;
+
+ cq = device->create_cq(device, cqe, comp_vector, NULL, NULL);
+
+ if (!IS_ERR(cq)) {
+ cq->device = device;
+ cq->uobject = NULL;
+ cq->comp_handler = comp_handler;
+ cq->event_handler = event_handler;
+ cq->cq_context = cq_context;
+ atomic_set(&cq->usecnt, 0);
+ }
+
+ return cq;
+}
+EXPORT_SYMBOL(ib_create_cq);
+
+int ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
+{
+ return cq->device->modify_cq ?
+ cq->device->modify_cq(cq, cq_count, cq_period) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_modify_cq);
+
+int ib_destroy_cq(struct ib_cq *cq)
+{
+ if (atomic_read(&cq->usecnt))
+ return -EBUSY;
+
+ return cq->device->destroy_cq(cq);
+}
+EXPORT_SYMBOL(ib_destroy_cq);
+
+int ib_resize_cq(struct ib_cq *cq, int cqe)
+{
+ return cq->device->resize_cq ?
+ cq->device->resize_cq(cq, cqe, NULL) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_resize_cq);
+
+/* Memory regions */
+
+struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
+{
+ struct ib_mr *mr;
+
+ mr = pd->device->get_dma_mr(pd, mr_access_flags);
+
+ if (!IS_ERR(mr)) {
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->uobject = NULL;
+ atomic_inc(&pd->usecnt);
+ atomic_set(&mr->usecnt, 0);
+ }
+
+ return mr;
+}
+EXPORT_SYMBOL(ib_get_dma_mr);
+
+struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
+ struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf,
+ int mr_access_flags,
+ u64 *iova_start)
+{
+ struct ib_mr *mr;
+
+ if (!pd->device->reg_phys_mr)
+ return ERR_PTR(-ENOSYS);
+
+ mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf,
+ mr_access_flags, iova_start);
+
+ if (!IS_ERR(mr)) {
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->uobject = NULL;
+ atomic_inc(&pd->usecnt);
+ atomic_set(&mr->usecnt, 0);
+ }
+
+ return mr;
+}
+EXPORT_SYMBOL(ib_reg_phys_mr);
+
+int ib_rereg_phys_mr(struct ib_mr *mr,
+ int mr_rereg_mask,
+ struct ib_pd *pd,
+ struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf,
+ int mr_access_flags,
+ u64 *iova_start)
+{
+ struct ib_pd *old_pd;
+ int ret;
+
+ if (!mr->device->rereg_phys_mr)
+ return -ENOSYS;
+
+ if (atomic_read(&mr->usecnt))
+ return -EBUSY;
+
+ old_pd = mr->pd;
+
+ ret = mr->device->rereg_phys_mr(mr, mr_rereg_mask, pd,
+ phys_buf_array, num_phys_buf,
+ mr_access_flags, iova_start);
+
+ if (!ret && (mr_rereg_mask & IB_MR_REREG_PD)) {
+ atomic_dec(&old_pd->usecnt);
+ atomic_inc(&pd->usecnt);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_rereg_phys_mr);
+
+int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
+{
+ return mr->device->query_mr ?
+ mr->device->query_mr(mr, mr_attr) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_mr);
+
+int ib_dereg_mr(struct ib_mr *mr)
+{
+ struct ib_pd *pd;
+ int ret;
+
+ if (atomic_read(&mr->usecnt))
+ return -EBUSY;
+
+ pd = mr->pd;
+ ret = mr->device->dereg_mr(mr);
+ if (!ret)
+ atomic_dec(&pd->usecnt);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_dereg_mr);
+
+struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
+{
+ struct ib_mr *mr;
+
+ if (!pd->device->alloc_fast_reg_mr)
+ return ERR_PTR(-ENOSYS);
+
+ mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len);
+
+ if (!IS_ERR(mr)) {
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->uobject = NULL;
+ atomic_inc(&pd->usecnt);
+ atomic_set(&mr->usecnt, 0);
+ }
+
+ return mr;
+}
+EXPORT_SYMBOL(ib_alloc_fast_reg_mr);
+
+struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device,
+ int max_page_list_len)
+{
+ struct ib_fast_reg_page_list *page_list;
+
+ if (!device->alloc_fast_reg_page_list)
+ return ERR_PTR(-ENOSYS);
+
+ page_list = device->alloc_fast_reg_page_list(device, max_page_list_len);
+
+ if (!IS_ERR(page_list)) {
+ page_list->device = device;
+ page_list->max_page_list_len = max_page_list_len;
+ }
+
+ return page_list;
+}
+EXPORT_SYMBOL(ib_alloc_fast_reg_page_list);
+
+void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
+{
+ page_list->device->free_fast_reg_page_list(page_list);
+}
+EXPORT_SYMBOL(ib_free_fast_reg_page_list);
+
+/* Memory windows */
+
+struct ib_mw *ib_alloc_mw(struct ib_pd *pd)
+{
+ struct ib_mw *mw;
+
+ if (!pd->device->alloc_mw)
+ return ERR_PTR(-ENOSYS);
+
+ mw = pd->device->alloc_mw(pd);
+ if (!IS_ERR(mw)) {
+ mw->device = pd->device;
+ mw->pd = pd;
+ mw->uobject = NULL;
+ atomic_inc(&pd->usecnt);
+ }
+
+ return mw;
+}
+EXPORT_SYMBOL(ib_alloc_mw);
+
+int ib_dealloc_mw(struct ib_mw *mw)
+{
+ struct ib_pd *pd;
+ int ret;
+
+ pd = mw->pd;
+ ret = mw->device->dealloc_mw(mw);
+ if (!ret)
+ atomic_dec(&pd->usecnt);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_dealloc_mw);
+
+/* "Fast" memory regions */
+
+struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
+ int mr_access_flags,
+ struct ib_fmr_attr *fmr_attr)
+{
+ struct ib_fmr *fmr;
+
+ if (!pd->device->alloc_fmr)
+ return ERR_PTR(-ENOSYS);
+
+ fmr = pd->device->alloc_fmr(pd, mr_access_flags, fmr_attr);
+ if (!IS_ERR(fmr)) {
+ fmr->device = pd->device;
+ fmr->pd = pd;
+ atomic_inc(&pd->usecnt);
+ }
+
+ return fmr;
+}
+EXPORT_SYMBOL(ib_alloc_fmr);
+
+int ib_unmap_fmr(struct list_head *fmr_list)
+{
+ struct ib_fmr *fmr;
+
+ if (list_empty(fmr_list))
+ return 0;
+
+ fmr = list_entry(fmr_list->next, struct ib_fmr, list);
+ return fmr->device->unmap_fmr(fmr_list);
+}
+EXPORT_SYMBOL(ib_unmap_fmr);
+
+int ib_dealloc_fmr(struct ib_fmr *fmr)
+{
+ struct ib_pd *pd;
+ int ret;
+
+ pd = fmr->pd;
+ ret = fmr->device->dealloc_fmr(fmr);
+ if (!ret)
+ atomic_dec(&pd->usecnt);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_dealloc_fmr);
+
+/* Multicast groups */
+
+int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+ if (!qp->device->attach_mcast)
+ return -ENOSYS;
+
+ switch (rdma_node_get_transport(qp->device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ if (qp->qp_type == IB_QPT_RAW_ETH) {
+ /* In raw Etherent mgids the 63 msb's should be 0 */
+ if (gid->global.subnet_prefix & cpu_to_be64(~1ULL))
+ return -EINVAL;
+ } else if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD)
+ return -EINVAL;
+ break;
+ case RDMA_TRANSPORT_IWARP:
+ if (qp->qp_type != IB_QPT_RAW_ETH)
+ return -EINVAL;
+ break;
+ }
+ return qp->device->attach_mcast(qp, gid, lid);
+}
+EXPORT_SYMBOL(ib_attach_mcast);
+
+int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+ if (!qp->device->detach_mcast)
+ return -ENOSYS;
+
+ switch (rdma_node_get_transport(qp->device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ if (qp->qp_type == IB_QPT_RAW_ETH) {
+ /* In raw Etherent mgids the 63 msb's should be 0 */
+ if (gid->global.subnet_prefix & cpu_to_be64(~1ULL))
+ return -EINVAL;
+ } else if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD)
+ return -EINVAL;
+ break;
+ case RDMA_TRANSPORT_IWARP:
+ if (qp->qp_type != IB_QPT_RAW_ETH)
+ return -EINVAL;
+ break;
+ }
+ return qp->device->detach_mcast(qp, gid, lid);
+}
+EXPORT_SYMBOL(ib_detach_mcast);
+
+int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+{
+ if (atomic_read(&xrcd->usecnt))
+ return -EBUSY;
+
+ return xrcd->device->dealloc_xrcd(xrcd);
+}
+EXPORT_SYMBOL(ib_dealloc_xrcd);
+
+struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device)
+{
+ struct ib_xrcd *xrcd;
+
+ if (!device->alloc_xrcd)
+ return ERR_PTR(-ENOSYS);
+
+ xrcd = device->alloc_xrcd(device, NULL, NULL);
+ if (!IS_ERR(xrcd)) {
+ xrcd->device = device;
+ xrcd->inode = NULL;
+ xrcd->uobject = NULL;
+ atomic_set(&xrcd->usecnt, 0);
+ }
+ return xrcd;
+}
+EXPORT_SYMBOL(ib_alloc_xrcd);
+
diff --git a/sys/ofed/drivers/infiniband/debug/Makefile b/sys/ofed/drivers/infiniband/debug/Makefile
new file mode 100644
index 0000000..e9d9f4b
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/debug/Makefile
@@ -0,0 +1,3 @@
+EXTRA_CFLAGS := $(subst $(KERNEL_MEMTRACK_CFLAGS),,$(EXTRA_CFLAGS))
+
+obj-m += memtrack.o
diff --git a/sys/ofed/drivers/infiniband/debug/memtrack.c b/sys/ofed/drivers/infiniband/debug/memtrack.c
new file mode 100644
index 0000000..199b33b
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/debug/memtrack.c
@@ -0,0 +1,600 @@
+/*
+ This software is available to you under a choice of one of two
+ licenses. You may choose to be licensed under the terms of the GNU
+ General Public License (GPL) Version 2, available at
+ <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ license, available in the LICENSE.TXT file accompanying this
+ software. These details are also available at
+ <http://openib.org/license.html>.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+
+ Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+*/
+
+#define C_MEMTRACK_C
+
+#ifdef kmalloc
+ #undef kmalloc
+#endif
+#ifdef kfree
+ #undef kfree
+#endif
+#ifdef vmalloc
+ #undef vmalloc
+#endif
+#ifdef vfree
+ #undef vfree
+#endif
+#ifdef kmem_cache_alloc
+ #undef kmem_cache_alloc
+#endif
+#ifdef kmem_cache_free
+ #undef kmem_cache_free
+#endif
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/vmalloc.h>
+#include <linux/version.h>
+#include <asm/uaccess.h>
+#include <linux/proc_fs.h>
+#include <memtrack.h>
+
+#include <linux/moduleparam.h>
+
+
+MODULE_AUTHOR("Mellanox Technologies LTD.");
+MODULE_DESCRIPTION("Memory allocations tracking");
+MODULE_LICENSE("GPL");
+
+#define MEMTRACK_HASH_SZ ((1<<15)-19) /* prime: http://www.utm.edu/research/primes/lists/2small/0bit.html */
+#define MAX_FILENAME_LEN 31
+
+#define memtrack_spin_lock(spl, flags) spin_lock_irqsave(spl, flags)
+#define memtrack_spin_unlock(spl, flags) spin_unlock_irqrestore(spl, flags)
+
+/* if a bit is set then the corresponding allocation is tracked.
+ bit0 corresponds to MEMTRACK_KMALLOC, bit1 corresponds to MEMTRACK_VMALLOC etc. */
+static unsigned long track_mask = -1; /* effectively everything */
+module_param(track_mask, ulong, 0444);
+MODULE_PARM_DESC(track_mask, "bitmask definenig what is tracked");
+
+/* if a bit is set then the corresponding allocation is strictly tracked.
+ That is, before inserting the whole range is checked to not overlap any
+ of the allocations already in the database */
+static unsigned long strict_track_mask = 0; /* no strict tracking */
+module_param(strict_track_mask, ulong, 0444);
+MODULE_PARM_DESC(strict_track_mask, "bitmask which allocation requires strict tracking");
+
+typedef struct memtrack_meminfo_st {
+ unsigned long addr;
+ unsigned long size;
+ unsigned long line_num;
+ struct memtrack_meminfo_st *next;
+ struct list_head list; /* used to link all items from a certain type together */
+ char filename[MAX_FILENAME_LEN + 1]; /* putting the char array last is better for struct. packing */
+} memtrack_meminfo_t;
+
+static struct kmem_cache *meminfo_cache;
+
+typedef struct {
+ memtrack_meminfo_t *mem_hash[MEMTRACK_HASH_SZ];
+ spinlock_t hash_lock;
+ unsigned long count; /* size of memory tracked (*malloc) or number of objects tracked */
+ struct list_head tracked_objs_head; /* head of list of all objects */
+ int strict_track; /* if 1 then for each object inserted check if it overlaps any of the objects already in the list */
+} tracked_obj_desc_t;
+
+static tracked_obj_desc_t *tracked_objs_arr[MEMTRACK_NUM_OF_MEMTYPES];
+
+static const char *rsc_names[MEMTRACK_NUM_OF_MEMTYPES] = {
+ "kmalloc",
+ "vmalloc",
+ "kmem_cache_alloc"
+};
+
+
+static const char *rsc_free_names[MEMTRACK_NUM_OF_MEMTYPES] = {
+ "kfree",
+ "vfree",
+ "kmem_cache_free"
+};
+
+
+static inline const char *memtype_alloc_str(memtrack_memtype_t memtype)
+{
+ switch (memtype) {
+ case MEMTRACK_KMALLOC:
+ case MEMTRACK_VMALLOC:
+ case MEMTRACK_KMEM_OBJ:
+ return rsc_names[memtype];
+ default:
+ return "(Unknown allocation type)";
+ }
+}
+
+static inline const char *memtype_free_str(memtrack_memtype_t memtype)
+{
+ switch (memtype) {
+ case MEMTRACK_KMALLOC:
+ case MEMTRACK_VMALLOC:
+ case MEMTRACK_KMEM_OBJ:
+ return rsc_free_names[memtype];
+ default:
+ return "(Unknown allocation type)";
+ }
+}
+
+/*
+ * overlap_a_b
+ */
+static int overlap_a_b(unsigned long a_start, unsigned long a_end,
+ unsigned long b_start, unsigned long b_end)
+{
+ if ((b_start > a_end) || (a_start > b_end)) {
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * check_overlap
+ */
+static void check_overlap(memtrack_memtype_t memtype,
+ memtrack_meminfo_t * mem_info_p,
+ tracked_obj_desc_t * obj_desc_p)
+{
+ struct list_head *pos, *next;
+ memtrack_meminfo_t *cur;
+ unsigned long start_a, end_a, start_b, end_b;
+
+ list_for_each_safe(pos, next, &obj_desc_p->tracked_objs_head) {
+ cur = list_entry(pos, memtrack_meminfo_t, list);
+
+ start_a = mem_info_p->addr;
+ end_a = mem_info_p->addr + mem_info_p->size - 1;
+ start_b = cur->addr;
+ end_b = cur->addr + cur->size - 1;
+
+ if (overlap_a_b(start_a, end_a, start_b, end_b)) {
+ printk
+ ("%s overlaps! new_start=0x%lx, new_end=0x%lx, item_start=0x%lx, item_end=0x%lx\n",
+ memtype_alloc_str(memtype), mem_info_p->addr,
+ mem_info_p->addr + mem_info_p->size - 1, cur->addr,
+ cur->addr + cur->size - 1);
+ }
+ }
+}
+
+/* Invoke on memory allocation */
+void memtrack_alloc(memtrack_memtype_t memtype, unsigned long addr,
+ unsigned long size, const char *filename,
+ const unsigned long line_num, int alloc_flags)
+{
+ unsigned long hash_val;
+ memtrack_meminfo_t *cur_mem_info_p, *new_mem_info_p;
+ tracked_obj_desc_t *obj_desc_p;
+ unsigned long flags;
+
+ if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) {
+ printk("%s: Invalid memory type (%d)\n", __func__, memtype);
+ return;
+ }
+
+ if (!tracked_objs_arr[memtype]) {
+ /* object is not tracked */
+ return;
+ }
+ obj_desc_p = tracked_objs_arr[memtype];
+
+ hash_val = addr % MEMTRACK_HASH_SZ;
+
+ new_mem_info_p = (memtrack_meminfo_t *)
+ kmem_cache_alloc(meminfo_cache, alloc_flags);
+ if (new_mem_info_p == NULL) {
+ printk
+ ("%s: Failed allocating kmem_cache item for new mem_info. "
+ "Lost tracking on allocation at %s:%lu...\n", __func__,
+ filename, line_num);
+ return;
+ }
+ /* save allocation properties */
+ new_mem_info_p->addr = addr;
+ new_mem_info_p->size = size;
+ new_mem_info_p->line_num = line_num;
+ /* Make sure that we will print out the path tail if the given filename is longer
+ * than MAX_FILENAME_LEN. (otherwise, we will not see the name of the actual file
+ * in the printout -- only the path head!
+ */
+ if (strlen(filename) > MAX_FILENAME_LEN) {
+ strncpy(new_mem_info_p->filename, filename + strlen(filename) - MAX_FILENAME_LEN, MAX_FILENAME_LEN);
+ } else {
+ strncpy(new_mem_info_p->filename, filename, MAX_FILENAME_LEN);
+ }
+ new_mem_info_p->filename[MAX_FILENAME_LEN] = 0; /* NULL terminate anyway */
+
+ memtrack_spin_lock(&obj_desc_p->hash_lock, flags);
+ /* make sure given memory location is not already allocated */
+ cur_mem_info_p = obj_desc_p->mem_hash[hash_val];
+ while (cur_mem_info_p != NULL) {
+ if (cur_mem_info_p->addr == addr) {
+ /* Found given address in the database */
+ printk
+ ("mtl rsc inconsistency: %s: %s::%lu: %s @ addr=0x%lX which is already known from %s:%lu\n",
+ __func__, filename, line_num,
+ memtype_alloc_str(memtype), addr,
+ cur_mem_info_p->filename,
+ cur_mem_info_p->line_num);
+ memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
+ kmem_cache_free(meminfo_cache, new_mem_info_p);
+ return;
+ }
+ cur_mem_info_p = cur_mem_info_p->next;
+ }
+ /* not found - we can put in the hash bucket */
+ /* link as first */
+ new_mem_info_p->next = obj_desc_p->mem_hash[hash_val];
+ obj_desc_p->mem_hash[hash_val] = new_mem_info_p;
+ if (obj_desc_p->strict_track) {
+ check_overlap(memtype, new_mem_info_p, obj_desc_p);
+ }
+ obj_desc_p->count += size;
+ list_add(&new_mem_info_p->list, &obj_desc_p->tracked_objs_head);
+
+ memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
+ return;
+}
+
+/* Invoke on memory free */
+void memtrack_free(memtrack_memtype_t memtype, unsigned long addr,
+ const char *filename, const unsigned long line_num)
+{
+ unsigned long hash_val;
+ memtrack_meminfo_t *cur_mem_info_p, *prev_mem_info_p;
+ tracked_obj_desc_t *obj_desc_p;
+ unsigned long flags;
+
+ if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) {
+ printk("%s: Invalid memory type (%d)\n", __func__, memtype);
+ return;
+ }
+
+ if (!tracked_objs_arr[memtype]) {
+ /* object is not tracked */
+ return;
+ }
+ obj_desc_p = tracked_objs_arr[memtype];
+
+ hash_val = addr % MEMTRACK_HASH_SZ;
+
+ memtrack_spin_lock(&obj_desc_p->hash_lock, flags);
+ /* find mem_info of given memory location */
+ prev_mem_info_p = NULL;
+ cur_mem_info_p = obj_desc_p->mem_hash[hash_val];
+ while (cur_mem_info_p != NULL) {
+ if (cur_mem_info_p->addr == addr) {
+ /* Found given address in the database - remove from the bucket/list */
+ if (prev_mem_info_p == NULL) {
+ obj_desc_p->mem_hash[hash_val] = cur_mem_info_p->next; /* removing first */
+ } else {
+ prev_mem_info_p->next = cur_mem_info_p->next; /* "crossover" */
+ }
+ list_del(&cur_mem_info_p->list);
+
+ obj_desc_p->count -= cur_mem_info_p->size;
+ memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
+ kmem_cache_free(meminfo_cache, cur_mem_info_p);
+ return;
+ }
+ prev_mem_info_p = cur_mem_info_p;
+ cur_mem_info_p = cur_mem_info_p->next;
+ }
+
+ /* not found */
+ printk
+ ("mtl rsc inconsistency: %s: %s::%lu: %s for unknown address=0x%lX\n",
+ __func__, filename, line_num, memtype_free_str(memtype), addr);
+ memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
+ return;
+}
+
+/* Report current allocations status (for all memory types) */
+static void memtrack_report(void)
+{
+ memtrack_memtype_t memtype;
+ unsigned long cur_bucket;
+ memtrack_meminfo_t *cur_mem_info_p;
+ int serial = 1;
+ tracked_obj_desc_t *obj_desc_p;
+ unsigned long flags;
+
+ printk("%s: Currently known allocations:\n", __func__);
+ for (memtype = 0; memtype < MEMTRACK_NUM_OF_MEMTYPES; memtype++) {
+ if (tracked_objs_arr[memtype]) {
+ printk("%d) %s:\n", serial, memtype_alloc_str(memtype));
+ obj_desc_p = tracked_objs_arr[memtype];
+ /* Scan all buckets to find existing allocations */
+ /* TBD: this may be optimized by holding a linked list of all hash items */
+ for (cur_bucket = 0; cur_bucket < MEMTRACK_HASH_SZ;
+ cur_bucket++) {
+ memtrack_spin_lock(&obj_desc_p->hash_lock, flags); /* protect per bucket/list */
+ cur_mem_info_p =
+ obj_desc_p->mem_hash[cur_bucket];
+ while (cur_mem_info_p != NULL) { /* scan bucket */
+ printk("%s::%lu: %s(%lu)==%lX\n",
+ cur_mem_info_p->filename,
+ cur_mem_info_p->line_num,
+ memtype_alloc_str(memtype),
+ cur_mem_info_p->size,
+ cur_mem_info_p->addr);
+ cur_mem_info_p = cur_mem_info_p->next;
+ } /* while cur_mem_info_p */
+ memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
+ } /* for cur_bucket */
+ serial++;
+ }
+ } /* for memtype */
+}
+
+
+
+static struct proc_dir_entry *memtrack_tree;
+
+static memtrack_memtype_t get_rsc_by_name(const char *name)
+{
+ memtrack_memtype_t i;
+
+ for (i=0; i<MEMTRACK_NUM_OF_MEMTYPES; ++i) {
+ if (strcmp(name, rsc_names[i]) == 0) {
+ return i;
+ }
+ }
+
+ return i;
+}
+
+
+static ssize_t memtrack_read(struct file *filp,
+ char __user *buf,
+ size_t size,
+ loff_t *offset)
+{
+ unsigned long cur, flags;
+ loff_t pos = *offset;
+ static char kbuf[20];
+ static int file_len;
+ int _read, to_ret, left;
+ const char *fname;
+ memtrack_memtype_t memtype;
+
+ if (pos < 0)
+ return -EINVAL;
+
+ fname= filp->f_dentry->d_name.name;
+
+ memtype= get_rsc_by_name(fname);
+ if (memtype >= MEMTRACK_NUM_OF_MEMTYPES) {
+ printk("invalid file name\n");
+ return -EINVAL;
+ }
+
+ if ( pos == 0 ) {
+ memtrack_spin_lock(&tracked_objs_arr[memtype]->hash_lock, flags);
+ cur= tracked_objs_arr[memtype]->count;
+ memtrack_spin_unlock(&tracked_objs_arr[memtype]->hash_lock, flags);
+ _read = sprintf(kbuf, "%lu\n", cur);
+ if ( _read < 0 ) {
+ return _read;
+ }
+ else {
+ file_len = _read;
+ }
+ }
+
+ left = file_len - pos;
+ to_ret = (left < size) ? left : size;
+ if ( copy_to_user(buf, kbuf+pos, to_ret) ) {
+ return -EFAULT;
+ }
+ else {
+ *offset = pos + to_ret;
+ return to_ret;
+ }
+}
+
+static struct file_operations memtrack_proc_fops = {
+ .read = memtrack_read,
+};
+
+static const char *memtrack_proc_entry_name = "mt_memtrack";
+
+static int create_procfs_tree(void)
+{
+ struct proc_dir_entry *dir_ent;
+ struct proc_dir_entry *proc_ent;
+ int i, j;
+ unsigned long bit_mask;
+
+ dir_ent = proc_mkdir(memtrack_proc_entry_name, NULL);
+ if ( !dir_ent ) {
+ return -1;
+ }
+
+ memtrack_tree = dir_ent;
+
+ for (i=0, bit_mask=1; i<MEMTRACK_NUM_OF_MEMTYPES; ++i, bit_mask<<=1) {
+ if (bit_mask & track_mask) {
+ proc_ent = create_proc_entry(rsc_names[i], S_IRUGO, memtrack_tree);
+ if ( !proc_ent )
+ goto undo_create_root;
+
+ proc_ent->proc_fops = &memtrack_proc_fops;
+ }
+ }
+
+ goto exit_ok;
+
+undo_create_root:
+ for (j=0, bit_mask=1; j<i; ++j, bit_mask<<=1) {
+ if (bit_mask & track_mask) {
+ remove_proc_entry(rsc_names[j], memtrack_tree);
+ }
+ }
+ remove_proc_entry(memtrack_proc_entry_name, NULL);
+ return -1;
+
+exit_ok:
+ return 0;
+}
+
+
+static void destroy_procfs_tree(void)
+{
+ int i;
+ unsigned long bit_mask;
+
+ for (i=0, bit_mask=1; i<MEMTRACK_NUM_OF_MEMTYPES; ++i, bit_mask<<=1) {
+ if (bit_mask & track_mask) {
+ remove_proc_entry(rsc_names[i], memtrack_tree);
+ }
+ }
+ remove_proc_entry(memtrack_proc_entry_name, NULL);
+}
+
+
+/* module entry points */
+
+int init_module(void)
+{
+ memtrack_memtype_t i;
+ int j;
+ unsigned long bit_mask;
+
+
+ /* create a cache for the memtrack_meminfo_t strcutures */
+ meminfo_cache = kmem_cache_create("memtrack_meminfo_t",
+ sizeof(memtrack_meminfo_t), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!meminfo_cache) {
+ printk("memtrack::%s: failed to allocate meminfo cache\n", __func__);
+ return -1;
+ }
+
+ /* initialize array of descriptors */
+ memset(tracked_objs_arr, 0, sizeof(tracked_objs_arr));
+
+ /* create a tracking object descriptor for all required objects */
+ for (i = 0, bit_mask = 1; i < MEMTRACK_NUM_OF_MEMTYPES;
+ ++i, bit_mask <<= 1) {
+ if (bit_mask & track_mask) {
+ tracked_objs_arr[i] =
+ vmalloc(sizeof(tracked_obj_desc_t));
+ if (!tracked_objs_arr[i]) {
+ printk("memtrack: failed to allocate tracking object\n");
+ goto undo_cache_create;
+ }
+
+ memset(tracked_objs_arr[i], 0, sizeof(tracked_obj_desc_t));
+ spin_lock_init(&tracked_objs_arr[i]->hash_lock);
+ INIT_LIST_HEAD(&tracked_objs_arr[i]->tracked_objs_head);
+ if (bit_mask & strict_track_mask) {
+ tracked_objs_arr[i]->strict_track = 1;
+ } else {
+ tracked_objs_arr[i]->strict_track = 0;
+ }
+ }
+ }
+
+
+ if ( create_procfs_tree() ) {
+ printk("%s: create_procfs_tree() failed\n", __FILE__);
+ goto undo_cache_create;
+ }
+
+
+ printk("memtrack::%s done.\n", __func__);
+
+ return 0;
+
+undo_cache_create:
+ for (j=0; j<i; ++j) {
+ if (tracked_objs_arr[j]) {
+ vfree(tracked_objs_arr[j]);
+ }
+ }
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
+ if (kmem_cache_destroy(meminfo_cache) != 0) {
+ printk("Failed on kmem_cache_destroy !\n");
+ }
+#else
+ kmem_cache_destroy(meminfo_cache);
+#endif
+ return -1;
+}
+
+
+void cleanup_module(void)
+{
+ memtrack_memtype_t memtype;
+ unsigned long cur_bucket;
+ memtrack_meminfo_t *cur_mem_info_p, *next_mem_info_p;
+ tracked_obj_desc_t *obj_desc_p;
+ unsigned long flags;
+
+
+ memtrack_report();
+
+
+ destroy_procfs_tree();
+
+ /* clean up any hash table left-overs */
+ for (memtype = 0; memtype < MEMTRACK_NUM_OF_MEMTYPES; memtype++) {
+ /* Scan all buckets to find existing allocations */
+ /* TBD: this may be optimized by holding a linked list of all hash items */
+ if (tracked_objs_arr[memtype]) {
+ obj_desc_p = tracked_objs_arr[memtype];
+ for (cur_bucket = 0; cur_bucket < MEMTRACK_HASH_SZ;
+ cur_bucket++) {
+ memtrack_spin_lock(&obj_desc_p->hash_lock, flags); /* protect per bucket/list */
+ cur_mem_info_p =
+ obj_desc_p->mem_hash[cur_bucket];
+ while (cur_mem_info_p != NULL) { /* scan bucket */
+ next_mem_info_p = cur_mem_info_p->next; /* save "next" pointer before the "free" */
+ kmem_cache_free(meminfo_cache,
+ cur_mem_info_p);
+ cur_mem_info_p = next_mem_info_p;
+ } /* while cur_mem_info_p */
+ memtrack_spin_unlock(&obj_desc_p->hash_lock, flags);
+ } /* for cur_bucket */
+ vfree(obj_desc_p);
+ }
+ } /* for memtype */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
+ if (kmem_cache_destroy(meminfo_cache) != 0) {
+ printk
+ ("memtrack::cleanup_module: Failed on kmem_cache_destroy !\n");
+ }
+#else
+ kmem_cache_destroy(meminfo_cache);
+#endif
+ printk("memtrack::cleanup_module done.\n");
+}
+
+EXPORT_SYMBOL(memtrack_alloc);
+EXPORT_SYMBOL(memtrack_free);
+
+//module_init(memtrack_init)
+//module_exit(memtrack_exit)
+
diff --git a/sys/ofed/drivers/infiniband/debug/memtrack.h b/sys/ofed/drivers/infiniband/debug/memtrack.h
new file mode 100644
index 0000000..e443a31
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/debug/memtrack.h
@@ -0,0 +1,45 @@
+/*
+ This software is available to you under a choice of one of two
+ licenses. You may choose to be licensed under the terms of the GNU
+ General Public License (GPL) Version 2, available at
+ <http://www.fsf.org/copyleft/gpl.html>, or the OpenIB.org BSD
+ license, available in the LICENSE.TXT file accompanying this
+ software. These details are also available at
+ <http://openib.org/license.html>.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+
+ Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+*/
+
+#ifndef H_MEMTRACK_H
+#define H_MEMTRACK_H
+
+typedef enum {
+ MEMTRACK_KMALLOC,
+ MEMTRACK_VMALLOC,
+ MEMTRACK_KMEM_OBJ,
+ MEMTRACK_NUM_OF_MEMTYPES
+} memtrack_memtype_t;
+
+/* Invoke on memory allocation */
+void memtrack_alloc(memtrack_memtype_t memtype, unsigned long addr,
+ unsigned long size, const char *filename,
+ const unsigned long line_num, int alloc_flags);
+
+/* Invoke on memory free */
+void memtrack_free(memtrack_memtype_t memtype, unsigned long addr,
+ const char *filename, const unsigned long line_num);
+
+/* Report current allocations status (for all memory types) */
+/* we do not export this function since it is used by cleanup_module only */
+/* void memtrack_report(void); */
+
+#endif
diff --git a/sys/ofed/drivers/infiniband/debug/mtrack.h b/sys/ofed/drivers/infiniband/debug/mtrack.h
new file mode 100644
index 0000000..337d9c3
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/debug/mtrack.h
@@ -0,0 +1,138 @@
+#ifndef __mtrack_h_
+#define __mtrack_h_
+
+#include <memtrack.h>
+
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/version.h>
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14)
+#define RDMA_KZALLOC_H
+#define kzalloc(size, flags) ({ \
+ void *__memtrack_kz_addr; \
+ \
+ __memtrack_kz_addr = kmalloc(size, flags); \
+ if ( __memtrack_kz_addr ) { \
+ memset( __memtrack_kz_addr, 0, size) ; \
+ } \
+ __memtrack_kz_addr; \
+})
+
+#else
+#define kzalloc(size, flags) ({ \
+ void *__memtrack_addr; \
+ \
+ __memtrack_addr = kzalloc(size, flags); \
+ if ( __memtrack_addr && (size)) { \
+ memtrack_alloc(MEMTRACK_KMALLOC, (unsigned long)(__memtrack_addr), size, __FILE__, __LINE__, flags); \
+ } \
+ __memtrack_addr; \
+})
+
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
+#define kcalloc(n, size, flags) kzalloc((n)*(size), flags)
+#else
+#define kcalloc(n, size, flags) ({ \
+ void *__memtrack_addr; \
+ \
+ __memtrack_addr = kcalloc(n, size, flags); \
+ if ( __memtrack_addr && (size)) { \
+ memtrack_alloc(MEMTRACK_KMALLOC, (unsigned long)(__memtrack_addr), (n)*(size), __FILE__, __LINE__, flags); \
+ } \
+ __memtrack_addr; \
+})
+#endif
+
+
+
+#ifdef ZERO_OR_NULL_PTR
+#define kmalloc(sz, flgs) ({ \
+ void *__memtrack_addr; \
+ \
+ __memtrack_addr = kmalloc(sz, flgs); \
+ if ( !ZERO_OR_NULL_PTR(__memtrack_addr)) { \
+ memtrack_alloc(MEMTRACK_KMALLOC, (unsigned long)(__memtrack_addr), sz, __FILE__, __LINE__, flgs); \
+ } \
+ __memtrack_addr; \
+})
+#else
+#define kmalloc(sz, flgs) ({ \
+ void *__memtrack_addr; \
+ \
+ __memtrack_addr = kmalloc(sz, flgs); \
+ if ( __memtrack_addr ) { \
+ memtrack_alloc(MEMTRACK_KMALLOC, (unsigned long)(__memtrack_addr), sz, __FILE__, __LINE__, flgs); \
+ } \
+ __memtrack_addr; \
+})
+
+#endif
+
+#ifdef ZERO_OR_NULL_PTR
+#define kfree(addr) ({ \
+ void *__memtrack_addr = (void *)addr; \
+ if ( !ZERO_OR_NULL_PTR(__memtrack_addr) ) { \
+ memtrack_free(MEMTRACK_KMALLOC, (unsigned long)(__memtrack_addr), __FILE__, __LINE__); \
+ } \
+ kfree(__memtrack_addr); \
+})
+#else
+#define kfree(addr) ({ \
+ void *__memtrack_addr = (void *)addr; \
+ if ( __memtrack_addr ) { \
+ memtrack_free(MEMTRACK_KMALLOC, (unsigned long)(__memtrack_addr), __FILE__, __LINE__); \
+ } \
+ kfree(__memtrack_addr); \
+})
+#endif
+
+
+
+
+
+
+#define vmalloc(size) ({ \
+ void *__memtrack_addr; \
+ \
+ __memtrack_addr = vmalloc(size); \
+ if ( __memtrack_addr ) { \
+ memtrack_alloc(MEMTRACK_VMALLOC, (unsigned long)(__memtrack_addr), size, __FILE__, __LINE__, GFP_ATOMIC); \
+ } \
+ __memtrack_addr; \
+})
+
+
+#define vfree(addr) ({ \
+ void *__memtrack_addr = (void *)addr; \
+ if ( __memtrack_addr ) { \
+ memtrack_free(MEMTRACK_VMALLOC, (unsigned long)(__memtrack_addr), __FILE__, __LINE__); \
+ } \
+ vfree(__memtrack_addr); \
+})
+
+
+#define kmem_cache_alloc(cache, flags) ({ \
+ void *__memtrack_addr; \
+ \
+ __memtrack_addr = kmem_cache_alloc(cache, flags); \
+ if ( __memtrack_addr ) { \
+ memtrack_alloc(MEMTRACK_KMEM_OBJ, (unsigned long)(__memtrack_addr), 1, __FILE__, __LINE__, flags); \
+ } \
+ __memtrack_addr; \
+})
+
+
+#define kmem_cache_free(cache, addr) ({ \
+ void *__memtrack_addr = (void *)addr; \
+ if ( __memtrack_addr ) { \
+ memtrack_free(MEMTRACK_KMEM_OBJ, (unsigned long)(__memtrack_addr), __FILE__, __LINE__); \
+ } \
+ kmem_cache_free(cache, __memtrack_addr); \
+})
+
+
+#endif /* __mtrack_h_ */
+
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/Kconfig b/sys/ofed/drivers/infiniband/hw/mlx4/Kconfig
new file mode 100644
index 0000000..4175a4b
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/Kconfig
@@ -0,0 +1,8 @@
+config MLX4_INFINIBAND
+ tristate "Mellanox ConnectX HCA support"
+ select MLX4_CORE
+ ---help---
+ This driver provides low-level InfiniBand support for
+ Mellanox ConnectX PCI Express host channel adapters (HCAs).
+ This is required to use InfiniBand protocols such as
+ IP-over-IB or SRP with these devices.
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/Makefile b/sys/ofed/drivers/infiniband/hw/mlx4/Makefile
new file mode 100644
index 0000000..ce885a8
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_MLX4_INFINIBAND) += mlx4_ib.o
+
+mlx4_ib-y := ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o
+mlx4_ib-y += wc.o
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/ah.c b/sys/ofed/drivers/infiniband/hw/mlx4/ah.c
new file mode 100644
index 0000000..26251b47
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/ah.c
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "mlx4_ib.h"
+#include <rdma/ib_addr.h>
+#include <linux/inet.h>
+#include <linux/string.h>
+#include <rdma/ib_cache.h>
+
+int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr,
+ u8 *mac, int *is_mcast, u8 port)
+{
+ struct mlx4_ib_iboe *iboe = &dev->iboe;
+ struct in6_addr in6;
+
+ *is_mcast = 0;
+ spin_lock(&iboe->lock);
+ if (!iboe->netdevs[port - 1]) {
+ spin_unlock(&iboe->lock);
+ return -EINVAL;
+ }
+ spin_unlock(&iboe->lock);
+
+ memcpy(&in6, ah_attr->grh.dgid.raw, sizeof in6);
+ if (rdma_link_local_addr(&in6))
+ rdma_get_ll_mac(&in6, mac);
+ else if (rdma_is_multicast_addr(&in6)) {
+ rdma_get_mcast_mac(&in6, mac);
+ *is_mcast = 1;
+ } else
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr,
+ struct mlx4_ib_ah *ah)
+{
+ struct mlx4_dev *dev = to_mdev(pd->device)->dev;
+
+ ah->av.ib.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24));
+ ah->av.ib.g_slid = ah_attr->src_path_bits;
+ if (ah_attr->ah_flags & IB_AH_GRH) {
+ ah->av.ib.g_slid |= 0x80;
+ ah->av.ib.gid_index = ah_attr->grh.sgid_index;
+ ah->av.ib.hop_limit = ah_attr->grh.hop_limit;
+ ah->av.ib.sl_tclass_flowlabel |=
+ cpu_to_be32((ah_attr->grh.traffic_class << 20) |
+ ah_attr->grh.flow_label);
+ memcpy(ah->av.ib.dgid, ah_attr->grh.dgid.raw, 16);
+ }
+
+ ah->av.ib.dlid = cpu_to_be16(ah_attr->dlid);
+ if (ah_attr->static_rate) {
+ ah->av.ib.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET;
+ while (ah->av.ib.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
+ !(1 << ah->av.ib.stat_rate & dev->caps.stat_rate_support))
+ --ah->av.ib.stat_rate;
+ }
+ ah->av.ib.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28);
+
+ return &ah->ibah;
+}
+
+static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr,
+ struct mlx4_ib_ah *ah)
+{
+ struct mlx4_ib_dev *ibdev = to_mdev(pd->device);
+ struct mlx4_dev *dev = ibdev->dev;
+ u8 mac[6];
+ int err;
+ int is_mcast;
+ u16 vlan_tag;
+ union ib_gid sgid;
+
+ err = mlx4_ib_resolve_grh(ibdev, ah_attr, mac, &is_mcast, ah_attr->port_num);
+ if (err)
+ return ERR_PTR(err);
+
+ memcpy(ah->av.eth.mac, mac, 6);
+ err = ib_get_cached_gid(pd->device, ah_attr->port_num, ah_attr->grh.sgid_index, &sgid);
+ if (err)
+ return ERR_PTR(err);
+ vlan_tag = rdma_get_vlan_id(&sgid);
+ if (vlan_tag < 0x1000)
+ vlan_tag |= (ah_attr->sl & 7) << 13;
+ ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24));
+ ah->av.eth.gid_index = ah_attr->grh.sgid_index;
+ ah->av.eth.vlan = cpu_to_be16(vlan_tag);
+ if (ah_attr->static_rate) {
+ ah->av.eth.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET;
+ while (ah->av.eth.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
+ !(1 << ah->av.eth.stat_rate & dev->caps.stat_rate_support))
+ --ah->av.eth.stat_rate;
+ }
+
+ /*
+ * HW requires multicast LID so we just choose one.
+ */
+ if (is_mcast)
+ ah->av.ib.dlid = cpu_to_be16(0xc000);
+
+ memcpy(ah->av.eth.dgid, ah_attr->grh.dgid.raw, 16);
+ ah->av.eth.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28);
+
+ return &ah->ibah;
+}
+
+struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+ struct mlx4_ib_ah *ah;
+ struct ib_ah *ret;
+
+ ah = kzalloc(sizeof *ah, GFP_ATOMIC);
+ if (!ah)
+ return ERR_PTR(-ENOMEM);
+
+ if (rdma_port_get_link_layer(pd->device, ah_attr->port_num) == IB_LINK_LAYER_ETHERNET) {
+ if (!(ah_attr->ah_flags & IB_AH_GRH)) {
+ ret = ERR_PTR(-EINVAL);
+ goto out;
+ } else {
+ /* TBD: need to handle the case when we get called
+ in an atomic context and there we might sleep. We
+ don't expect this currently since we're working with
+ link local addresses which we can translate without
+ going to sleep */
+ ret = create_iboe_ah(pd, ah_attr, ah);
+ if (IS_ERR(ret))
+ goto out;
+ else
+ return ret;
+ }
+ } else
+ return create_ib_ah(pd, ah_attr, ah); /* never fails */
+
+out:
+ kfree(ah);
+ return ret;
+}
+
+int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+ struct mlx4_ib_ah *ah = to_mah(ibah);
+ enum rdma_link_layer ll;
+
+ memset(ah_attr, 0, sizeof *ah_attr);
+ ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
+ ah_attr->port_num = be32_to_cpu(ah->av.ib.port_pd) >> 24;
+ ll = rdma_port_get_link_layer(ibah->device, ah_attr->port_num);
+ ah_attr->dlid = ll == IB_LINK_LAYER_INFINIBAND ? be16_to_cpu(ah->av.ib.dlid) : 0;
+ if (ah->av.ib.stat_rate)
+ ah_attr->static_rate = ah->av.ib.stat_rate - MLX4_STAT_RATE_OFFSET;
+ ah_attr->src_path_bits = ah->av.ib.g_slid & 0x7F;
+
+ if (mlx4_ib_ah_grh_present(ah)) {
+ ah_attr->ah_flags = IB_AH_GRH;
+
+ ah_attr->grh.traffic_class =
+ be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20;
+ ah_attr->grh.flow_label =
+ be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) & 0xfffff;
+ ah_attr->grh.hop_limit = ah->av.ib.hop_limit;
+ ah_attr->grh.sgid_index = ah->av.ib.gid_index;
+ memcpy(ah_attr->grh.dgid.raw, ah->av.ib.dgid, 16);
+ }
+
+ return 0;
+}
+
+int mlx4_ib_destroy_ah(struct ib_ah *ah)
+{
+ kfree(to_mah(ah));
+ return 0;
+}
+
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/cq.c b/sys/ofed/drivers/infiniband/hw/mlx4/cq.c
new file mode 100644
index 0000000..31cd00d
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/cq.c
@@ -0,0 +1,861 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx4/cq.h>
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/srq.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+
+/* Which firmware version adds support for Resize CQ */
+#define MLX4_FW_VER_RESIZE_CQ mlx4_fw_ver(2, 5, 0)
+
+static void mlx4_ib_cq_comp(struct mlx4_cq *cq)
+{
+ struct ib_cq *ibcq = &to_mibcq(cq)->ibcq;
+ ibcq->comp_handler(ibcq, ibcq->cq_context);
+}
+
+static void mlx4_ib_cq_event(struct mlx4_cq *cq, enum mlx4_event type)
+{
+ struct ib_event event;
+ struct ib_cq *ibcq;
+
+ if (type != MLX4_EVENT_TYPE_CQ_ERROR) {
+ printk(KERN_WARNING "mlx4_ib: Unexpected event type %d "
+ "on CQ %06x\n", type, cq->cqn);
+ return;
+ }
+
+ ibcq = &to_mibcq(cq)->ibcq;
+ if (ibcq->event_handler) {
+ event.device = ibcq->device;
+ event.event = IB_EVENT_CQ_ERR;
+ event.element.cq = ibcq;
+ ibcq->event_handler(&event, ibcq->cq_context);
+ }
+}
+
+static void *get_cqe_from_buf(struct mlx4_ib_cq_buf *buf, int n)
+{
+ return mlx4_buf_offset(&buf->buf, n * sizeof (struct mlx4_cqe));
+}
+
+static void *get_cqe(struct mlx4_ib_cq *cq, int n)
+{
+ return get_cqe_from_buf(&cq->buf, n);
+}
+
+static void *get_sw_cqe(struct mlx4_ib_cq *cq, int n)
+{
+ struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibcq.cqe);
+
+ return (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
+ !!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe;
+}
+
+static struct mlx4_cqe *next_cqe_sw(struct mlx4_ib_cq *cq)
+{
+ return get_sw_cqe(cq, cq->mcq.cons_index);
+}
+
+int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
+{
+ struct mlx4_ib_cq *mcq = to_mcq(cq);
+ struct mlx4_ib_dev *dev = to_mdev(cq->device);
+
+ return mlx4_cq_modify(dev->dev, &mcq->mcq, cq_count, cq_period);
+}
+
+static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int nent)
+{
+ int err;
+
+ err = mlx4_buf_alloc(dev->dev, nent * sizeof(struct mlx4_cqe),
+ PAGE_SIZE * 2, &buf->buf);
+
+ if (err)
+ goto out;
+
+ err = mlx4_mtt_init(dev->dev, buf->buf.npages, buf->buf.page_shift,
+ &buf->mtt);
+ if (err)
+ goto err_buf;
+
+ err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf);
+ if (err)
+ goto err_mtt;
+
+ return 0;
+
+err_mtt:
+ mlx4_mtt_cleanup(dev->dev, &buf->mtt);
+
+err_buf:
+ mlx4_buf_free(dev->dev, nent * sizeof(struct mlx4_cqe),
+ &buf->buf);
+
+out:
+ return err;
+}
+
+static void mlx4_ib_free_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int cqe)
+{
+ mlx4_buf_free(dev->dev, (cqe + 1) * sizeof(struct mlx4_cqe), &buf->buf);
+}
+
+static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *context,
+ struct mlx4_ib_cq_buf *buf, struct ib_umem **umem,
+ u64 buf_addr, int cqe)
+{
+ int err;
+
+ *umem = ib_umem_get(context, buf_addr, cqe * sizeof (struct mlx4_cqe),
+ IB_ACCESS_LOCAL_WRITE, 1);
+ if (IS_ERR(*umem))
+ return PTR_ERR(*umem);
+
+ err = mlx4_mtt_init(dev->dev, ib_umem_page_count(*umem),
+ ilog2((*umem)->page_size), &buf->mtt);
+ if (err)
+ goto err_buf;
+
+ err = mlx4_ib_umem_write_mtt(dev, &buf->mtt, *umem);
+ if (err)
+ goto err_mtt;
+
+ return 0;
+
+err_mtt:
+ mlx4_mtt_cleanup(dev->dev, &buf->mtt);
+
+err_buf:
+ ib_umem_release(*umem);
+
+ return err;
+}
+
+struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector,
+ struct ib_ucontext *context,
+ struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ struct mlx4_ib_cq *cq;
+ struct mlx4_uar *uar;
+ int err;
+
+ if (entries < 1 || entries > dev->dev->caps.max_cqes) {
+ mlx4_ib_dbg("invalid num of entries: %d", entries);
+ return ERR_PTR(-EINVAL);
+ }
+
+ cq = kzalloc(sizeof *cq, GFP_KERNEL);
+ if (!cq)
+ return ERR_PTR(-ENOMEM);
+
+ entries = roundup_pow_of_two(entries + 1);
+ cq->ibcq.cqe = entries - 1;
+ mutex_init(&cq->resize_mutex);
+ spin_lock_init(&cq->lock);
+ cq->resize_buf = NULL;
+ cq->resize_umem = NULL;
+
+ if (context) {
+ struct mlx4_ib_create_cq ucmd;
+
+ if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+ err = -EFAULT;
+ goto err_cq;
+ }
+
+ err = mlx4_ib_get_cq_umem(dev, context, &cq->buf, &cq->umem,
+ ucmd.buf_addr, entries);
+ if (err)
+ goto err_cq;
+
+ err = mlx4_ib_db_map_user(to_mucontext(context), ucmd.db_addr,
+ &cq->db);
+ if (err)
+ goto err_mtt;
+
+ uar = &to_mucontext(context)->uar;
+ } else {
+ err = mlx4_db_alloc(dev->dev, &cq->db, 1);
+ if (err)
+ goto err_cq;
+
+ cq->mcq.set_ci_db = cq->db.db;
+ cq->mcq.arm_db = cq->db.db + 1;
+ *cq->mcq.set_ci_db = 0;
+ *cq->mcq.arm_db = 0;
+
+ err = mlx4_ib_alloc_cq_buf(dev, &cq->buf, entries);
+ if (err)
+ goto err_db;
+
+ uar = &dev->priv_uar;
+ }
+
+ err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar,
+ cq->db.dma, &cq->mcq,
+ vector == IB_CQ_VECTOR_LEAST_ATTACHED ?
+ MLX4_LEAST_ATTACHED_VECTOR : vector, 0);
+ if (err)
+ goto err_dbmap;
+
+ cq->mcq.comp = mlx4_ib_cq_comp;
+ cq->mcq.event = mlx4_ib_cq_event;
+
+ if (context)
+ if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof (__u32))) {
+ err = -EFAULT;
+ goto err_dbmap;
+ }
+
+ return &cq->ibcq;
+
+err_dbmap:
+ if (context)
+ mlx4_ib_db_unmap_user(to_mucontext(context), &cq->db);
+
+err_mtt:
+ mlx4_mtt_cleanup(dev->dev, &cq->buf.mtt);
+
+ if (context)
+ ib_umem_release(cq->umem);
+ else
+ mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
+
+err_db:
+ if (!context)
+ mlx4_db_free(dev->dev, &cq->db);
+
+err_cq:
+ kfree(cq);
+
+ return ERR_PTR(err);
+}
+
+static int mlx4_alloc_resize_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq,
+ int entries)
+{
+ int err;
+
+ if (cq->resize_buf)
+ return -EBUSY;
+
+ cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC);
+ if (!cq->resize_buf)
+ return -ENOMEM;
+
+ err = mlx4_ib_alloc_cq_buf(dev, &cq->resize_buf->buf, entries);
+ if (err) {
+ kfree(cq->resize_buf);
+ cq->resize_buf = NULL;
+ return err;
+ }
+
+ cq->resize_buf->cqe = entries - 1;
+
+ return 0;
+}
+
+static int mlx4_alloc_resize_umem(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq,
+ int entries, struct ib_udata *udata)
+{
+ struct mlx4_ib_resize_cq ucmd;
+ int err;
+
+ if (cq->resize_umem)
+ return -EBUSY;
+
+ if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd))
+ return -EFAULT;
+
+ cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC);
+ if (!cq->resize_buf)
+ return -ENOMEM;
+
+ err = mlx4_ib_get_cq_umem(dev, cq->umem->context, &cq->resize_buf->buf,
+ &cq->resize_umem, ucmd.buf_addr, entries);
+ if (err) {
+ kfree(cq->resize_buf);
+ cq->resize_buf = NULL;
+ return err;
+ }
+
+ cq->resize_buf->cqe = entries - 1;
+
+ return 0;
+}
+
+static int mlx4_ib_get_outstanding_cqes(struct mlx4_ib_cq *cq)
+{
+ u32 i;
+
+ i = cq->mcq.cons_index;
+ while (get_sw_cqe(cq, i & cq->ibcq.cqe))
+ ++i;
+
+ return i - cq->mcq.cons_index;
+}
+
+static void mlx4_ib_cq_resize_copy_cqes(struct mlx4_ib_cq *cq)
+{
+ struct mlx4_cqe *cqe, *new_cqe;
+ int i;
+
+ i = cq->mcq.cons_index;
+ cqe = get_cqe(cq, i & cq->ibcq.cqe);
+ while ((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != MLX4_CQE_OPCODE_RESIZE) {
+ new_cqe = get_cqe_from_buf(&cq->resize_buf->buf,
+ (i + 1) & cq->resize_buf->cqe);
+ memcpy(new_cqe, get_cqe(cq, i & cq->ibcq.cqe), sizeof(struct mlx4_cqe));
+ new_cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) |
+ (((i + 1) & (cq->resize_buf->cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0);
+ cqe = get_cqe(cq, ++i & cq->ibcq.cqe);
+ }
+ ++cq->mcq.cons_index;
+}
+
+int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibcq->device);
+ struct mlx4_ib_cq *cq = to_mcq(ibcq);
+ struct mlx4_mtt mtt;
+ int outst_cqe;
+ int err;
+
+ if (dev->dev->caps.fw_ver < MLX4_FW_VER_RESIZE_CQ)
+ return -ENOSYS;
+
+ mutex_lock(&cq->resize_mutex);
+
+ if (entries < 1 || entries > dev->dev->caps.max_cqes) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ entries = roundup_pow_of_two(entries + 1);
+ if (entries == ibcq->cqe + 1) {
+ err = 0;
+ goto out;
+ }
+
+ if (ibcq->uobject) {
+ err = mlx4_alloc_resize_umem(dev, cq, entries, udata);
+ if (err)
+ goto out;
+ } else {
+ /* Can't be smaller than the number of outstanding CQEs */
+ outst_cqe = mlx4_ib_get_outstanding_cqes(cq);
+ if (entries < outst_cqe + 1) {
+ err = 0;
+ goto out;
+ }
+
+ err = mlx4_alloc_resize_buf(dev, cq, entries);
+ if (err)
+ goto out;
+ }
+
+ mtt = cq->buf.mtt;
+
+ err = mlx4_cq_resize(dev->dev, &cq->mcq, entries, &cq->resize_buf->buf.mtt);
+ if (err)
+ goto err_buf;
+
+ mlx4_mtt_cleanup(dev->dev, &mtt);
+ if (ibcq->uobject) {
+ cq->buf = cq->resize_buf->buf;
+ cq->ibcq.cqe = cq->resize_buf->cqe;
+ ib_umem_release(cq->umem);
+ cq->umem = cq->resize_umem;
+
+ kfree(cq->resize_buf);
+ cq->resize_buf = NULL;
+ cq->resize_umem = NULL;
+ } else {
+ struct mlx4_ib_cq_buf tmp_buf;
+ int tmp_cqe = 0;
+
+ spin_lock_irq(&cq->lock);
+ if (cq->resize_buf) {
+ mlx4_ib_cq_resize_copy_cqes(cq);
+ tmp_buf = cq->buf;
+ tmp_cqe = cq->ibcq.cqe;
+ cq->buf = cq->resize_buf->buf;
+ cq->ibcq.cqe = cq->resize_buf->cqe;
+
+ kfree(cq->resize_buf);
+ cq->resize_buf = NULL;
+ }
+ spin_unlock_irq(&cq->lock);
+
+ if (tmp_cqe)
+ mlx4_ib_free_cq_buf(dev, &tmp_buf, tmp_cqe);
+ }
+
+ goto out;
+
+err_buf:
+ mlx4_mtt_cleanup(dev->dev, &cq->resize_buf->buf.mtt);
+ if (!ibcq->uobject)
+ mlx4_ib_free_cq_buf(dev, &cq->resize_buf->buf,
+ cq->resize_buf->cqe);
+
+ kfree(cq->resize_buf);
+ cq->resize_buf = NULL;
+
+ if (cq->resize_umem) {
+ ib_umem_release(cq->resize_umem);
+ cq->resize_umem = NULL;
+ }
+
+out:
+ mutex_unlock(&cq->resize_mutex);
+ return err;
+}
+
+int mlx4_ib_destroy_cq(struct ib_cq *cq)
+{
+ struct mlx4_ib_dev *dev = to_mdev(cq->device);
+ struct mlx4_ib_cq *mcq = to_mcq(cq);
+
+ mlx4_cq_free(dev->dev, &mcq->mcq);
+ mlx4_mtt_cleanup(dev->dev, &mcq->buf.mtt);
+
+ if (cq->uobject) {
+ mlx4_ib_db_unmap_user(to_mucontext(cq->uobject->context), &mcq->db);
+ ib_umem_release(mcq->umem);
+ } else {
+ mlx4_ib_free_cq_buf(dev, &mcq->buf, cq->cqe);
+ mlx4_db_free(dev->dev, &mcq->db);
+ }
+
+ kfree(mcq);
+
+ return 0;
+}
+
+static void dump_cqe(void *cqe)
+{
+ __be32 *buf = cqe;
+
+ printk(KERN_DEBUG "CQE contents %08x %08x %08x %08x %08x %08x %08x %08x\n",
+ be32_to_cpu(buf[0]), be32_to_cpu(buf[1]), be32_to_cpu(buf[2]),
+ be32_to_cpu(buf[3]), be32_to_cpu(buf[4]), be32_to_cpu(buf[5]),
+ be32_to_cpu(buf[6]), be32_to_cpu(buf[7]));
+}
+
+static void mlx4_ib_handle_error_cqe(struct mlx4_err_cqe *cqe,
+ struct ib_wc *wc)
+{
+ if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR) {
+ printk(KERN_DEBUG "local QP operation err "
+ "(QPN %06x, WQE index %x, vendor syndrome %02x, "
+ "opcode = %02x)\n",
+ be32_to_cpu(cqe->my_qpn), be16_to_cpu(cqe->wqe_index),
+ cqe->vendor_err_syndrome,
+ cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK);
+ dump_cqe(cqe);
+ }
+
+ switch (cqe->syndrome) {
+ case MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR:
+ wc->status = IB_WC_LOC_LEN_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR:
+ wc->status = IB_WC_LOC_QP_OP_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_LOCAL_PROT_ERR:
+ wc->status = IB_WC_LOC_PROT_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_WR_FLUSH_ERR:
+ wc->status = IB_WC_WR_FLUSH_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_MW_BIND_ERR:
+ wc->status = IB_WC_MW_BIND_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_BAD_RESP_ERR:
+ wc->status = IB_WC_BAD_RESP_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR:
+ wc->status = IB_WC_LOC_ACCESS_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR:
+ wc->status = IB_WC_REM_INV_REQ_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR:
+ wc->status = IB_WC_REM_ACCESS_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_REMOTE_OP_ERR:
+ wc->status = IB_WC_REM_OP_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR:
+ wc->status = IB_WC_RETRY_EXC_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR:
+ wc->status = IB_WC_RNR_RETRY_EXC_ERR;
+ break;
+ case MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR:
+ wc->status = IB_WC_REM_ABORT_ERR;
+ break;
+ default:
+ wc->status = IB_WC_GENERAL_ERR;
+ break;
+ }
+
+ wc->vendor_err = cqe->vendor_err_syndrome;
+}
+
+static int mlx4_ib_ipoib_csum_ok(__be16 status, __be16 checksum)
+{
+ return ((status & cpu_to_be16(MLX4_CQE_STATUS_IPV4 |
+ MLX4_CQE_STATUS_IPV4F |
+ MLX4_CQE_STATUS_IPV4OPT |
+ MLX4_CQE_STATUS_IPV6 |
+ MLX4_CQE_STATUS_IPOK)) ==
+ cpu_to_be16(MLX4_CQE_STATUS_IPV4 |
+ MLX4_CQE_STATUS_IPOK)) &&
+ (status & cpu_to_be16(MLX4_CQE_STATUS_UDP |
+ MLX4_CQE_STATUS_TCP)) &&
+ checksum == cpu_to_be16(0xffff);
+}
+
+static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
+ struct mlx4_ib_qp **cur_qp,
+ struct ib_wc *wc)
+{
+ struct mlx4_cqe *cqe;
+ struct mlx4_qp *mqp;
+ struct mlx4_ib_wq *wq;
+ struct mlx4_ib_srq *srq;
+ struct mlx4_srq *msrq;
+ int is_send;
+ int is_error;
+ u32 g_mlpath_rqpn;
+ int is_xrc_recv = 0;
+ u16 wqe_ctr;
+
+repoll:
+ cqe = next_cqe_sw(cq);
+ if (!cqe)
+ return -EAGAIN;
+
+ ++cq->mcq.cons_index;
+
+ /*
+ * Make sure we read CQ entry contents after we've checked the
+ * ownership bit.
+ */
+ rmb();
+
+ is_send = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK;
+ is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
+ MLX4_CQE_OPCODE_ERROR;
+
+ if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_OPCODE_NOP &&
+ is_send)) {
+ printk(KERN_WARNING "Completion for NOP opcode detected!\n");
+ return -EINVAL;
+ }
+
+ /* Resize CQ in progress */
+ if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_CQE_OPCODE_RESIZE)) {
+ if (cq->resize_buf) {
+ struct mlx4_ib_dev *dev = to_mdev(cq->ibcq.device);
+
+ mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
+ cq->buf = cq->resize_buf->buf;
+ cq->ibcq.cqe = cq->resize_buf->cqe;
+
+ kfree(cq->resize_buf);
+ cq->resize_buf = NULL;
+ }
+
+ goto repoll;
+ }
+
+ if ((be32_to_cpu(cqe->vlan_my_qpn) & (1 << 23)) && !is_send) {
+ /*
+ * We do not have to take the XRC SRQ table lock here,
+ * because CQs will be locked while XRC SRQs are removed
+ * from the table.
+ */
+ msrq = __mlx4_srq_lookup(to_mdev(cq->ibcq.device)->dev,
+ be32_to_cpu(cqe->g_mlpath_rqpn) &
+ 0xffffff);
+ if (unlikely(!msrq)) {
+ printk(KERN_WARNING "CQ %06x with entry for unknown "
+ "XRC SRQ %06x\n", cq->mcq.cqn,
+ be32_to_cpu(cqe->g_mlpath_rqpn) & 0xffffff);
+ return -EINVAL;
+ }
+ is_xrc_recv = 1;
+ srq = to_mibsrq(msrq);
+ } else if (!*cur_qp ||
+ (be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) != (*cur_qp)->mqp.qpn) {
+ /*
+ * We do not have to take the QP table lock here,
+ * because CQs will be locked while QPs are removed
+ * from the table.
+ */
+ mqp = __mlx4_qp_lookup(to_mdev(cq->ibcq.device)->dev,
+ be32_to_cpu(cqe->vlan_my_qpn));
+ if (unlikely(!mqp)) {
+ printk(KERN_WARNING "CQ %06x with entry for unknown QPN %06x\n",
+ cq->mcq.cqn, be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK);
+ return -EINVAL;
+ }
+
+ *cur_qp = to_mibqp(mqp);
+ }
+
+ wc->qp = is_xrc_recv ? NULL: &(*cur_qp)->ibqp;
+
+ if (is_send) {
+ wq = &(*cur_qp)->sq;
+ if (!(*cur_qp)->sq_signal_bits) {
+ wqe_ctr = be16_to_cpu(cqe->wqe_index);
+ wq->tail += (u16) (wqe_ctr - (u16) wq->tail);
+ }
+ wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+ ++wq->tail;
+ } else if (is_xrc_recv) {
+ wqe_ctr = be16_to_cpu(cqe->wqe_index);
+ wc->wr_id = srq->wrid[wqe_ctr];
+ mlx4_ib_free_srq_wqe(srq, wqe_ctr);
+ } else if ((*cur_qp)->ibqp.srq) {
+ srq = to_msrq((*cur_qp)->ibqp.srq);
+ wqe_ctr = be16_to_cpu(cqe->wqe_index);
+ wc->wr_id = srq->wrid[wqe_ctr];
+ mlx4_ib_free_srq_wqe(srq, wqe_ctr);
+ } else {
+ wq = &(*cur_qp)->rq;
+ wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+ ++wq->tail;
+ }
+
+ if (unlikely(is_error)) {
+ mlx4_ib_handle_error_cqe((struct mlx4_err_cqe *) cqe, wc);
+ return 0;
+ }
+
+ wc->status = IB_WC_SUCCESS;
+
+ if (is_send) {
+ wc->wc_flags = 0;
+ switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+ case MLX4_OPCODE_RDMA_WRITE_IMM:
+ wc->wc_flags |= IB_WC_WITH_IMM;
+ case MLX4_OPCODE_RDMA_WRITE:
+ wc->opcode = IB_WC_RDMA_WRITE;
+ break;
+ case MLX4_OPCODE_SEND_IMM:
+ wc->wc_flags |= IB_WC_WITH_IMM;
+ case MLX4_OPCODE_SEND:
+ case MLX4_OPCODE_SEND_INVAL:
+ wc->opcode = IB_WC_SEND;
+ break;
+ case MLX4_OPCODE_RDMA_READ:
+ wc->opcode = IB_WC_RDMA_READ;
+ wc->byte_len = be32_to_cpu(cqe->byte_cnt);
+ break;
+ case MLX4_OPCODE_ATOMIC_CS:
+ wc->opcode = IB_WC_COMP_SWAP;
+ wc->byte_len = 8;
+ break;
+ case MLX4_OPCODE_ATOMIC_FA:
+ wc->opcode = IB_WC_FETCH_ADD;
+ wc->byte_len = 8;
+ break;
+ case MLX4_OPCODE_MASKED_ATOMIC_CS:
+ wc->opcode = IB_WC_MASKED_COMP_SWAP;
+ wc->byte_len = 8;
+ break;
+ case MLX4_OPCODE_MASKED_ATOMIC_FA:
+ wc->opcode = IB_WC_MASKED_FETCH_ADD;
+ wc->byte_len = 8;
+ break;
+ case MLX4_OPCODE_BIND_MW:
+ wc->opcode = IB_WC_BIND_MW;
+ break;
+ case MLX4_OPCODE_LSO:
+ wc->opcode = IB_WC_LSO;
+ break;
+ case MLX4_OPCODE_FMR:
+ wc->opcode = IB_WC_FAST_REG_MR;
+ break;
+ case MLX4_OPCODE_LOCAL_INVAL:
+ wc->opcode = IB_WC_LOCAL_INV;
+ break;
+ }
+ } else {
+ wc->byte_len = be32_to_cpu(cqe->byte_cnt);
+
+ switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+ case MLX4_RECV_OPCODE_RDMA_WRITE_IMM:
+ wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+ wc->wc_flags = IB_WC_WITH_IMM;
+ wc->ex.imm_data = cqe->immed_rss_invalid;
+ break;
+ case MLX4_RECV_OPCODE_SEND_INVAL:
+ wc->opcode = IB_WC_RECV;
+ wc->wc_flags = IB_WC_WITH_INVALIDATE;
+ wc->ex.invalidate_rkey = be32_to_cpu(cqe->immed_rss_invalid);
+ break;
+ case MLX4_RECV_OPCODE_SEND:
+ wc->opcode = IB_WC_RECV;
+ wc->wc_flags = 0;
+ break;
+ case MLX4_RECV_OPCODE_SEND_IMM:
+ wc->opcode = IB_WC_RECV;
+ wc->wc_flags = IB_WC_WITH_IMM;
+ wc->ex.imm_data = cqe->immed_rss_invalid;
+ break;
+ }
+
+ wc->slid = be16_to_cpu(cqe->rlid);
+ wc->sl = be16_to_cpu(cqe->sl_vid) >> 12;
+ g_mlpath_rqpn = be32_to_cpu(cqe->g_mlpath_rqpn);
+ wc->src_qp = g_mlpath_rqpn & 0xffffff;
+ wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f;
+ wc->wc_flags |= g_mlpath_rqpn & 0x80000000 ? IB_WC_GRH : 0;
+ wc->pkey_index = be32_to_cpu(cqe->immed_rss_invalid) & 0x7f;
+ wc->csum_ok = mlx4_ib_ipoib_csum_ok(cqe->status, cqe->checksum);
+ }
+
+ return 0;
+}
+
+int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+ struct mlx4_ib_cq *cq = to_mcq(ibcq);
+ struct mlx4_ib_qp *cur_qp = NULL;
+ unsigned long flags;
+ int npolled;
+ int err = 0;
+
+ spin_lock_irqsave(&cq->lock, flags);
+
+ for (npolled = 0; npolled < num_entries; ++npolled) {
+ err = mlx4_ib_poll_one(cq, &cur_qp, wc + npolled);
+ if (err)
+ break;
+ }
+
+ if (npolled)
+ mlx4_cq_set_ci(&cq->mcq);
+
+ spin_unlock_irqrestore(&cq->lock, flags);
+
+ if (err == 0 || err == -EAGAIN)
+ return npolled;
+ else
+ return err;
+}
+
+int mlx4_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+ mlx4_cq_arm(&to_mcq(ibcq)->mcq,
+ (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+ MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT,
+ to_mdev(ibcq->device)->priv_uar.map,
+ MLX4_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->uar_lock));
+
+ return 0;
+}
+
+void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq)
+{
+ u32 prod_index;
+ int nfreed = 0;
+ struct mlx4_cqe *cqe, *dest;
+ u8 owner_bit;
+ int is_xrc_srq = 0;
+
+ if (srq && srq->ibsrq.xrc_cq)
+ is_xrc_srq = 1;
+
+ /*
+ * First we need to find the current producer index, so we
+ * know where to start cleaning from. It doesn't matter if HW
+ * adds new entries after this loop -- the QP we're worried
+ * about is already in RESET, so the new entries won't come
+ * from our QP and therefore don't need to be checked.
+ */
+ for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); ++prod_index)
+ if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe)
+ break;
+
+ /*
+ * Now sweep backwards through the CQ, removing CQ entries
+ * that match our QP by copying older entries on top of them.
+ */
+ while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) {
+ cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
+ if (((be32_to_cpu(cqe->vlan_my_qpn) & 0xffffff) == qpn) ||
+ (is_xrc_srq &&
+ (be32_to_cpu(cqe->g_mlpath_rqpn) & 0xffffff) ==
+ srq->msrq.srqn)) {
+ if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))
+ mlx4_ib_free_srq_wqe(srq, be16_to_cpu(cqe->wqe_index));
+ ++nfreed;
+ } else if (nfreed) {
+ dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe);
+ owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK;
+ memcpy(dest, cqe, sizeof *cqe);
+ dest->owner_sr_opcode = owner_bit |
+ (dest->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK);
+ }
+ }
+
+ if (nfreed) {
+ cq->mcq.cons_index += nfreed;
+ /*
+ * Make sure update of buffer contents is done before
+ * updating consumer index.
+ */
+ wmb();
+ mlx4_cq_set_ci(&cq->mcq);
+ }
+}
+
+void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq)
+{
+ spin_lock_irq(&cq->lock);
+ __mlx4_ib_cq_clean(cq, qpn, srq);
+ spin_unlock_irq(&cq->lock);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/doorbell.c b/sys/ofed/drivers/infiniband/hw/mlx4/doorbell.c
new file mode 100644
index 0000000..8aee423
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/doorbell.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+
+#include "mlx4_ib.h"
+
+struct mlx4_ib_user_db_page {
+ struct list_head list;
+ struct ib_umem *umem;
+ unsigned long user_virt;
+ int refcnt;
+};
+
+int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
+ struct mlx4_db *db)
+{
+ struct mlx4_ib_user_db_page *page;
+ struct ib_umem_chunk *chunk;
+ int err = 0;
+
+ mutex_lock(&context->db_page_mutex);
+
+ list_for_each_entry(page, &context->db_page_list, list)
+ if (page->user_virt == (virt & PAGE_MASK))
+ goto found;
+
+ page = kmalloc(sizeof *page, GFP_KERNEL);
+ if (!page) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ page->user_virt = (virt & PAGE_MASK);
+ page->refcnt = 0;
+ page->umem = ib_umem_get(&context->ibucontext, virt & PAGE_MASK,
+ PAGE_SIZE, 0, 0);
+ if (IS_ERR(page->umem)) {
+ err = PTR_ERR(page->umem);
+ kfree(page);
+ goto out;
+ }
+
+ list_add(&page->list, &context->db_page_list);
+
+found:
+ chunk = list_entry(page->umem->chunk_list.next, struct ib_umem_chunk, list);
+ db->dma = sg_dma_address(chunk->page_list) + (virt & ~PAGE_MASK);
+ db->u.user_page = page;
+ ++page->refcnt;
+
+out:
+ mutex_unlock(&context->db_page_mutex);
+
+ return err;
+}
+
+void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db)
+{
+ mutex_lock(&context->db_page_mutex);
+
+ if (!--db->u.user_page->refcnt) {
+ list_del(&db->u.user_page->list);
+ ib_umem_release(db->u.user_page->umem);
+ kfree(db->u.user_page);
+ }
+
+ mutex_unlock(&context->db_page_mutex);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/mad.c b/sys/ofed/drivers/infiniband/hw/mlx4/mad.c
new file mode 100644
index 0000000..2bb87ab
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/mad.c
@@ -0,0 +1,452 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4_ib.h"
+
+enum {
+ MLX4_IB_VENDOR_CLASS1 = 0x9,
+ MLX4_IB_VENDOR_CLASS2 = 0xa
+};
+
+int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey,
+ int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+ void *in_mad, void *response_mad)
+{
+ struct mlx4_cmd_mailbox *inmailbox, *outmailbox;
+ void *inbox;
+ int err;
+ u32 in_modifier = port;
+ u8 op_modifier = 0;
+
+ inmailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+ if (IS_ERR(inmailbox))
+ return PTR_ERR(inmailbox);
+ inbox = inmailbox->buf;
+
+ outmailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+ if (IS_ERR(outmailbox)) {
+ mlx4_free_cmd_mailbox(dev->dev, inmailbox);
+ return PTR_ERR(outmailbox);
+ }
+
+ memcpy(inbox, in_mad, 256);
+
+ /*
+ * Key check traps can't be generated unless we have in_wc to
+ * tell us where to send the trap.
+ */
+ if (ignore_mkey || !in_wc)
+ op_modifier |= 0x1;
+ if (ignore_bkey || !in_wc)
+ op_modifier |= 0x2;
+
+ if (in_wc) {
+ struct {
+ __be32 my_qpn;
+ u32 reserved1;
+ __be32 rqpn;
+ u8 sl;
+ u8 g_path;
+ u16 reserved2[2];
+ __be16 pkey;
+ u32 reserved3[11];
+ u8 grh[40];
+ } *ext_info;
+
+ memset(inbox + 256, 0, 256);
+ ext_info = inbox + 256;
+
+ ext_info->my_qpn = cpu_to_be32(in_wc->qp->qp_num);
+ ext_info->rqpn = cpu_to_be32(in_wc->src_qp);
+ ext_info->sl = in_wc->sl << 4;
+ ext_info->g_path = in_wc->dlid_path_bits |
+ (in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0);
+ ext_info->pkey = cpu_to_be16(in_wc->pkey_index);
+
+ if (in_grh)
+ memcpy(ext_info->grh, in_grh, 40);
+
+ op_modifier |= 0x4;
+
+ in_modifier |= in_wc->slid << 16;
+ }
+
+ err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma,
+ in_modifier, op_modifier,
+ MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C);
+
+ if (!err)
+ memcpy(response_mad, outmailbox->buf, 256);
+
+ mlx4_free_cmd_mailbox(dev->dev, inmailbox);
+ mlx4_free_cmd_mailbox(dev->dev, outmailbox);
+
+ return err;
+}
+
+static void update_sm_ah(struct mlx4_ib_dev *dev, u8 port_num, u16 lid, u8 sl)
+{
+ struct ib_ah *new_ah;
+ struct ib_ah_attr ah_attr;
+
+ if (!dev->send_agent[port_num - 1][0])
+ return;
+
+ memset(&ah_attr, 0, sizeof ah_attr);
+ ah_attr.dlid = lid;
+ ah_attr.sl = sl;
+ ah_attr.port_num = port_num;
+
+ new_ah = ib_create_ah(dev->send_agent[port_num - 1][0]->qp->pd,
+ &ah_attr);
+ if (IS_ERR(new_ah))
+ return;
+
+ spin_lock(&dev->sm_lock);
+ if (dev->sm_ah[port_num - 1])
+ ib_destroy_ah(dev->sm_ah[port_num - 1]);
+ dev->sm_ah[port_num - 1] = new_ah;
+ spin_unlock(&dev->sm_lock);
+}
+
+/*
+ * Snoop SM MADs for port info and P_Key table sets, so we can
+ * synthesize LID change and P_Key change events.
+ */
+static void smp_snoop(struct ib_device *ibdev, u8 port_num, struct ib_mad *mad,
+ u16 prev_lid)
+{
+ struct ib_event event;
+
+ if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+ mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+ mad->mad_hdr.method == IB_MGMT_METHOD_SET) {
+ if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO) {
+ struct ib_port_info *pinfo =
+ (struct ib_port_info *) ((struct ib_smp *) mad)->data;
+ u16 lid = be16_to_cpu(pinfo->lid);
+
+ update_sm_ah(to_mdev(ibdev), port_num,
+ be16_to_cpu(pinfo->sm_lid),
+ pinfo->neighbormtu_mastersmsl & 0xf);
+
+ event.device = ibdev;
+ event.element.port_num = port_num;
+
+ if (pinfo->clientrereg_resv_subnetto & 0x80) {
+ event.event = IB_EVENT_CLIENT_REREGISTER;
+ ib_dispatch_event(&event);
+ }
+
+ if (prev_lid != lid) {
+ event.event = IB_EVENT_LID_CHANGE;
+ ib_dispatch_event(&event);
+ }
+ }
+
+ if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PKEY_TABLE) {
+ event.device = ibdev;
+ event.event = IB_EVENT_PKEY_CHANGE;
+ event.element.port_num = port_num;
+ ib_dispatch_event(&event);
+ }
+ }
+}
+
+static void node_desc_override(struct ib_device *dev,
+ struct ib_mad *mad)
+{
+ if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+ mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+ mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP &&
+ mad->mad_hdr.attr_id == IB_SMP_ATTR_NODE_DESC) {
+ spin_lock(&to_mdev(dev)->sm_lock);
+ memcpy(((struct ib_smp *) mad)->data, dev->node_desc, 64);
+ spin_unlock(&to_mdev(dev)->sm_lock);
+ }
+}
+
+static void forward_trap(struct mlx4_ib_dev *dev, u8 port_num, struct ib_mad *mad)
+{
+ int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED;
+ struct ib_mad_send_buf *send_buf;
+ struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn];
+ int ret;
+
+ if (agent) {
+ send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR,
+ IB_MGMT_MAD_DATA, GFP_ATOMIC);
+ /*
+ * We rely here on the fact that MLX QPs don't use the
+ * address handle after the send is posted (this is
+ * wrong following the IB spec strictly, but we know
+ * it's OK for our devices).
+ */
+ spin_lock(&dev->sm_lock);
+ memcpy(send_buf->mad, mad, sizeof *mad);
+ if ((send_buf->ah = dev->sm_ah[port_num - 1]))
+ ret = ib_post_send_mad(send_buf, NULL);
+ else
+ ret = -EINVAL;
+ spin_unlock(&dev->sm_lock);
+
+ if (ret)
+ ib_free_send_mad(send_buf);
+ }
+}
+
+static int is_vendor_id(__be16 attr_id)
+{
+ return (attr_id & IB_SMP_ATTR_VENDOR_MASK) == IB_SMP_ATTR_VENDOR_MASK;
+}
+
+static int supported_vendor_id(__be16 attr_id)
+{
+ return 1;
+}
+
+static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+ struct ib_wc *in_wc, struct ib_grh *in_grh,
+ struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+ u16 slid, prev_lid = 0;
+ int err;
+ struct ib_port_attr pattr;
+
+ slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE);
+
+ if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) {
+ forward_trap(to_mdev(ibdev), port_num, in_mad);
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+ }
+
+ if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+ in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+ if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET &&
+ in_mad->mad_hdr.method != IB_MGMT_METHOD_SET &&
+ in_mad->mad_hdr.method != IB_MGMT_METHOD_TRAP_REPRESS)
+ return IB_MAD_RESULT_SUCCESS;
+
+ /*
+ * Don't process SMInfo queries or vendor-specific
+ * MADs -- the SMA can't handle them.
+ */
+ if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO ||
+ (is_vendor_id(in_mad->mad_hdr.attr_id) &&
+ !supported_vendor_id(in_mad->mad_hdr.attr_id)))
+ return IB_MAD_RESULT_SUCCESS;
+ } else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
+ in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS1 ||
+ in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS2 ||
+ in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CONG_MGMT) {
+ if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET &&
+ in_mad->mad_hdr.method != IB_MGMT_METHOD_SET)
+ return IB_MAD_RESULT_SUCCESS;
+ } else
+ return IB_MAD_RESULT_SUCCESS;
+
+ if ((in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+ in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+ in_mad->mad_hdr.method == IB_MGMT_METHOD_SET &&
+ in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
+ !ib_query_port(ibdev, port_num, &pattr))
+ prev_lid = pattr.lid;
+
+ err = mlx4_MAD_IFC(to_mdev(ibdev),
+ mad_flags & IB_MAD_IGNORE_MKEY,
+ mad_flags & IB_MAD_IGNORE_BKEY,
+ port_num, in_wc, in_grh, in_mad, out_mad);
+ if (err)
+ return IB_MAD_RESULT_FAILURE;
+
+ if (!out_mad->mad_hdr.status) {
+ smp_snoop(ibdev, port_num, in_mad, prev_lid);
+ node_desc_override(ibdev, out_mad);
+ }
+
+ /* set return bit in status of directed route responses */
+ if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ out_mad->mad_hdr.status |= cpu_to_be16(1 << 15);
+
+ if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS)
+ /* no response for trap repress */
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static __be32 be64_to_be32(__be64 b64)
+{
+ return cpu_to_be32(be64_to_cpu(b64) & 0xffffffff);
+}
+
+static void edit_counters(struct mlx4_counters *cnt, void *data)
+{
+ *(__be32 *)(data + 40 + 24) = be64_to_be32(cnt->tx_bytes);
+ *(__be32 *)(data + 40 + 28) = be64_to_be32(cnt->rx_bytes);
+ *(__be32 *)(data + 40 + 32) = be64_to_be32(cnt->tx_frames);
+ *(__be32 *)(data + 40 + 36) = be64_to_be32(cnt->rx_frames);
+}
+
+static void edit_ext_counters(struct mlx4_counters_ext *cnt, void *data)
+{
+ *(__be32 *)(data + 40 + 24) = be64_to_be32(cnt->tx_uni_bytes);
+ *(__be32 *)(data + 40 + 28) = be64_to_be32(cnt->rx_uni_bytes);
+ *(__be32 *)(data + 40 + 32) = be64_to_be32(cnt->tx_uni_frames);
+ *(__be32 *)(data + 40 + 36) = be64_to_be32(cnt->rx_uni_frames);
+ *(__be32 *)(data + 40 + 8) = be64_to_be32(cnt->rx_err_frames);
+}
+
+static int rdmaoe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+ struct ib_wc *in_wc, struct ib_grh *in_grh,
+ struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ int err;
+ u32 inmod = dev->counters[port_num - 1] & 0xffff;
+ int mode;
+
+ if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT)
+ return -EINVAL;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+ if (IS_ERR(mailbox))
+ return IB_MAD_RESULT_FAILURE;
+
+ err = mlx4_cmd_box(dev->dev, 0, mailbox->dma, inmod, 0,
+ MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C);
+ if (err)
+ err = IB_MAD_RESULT_FAILURE;
+ else {
+ memset(out_mad->data, 0, sizeof out_mad->data);
+ mode = be32_to_cpu(((struct mlx4_counters *)mailbox->buf)->counter_mode) & 0xf;
+ switch (mode) {
+ case 0:
+ edit_counters(mailbox->buf, out_mad->data);
+ err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+ break;
+ case 1:
+ edit_ext_counters(mailbox->buf, out_mad->data);
+ err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+ break;
+ default:
+ err = IB_MAD_RESULT_FAILURE;
+ }
+ }
+
+ mlx4_free_cmd_mailbox(dev->dev, mailbox);
+
+ return err;
+}
+
+int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+ struct ib_wc *in_wc, struct ib_grh *in_grh,
+ struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+ switch (rdma_port_get_link_layer(ibdev, port_num)) {
+ case IB_LINK_LAYER_INFINIBAND:
+ return ib_process_mad(ibdev, mad_flags, port_num, in_wc,
+ in_grh, in_mad, out_mad);
+ case IB_LINK_LAYER_ETHERNET:
+ return rdmaoe_process_mad(ibdev, mad_flags, port_num, in_wc,
+ in_grh, in_mad, out_mad);
+ default:
+ return -EINVAL;
+ }
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+int mlx4_ib_mad_init(struct mlx4_ib_dev *dev)
+{
+ struct ib_mad_agent *agent;
+ int p, q;
+ int ret;
+ enum rdma_link_layer ll;
+
+ for (p = 0; p < dev->num_ports; ++p) {
+ ll = rdma_port_get_link_layer(&dev->ib_dev, p + 1);
+ for (q = 0; q <= 1; ++q) {
+ if (ll == IB_LINK_LAYER_INFINIBAND) {
+ agent = ib_register_mad_agent(&dev->ib_dev, p + 1,
+ q ? IB_QPT_GSI : IB_QPT_SMI,
+ NULL, 0, send_handler,
+ NULL, NULL);
+ if (IS_ERR(agent)) {
+ ret = PTR_ERR(agent);
+ goto err;
+ }
+ dev->send_agent[p][q] = agent;
+ } else
+ dev->send_agent[p][q] = NULL;
+ }
+ }
+
+ return 0;
+
+err:
+ for (p = 0; p < dev->num_ports; ++p)
+ for (q = 0; q <= 1; ++q)
+ if (dev->send_agent[p][q])
+ ib_unregister_mad_agent(dev->send_agent[p][q]);
+
+ return ret;
+}
+
+void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev)
+{
+ struct ib_mad_agent *agent;
+ int p, q;
+
+ for (p = 0; p < dev->num_ports; ++p) {
+ for (q = 0; q <= 1; ++q) {
+ agent = dev->send_agent[p][q];
+ if (agent) {
+ dev->send_agent[p][q] = NULL;
+ ib_unregister_mad_agent(agent);
+ }
+ }
+
+ if (dev->sm_ah[p])
+ ib_destroy_ah(dev->sm_ah[p]);
+ }
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/main.c b/sys/ofed/drivers/infiniband/hw/mlx4/main.c
new file mode 100644
index 0000000..bc99414
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/main.c
@@ -0,0 +1,1580 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_vlan.h>
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_addr.h>
+
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+#include "wc.h"
+
+#define DRV_NAME MLX4_IB_DRV_NAME
+#define DRV_VERSION "1.0-ofed1.5.2"
+#define DRV_RELDATE "August 4, 2010"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+#ifdef CONFIG_MLX4_DEBUG
+
+int mlx4_ib_debug_level = 0;
+module_param_named(debug_level, mlx4_ib_debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
+
+#endif /* CONFIG_MLX4_DEBUG */
+
+static const char mlx4_ib_version[] =
+ DRV_NAME ": Mellanox ConnectX InfiniBand driver v"
+ DRV_VERSION " (" DRV_RELDATE ")\n";
+
+static void *get_ibdev(struct mlx4_dev *dev, void *ctx, u8 port)
+{
+ struct mlx4_ib_dev *mlxibdev = ctx;
+ return &mlxibdev->ib_dev;
+}
+
+struct update_gid_work {
+ struct work_struct work;
+ union ib_gid gids[128];
+ int port;
+ struct mlx4_ib_dev *dev;
+};
+
+static struct workqueue_struct *wq;
+
+static void init_query_mad(struct ib_smp *mad)
+{
+ mad->base_version = 1;
+ mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+ mad->class_version = 1;
+ mad->method = IB_MGMT_METHOD_GET;
+}
+
+static union ib_gid zgid;
+
+static int mlx4_ib_query_device(struct ib_device *ibdev,
+ struct ib_device_attr *props)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ int err = -ENOMEM;
+
+ in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad)
+ goto out;
+
+ init_query_mad(in_mad);
+ in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+ err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, 1, NULL, NULL, in_mad, out_mad);
+ if (err)
+ goto out;
+
+ memset(props, 0, sizeof *props);
+
+ props->fw_ver = dev->dev->caps.fw_ver;
+ props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT |
+ IB_DEVICE_PORT_ACTIVE_EVENT |
+ IB_DEVICE_SYS_IMAGE_GUID |
+ IB_DEVICE_RC_RNR_NAK_GEN |
+ IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR)
+ props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR)
+ props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_APM)
+ props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UD_AV_PORT)
+ props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE;
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM)
+ props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
+ if (dev->dev->caps.max_gso_sz && dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BLH)
+ props->device_cap_flags |= IB_DEVICE_UD_TSO;
+ if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_RESERVED_LKEY)
+ props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY;
+ if ((dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_LOCAL_INV) &&
+ (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_REMOTE_INV) &&
+ (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_FAST_REG_WR))
+ props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)
+ props->device_cap_flags |= IB_DEVICE_XRC;
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_RAW_ETY)
+ props->max_raw_ethy_qp = dev->ib_dev.phys_port_cnt;
+
+ props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) &
+ 0xffffff;
+ props->vendor_part_id = be16_to_cpup((__be16 *) (out_mad->data + 30));
+ props->hw_ver = be32_to_cpup((__be32 *) (out_mad->data + 32));
+ memcpy(&props->sys_image_guid, out_mad->data + 4, 8);
+
+ props->max_mr_size = ~0ull;
+ props->page_size_cap = dev->dev->caps.page_size_cap;
+ props->max_qp = dev->dev->caps.num_qps - dev->dev->caps.reserved_qps;
+ props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE;
+ props->max_sge = min(dev->dev->caps.max_sq_sg,
+ dev->dev->caps.max_rq_sg);
+ props->max_cq = dev->dev->caps.num_cqs - dev->dev->caps.reserved_cqs;
+ props->max_cqe = dev->dev->caps.max_cqes;
+ props->max_mr = dev->dev->caps.num_mpts - dev->dev->caps.reserved_mrws;
+ props->max_pd = dev->dev->caps.num_pds - dev->dev->caps.reserved_pds;
+ props->max_qp_rd_atom = dev->dev->caps.max_qp_dest_rdma;
+ props->max_qp_init_rd_atom = dev->dev->caps.max_qp_init_rdma;
+ props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp;
+ props->max_srq = dev->dev->caps.num_srqs - dev->dev->caps.reserved_srqs;
+ props->max_srq_wr = dev->dev->caps.max_srq_wqes - 1;
+ props->max_srq_sge = dev->dev->caps.max_srq_sge;
+ props->max_fast_reg_page_list_len = MAX_FAST_REG_PAGES;
+ props->local_ca_ack_delay = dev->dev->caps.local_ca_ack_delay;
+ props->atomic_cap = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ?
+ IB_ATOMIC_HCA : IB_ATOMIC_NONE;
+ props->masked_atomic_cap = IB_ATOMIC_HCA;
+ props->max_pkeys = dev->dev->caps.pkey_table_len[1];
+ props->max_mcast_grp = dev->dev->caps.num_mgms + dev->dev->caps.num_amgms;
+ props->max_mcast_qp_attach = dev->dev->caps.num_qp_per_mgm;
+ props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+ props->max_mcast_grp;
+ props->max_map_per_fmr = (1 << (32 - ilog2(dev->dev->caps.num_mpts))) - 1;
+
+out:
+ kfree(in_mad);
+ kfree(out_mad);
+
+ return err;
+}
+
+static enum rdma_link_layer
+mlx4_ib_port_link_layer(struct ib_device *device, u8 port_num)
+{
+ struct mlx4_dev *dev = to_mdev(device)->dev;
+
+ return dev->caps.port_mask[port_num] == MLX4_PORT_TYPE_IB ?
+ IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET;
+}
+
+static void ib_link_query_port(struct ib_device *ibdev, u8 port,
+ struct ib_port_attr *props,
+ struct ib_smp *out_mad)
+{
+ props->lid = be16_to_cpup((__be16 *) (out_mad->data + 16));
+ props->lmc = out_mad->data[34] & 0x7;
+ props->sm_lid = be16_to_cpup((__be16 *) (out_mad->data + 18));
+ props->sm_sl = out_mad->data[36] & 0xf;
+ props->state = out_mad->data[32] & 0xf;
+ props->phys_state = out_mad->data[33] >> 4;
+ props->port_cap_flags = be32_to_cpup((__be32 *) (out_mad->data + 20));
+ props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port];
+ props->max_msg_sz = to_mdev(ibdev)->dev->caps.max_msg_sz;
+ props->pkey_tbl_len = to_mdev(ibdev)->dev->caps.pkey_table_len[port];
+ props->bad_pkey_cntr = be16_to_cpup((__be16 *) (out_mad->data + 46));
+ props->qkey_viol_cntr = be16_to_cpup((__be16 *) (out_mad->data + 48));
+ props->active_width = out_mad->data[31] & 0xf;
+ props->active_speed = out_mad->data[35] >> 4;
+ props->max_mtu = out_mad->data[41] & 0xf;
+ props->active_mtu = out_mad->data[36] >> 4;
+ props->subnet_timeout = out_mad->data[51] & 0x1f;
+ props->max_vl_num = out_mad->data[37] >> 4;
+ props->init_type_reply = out_mad->data[41] >> 4;
+ props->link_layer = IB_LINK_LAYER_INFINIBAND;
+}
+
+#ifdef notyet
+static int eth_to_ib_width(int w)
+{
+ switch (w) {
+ case 4:
+ return IB_WIDTH_4X;
+ case 8:
+ case 16:
+ return IB_WIDTH_8X;
+ case 32:
+ return IB_WIDTH_12X;
+ default:
+ return IB_WIDTH_1X;
+ }
+}
+
+static int eth_to_ib_speed(int s)
+{
+ switch (s) {
+ case 256:
+ return 1;
+ case 512:
+ return 2;
+ case 1024:
+ return 4;
+ default:
+ return 1;
+ }
+}
+#endif
+
+static u8 state_to_phys_state(enum ib_port_state state)
+{
+ return state == IB_PORT_ACTIVE ? 5 : 3;
+}
+
+static int eth_link_query_port(struct ib_device *ibdev, u8 port,
+ struct ib_port_attr *props,
+ struct ib_smp *out_mad)
+{
+ struct mlx4_ib_iboe *iboe = &to_mdev(ibdev)->iboe;
+ struct net_device *ndev;
+ enum ib_mtu tmp;
+
+ props->active_width = IB_WIDTH_4X;
+ props->active_speed = 1;
+ props->port_cap_flags = IB_PORT_CM_SUP;
+ props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port];
+ props->max_msg_sz = to_mdev(ibdev)->dev->caps.max_msg_sz;
+ props->pkey_tbl_len = 1;
+ props->bad_pkey_cntr = be16_to_cpup((__be16 *) (out_mad->data + 46));
+ props->qkey_viol_cntr = be16_to_cpup((__be16 *) (out_mad->data + 48));
+ props->max_mtu = IB_MTU_2048;
+ props->subnet_timeout = 0;
+ props->max_vl_num = out_mad->data[37] >> 4;
+ props->init_type_reply = 0;
+ props->link_layer = IB_LINK_LAYER_ETHERNET;
+ props->state = IB_PORT_DOWN;
+ props->phys_state = state_to_phys_state(props->state);
+ props->active_mtu = IB_MTU_256;
+ spin_lock(&iboe->lock);
+ ndev = iboe->netdevs[port - 1];
+ if (!ndev)
+ goto out;
+
+#ifdef __linux__
+ tmp = iboe_get_mtu(ndev->mtu);
+#else
+ tmp = iboe_get_mtu(ndev->if_mtu);
+#endif
+ props->active_mtu = tmp ? min(props->max_mtu, tmp) : IB_MTU_256;
+ props->state = netif_carrier_ok(ndev) && netif_oper_up(ndev) ?
+ IB_PORT_ACTIVE : IB_PORT_DOWN;
+ props->phys_state = state_to_phys_state(props->state);
+
+out:
+ spin_unlock(&iboe->lock);
+ return 0;
+}
+
+static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
+ struct ib_port_attr *props)
+{
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ int err = -ENOMEM;
+
+ in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad)
+ goto out;
+
+ memset(props, 0, sizeof *props);
+
+ init_query_mad(in_mad);
+ in_mad->attr_id = IB_SMP_ATTR_PORT_INFO;
+ in_mad->attr_mod = cpu_to_be32(port);
+
+ err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+ if (err)
+ goto out;
+
+ mlx4_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND ?
+ ib_link_query_port(ibdev, port, props, out_mad) :
+ eth_link_query_port(ibdev, port, props, out_mad);
+
+out:
+ kfree(in_mad);
+ kfree(out_mad);
+
+ return err;
+}
+
+static int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+ union ib_gid *gid)
+{
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ int err = -ENOMEM;
+
+ in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad)
+ goto out;
+
+ init_query_mad(in_mad);
+ in_mad->attr_id = IB_SMP_ATTR_PORT_INFO;
+ in_mad->attr_mod = cpu_to_be32(port);
+
+ err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+ if (err)
+ goto out;
+
+ memcpy(gid->raw, out_mad->data + 8, 8);
+
+ init_query_mad(in_mad);
+ in_mad->attr_id = IB_SMP_ATTR_GUID_INFO;
+ in_mad->attr_mod = cpu_to_be32(index / 8);
+
+ err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+ if (err)
+ goto out;
+
+ memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8);
+
+out:
+ kfree(in_mad);
+ kfree(out_mad);
+ return err;
+}
+
+static int iboe_query_gid(struct ib_device *ibdev, u8 port, int index,
+ union ib_gid *gid)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+
+ *gid = dev->iboe.gid_table[port - 1][index];
+
+ return 0;
+}
+
+static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+ union ib_gid *gid)
+{
+ if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND)
+ return __mlx4_ib_query_gid(ibdev, port, index, gid);
+ else
+ return iboe_query_gid(ibdev, port, index, gid);
+}
+
+static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+ u16 *pkey)
+{
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ int err = -ENOMEM;
+
+ in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad)
+ goto out;
+
+ init_query_mad(in_mad);
+ in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE;
+ in_mad->attr_mod = cpu_to_be32(index / 32);
+
+ err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+ if (err)
+ goto out;
+
+ *pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]);
+
+out:
+ kfree(in_mad);
+ kfree(out_mad);
+ return err;
+}
+
+static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask,
+ struct ib_device_modify *props)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ int err;
+
+ if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
+ return -EOPNOTSUPP;
+
+ if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
+ return 0;
+
+ spin_lock(&to_mdev(ibdev)->sm_lock);
+ memcpy(ibdev->node_desc, props->node_desc, 64);
+ spin_unlock(&to_mdev(ibdev)->sm_lock);
+
+ /* if possible, pass node desc to FW, so it can generate
+ * a 144 trap. If cmd fails, just ignore.
+ */
+ mailbox = mlx4_alloc_cmd_mailbox(to_mdev(ibdev)->dev);
+ if (IS_ERR(mailbox))
+ return 0;
+
+ memset(mailbox->buf, 0, 256);
+ memcpy(mailbox->buf, props->node_desc, 64);
+ err = mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0,
+ MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A);
+ if (err)
+ mlx4_ib_dbg("SET_NODE command failed (%d)", err);
+
+ mlx4_free_cmd_mailbox(to_mdev(ibdev)->dev, mailbox);
+
+ return 0;
+}
+
+static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols,
+ u32 cap_mask)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ int err;
+ u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+
+ memset(mailbox->buf, 0, 256);
+
+ if (dev->dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
+ *(u8 *) mailbox->buf = !!reset_qkey_viols << 6;
+ ((__be32 *) mailbox->buf)[2] = cpu_to_be32(cap_mask);
+ } else {
+ ((u8 *) mailbox->buf)[3] = !!reset_qkey_viols;
+ ((__be32 *) mailbox->buf)[1] = cpu_to_be32(cap_mask);
+ }
+
+ err = mlx4_cmd(dev->dev, mailbox->dma, port, is_eth, MLX4_CMD_SET_PORT,
+ MLX4_CMD_TIME_CLASS_B);
+
+ mlx4_free_cmd_mailbox(dev->dev, mailbox);
+ return err;
+}
+
+static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
+ struct ib_port_modify *props)
+{
+ struct ib_port_attr attr;
+ u32 cap_mask;
+ int err;
+
+ mutex_lock(&to_mdev(ibdev)->cap_mask_mutex);
+
+ err = mlx4_ib_query_port(ibdev, port, &attr);
+ if (err)
+ goto out;
+
+ cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) &
+ ~props->clr_port_cap_mask;
+
+ err = mlx4_SET_PORT(to_mdev(ibdev), port,
+ !!(mask & IB_PORT_RESET_QKEY_CNTR),
+ cap_mask);
+
+out:
+ mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex);
+ return err;
+}
+
+static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev,
+ struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ struct mlx4_ib_ucontext *context;
+ struct mlx4_ib_alloc_ucontext_resp resp;
+ int err;
+
+ if (!dev->ib_active)
+ return ERR_PTR(-EAGAIN);
+
+ resp.qp_tab_size = dev->dev->caps.num_qps;
+
+ if (mlx4_wc_enabled()) {
+ resp.bf_reg_size = dev->dev->caps.bf_reg_size;
+ resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
+ } else {
+ resp.bf_reg_size = 0;
+ resp.bf_regs_per_page = 0;
+ }
+
+ context = kzalloc(sizeof *context, GFP_KERNEL);
+ if (!context)
+ return ERR_PTR(-ENOMEM);
+
+ err = mlx4_uar_alloc(to_mdev(ibdev)->dev, &context->uar);
+ if (err) {
+ kfree(context);
+ return ERR_PTR(err);
+ }
+
+ INIT_LIST_HEAD(&context->db_page_list);
+ mutex_init(&context->db_page_mutex);
+
+ err = ib_copy_to_udata(udata, &resp, sizeof resp);
+ if (err) {
+ mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar);
+ kfree(context);
+ return ERR_PTR(-EFAULT);
+ }
+
+ return &context->ibucontext;
+}
+
+static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+{
+ struct mlx4_ib_ucontext *context = to_mucontext(ibcontext);
+
+ mlx4_uar_free(to_mdev(ibcontext->device)->dev, &context->uar);
+ kfree(context);
+
+ return 0;
+}
+
+static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+ struct mlx4_ib_dev *dev = to_mdev(context->device);
+
+ if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+ return -EINVAL;
+
+ if (vma->vm_pgoff == 0) {
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+ if (io_remap_pfn_range(vma, vma->vm_start,
+ to_mucontext(context)->uar.pfn,
+ PAGE_SIZE, vma->vm_page_prot))
+ return -EAGAIN;
+ } else if (vma->vm_pgoff == 1 && dev->dev->caps.bf_reg_size != 0) {
+ vma->vm_page_prot = pgprot_wc(vma->vm_page_prot);
+
+ if (io_remap_pfn_range(vma, vma->vm_start,
+ to_mucontext(context)->uar.pfn +
+ dev->dev->caps.num_uars,
+ PAGE_SIZE, vma->vm_page_prot))
+ return -EAGAIN;
+ } else
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev,
+ struct ib_ucontext *context,
+ struct ib_udata *udata)
+{
+ struct mlx4_ib_pd *pd;
+ int err;
+
+ pd = kzalloc(sizeof *pd, GFP_KERNEL);
+ if (!pd)
+ return ERR_PTR(-ENOMEM);
+
+ err = mlx4_pd_alloc(to_mdev(ibdev)->dev, &pd->pdn);
+ if (err) {
+ kfree(pd);
+ return ERR_PTR(err);
+ }
+
+ if (context)
+ if (ib_copy_to_udata(udata, &pd->pdn, sizeof (__u32))) {
+ mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn);
+ kfree(pd);
+ return ERR_PTR(-EFAULT);
+ }
+
+ return &pd->ibpd;
+}
+
+static int mlx4_ib_dealloc_pd(struct ib_pd *pd)
+{
+ mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn);
+ kfree(pd);
+
+ return 0;
+}
+
+static int add_gid_entry(struct ib_qp *ibqp, union ib_gid *gid)
+{
+ struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+ struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+ struct gid_entry *ge;
+
+ ge = kzalloc(sizeof *ge, GFP_KERNEL);
+ if (!ge)
+ return -ENOMEM;
+
+ ge->gid = *gid;
+ if (mlx4_ib_add_mc(mdev, mqp, gid)) {
+ ge->port = mqp->port;
+ ge->added = 1;
+ }
+
+ mutex_lock(&mqp->mutex);
+ list_add_tail(&ge->list, &mqp->gid_list);
+ mutex_unlock(&mqp->mutex);
+
+ return 0;
+}
+
+int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
+ union ib_gid *gid)
+{
+ u8 mac[6];
+ struct net_device *ndev;
+ int ret = 0;
+
+ if (!mqp->port)
+ return 0;
+ spin_lock(&mdev->iboe.lock);
+ ndev = mdev->iboe.netdevs[mqp->port - 1];
+ if (ndev)
+ dev_hold(ndev);
+ spin_unlock(&mdev->iboe.lock);
+ if (ndev) {
+ rdma_get_mcast_mac((struct in6_addr *)gid, mac);
+ rtnl_lock();
+ dev_mc_add(mdev->iboe.netdevs[mqp->port - 1], mac, 6, 0);
+ ret = 1;
+ rtnl_unlock();
+ dev_put(ndev);
+ }
+
+ return ret;
+}
+
+static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+ int err;
+ struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+ struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+
+ err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, !!(mqp->flags &
+ MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK),
+ (ibqp->qp_type == IB_QPT_RAW_ETH) ?
+ MLX4_MCAST_PROT_EN : MLX4_MCAST_PROT_IB);
+ if (err)
+ return err;
+
+ err = add_gid_entry(ibqp, gid);
+ if (err)
+ goto err_add;
+
+ return 0;
+
+err_add:
+ mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
+ (ibqp->qp_type == IB_QPT_RAW_ETH) ?
+ MLX4_MCAST_PROT_EN : MLX4_MCAST_PROT_IB);
+ return err;
+}
+
+static struct gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw)
+{
+ struct gid_entry *ge;
+ struct gid_entry *tmp;
+ struct gid_entry *ret = NULL;
+
+ list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
+ if (!memcmp(raw, ge->gid.raw, 16)) {
+ ret = ge;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+ int err;
+ struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+ struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+ u8 mac[6];
+ struct net_device *ndev;
+ struct gid_entry *ge;
+
+ err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
+ (ibqp->qp_type == IB_QPT_RAW_ETH) ?
+ MLX4_MCAST_PROT_EN : MLX4_MCAST_PROT_IB);
+ if (err)
+ return err;
+
+ mutex_lock(&mqp->mutex);
+ ge = find_gid_entry(mqp, gid->raw);
+ if (ge) {
+ spin_lock(&mdev->iboe.lock);
+ ndev = ge->added ? mdev->iboe.netdevs[ge->port - 1] : NULL;
+ if (ndev)
+ dev_hold(ndev);
+ spin_unlock(&mdev->iboe.lock);
+ rdma_get_mcast_mac((struct in6_addr *)gid, mac);
+ if (ndev) {
+ rtnl_lock();
+ dev_mc_delete(mdev->iboe.netdevs[ge->port - 1], mac, 6, 0);
+ rtnl_unlock();
+ dev_put(ndev);
+ }
+ list_del(&ge->list);
+ kfree(ge);
+ } else
+ printk(KERN_WARNING "could not find mgid entry\n");
+
+ mutex_unlock(&mqp->mutex);
+
+ return 0;
+}
+
+static void mlx4_dummy_comp_handler(struct ib_cq *cq, void *cq_context)
+{
+}
+
+static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev,
+ struct ib_ucontext *context,
+ struct ib_udata *udata)
+{
+ struct mlx4_ib_xrcd *xrcd;
+ struct mlx4_ib_dev *mdev = to_mdev(ibdev);
+ struct ib_pd *pd;
+ struct ib_cq *cq;
+ int err;
+
+ if (!(mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
+ return ERR_PTR(-ENOSYS);
+
+ xrcd = kmalloc(sizeof *xrcd, GFP_KERNEL);
+ if (!xrcd)
+ return ERR_PTR(-ENOMEM);
+
+ err = mlx4_xrcd_alloc(mdev->dev, &xrcd->xrcdn);
+ if (err)
+ goto err_xrcd;
+
+ pd = mlx4_ib_alloc_pd(ibdev, NULL, NULL);
+ if (IS_ERR(pd)) {
+ err = PTR_ERR(pd);
+ goto err_pd;
+ }
+ pd->device = ibdev;
+
+ cq = mlx4_ib_create_cq(ibdev, 1, 0, NULL, NULL);
+ if (IS_ERR(cq)) {
+ err = PTR_ERR(cq);
+ goto err_cq;
+ }
+ cq->device = ibdev;
+ cq->comp_handler = mlx4_dummy_comp_handler;
+
+ if (context)
+ if (ib_copy_to_udata(udata, &xrcd->xrcdn, sizeof(__u32))) {
+ err = -EFAULT;
+ goto err_copy;
+ }
+
+ xrcd->cq = cq;
+ xrcd->pd = pd;
+ return &xrcd->ibxrcd;
+
+err_copy:
+ mlx4_ib_destroy_cq(cq);
+err_cq:
+ mlx4_ib_dealloc_pd(pd);
+err_pd:
+ mlx4_xrcd_free(mdev->dev, xrcd->xrcdn);
+err_xrcd:
+ kfree(xrcd);
+ return ERR_PTR(err);
+}
+
+static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+{
+ struct mlx4_ib_xrcd *mxrcd = to_mxrcd(xrcd);
+
+ mlx4_ib_destroy_cq(mxrcd->cq);
+ mlx4_ib_dealloc_pd(mxrcd->pd);
+ mlx4_xrcd_free(to_mdev(xrcd->device)->dev, to_mxrcd(xrcd)->xrcdn);
+ kfree(xrcd);
+
+ return 0;
+}
+
+
+static int init_node_data(struct mlx4_ib_dev *dev)
+{
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ int err = -ENOMEM;
+
+ in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad)
+ goto out;
+
+ init_query_mad(in_mad);
+ in_mad->attr_id = IB_SMP_ATTR_NODE_DESC;
+
+ err = mlx4_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad);
+ if (err)
+ goto out;
+
+ memcpy(dev->ib_dev.node_desc, out_mad->data, 64);
+
+ in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+ err = mlx4_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad);
+ if (err)
+ goto out;
+
+ dev->dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32));
+ memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8);
+
+out:
+ kfree(in_mad);
+ kfree(out_mad);
+ return err;
+}
+
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+ char *buf)
+{
+ struct mlx4_ib_dev *dev =
+ container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+ return sprintf(buf, "MT%d\n", dev->dev->pdev->device);
+}
+
+static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
+ char *buf)
+{
+ struct mlx4_ib_dev *dev =
+ container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+ return sprintf(buf, "%d.%d.%d\n", (int) (dev->dev->caps.fw_ver >> 32),
+ (int) (dev->dev->caps.fw_ver >> 16) & 0xffff,
+ (int) dev->dev->caps.fw_ver & 0xffff);
+}
+
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+ char *buf)
+{
+ struct mlx4_ib_dev *dev =
+ container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+ return sprintf(buf, "%x\n", dev->dev->rev_id);
+}
+
+static ssize_t show_board(struct device *device, struct device_attribute *attr,
+ char *buf)
+{
+ struct mlx4_ib_dev *dev =
+ container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+ return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN,
+ dev->dev->board_id);
+}
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+
+static struct device_attribute *mlx4_class_attributes[] = {
+ &dev_attr_hw_rev,
+ &dev_attr_fw_ver,
+ &dev_attr_hca_type,
+ &dev_attr_board_id
+};
+
+/*
+ * create show function and a device_attribute struct pointing to
+ * the function for _name
+ */
+#define DEVICE_DIAG_RPRT_ATTR(_name, _offset, _op_mod) \
+static ssize_t show_rprt_##_name(struct device *dev, \
+ struct device_attribute *attr, \
+ char *buf){ \
+ return show_diag_rprt(dev, buf, _offset, _op_mod); \
+} \
+static DEVICE_ATTR(_name, S_IRUGO, show_rprt_##_name, NULL);
+
+#define MLX4_DIAG_RPRT_CLEAR_DIAGS 3
+
+static size_t show_diag_rprt(struct device *device, char *buf,
+ u32 offset, u8 op_modifier)
+{
+ size_t ret;
+ u32 counter_offset = offset;
+ u32 diag_counter = 0;
+ struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev,
+ ib_dev.dev);
+
+ ret = mlx4_query_diag_counters(dev->dev, 1, op_modifier,
+ &counter_offset, &diag_counter);
+ if (ret)
+ return ret;
+
+ return sprintf(buf,"%d\n", diag_counter);
+}
+
+static ssize_t clear_diag_counters(struct device *device,
+ struct device_attribute *attr,
+ const char *buf, size_t length)
+{
+ size_t ret;
+ struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev,
+ ib_dev.dev);
+
+ ret = mlx4_query_diag_counters(dev->dev, 0, MLX4_DIAG_RPRT_CLEAR_DIAGS,
+ NULL, NULL);
+ if (ret)
+ return ret;
+
+ return length;
+}
+
+DEVICE_DIAG_RPRT_ATTR(rq_num_lle , 0x00, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_lle , 0x04, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_lqpoe , 0x08, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_lqpoe , 0x0C, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_leeoe , 0x10, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_leeoe , 0x14, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_lpe , 0x18, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_lpe , 0x1C, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_wrfe , 0x20, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_wrfe , 0x24, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_mwbe , 0x2C, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_bre , 0x34, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_lae , 0x38, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_rire , 0x44, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_rire , 0x48, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_rae , 0x4C, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_rae , 0x50, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_roe , 0x54, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_tree , 0x5C, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_rree , 0x64, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_rnr , 0x68, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_rnr , 0x6C, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_rabrte , 0x7C, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_ieecne , 0x84, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_ieecse , 0x8C, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_oos , 0x100, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_oos , 0x104, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_mce , 0x108, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_rsync , 0x110, 2);
+DEVICE_DIAG_RPRT_ATTR(sq_num_rsync , 0x114, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_udsdprd , 0x118, 2);
+DEVICE_DIAG_RPRT_ATTR(rq_num_ucsdprd , 0x120, 2);
+DEVICE_DIAG_RPRT_ATTR(num_cqovf , 0x1A0, 2);
+DEVICE_DIAG_RPRT_ATTR(num_eqovf , 0x1A4, 2);
+DEVICE_DIAG_RPRT_ATTR(num_baddb , 0x1A8, 2);
+
+static DEVICE_ATTR(clear_diag, S_IWUGO, NULL, clear_diag_counters);
+
+static struct attribute *diag_rprt_attrs[] = {
+ &dev_attr_rq_num_lle.attr,
+ &dev_attr_sq_num_lle.attr,
+ &dev_attr_rq_num_lqpoe.attr,
+ &dev_attr_sq_num_lqpoe.attr,
+ &dev_attr_rq_num_leeoe.attr,
+ &dev_attr_sq_num_leeoe.attr,
+ &dev_attr_rq_num_lpe.attr,
+ &dev_attr_sq_num_lpe.attr,
+ &dev_attr_rq_num_wrfe.attr,
+ &dev_attr_sq_num_wrfe.attr,
+ &dev_attr_sq_num_mwbe.attr,
+ &dev_attr_sq_num_bre.attr,
+ &dev_attr_rq_num_lae.attr,
+ &dev_attr_sq_num_rire.attr,
+ &dev_attr_rq_num_rire.attr,
+ &dev_attr_sq_num_rae.attr,
+ &dev_attr_rq_num_rae.attr,
+ &dev_attr_sq_num_roe.attr,
+ &dev_attr_sq_num_tree.attr,
+ &dev_attr_sq_num_rree.attr,
+ &dev_attr_rq_num_rnr.attr,
+ &dev_attr_sq_num_rnr.attr,
+ &dev_attr_sq_num_rabrte.attr,
+ &dev_attr_sq_num_ieecne.attr,
+ &dev_attr_sq_num_ieecse.attr,
+ &dev_attr_rq_num_oos.attr,
+ &dev_attr_sq_num_oos.attr,
+ &dev_attr_rq_num_mce.attr,
+ &dev_attr_rq_num_rsync.attr,
+ &dev_attr_sq_num_rsync.attr,
+ &dev_attr_rq_num_udsdprd.attr,
+ &dev_attr_rq_num_ucsdprd.attr,
+ &dev_attr_num_cqovf.attr,
+ &dev_attr_num_eqovf.attr,
+ &dev_attr_num_baddb.attr,
+ &dev_attr_clear_diag.attr,
+ NULL
+};
+
+struct attribute_group diag_counters_group = {
+ .name = "diag_counters",
+ .attrs = diag_rprt_attrs
+};
+
+static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, struct net_device *dev)
+{
+#ifdef __linux__
+ memcpy(eui, dev->dev_addr, 3);
+ memcpy(eui + 5, dev->dev_addr + 3, 3);
+#else
+ memcpy(eui, IF_LLADDR(dev), 3);
+ memcpy(eui + 5, IF_LLADDR(dev) + 3, 3);
+#endif
+ if (vlan_id < 0x1000) {
+ eui[3] = vlan_id >> 8;
+ eui[4] = vlan_id & 0xff;
+ } else {
+ eui[3] = 0xff;
+ eui[4] = 0xfe;
+ }
+ eui[0] ^= 2;
+}
+
+static void update_gids_task(struct work_struct *work)
+{
+ struct update_gid_work *gw = container_of(work, struct update_gid_work, work);
+ struct mlx4_cmd_mailbox *mailbox;
+ union ib_gid *gids;
+ int err;
+ struct mlx4_dev *dev = gw->dev->dev;
+ struct ib_event event;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox)) {
+ printk(KERN_WARNING "update gid table failed %ld\n", PTR_ERR(mailbox));
+ return;
+ }
+
+ gids = mailbox->buf;
+ memcpy(gids, gw->gids, sizeof gw->gids);
+
+ err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | gw->port,
+ 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B);
+ if (err)
+ printk(KERN_WARNING "set port command failed\n");
+ else {
+ memcpy(gw->dev->iboe.gid_table[gw->port - 1], gw->gids, sizeof gw->gids);
+ event.device = &gw->dev->ib_dev;
+ event.element.port_num = gw->port;
+ event.event = IB_EVENT_GID_CHANGE;
+ ib_dispatch_event(&event);
+ }
+
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ kfree(gw);
+}
+
+enum {
+ MLX4_MAX_EFF_VLANS = 128 - MLX4_VLAN_REGULAR,
+};
+
+static int update_ipv6_gids(struct mlx4_ib_dev *dev, int port, int clear)
+{
+ struct net_device *ndev = dev->iboe.netdevs[port - 1];
+ struct update_gid_work *work;
+ struct net_device *tmp;
+ int i;
+ u8 *hits;
+ int ret;
+ union ib_gid gid;
+ int tofree;
+ int found;
+ int need_update = 0;
+ u16 vid;
+
+ work = kzalloc(sizeof *work, GFP_ATOMIC);
+ if (!work)
+ return -ENOMEM;
+
+ hits = kzalloc(MLX4_MAX_EFF_VLANS + 1, GFP_ATOMIC);
+ if (!hits) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+#ifdef __linux__
+ read_lock(&dev_base_lock);
+ for_each_netdev(&init_net, tmp) {
+#else
+ IFNET_RLOCK();
+ TAILQ_FOREACH(tmp, &V_ifnet, if_link) {
+#endif
+ if (ndev && (tmp == ndev || rdma_vlan_dev_real_dev(tmp) == ndev)) {
+ gid.global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
+ vid = rdma_vlan_dev_vlan_id(tmp);
+ mlx4_addrconf_ifid_eui48(&gid.raw[8], vid, ndev);
+ found = 0;
+ tofree = -1;
+ for (i = 0; i < MLX4_MAX_EFF_VLANS + 1; ++i) {
+ if (tofree < 0 &&
+ !memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof zgid))
+ tofree = i;
+ if (!memcmp(&dev->iboe.gid_table[port - 1][i], &gid, sizeof gid)) {
+ hits[i] = 1;
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found) {
+ if (tmp == ndev && (memcmp(&dev->iboe.gid_table[port - 1][0], &gid, sizeof gid) || !memcmp(&dev->iboe.gid_table[port - 1][0], &zgid, sizeof gid))) {
+ dev->iboe.gid_table[port - 1][0] = gid;
+ ++need_update;
+ hits[0] = 1;
+ } else if (tofree >= 0) {
+ dev->iboe.gid_table[port - 1][tofree] = gid;
+ hits[tofree] = 1;
+ ++need_update;
+ }
+ }
+ }
+#ifdef __linux__
+ }
+ read_unlock(&dev_base_lock);
+#else
+ }
+ IFNET_RUNLOCK();
+#endif
+
+ for (i = 0; i < MLX4_MAX_EFF_VLANS + 1; ++i)
+ if (!hits[i]) {
+ if (memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof zgid))
+ ++need_update;
+ dev->iboe.gid_table[port - 1][i] = zgid;
+ }
+
+
+ if (need_update) {
+ memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof work->gids);
+ INIT_WORK(&work->work, update_gids_task);
+ work->port = port;
+ work->dev = dev;
+ queue_work(wq, &work->work);
+ } else
+ kfree(work);
+
+ kfree(hits);
+ return 0;
+
+out:
+ kfree(work);
+ return ret;
+}
+
+static void handle_en_event(struct mlx4_ib_dev *dev, int port, unsigned long event)
+{
+ switch (event) {
+ case NETDEV_UP:
+#ifdef __linux__
+ case NETDEV_CHANGEADDR:
+#endif
+ update_ipv6_gids(dev, port, 0);
+ break;
+
+ case NETDEV_DOWN:
+ update_ipv6_gids(dev, port, 1);
+ dev->iboe.netdevs[port - 1] = NULL;
+ }
+}
+
+static void netdev_added(struct mlx4_ib_dev *dev, int port)
+{
+ update_ipv6_gids(dev, port, 0);
+}
+
+static void netdev_removed(struct mlx4_ib_dev *dev, int port)
+{
+ update_ipv6_gids(dev, port, 1);
+}
+
+static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = ptr;
+ struct mlx4_ib_dev *ibdev;
+ struct net_device *oldnd;
+ struct mlx4_ib_iboe *iboe;
+ int port;
+
+#ifdef __linux__
+ if (!net_eq(dev_net(dev), &init_net))
+ return NOTIFY_DONE;
+#endif
+
+ ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb);
+ iboe = &ibdev->iboe;
+
+ spin_lock(&iboe->lock);
+ mlx4_foreach_ib_transport_port(port, ibdev->dev) {
+ oldnd = iboe->netdevs[port - 1];
+ iboe->netdevs[port - 1] = mlx4_get_prot_dev(ibdev->dev, MLX4_PROT_EN, port);
+ if (oldnd != iboe->netdevs[port - 1]) {
+ if (iboe->netdevs[port - 1])
+ netdev_added(ibdev, port);
+ else
+ netdev_removed(ibdev, port);
+ }
+ }
+
+ if (dev == iboe->netdevs[0] ||
+ (iboe->netdevs[0] && rdma_vlan_dev_real_dev(dev) == iboe->netdevs[0]))
+ handle_en_event(ibdev, 1, event);
+ else if (dev == iboe->netdevs[1]
+ || (iboe->netdevs[1] && rdma_vlan_dev_real_dev(dev) == iboe->netdevs[1]))
+ handle_en_event(ibdev, 2, event);
+
+ spin_unlock(&iboe->lock);
+
+ return NOTIFY_DONE;
+}
+
+static void *mlx4_ib_add(struct mlx4_dev *dev)
+{
+ static int mlx4_ib_version_printed;
+ struct mlx4_ib_dev *ibdev;
+ int num_ports = 0;
+ int i;
+ int err;
+ struct mlx4_ib_iboe *iboe;
+ int k;
+
+ if (!mlx4_ib_version_printed) {
+ printk(KERN_INFO "%s", mlx4_ib_version);
+ ++mlx4_ib_version_printed;
+ }
+
+ mlx4_foreach_ib_transport_port(i, dev)
+ num_ports++;
+
+ /* No point in registering a device with no ports... */
+ if (num_ports == 0)
+ return NULL;
+
+ ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev);
+ if (!ibdev) {
+ dev_err(&dev->pdev->dev, "Device struct alloc failed\n");
+ return NULL;
+ }
+
+ iboe = &ibdev->iboe;
+
+ if (mlx4_pd_alloc(dev, &ibdev->priv_pdn))
+ goto err_dealloc;
+
+ if (mlx4_uar_alloc(dev, &ibdev->priv_uar))
+ goto err_pd;
+
+ ibdev->priv_uar.map = ioremap(ibdev->priv_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
+ if (!ibdev->priv_uar.map)
+ goto err_uar;
+ MLX4_INIT_DOORBELL_LOCK(&ibdev->uar_lock);
+
+ ibdev->dev = dev;
+
+ strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX);
+ ibdev->ib_dev.owner = THIS_MODULE;
+ ibdev->ib_dev.node_type = RDMA_NODE_IB_CA;
+ ibdev->ib_dev.local_dma_lkey = dev->caps.reserved_lkey;
+ ibdev->num_ports = num_ports;
+ ibdev->ib_dev.phys_port_cnt = ibdev->num_ports;
+ ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors;
+ ibdev->ib_dev.dma_device = &dev->pdev->dev;
+
+ ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION;
+ ibdev->ib_dev.uverbs_cmd_mask =
+ (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+ (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+ (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+ (1ull << IB_USER_VERBS_CMD_REG_MR) |
+ (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+ (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+ (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_QP) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+ (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) |
+ (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ);
+
+ ibdev->ib_dev.query_device = mlx4_ib_query_device;
+ ibdev->ib_dev.query_port = mlx4_ib_query_port;
+ ibdev->ib_dev.get_link_layer = mlx4_ib_port_link_layer;
+ ibdev->ib_dev.query_gid = mlx4_ib_query_gid;
+ ibdev->ib_dev.query_pkey = mlx4_ib_query_pkey;
+ ibdev->ib_dev.modify_device = mlx4_ib_modify_device;
+ ibdev->ib_dev.modify_port = mlx4_ib_modify_port;
+ ibdev->ib_dev.alloc_ucontext = mlx4_ib_alloc_ucontext;
+ ibdev->ib_dev.dealloc_ucontext = mlx4_ib_dealloc_ucontext;
+ ibdev->ib_dev.mmap = mlx4_ib_mmap;
+ ibdev->ib_dev.alloc_pd = mlx4_ib_alloc_pd;
+ ibdev->ib_dev.dealloc_pd = mlx4_ib_dealloc_pd;
+ ibdev->ib_dev.create_ah = mlx4_ib_create_ah;
+ ibdev->ib_dev.query_ah = mlx4_ib_query_ah;
+ ibdev->ib_dev.destroy_ah = mlx4_ib_destroy_ah;
+ ibdev->ib_dev.create_srq = mlx4_ib_create_srq;
+ ibdev->ib_dev.modify_srq = mlx4_ib_modify_srq;
+ ibdev->ib_dev.query_srq = mlx4_ib_query_srq;
+ ibdev->ib_dev.destroy_srq = mlx4_ib_destroy_srq;
+ ibdev->ib_dev.post_srq_recv = mlx4_ib_post_srq_recv;
+ ibdev->ib_dev.create_qp = mlx4_ib_create_qp;
+ ibdev->ib_dev.modify_qp = mlx4_ib_modify_qp;
+ ibdev->ib_dev.query_qp = mlx4_ib_query_qp;
+ ibdev->ib_dev.destroy_qp = mlx4_ib_destroy_qp;
+ ibdev->ib_dev.post_send = mlx4_ib_post_send;
+ ibdev->ib_dev.post_recv = mlx4_ib_post_recv;
+ ibdev->ib_dev.create_cq = mlx4_ib_create_cq;
+ ibdev->ib_dev.modify_cq = mlx4_ib_modify_cq;
+ ibdev->ib_dev.resize_cq = mlx4_ib_resize_cq;
+ ibdev->ib_dev.destroy_cq = mlx4_ib_destroy_cq;
+ ibdev->ib_dev.poll_cq = mlx4_ib_poll_cq;
+ ibdev->ib_dev.req_notify_cq = mlx4_ib_arm_cq;
+ ibdev->ib_dev.get_dma_mr = mlx4_ib_get_dma_mr;
+ ibdev->ib_dev.reg_user_mr = mlx4_ib_reg_user_mr;
+ ibdev->ib_dev.dereg_mr = mlx4_ib_dereg_mr;
+ ibdev->ib_dev.alloc_fast_reg_mr = mlx4_ib_alloc_fast_reg_mr;
+ ibdev->ib_dev.alloc_fast_reg_page_list = mlx4_ib_alloc_fast_reg_page_list;
+ ibdev->ib_dev.free_fast_reg_page_list = mlx4_ib_free_fast_reg_page_list;
+ ibdev->ib_dev.attach_mcast = mlx4_ib_mcg_attach;
+ ibdev->ib_dev.detach_mcast = mlx4_ib_mcg_detach;
+ ibdev->ib_dev.process_mad = mlx4_ib_process_mad;
+
+ ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc;
+ ibdev->ib_dev.map_phys_fmr = mlx4_ib_map_phys_fmr;
+ ibdev->ib_dev.unmap_fmr = mlx4_ib_unmap_fmr;
+ ibdev->ib_dev.dealloc_fmr = mlx4_ib_fmr_dealloc;
+ if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) {
+ ibdev->ib_dev.create_xrc_srq = mlx4_ib_create_xrc_srq;
+ ibdev->ib_dev.alloc_xrcd = mlx4_ib_alloc_xrcd;
+ ibdev->ib_dev.dealloc_xrcd = mlx4_ib_dealloc_xrcd;
+ ibdev->ib_dev.create_xrc_rcv_qp = mlx4_ib_create_xrc_rcv_qp;
+ ibdev->ib_dev.modify_xrc_rcv_qp = mlx4_ib_modify_xrc_rcv_qp;
+ ibdev->ib_dev.query_xrc_rcv_qp = mlx4_ib_query_xrc_rcv_qp;
+ ibdev->ib_dev.reg_xrc_rcv_qp = mlx4_ib_reg_xrc_rcv_qp;
+ ibdev->ib_dev.unreg_xrc_rcv_qp = mlx4_ib_unreg_xrc_rcv_qp;
+ ibdev->ib_dev.uverbs_cmd_mask |=
+ (1ull << IB_USER_VERBS_CMD_CREATE_XRC_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_OPEN_XRC_DOMAIN) |
+ (1ull << IB_USER_VERBS_CMD_CLOSE_XRC_DOMAIN) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_XRC_RCV_QP) |
+ (1ull << IB_USER_VERBS_CMD_MODIFY_XRC_RCV_QP) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_XRC_RCV_QP) |
+ (1ull << IB_USER_VERBS_CMD_REG_XRC_RCV_QP) |
+ (1ull << IB_USER_VERBS_CMD_UNREG_XRC_RCV_QP);
+ }
+
+
+ spin_lock_init(&iboe->lock);
+ if (init_node_data(ibdev))
+ goto err_map;
+
+ for (k = 0; k < ibdev->num_ports; ++k) {
+ err = mlx4_counter_alloc(ibdev->dev, &ibdev->counters[k]);
+ if (err)
+ ibdev->counters[k] = -1;
+ else
+ mlx4_set_iboe_counter(dev, ibdev->counters[k], k + 1);
+ }
+
+ spin_lock_init(&ibdev->sm_lock);
+ mutex_init(&ibdev->cap_mask_mutex);
+ mutex_init(&ibdev->xrc_reg_mutex);
+
+ if (ib_register_device(&ibdev->ib_dev))
+ goto err_counter;
+
+ if (mlx4_ib_mad_init(ibdev))
+ goto err_reg;
+ if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE && !iboe->nb.notifier_call) {
+ iboe->nb.notifier_call = mlx4_ib_netdev_event;
+ err = register_netdevice_notifier(&iboe->nb);
+ if (err)
+ goto err_reg;
+ }
+ for (i = 0; i < ARRAY_SIZE(mlx4_class_attributes); ++i) {
+ if (device_create_file(&ibdev->ib_dev.dev,
+ mlx4_class_attributes[i]))
+ goto err_notif;
+ }
+
+ if(sysfs_create_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group))
+ goto err_notif;
+
+ ibdev->ib_active = 1;
+
+ return ibdev;
+
+err_notif:
+ if (unregister_netdevice_notifier(&ibdev->iboe.nb))
+ printk(KERN_WARNING "failure unregistering notifier\n");
+ flush_workqueue(wq);
+
+err_reg:
+ ib_unregister_device(&ibdev->ib_dev);
+
+err_counter:
+ for (; k; --k)
+ mlx4_counter_free(ibdev->dev, ibdev->counters[k - 1]);
+
+err_map:
+ iounmap(ibdev->priv_uar.map);
+
+err_uar:
+ mlx4_uar_free(dev, &ibdev->priv_uar);
+
+err_pd:
+ mlx4_pd_free(dev, ibdev->priv_pdn);
+
+err_dealloc:
+ ib_dealloc_device(&ibdev->ib_dev);
+
+ return NULL;
+}
+
+static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
+{
+ struct mlx4_ib_dev *ibdev = ibdev_ptr;
+ int p;
+ int k;
+
+ sysfs_remove_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group);
+
+ mlx4_ib_mad_cleanup(ibdev);
+ ib_unregister_device(&ibdev->ib_dev);
+ for (k = 0; k < ibdev->num_ports; ++k)
+ mlx4_counter_free(ibdev->dev, ibdev->counters[k]);
+
+ if (ibdev->iboe.nb.notifier_call) {
+ unregister_netdevice_notifier(&ibdev->iboe.nb);
+ flush_workqueue(wq);
+ ibdev->iboe.nb.notifier_call = NULL;
+ }
+ iounmap(ibdev->priv_uar.map);
+
+ mlx4_foreach_port(p, dev, MLX4_PORT_TYPE_IB)
+ mlx4_CLOSE_PORT(dev, p);
+
+ mlx4_uar_free(dev, &ibdev->priv_uar);
+ mlx4_pd_free(dev, ibdev->priv_pdn);
+ ib_dealloc_device(&ibdev->ib_dev);
+}
+
+static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr,
+ enum mlx4_dev_event event, int port)
+{
+ struct ib_event ibev;
+ struct mlx4_ib_dev *ibdev = to_mdev((struct ib_device *) ibdev_ptr);
+
+ if (port > ibdev->num_ports)
+ return;
+
+ switch (event) {
+ case MLX4_DEV_EVENT_PORT_UP:
+ ibev.event = IB_EVENT_PORT_ACTIVE;
+ break;
+
+ case MLX4_DEV_EVENT_PORT_DOWN:
+ ibev.event = IB_EVENT_PORT_ERR;
+ break;
+
+ case MLX4_DEV_EVENT_CATASTROPHIC_ERROR:
+ ibdev->ib_active = 0;
+ ibev.event = IB_EVENT_DEVICE_FATAL;
+ break;
+
+ default:
+ return;
+ }
+
+ ibev.device = ibdev_ptr;
+ ibev.element.port_num = port;
+
+ ib_dispatch_event(&ibev);
+}
+
+static struct mlx4_interface mlx4_ib_interface = {
+ .add = mlx4_ib_add,
+ .remove = mlx4_ib_remove,
+ .event = mlx4_ib_event,
+ .get_prot_dev = get_ibdev,
+ .protocol = MLX4_PROT_IB,
+};
+
+static int __init mlx4_ib_init(void)
+{
+ int err;
+
+ wq = create_singlethread_workqueue("mlx4_ib");
+ if (!wq)
+ return -ENOMEM;
+
+ err = mlx4_register_interface(&mlx4_ib_interface);
+ if (err) {
+ destroy_workqueue(wq);
+ return err;
+ }
+
+ return 0;
+}
+
+static void __exit mlx4_ib_cleanup(void)
+{
+ mlx4_unregister_interface(&mlx4_ib_interface);
+ destroy_workqueue(wq);
+}
+
+module_init_order(mlx4_ib_init, SI_ORDER_MIDDLE);
+module_exit(mlx4_ib_cleanup);
+
+#undef MODULE_VERSION
+#include <sys/module.h>
+static int
+mlx4ib_evhand(module_t mod, int event, void *arg)
+{
+ return (0);
+}
+static moduledata_t mlx4ib_mod = {
+ .name = "mlx4ib",
+ .evhand = mlx4ib_evhand,
+};
+DECLARE_MODULE(mlx4ib, mlx4ib_mod, SI_SUB_SMP, SI_ORDER_ANY);
+MODULE_DEPEND(mlx4ib, mlx4, 1, 1, 1);
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h b/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h
new file mode 100644
index 0000000..b8f6996
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -0,0 +1,409 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_IB_H
+#define MLX4_IB_H
+
+#include <linux/compiler.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/doorbell.h>
+
+
+#define MLX4_IB_DRV_NAME "mlx4_ib"
+
+#ifdef CONFIG_MLX4_DEBUG
+extern int mlx4_ib_debug_level;
+
+#define mlx4_ib_dbg(format, arg...) \
+ do { \
+ if (mlx4_ib_debug_level) \
+ printk(KERN_DEBUG "<" MLX4_IB_DRV_NAME "> %s: " format "\n",\
+ __func__, ## arg); \
+ } while (0)
+
+#else /* CONFIG_MLX4_DEBUG */
+
+#define mlx4_ib_dbg(format, arg...) do {} while (0)
+
+#endif /* CONFIG_MLX4_DEBUG */
+
+enum {
+ MLX4_IB_SQ_MIN_WQE_SHIFT = 6
+};
+
+#define MLX4_IB_SQ_HEADROOM(shift) ((2048 >> (shift)) + 1)
+#define MLX4_IB_SQ_MAX_SPARE (MLX4_IB_SQ_HEADROOM(MLX4_IB_SQ_MIN_WQE_SHIFT))
+
+struct mlx4_ib_ucontext {
+ struct ib_ucontext ibucontext;
+ struct mlx4_uar uar;
+ struct list_head db_page_list;
+ struct mutex db_page_mutex;
+};
+
+struct mlx4_ib_pd {
+ struct ib_pd ibpd;
+ u32 pdn;
+};
+
+struct mlx4_ib_xrcd {
+ struct ib_xrcd ibxrcd;
+ u32 xrcdn;
+ struct ib_pd *pd;
+ struct ib_cq *cq;
+};
+
+struct mlx4_ib_cq_buf {
+ struct mlx4_buf buf;
+ struct mlx4_mtt mtt;
+};
+
+struct mlx4_ib_cq_resize {
+ struct mlx4_ib_cq_buf buf;
+ int cqe;
+};
+
+struct mlx4_ib_cq {
+ struct ib_cq ibcq;
+ struct mlx4_cq mcq;
+ struct mlx4_ib_cq_buf buf;
+ struct mlx4_ib_cq_resize *resize_buf;
+ struct mlx4_db db;
+ spinlock_t lock;
+ struct mutex resize_mutex;
+ struct ib_umem *umem;
+ struct ib_umem *resize_umem;
+};
+
+struct mlx4_ib_mr {
+ struct ib_mr ibmr;
+ struct mlx4_mr mmr;
+ struct ib_umem *umem;
+};
+
+struct mlx4_ib_fast_reg_page_list {
+ struct ib_fast_reg_page_list ibfrpl;
+ __be64 *mapped_page_list;
+ dma_addr_t map;
+};
+
+struct mlx4_ib_fmr {
+ struct ib_fmr ibfmr;
+ struct mlx4_fmr mfmr;
+};
+
+struct mlx4_ib_wq {
+ u64 *wrid;
+ spinlock_t lock;
+ int wqe_cnt;
+ int max_post;
+ int max_gs;
+ int offset;
+ int wqe_shift;
+ unsigned head;
+ unsigned tail;
+};
+
+enum mlx4_ib_qp_flags {
+ MLX4_IB_QP_LSO = 1 << 0,
+ MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = 1 << 1,
+ MLX4_IB_XRC_RCV = 1 << 2,
+};
+
+struct gid_entry {
+ struct list_head list;
+ union ib_gid gid;
+ int added;
+ u8 port;
+};
+
+struct mlx4_ib_qp {
+ struct ib_qp ibqp;
+ struct mlx4_qp mqp;
+ struct mlx4_buf buf;
+
+ struct mlx4_db db;
+ struct mlx4_ib_wq rq;
+
+ u32 doorbell_qpn;
+ __be32 sq_signal_bits;
+ unsigned sq_next_wqe;
+ int sq_max_wqes_per_wr;
+ int sq_spare_wqes;
+ struct mlx4_ib_wq sq;
+
+ struct ib_umem *umem;
+ struct mlx4_mtt mtt;
+ int buf_size;
+ struct mutex mutex;
+ u32 flags;
+ struct list_head xrc_reg_list;
+ spinlock_t xrc_reg_list_lock;
+ u16 xrcdn;
+ u8 port;
+ u8 alt_port;
+ u8 atomic_rd_en;
+ u8 resp_depth;
+ u8 sq_no_prefetch;
+ u8 state;
+ int mlx_type;
+ struct list_head gid_list;
+ int max_inline_data;
+ struct mlx4_bf bf;
+};
+
+struct mlx4_ib_srq {
+ struct ib_srq ibsrq;
+ struct mlx4_srq msrq;
+ struct mlx4_buf buf;
+ struct mlx4_db db;
+ u64 *wrid;
+ spinlock_t lock;
+ int head;
+ int tail;
+ u16 wqe_ctr;
+ struct ib_umem *umem;
+ struct mlx4_mtt mtt;
+ struct mutex mutex;
+};
+
+struct mlx4_ib_ah {
+ struct ib_ah ibah;
+ union mlx4_ext_av av;
+};
+
+struct mlx4_ib_iboe {
+ spinlock_t lock;
+ struct net_device *netdevs[MLX4_MAX_PORTS];
+ struct notifier_block nb;
+ union ib_gid gid_table[MLX4_MAX_PORTS][128];
+};
+
+struct mlx4_ib_dev {
+ struct ib_device ib_dev;
+ struct mlx4_dev *dev;
+ int num_ports;
+ struct mlx4_uar priv_uar;
+ u32 priv_pdn;
+ MLX4_DECLARE_DOORBELL_LOCK(uar_lock);
+
+ struct ib_mad_agent *send_agent[MLX4_MAX_PORTS][2];
+ struct ib_ah *sm_ah[MLX4_MAX_PORTS];
+ spinlock_t sm_lock;
+
+ struct mutex cap_mask_mutex;
+ struct mutex xrc_reg_mutex;
+ int ib_active;
+ struct mlx4_ib_iboe iboe;
+ int counters[MLX4_MAX_PORTS];
+};
+
+static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev)
+{
+ return container_of(ibdev, struct mlx4_ib_dev, ib_dev);
+}
+
+static inline struct mlx4_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
+{
+ return container_of(ibucontext, struct mlx4_ib_ucontext, ibucontext);
+}
+
+static inline struct mlx4_ib_pd *to_mpd(struct ib_pd *ibpd)
+{
+ return container_of(ibpd, struct mlx4_ib_pd, ibpd);
+}
+
+static inline struct mlx4_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd)
+{
+ return container_of(ibxrcd, struct mlx4_ib_xrcd, ibxrcd);
+}
+
+static inline struct mlx4_ib_cq *to_mcq(struct ib_cq *ibcq)
+{
+ return container_of(ibcq, struct mlx4_ib_cq, ibcq);
+}
+
+static inline struct mlx4_ib_cq *to_mibcq(struct mlx4_cq *mcq)
+{
+ return container_of(mcq, struct mlx4_ib_cq, mcq);
+}
+
+static inline struct mlx4_ib_mr *to_mmr(struct ib_mr *ibmr)
+{
+ return container_of(ibmr, struct mlx4_ib_mr, ibmr);
+}
+
+static inline struct mlx4_ib_fast_reg_page_list *to_mfrpl(struct ib_fast_reg_page_list *ibfrpl)
+{
+ return container_of(ibfrpl, struct mlx4_ib_fast_reg_page_list, ibfrpl);
+}
+
+static inline struct mlx4_ib_fmr *to_mfmr(struct ib_fmr *ibfmr)
+{
+ return container_of(ibfmr, struct mlx4_ib_fmr, ibfmr);
+}
+static inline struct mlx4_ib_qp *to_mqp(struct ib_qp *ibqp)
+{
+ return container_of(ibqp, struct mlx4_ib_qp, ibqp);
+}
+
+static inline struct mlx4_ib_qp *to_mibqp(struct mlx4_qp *mqp)
+{
+ return container_of(mqp, struct mlx4_ib_qp, mqp);
+}
+
+static inline struct mlx4_ib_srq *to_msrq(struct ib_srq *ibsrq)
+{
+ return container_of(ibsrq, struct mlx4_ib_srq, ibsrq);
+}
+
+static inline struct mlx4_ib_srq *to_mibsrq(struct mlx4_srq *msrq)
+{
+ return container_of(msrq, struct mlx4_ib_srq, msrq);
+}
+
+static inline struct mlx4_ib_ah *to_mah(struct ib_ah *ibah)
+{
+ return container_of(ibah, struct mlx4_ib_ah, ibah);
+}
+
+int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
+ struct mlx4_db *db);
+void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db);
+
+struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc);
+int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
+ struct ib_umem *umem);
+struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+ u64 virt_addr, int access_flags,
+ struct ib_udata *udata);
+int mlx4_ib_dereg_mr(struct ib_mr *mr);
+struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd,
+ int max_page_list_len);
+struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
+ int page_list_len);
+void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);
+
+int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
+int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata);
+struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector,
+ struct ib_ucontext *context,
+ struct ib_udata *udata);
+int mlx4_ib_destroy_cq(struct ib_cq *cq);
+int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
+void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq);
+void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq);
+
+struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
+int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr);
+int mlx4_ib_destroy_ah(struct ib_ah *ah);
+
+struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
+ struct ib_srq_init_attr *init_attr,
+ struct ib_udata *udata);
+struct ib_srq *mlx4_ib_create_xrc_srq(struct ib_pd *pd,
+ struct ib_cq *xrc_cq,
+ struct ib_xrcd *xrcd,
+ struct ib_srq_init_attr *init_attr,
+ struct ib_udata *udata);
+int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+ enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
+int mlx4_ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
+int mlx4_ib_destroy_srq(struct ib_srq *srq);
+void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index);
+int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr);
+
+struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
+ struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata);
+int mlx4_ib_destroy_qp(struct ib_qp *qp);
+int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata);
+int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+ struct ib_qp_init_attr *qp_init_attr);
+int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+ struct ib_send_wr **bad_wr);
+int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr);
+
+int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey,
+ int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+ void *in_mad, void *response_mad);
+int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+ struct ib_wc *in_wc, struct ib_grh *in_grh,
+ struct ib_mad *in_mad, struct ib_mad *out_mad);
+int mlx4_ib_mad_init(struct mlx4_ib_dev *dev);
+void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev);
+
+struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int mr_access_flags,
+ struct ib_fmr_attr *fmr_attr);
+int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int npages,
+ u64 iova);
+int mlx4_ib_unmap_fmr(struct list_head *fmr_list);
+int mlx4_ib_fmr_dealloc(struct ib_fmr *fmr);
+int mlx4_ib_create_xrc_rcv_qp(struct ib_qp_init_attr *init_attr,
+ u32 *qp_num);
+int mlx4_ib_modify_xrc_rcv_qp(struct ib_xrcd *xrcd, u32 qp_num,
+ struct ib_qp_attr *attr, int attr_mask);
+int mlx4_ib_query_xrc_rcv_qp(struct ib_xrcd *xrcd, u32 qp_num,
+ struct ib_qp_attr *attr, int attr_mask,
+ struct ib_qp_init_attr *init_attr);
+int mlx4_ib_reg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num);
+int mlx4_ib_unreg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num);
+
+
+int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr,
+ u8 *mac, int *is_mcast, u8 port);
+
+static inline int mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah)
+{
+ u8 port = be32_to_cpu(ah->av.ib.port_pd) >> 24 & 3;
+
+ if (rdma_port_get_link_layer(ah->ibah.device, port) == IB_LINK_LAYER_ETHERNET)
+ return 1;
+
+ return !!(ah->av.ib.g_slid & 0x80);
+}
+
+int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
+ union ib_gid *gid);
+
+#endif /* MLX4_IB_H */
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/mr.c b/sys/ofed/drivers/infiniband/hw/mlx4/mr.c
new file mode 100644
index 0000000..c49b460
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/mr.c
@@ -0,0 +1,424 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "mlx4_ib.h"
+
+static u32 convert_access(int acc)
+{
+ return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX4_PERM_ATOMIC : 0) |
+ (acc & IB_ACCESS_REMOTE_WRITE ? MLX4_PERM_REMOTE_WRITE : 0) |
+ (acc & IB_ACCESS_REMOTE_READ ? MLX4_PERM_REMOTE_READ : 0) |
+ (acc & IB_ACCESS_LOCAL_WRITE ? MLX4_PERM_LOCAL_WRITE : 0) |
+ MLX4_PERM_LOCAL_READ;
+}
+
+struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc)
+{
+ struct mlx4_ib_mr *mr;
+ int err;
+
+ mr = kmalloc(sizeof *mr, GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+ err = mlx4_mr_alloc(to_mdev(pd->device)->dev, to_mpd(pd)->pdn, 0,
+ ~0ull, convert_access(acc), 0, 0, &mr->mmr);
+ if (err)
+ goto err_free;
+
+ err = mlx4_mr_enable(to_mdev(pd->device)->dev, &mr->mmr);
+ if (err)
+ goto err_mr;
+
+ mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
+ mr->umem = NULL;
+
+ return &mr->ibmr;
+
+err_mr:
+ mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
+
+err_free:
+ kfree(mr);
+
+ return ERR_PTR(err);
+}
+
+int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
+ struct ib_umem *umem)
+{
+ u64 *pages;
+ struct ib_umem_chunk *chunk;
+ int i, j, k;
+ int n;
+ int len;
+ int err = 0;
+
+ pages = (u64 *) __get_free_page(GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
+
+ i = n = 0;
+
+ list_for_each_entry(chunk, &umem->chunk_list, list)
+ for (j = 0; j < chunk->nmap; ++j) {
+ len = sg_dma_len(&chunk->page_list[j]) >> mtt->page_shift;
+ for (k = 0; k < len; ++k) {
+ pages[i++] = sg_dma_address(&chunk->page_list[j]) +
+ umem->page_size * k;
+ /*
+ * Be friendly to mlx4_write_mtt() and
+ * pass it chunks of appropriate size.
+ */
+ if (i == PAGE_SIZE / sizeof (u64)) {
+ err = mlx4_write_mtt(dev->dev, mtt, n,
+ i, pages);
+ if (err)
+ goto out;
+ n += i;
+ i = 0;
+ }
+ }
+ }
+
+ if (i)
+ err = mlx4_write_mtt(dev->dev, mtt, n, i, pages);
+
+out:
+ free_page((unsigned long) pages);
+ return err;
+}
+
+static int handle_hugetlb_user_mr(struct ib_pd *pd, struct mlx4_ib_mr *mr,
+ u64 start, u64 virt_addr, int access_flags)
+{
+#if defined(CONFIG_HUGETLB_PAGE) && !defined(__powerpc__) && !defined(__ia64__)
+ struct mlx4_ib_dev *dev = to_mdev(pd->device);
+ struct ib_umem_chunk *chunk;
+ unsigned dsize;
+ dma_addr_t daddr;
+ unsigned cur_size = 0;
+ dma_addr_t uninitialized_var(cur_addr);
+ int n;
+ struct ib_umem *umem = mr->umem;
+ u64 *arr;
+ int err = 0;
+ int i;
+ int j = 0;
+ int off = start & (HPAGE_SIZE - 1);
+
+ n = DIV_ROUND_UP(off + umem->length, HPAGE_SIZE);
+ arr = kmalloc(n * sizeof *arr, GFP_KERNEL);
+ if (!arr)
+ return -ENOMEM;
+
+ list_for_each_entry(chunk, &umem->chunk_list, list)
+ for (i = 0; i < chunk->nmap; ++i) {
+ daddr = sg_dma_address(&chunk->page_list[i]);
+ dsize = sg_dma_len(&chunk->page_list[i]);
+ if (!cur_size) {
+ cur_addr = daddr;
+ cur_size = dsize;
+ } else if (cur_addr + cur_size != daddr) {
+ err = -EINVAL;
+ goto out;
+ } else
+ cur_size += dsize;
+
+ if (cur_size > HPAGE_SIZE) {
+ err = -EINVAL;
+ goto out;
+ } else if (cur_size == HPAGE_SIZE) {
+ cur_size = 0;
+ arr[j++] = cur_addr;
+ }
+ }
+
+ if (cur_size) {
+ arr[j++] = cur_addr;
+ }
+
+ err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, umem->length,
+ convert_access(access_flags), n, HPAGE_SHIFT, &mr->mmr);
+ if (err)
+ goto out;
+
+ err = mlx4_write_mtt(dev->dev, &mr->mmr.mtt, 0, n, arr);
+
+out:
+ kfree(arr);
+ return err;
+#else
+ return -ENOSYS;
+#endif
+}
+
+struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+ u64 virt_addr, int access_flags,
+ struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(pd->device);
+ struct mlx4_ib_mr *mr;
+ int shift;
+ int err;
+ int n;
+
+ mr = kmalloc(sizeof *mr, GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+ mr->umem = ib_umem_get(pd->uobject->context, start, length,
+ access_flags, 0);
+ if (IS_ERR(mr->umem)) {
+ err = PTR_ERR(mr->umem);
+ goto err_free;
+ }
+
+ if (!mr->umem->hugetlb ||
+ handle_hugetlb_user_mr(pd, mr, start, virt_addr, access_flags)) {
+ n = ib_umem_page_count(mr->umem);
+ shift = ilog2(mr->umem->page_size);
+
+ err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length,
+ convert_access(access_flags), n, shift, &mr->mmr);
+ if (err)
+ goto err_umem;
+
+ err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem);
+ if (err)
+ goto err_mr;
+ }
+
+ err = mlx4_mr_enable(dev->dev, &mr->mmr);
+ if (err)
+ goto err_mr;
+
+ mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
+
+ return &mr->ibmr;
+
+err_mr:
+ mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
+
+err_umem:
+ ib_umem_release(mr->umem);
+
+err_free:
+ kfree(mr);
+
+ return ERR_PTR(err);
+}
+
+int mlx4_ib_dereg_mr(struct ib_mr *ibmr)
+{
+ struct mlx4_ib_mr *mr = to_mmr(ibmr);
+
+ mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr);
+ if (mr->umem)
+ ib_umem_release(mr->umem);
+ kfree(mr);
+
+ return 0;
+}
+
+struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd,
+ int max_page_list_len)
+{
+ struct mlx4_ib_dev *dev = to_mdev(pd->device);
+ struct mlx4_ib_mr *mr;
+ int err;
+
+ mr = kmalloc(sizeof *mr, GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+ err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, 0, 0, 0,
+ max_page_list_len, 0, &mr->mmr);
+ if (err)
+ goto err_free;
+
+ err = mlx4_mr_enable(dev->dev, &mr->mmr);
+ if (err)
+ goto err_mr;
+
+ mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
+ mr->umem = NULL;
+
+ return &mr->ibmr;
+
+err_mr:
+ mlx4_mr_free(dev->dev, &mr->mmr);
+
+err_free:
+ kfree(mr);
+ return ERR_PTR(err);
+}
+
+struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
+ int page_list_len)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ struct mlx4_ib_fast_reg_page_list *mfrpl;
+ int size = page_list_len * sizeof (u64);
+
+ if (page_list_len > MAX_FAST_REG_PAGES)
+ return ERR_PTR(-EINVAL);
+
+ mfrpl = kmalloc(sizeof *mfrpl, GFP_KERNEL);
+ if (!mfrpl)
+ return ERR_PTR(-ENOMEM);
+
+ mfrpl->ibfrpl.page_list = kmalloc(size, GFP_KERNEL);
+ if (!mfrpl->ibfrpl.page_list)
+ goto err_free;
+
+ mfrpl->mapped_page_list = dma_alloc_coherent(&dev->dev->pdev->dev,
+ size, &mfrpl->map,
+ GFP_KERNEL);
+ if (!mfrpl->mapped_page_list)
+ goto err_free;
+
+ WARN_ON(mfrpl->map & 0x3f);
+
+ return &mfrpl->ibfrpl;
+
+err_free:
+ kfree(mfrpl->ibfrpl.page_list);
+ kfree(mfrpl);
+ return ERR_PTR(-ENOMEM);
+}
+
+void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
+{
+ struct mlx4_ib_dev *dev = to_mdev(page_list->device);
+ struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list);
+ int size = page_list->max_page_list_len * sizeof (u64);
+
+ dma_free_coherent(&dev->dev->pdev->dev, size, mfrpl->mapped_page_list,
+ mfrpl->map);
+ kfree(mfrpl->ibfrpl.page_list);
+ kfree(mfrpl);
+}
+
+struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int acc,
+ struct ib_fmr_attr *fmr_attr)
+{
+ struct mlx4_ib_dev *dev = to_mdev(pd->device);
+ struct mlx4_ib_fmr *fmr;
+ int err = -ENOMEM;
+
+ fmr = kmalloc(sizeof *fmr, GFP_KERNEL);
+ if (!fmr)
+ return ERR_PTR(-ENOMEM);
+
+ err = mlx4_fmr_alloc(dev->dev, to_mpd(pd)->pdn, convert_access(acc),
+ fmr_attr->max_pages, fmr_attr->max_maps,
+ fmr_attr->page_shift, &fmr->mfmr);
+ if (err)
+ goto err_free;
+
+ err = mlx4_fmr_enable(to_mdev(pd->device)->dev, &fmr->mfmr);
+ if (err)
+ goto err_mr;
+
+ fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mfmr.mr.key;
+
+ return &fmr->ibfmr;
+
+err_mr:
+ mlx4_mr_free(to_mdev(pd->device)->dev, &fmr->mfmr.mr);
+
+err_free:
+ kfree(fmr);
+
+ return ERR_PTR(err);
+}
+
+int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+ int npages, u64 iova)
+{
+ struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
+ struct mlx4_ib_dev *dev = to_mdev(ifmr->ibfmr.device);
+
+ return mlx4_map_phys_fmr(dev->dev, &ifmr->mfmr, page_list, npages, iova,
+ &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey);
+}
+
+int mlx4_ib_unmap_fmr(struct list_head *fmr_list)
+{
+ struct ib_fmr *ibfmr;
+ int err;
+ struct mlx4_dev *mdev = NULL;
+
+ list_for_each_entry(ibfmr, fmr_list, list) {
+ if (mdev && to_mdev(ibfmr->device)->dev != mdev)
+ return -EINVAL;
+ mdev = to_mdev(ibfmr->device)->dev;
+ }
+
+ if (!mdev)
+ return 0;
+
+ list_for_each_entry(ibfmr, fmr_list, list) {
+ struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
+
+ mlx4_fmr_unmap(mdev, &ifmr->mfmr, &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey);
+ }
+
+ /*
+ * Make sure all MPT status updates are visible before issuing
+ * SYNC_TPT firmware command.
+ */
+ wmb();
+
+ err = mlx4_SYNC_TPT(mdev);
+ if (err)
+ printk(KERN_WARNING "mlx4_ib: SYNC_TPT error %d when "
+ "unmapping FMRs\n", err);
+
+ return 0;
+}
+
+int mlx4_ib_fmr_dealloc(struct ib_fmr *ibfmr)
+{
+ struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
+ struct mlx4_ib_dev *dev = to_mdev(ibfmr->device);
+ int err;
+
+ err = mlx4_fmr_free(dev->dev, &ifmr->mfmr);
+
+ if (!err)
+ kfree(ifmr);
+
+ return err;
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/qp.c b/sys/ofed/drivers/infiniband/hw/mlx4/qp.c
new file mode 100644
index 0000000..8958c1e
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/qp.c
@@ -0,0 +1,2770 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/log2.h>
+#include <linux/netdevice.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_addr.h>
+
+#include <linux/mlx4/qp.h>
+#include <linux/io.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+
+enum {
+ MLX4_IB_ACK_REQ_FREQ = 8,
+};
+
+enum {
+ MLX4_IB_DEFAULT_SCHED_QUEUE = 0x83,
+ MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f,
+ MLX4_IB_LINK_TYPE_IB = 0,
+ MLX4_IB_LINK_TYPE_ETH = 1,
+};
+
+enum {
+ /*
+ * Largest possible UD header: send with GRH and immediate data.
+ * 4 bytes added to accommodate for eth header instead of lrh
+ */
+ MLX4_IB_UD_HEADER_SIZE = 76,
+ MLX4_IB_MAX_RAW_ETY_HDR_SIZE = 12
+};
+
+enum {
+ MLX4_IBOE_ETHERTYPE = 0x8915
+};
+
+struct mlx4_ib_xrc_reg_entry {
+ struct list_head list;
+ void *context;
+};
+
+struct mlx4_ib_sqp {
+ struct mlx4_ib_qp qp;
+ int pkey_index;
+ u32 qkey;
+ u32 send_psn;
+ struct ib_ud_header ud_header;
+ u8 header_buf[MLX4_IB_UD_HEADER_SIZE];
+};
+
+enum {
+ MLX4_IB_MIN_SQ_STRIDE = 6
+};
+
+static const __be32 mlx4_ib_opcode[] = {
+ [IB_WR_SEND] = cpu_to_be32(MLX4_OPCODE_SEND),
+ [IB_WR_LSO] = cpu_to_be32(MLX4_OPCODE_LSO),
+ [IB_WR_SEND_WITH_IMM] = cpu_to_be32(MLX4_OPCODE_SEND_IMM),
+ [IB_WR_RDMA_WRITE] = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE),
+ [IB_WR_RDMA_WRITE_WITH_IMM] = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM),
+ [IB_WR_RDMA_READ] = cpu_to_be32(MLX4_OPCODE_RDMA_READ),
+ [IB_WR_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_ATOMIC_CS),
+ [IB_WR_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_ATOMIC_FA),
+ [IB_WR_SEND_WITH_INV] = cpu_to_be32(MLX4_OPCODE_SEND_INVAL),
+ [IB_WR_LOCAL_INV] = cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL),
+ [IB_WR_FAST_REG_MR] = cpu_to_be32(MLX4_OPCODE_FMR),
+ [IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS),
+ [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA),
+};
+
+#ifndef wc_wmb
+ #if defined(__i386__)
+ #define wc_wmb() __asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
+ #elif defined(__x86_64__)
+ #define wc_wmb() __asm volatile("sfence" ::: "memory")
+ #elif defined(__ia64__)
+ #define wc_wmb() __asm volatile("fwb" ::: "memory")
+ #else
+ #define wc_wmb() wmb()
+ #endif
+#endif
+
+
+static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
+{
+ return container_of(mqp, struct mlx4_ib_sqp, qp);
+}
+
+static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+ return qp->mqp.qpn >= dev->dev->caps.sqp_start &&
+ qp->mqp.qpn <= dev->dev->caps.sqp_start + 3;
+}
+
+static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+ return qp->mqp.qpn >= dev->dev->caps.sqp_start &&
+ qp->mqp.qpn <= dev->dev->caps.sqp_start + 1;
+}
+
+static void *get_wqe(struct mlx4_ib_qp *qp, int offset)
+{
+ return mlx4_buf_offset(&qp->buf, offset);
+}
+
+static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n)
+{
+ return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift));
+}
+
+static void *get_send_wqe(struct mlx4_ib_qp *qp, int n)
+{
+ return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift));
+}
+
+/*
+ * Stamp a SQ WQE so that it is invalid if prefetched by marking the
+ * first four bytes of every 64 byte chunk with
+ * 0x7FFFFFF | (invalid_ownership_value << 31).
+ *
+ * When the max work request size is less than or equal to the WQE
+ * basic block size, as an optimization, we can stamp all WQEs with
+ * 0xffffffff, and skip the very first chunk of each WQE.
+ */
+static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size)
+{
+ __be32 *wqe;
+ int i;
+ int s;
+ int ind;
+ void *buf;
+ __be32 stamp;
+ struct mlx4_wqe_ctrl_seg *ctrl;
+
+ if (qp->sq_max_wqes_per_wr > 1) {
+ s = roundup(size, 1U << qp->sq.wqe_shift);
+ for (i = 0; i < s; i += 64) {
+ ind = (i >> qp->sq.wqe_shift) + n;
+ stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) :
+ cpu_to_be32(0xffffffff);
+ buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+ wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1));
+ *wqe = stamp;
+ }
+ } else {
+ ctrl = buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+ s = (ctrl->fence_size & 0x3f) << 4;
+ for (i = 64; i < s; i += 64) {
+ wqe = buf + i;
+ *wqe = cpu_to_be32(0xffffffff);
+ }
+ }
+}
+
+static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size)
+{
+ struct mlx4_wqe_ctrl_seg *ctrl;
+ struct mlx4_wqe_inline_seg *inl;
+ void *wqe;
+ int s;
+
+ ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+ s = sizeof(struct mlx4_wqe_ctrl_seg);
+
+ if (qp->ibqp.qp_type == IB_QPT_UD) {
+ struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl;
+ struct mlx4_av *av = (struct mlx4_av *)dgram->av;
+ memset(dgram, 0, sizeof *dgram);
+ av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn);
+ s += sizeof(struct mlx4_wqe_datagram_seg);
+ }
+
+ /* Pad the remainder of the WQE with an inline data segment. */
+ if (size > s) {
+ inl = wqe + s;
+ inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl));
+ }
+ ctrl->srcrb_flags = 0;
+ ctrl->fence_size = size / 16;
+ /*
+ * Make sure descriptor is fully written before setting ownership bit
+ * (because HW can start executing as soon as we do).
+ */
+ wmb();
+
+ ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) |
+ (n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
+
+ stamp_send_wqe(qp, n + qp->sq_spare_wqes, size);
+}
+
+/* Post NOP WQE to prevent wrap-around in the middle of WR */
+static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind)
+{
+ unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1));
+ if (unlikely(s < qp->sq_max_wqes_per_wr)) {
+ post_nop_wqe(qp, ind, s << qp->sq.wqe_shift);
+ ind += s;
+ }
+ return ind;
+}
+
+static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
+{
+ struct ib_event event;
+ struct mlx4_ib_qp *mqp = to_mibqp(qp);
+ struct ib_qp *ibqp = &mqp->ibqp;
+ struct mlx4_ib_xrc_reg_entry *ctx_entry;
+ unsigned long flags;
+
+ if (type == MLX4_EVENT_TYPE_PATH_MIG)
+ to_mibqp(qp)->port = to_mibqp(qp)->alt_port;
+
+ if (ibqp->event_handler) {
+ event.device = ibqp->device;
+ switch (type) {
+ case MLX4_EVENT_TYPE_PATH_MIG:
+ event.event = IB_EVENT_PATH_MIG;
+ break;
+ case MLX4_EVENT_TYPE_COMM_EST:
+ event.event = IB_EVENT_COMM_EST;
+ break;
+ case MLX4_EVENT_TYPE_SQ_DRAINED:
+ event.event = IB_EVENT_SQ_DRAINED;
+ break;
+ case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE:
+ event.event = IB_EVENT_QP_LAST_WQE_REACHED;
+ break;
+ case MLX4_EVENT_TYPE_WQ_CATAS_ERROR:
+ event.event = IB_EVENT_QP_FATAL;
+ break;
+ case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
+ event.event = IB_EVENT_PATH_MIG_ERR;
+ break;
+ case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+ event.event = IB_EVENT_QP_REQ_ERR;
+ break;
+ case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
+ event.event = IB_EVENT_QP_ACCESS_ERR;
+ break;
+ default:
+ printk(KERN_WARNING "mlx4_ib: Unexpected event type %d "
+ "on QP %06x\n", type, qp->qpn);
+ return;
+ }
+
+ if (unlikely(ibqp->qp_type == IB_QPT_XRC &&
+ mqp->flags & MLX4_IB_XRC_RCV)) {
+ event.event |= IB_XRC_QP_EVENT_FLAG;
+ event.element.xrc_qp_num = ibqp->qp_num;
+ spin_lock_irqsave(&mqp->xrc_reg_list_lock, flags);
+ list_for_each_entry(ctx_entry, &mqp->xrc_reg_list, list)
+ ibqp->event_handler(&event, ctx_entry->context);
+ spin_unlock_irqrestore(&mqp->xrc_reg_list_lock, flags);
+ return;
+ }
+ event.element.qp = ibqp;
+ ibqp->event_handler(&event, ibqp->qp_context);
+ }
+}
+
+static int send_wqe_overhead(enum ib_qp_type type, u32 flags)
+{
+ /*
+ * UD WQEs must have a datagram segment.
+ * RC and UC WQEs might have a remote address segment.
+ * MLX WQEs need two extra inline data segments (for the UD
+ * header and space for the ICRC).
+ */
+ switch (type) {
+ case IB_QPT_UD:
+ return sizeof (struct mlx4_wqe_ctrl_seg) +
+ sizeof (struct mlx4_wqe_datagram_seg) +
+ ((flags & MLX4_IB_QP_LSO) ? 128 : 0);
+ case IB_QPT_UC:
+ return sizeof (struct mlx4_wqe_ctrl_seg) +
+ sizeof (struct mlx4_wqe_raddr_seg);
+ case IB_QPT_XRC:
+ case IB_QPT_RC:
+ return sizeof (struct mlx4_wqe_ctrl_seg) +
+ sizeof (struct mlx4_wqe_atomic_seg) +
+ sizeof (struct mlx4_wqe_raddr_seg);
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ return sizeof (struct mlx4_wqe_ctrl_seg) +
+ ALIGN(MLX4_IB_UD_HEADER_SIZE +
+ DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE,
+ MLX4_INLINE_ALIGN) *
+ sizeof (struct mlx4_wqe_inline_seg),
+ sizeof (struct mlx4_wqe_data_seg)) +
+ ALIGN(4 +
+ sizeof (struct mlx4_wqe_inline_seg),
+ sizeof (struct mlx4_wqe_data_seg));
+ case IB_QPT_RAW_ETY:
+ return sizeof(struct mlx4_wqe_ctrl_seg) +
+ ALIGN(MLX4_IB_MAX_RAW_ETY_HDR_SIZE +
+ sizeof(struct mlx4_wqe_inline_seg),
+ sizeof(struct mlx4_wqe_data_seg));
+
+ default:
+ return sizeof (struct mlx4_wqe_ctrl_seg);
+ }
+}
+
+static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+ int is_user, int has_srq_or_is_xrc, struct mlx4_ib_qp *qp)
+{
+ /* Sanity check RQ size before proceeding */
+ if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE ||
+ cap->max_recv_sge >
+ min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg)) {
+ mlx4_ib_dbg("Requested RQ size (sge or wr) too large");
+ return -EINVAL;
+ }
+
+ if (has_srq_or_is_xrc) {
+ /* QPs attached to an SRQ should have no RQ */
+ if (cap->max_recv_wr) {
+ mlx4_ib_dbg("non-zero RQ size for QP using SRQ");
+ return -EINVAL;
+ }
+
+ qp->rq.wqe_cnt = qp->rq.max_gs = 0;
+ } else {
+ /* HW requires >= 1 RQ entry with >= 1 gather entry */
+ if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge)) {
+ mlx4_ib_dbg("user QP RQ has 0 wr's or 0 sge's "
+ "(wr: 0x%x, sge: 0x%x)", cap->max_recv_wr,
+ cap->max_recv_sge);
+ return -EINVAL;
+ }
+
+ qp->rq.wqe_cnt = roundup_pow_of_two(max(1U, cap->max_recv_wr));
+ qp->rq.max_gs = roundup_pow_of_two(max(1U, cap->max_recv_sge));
+ qp->rq.wqe_shift = ilog2(qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg));
+ }
+
+ /* leave userspace return values as they were, so as not to break ABI */
+ if (is_user) {
+ cap->max_recv_wr = qp->rq.max_post = qp->rq.wqe_cnt;
+ cap->max_recv_sge = qp->rq.max_gs;
+ } else {
+ cap->max_recv_wr = qp->rq.max_post =
+ min(dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE, qp->rq.wqe_cnt);
+ cap->max_recv_sge = min(qp->rq.max_gs,
+ min(dev->dev->caps.max_sq_sg,
+ dev->dev->caps.max_rq_sg));
+ }
+ /* We don't support inline sends for kernel QPs (yet) */
+
+
+ return 0;
+}
+
+static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+ enum ib_qp_type type, struct mlx4_ib_qp *qp)
+{
+ int s;
+
+ /* Sanity check SQ size before proceeding */
+ if (cap->max_send_wr > (dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE) ||
+ cap->max_send_sge >
+ min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg) ||
+ cap->max_inline_data + send_wqe_overhead(type, qp->flags) +
+ sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz) {
+ mlx4_ib_dbg("Requested SQ resources exceed device maxima");
+ return -EINVAL;
+ }
+
+ /*
+ * For MLX transport we need 2 extra S/G entries:
+ * one for the header and one for the checksum at the end
+ */
+ if ((type == IB_QPT_SMI || type == IB_QPT_GSI) &&
+ cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg) {
+ mlx4_ib_dbg("No space for SQP hdr/csum sge's");
+ return -EINVAL;
+ }
+
+ if (type == IB_QPT_RAW_ETY &&
+ cap->max_send_sge + 1 > dev->dev->caps.max_sq_sg) {
+ mlx4_ib_dbg("No space for RAW ETY hdr");
+ return -EINVAL;
+ }
+
+ s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg),
+ cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) +
+ send_wqe_overhead(type, qp->flags);
+
+ if (s > dev->dev->caps.max_sq_desc_sz)
+ return -EINVAL;
+
+ /*
+ * Hermon supports shrinking WQEs, such that a single work
+ * request can include multiple units of 1 << wqe_shift. This
+ * way, work requests can differ in size, and do not have to
+ * be a power of 2 in size, saving memory and speeding up send
+ * WR posting. Unfortunately, if we do this then the
+ * wqe_index field in CQEs can't be used to look up the WR ID
+ * anymore, so we do this only if selective signaling is off.
+ *
+ * Further, on 32-bit platforms, we can't use vmap() to make
+ * the QP buffer virtually contigious. Thus we have to use
+ * constant-sized WRs to make sure a WR is always fully within
+ * a single page-sized chunk.
+ *
+ * Finally, we use NOP work requests to pad the end of the
+ * work queue, to avoid wrap-around in the middle of WR. We
+ * set NEC bit to avoid getting completions with error for
+ * these NOP WRs, but since NEC is only supported starting
+ * with firmware 2.2.232, we use constant-sized WRs for older
+ * firmware.
+ *
+ * And, since MLX QPs only support SEND, we use constant-sized
+ * WRs in this case.
+ *
+ * We look for the smallest value of wqe_shift such that the
+ * resulting number of wqes does not exceed device
+ * capabilities.
+ *
+ * We set WQE size to at least 64 bytes, this way stamping
+ * invalidates each WQE.
+ */
+ if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC &&
+ qp->sq_signal_bits && BITS_PER_LONG == 64 &&
+ type != IB_QPT_SMI && type != IB_QPT_GSI && type != IB_QPT_RAW_ETY)
+ qp->sq.wqe_shift = ilog2(64);
+ else
+ qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s));
+
+ for (;;) {
+ qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift);
+
+ /*
+ * We need to leave 2 KB + 1 WR of headroom in the SQ to
+ * allow HW to prefetch.
+ */
+ qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr;
+ qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr *
+ qp->sq_max_wqes_per_wr +
+ qp->sq_spare_wqes);
+
+ if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes)
+ break;
+
+ if (qp->sq_max_wqes_per_wr <= 1)
+ return -EINVAL;
+
+ ++qp->sq.wqe_shift;
+ }
+
+ qp->sq.max_gs = (min(dev->dev->caps.max_sq_desc_sz,
+ (qp->sq_max_wqes_per_wr << qp->sq.wqe_shift)) -
+ send_wqe_overhead(type, qp->flags)) /
+ sizeof (struct mlx4_wqe_data_seg);
+
+ qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+ (qp->sq.wqe_cnt << qp->sq.wqe_shift);
+ if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
+ qp->rq.offset = 0;
+ qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
+ } else {
+ qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
+ qp->sq.offset = 0;
+ }
+
+ cap->max_send_wr = qp->sq.max_post =
+ (qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr;
+ cap->max_send_sge = min(qp->sq.max_gs,
+ min(dev->dev->caps.max_sq_sg,
+ dev->dev->caps.max_rq_sg));
+ qp->max_inline_data = cap->max_inline_data;
+
+ return 0;
+}
+
+static int set_user_sq_size(struct mlx4_ib_dev *dev,
+ struct mlx4_ib_qp *qp,
+ struct mlx4_ib_create_qp *ucmd)
+{
+ /* Sanity check SQ size before proceeding */
+ if ((1 << ucmd->log_sq_bb_count) > dev->dev->caps.max_wqes ||
+ ucmd->log_sq_stride >
+ ilog2(roundup_pow_of_two(dev->dev->caps.max_sq_desc_sz)) ||
+ ucmd->log_sq_stride < MLX4_IB_MIN_SQ_STRIDE) {
+ mlx4_ib_dbg("Requested max wqes or wqe stride exceeds max");
+ return -EINVAL;
+ }
+
+ qp->sq.wqe_cnt = 1 << ucmd->log_sq_bb_count;
+ qp->sq.wqe_shift = ucmd->log_sq_stride;
+
+ qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+ (qp->sq.wqe_cnt << qp->sq.wqe_shift);
+
+ return 0;
+}
+
+static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
+ struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata, int sqpn, struct mlx4_ib_qp *qp)
+{
+ int qpn;
+ int err;
+
+ mutex_init(&qp->mutex);
+ spin_lock_init(&qp->sq.lock);
+ spin_lock_init(&qp->rq.lock);
+ spin_lock_init(&qp->xrc_reg_list_lock);
+ INIT_LIST_HEAD(&qp->gid_list);
+
+ qp->state = IB_QPS_RESET;
+ if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
+ qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+
+ err = set_rq_size(dev, &init_attr->cap, !!pd->uobject,
+ !!init_attr->srq || !!init_attr->xrc_domain , qp);
+ if (err)
+ goto err;
+
+ if (pd->uobject) {
+ struct mlx4_ib_create_qp ucmd;
+
+ if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+ err = -EFAULT;
+ goto err;
+ }
+
+ qp->sq_no_prefetch = ucmd.sq_no_prefetch;
+
+ err = set_user_sq_size(dev, qp, &ucmd);
+ if (err)
+ goto err;
+
+ qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
+ qp->buf_size, 0, 0);
+ if (IS_ERR(qp->umem)) {
+ err = PTR_ERR(qp->umem);
+ mlx4_ib_dbg("ib_umem_get error (%d)", err);
+ goto err;
+ }
+
+ err = mlx4_mtt_init(dev->dev, ib_umem_page_count(qp->umem),
+ ilog2(qp->umem->page_size), &qp->mtt);
+ if (err) {
+ mlx4_ib_dbg("mlx4_mtt_init error (%d)", err);
+ goto err_buf;
+ }
+
+ err = mlx4_ib_umem_write_mtt(dev, &qp->mtt, qp->umem);
+ if (err) {
+ mlx4_ib_dbg("mlx4_ib_umem_write_mtt error (%d)", err);
+ goto err_mtt;
+ }
+
+ if (!init_attr->srq && init_attr->qp_type != IB_QPT_XRC) {
+ err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
+ ucmd.db_addr, &qp->db);
+ if (err) {
+ mlx4_ib_dbg("mlx4_ib_db_map_user error (%d)", err);
+ goto err_mtt;
+ }
+ }
+ } else {
+ qp->sq_no_prefetch = 0;
+
+ if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
+ qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
+
+ if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
+ qp->flags |= MLX4_IB_QP_LSO;
+
+ err = set_kernel_sq_size(dev, &init_attr->cap, init_attr->qp_type, qp);
+ if (err)
+ goto err;
+
+ if (!init_attr->srq && init_attr->qp_type != IB_QPT_XRC) {
+ err = mlx4_db_alloc(dev->dev, &qp->db, 0);
+ if (err)
+ goto err;
+
+ *qp->db.db = 0;
+ }
+
+ if (qp->max_inline_data) {
+ err = mlx4_bf_alloc(dev->dev, &qp->bf);
+ if (err) {
+ mlx4_ib_dbg("failed to allocate blue flame register (%d)", err);
+ qp->bf.uar = &dev->priv_uar;
+ }
+ } else
+ qp->bf.uar = &dev->priv_uar;
+
+ if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf)) {
+ err = -ENOMEM;
+ goto err_db;
+ }
+
+ err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift,
+ &qp->mtt);
+ if (err) {
+ mlx4_ib_dbg("kernel qp mlx4_mtt_init error (%d)", err);
+ goto err_buf;
+ }
+
+ err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf);
+ if (err) {
+ mlx4_ib_dbg("mlx4_buf_write_mtt error (%d)", err);
+ goto err_mtt;
+ }
+
+ qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), GFP_KERNEL);
+ qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), GFP_KERNEL);
+
+ if (!qp->sq.wrid || !qp->rq.wrid) {
+ err = -ENOMEM;
+ goto err_wrid;
+ }
+ }
+
+ if (sqpn) {
+ qpn = sqpn;
+ } else {
+ err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn);
+ if (err)
+ goto err_wrid;
+ }
+
+ err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp);
+ if (err)
+ goto err_qpn;
+
+ if (init_attr->qp_type == IB_QPT_XRC)
+ qp->mqp.qpn |= (1 << 23);
+
+ /*
+ * Hardware wants QPN written in big-endian order (after
+ * shifting) for send doorbell. Precompute this value to save
+ * a little bit when posting sends.
+ */
+ qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
+
+ qp->mqp.event = mlx4_ib_qp_event;
+
+ return 0;
+
+err_qpn:
+ if (!sqpn)
+ mlx4_qp_release_range(dev->dev, qpn, 1);
+
+err_wrid:
+ if (pd->uobject) {
+ if (!init_attr->srq && init_attr->qp_type != IB_QPT_XRC)
+ mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context),
+ &qp->db);
+ } else {
+ kfree(qp->sq.wrid);
+ kfree(qp->rq.wrid);
+ }
+
+err_mtt:
+ mlx4_mtt_cleanup(dev->dev, &qp->mtt);
+
+err_buf:
+ if (pd->uobject)
+ ib_umem_release(qp->umem);
+ else
+ mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
+
+err_db:
+ if (!pd->uobject && !init_attr->srq && init_attr->qp_type != IB_QPT_XRC)
+ mlx4_db_free(dev->dev, &qp->db);
+
+ if (qp->max_inline_data)
+ mlx4_bf_free(dev->dev, &qp->bf);
+
+err:
+ return err;
+}
+
+static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state)
+{
+ switch (state) {
+ case IB_QPS_RESET: return MLX4_QP_STATE_RST;
+ case IB_QPS_INIT: return MLX4_QP_STATE_INIT;
+ case IB_QPS_RTR: return MLX4_QP_STATE_RTR;
+ case IB_QPS_RTS: return MLX4_QP_STATE_RTS;
+ case IB_QPS_SQD: return MLX4_QP_STATE_SQD;
+ case IB_QPS_SQE: return MLX4_QP_STATE_SQER;
+ case IB_QPS_ERR: return MLX4_QP_STATE_ERR;
+ default: return -1;
+ }
+}
+
+static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
+{
+ if (send_cq == recv_cq)
+ spin_lock_irq(&send_cq->lock);
+ else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
+ spin_lock_irq(&send_cq->lock);
+ spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
+ } else {
+ spin_lock_irq(&recv_cq->lock);
+ spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
+ }
+}
+
+static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
+{
+ if (send_cq == recv_cq)
+ spin_unlock_irq(&send_cq->lock);
+ else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
+ spin_unlock(&recv_cq->lock);
+ spin_unlock_irq(&send_cq->lock);
+ } else {
+ spin_unlock(&send_cq->lock);
+ spin_unlock_irq(&recv_cq->lock);
+ }
+}
+
+static void del_gid_entries(struct mlx4_ib_qp *qp)
+{
+ struct gid_entry *ge, *tmp;
+
+ list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
+ list_del(&ge->list);
+ kfree(ge);
+ }
+}
+
+static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
+ int is_user)
+{
+ struct mlx4_ib_cq *send_cq, *recv_cq;
+
+ if (qp->state != IB_QPS_RESET)
+ if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state),
+ MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp))
+ printk(KERN_WARNING "mlx4_ib: modify QP %06x to RESET failed.\n",
+ qp->mqp.qpn);
+
+ send_cq = to_mcq(qp->ibqp.send_cq);
+ recv_cq = to_mcq(qp->ibqp.recv_cq);
+
+ mlx4_ib_lock_cqs(send_cq, recv_cq);
+
+ if (!is_user) {
+ __mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
+ qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL);
+ if (send_cq != recv_cq)
+ __mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
+ }
+
+ mlx4_qp_remove(dev->dev, &qp->mqp);
+
+ mlx4_ib_unlock_cqs(send_cq, recv_cq);
+
+ mlx4_qp_free(dev->dev, &qp->mqp);
+
+ if (!is_sqp(dev, qp))
+ mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
+
+ mlx4_mtt_cleanup(dev->dev, &qp->mtt);
+
+ if (is_user) {
+ if (!qp->ibqp.srq && qp->ibqp.qp_type != IB_QPT_XRC)
+ mlx4_ib_db_unmap_user(to_mucontext(qp->ibqp.uobject->context),
+ &qp->db);
+ ib_umem_release(qp->umem);
+ } else {
+ kfree(qp->sq.wrid);
+ kfree(qp->rq.wrid);
+ mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
+ if (qp->max_inline_data)
+ mlx4_bf_free(dev->dev, &qp->bf);
+ if (!qp->ibqp.srq && qp->ibqp.qp_type != IB_QPT_XRC)
+ mlx4_db_free(dev->dev, &qp->db);
+ }
+
+ del_gid_entries(qp);
+}
+
+struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
+ struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(pd->device);
+ struct mlx4_ib_sqp *sqp;
+ struct mlx4_ib_qp *qp;
+ int err;
+
+ /*
+ * We only support LSO and multicast loopback blocking, and
+ * only for kernel UD QPs.
+ */
+ if (init_attr->create_flags & ~(IB_QP_CREATE_IPOIB_UD_LSO |
+ IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK))
+ return ERR_PTR(-EINVAL);
+
+ if (init_attr->create_flags &&
+ (pd->uobject || init_attr->qp_type != IB_QPT_UD))
+ return ERR_PTR(-EINVAL);
+
+ switch (init_attr->qp_type) {
+ case IB_QPT_XRC:
+ if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
+ return ERR_PTR(-ENOSYS);
+ case IB_QPT_RC:
+ case IB_QPT_UC:
+ case IB_QPT_UD:
+ case IB_QPT_RAW_ETH:
+ {
+ qp = kzalloc(sizeof *qp, GFP_KERNEL);
+ if (!qp)
+ return ERR_PTR(-ENOMEM);
+
+ err = create_qp_common(dev, pd, init_attr, udata, 0, qp);
+ if (err) {
+ kfree(qp);
+ return ERR_PTR(err);
+ }
+
+ if (init_attr->qp_type == IB_QPT_XRC)
+ qp->xrcdn = to_mxrcd(init_attr->xrc_domain)->xrcdn;
+ else
+ qp->xrcdn = 0;
+
+ qp->ibqp.qp_num = qp->mqp.qpn;
+
+ break;
+ }
+ case IB_QPT_RAW_ETY:
+ if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_RAW_ETY))
+ return ERR_PTR(-ENOSYS);
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ {
+ /* Userspace is not allowed to create special QPs: */
+ if (pd->uobject) {
+ mlx4_ib_dbg("Userspace is not allowed to create special QPs");
+ return ERR_PTR(-EINVAL);
+ }
+
+ sqp = kzalloc(sizeof *sqp, GFP_KERNEL);
+ if (!sqp)
+ return ERR_PTR(-ENOMEM);
+
+ qp = &sqp->qp;
+
+ err = create_qp_common(dev, pd, init_attr, udata,
+ dev->dev->caps.sqp_start +
+ (init_attr->qp_type == IB_QPT_RAW_ETY ? 4 :
+ (init_attr->qp_type == IB_QPT_SMI ? 0 : 2)) +
+ init_attr->port_num - 1,
+ qp);
+ if (err) {
+ kfree(sqp);
+ return ERR_PTR(err);
+ }
+
+ qp->port = init_attr->port_num;
+ qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1;
+
+ break;
+ }
+ default:
+ mlx4_ib_dbg("Invalid QP type requested for create_qp (%d)",
+ init_attr->qp_type);
+ return ERR_PTR(-EINVAL);
+ }
+
+ return &qp->ibqp;
+}
+
+int mlx4_ib_destroy_qp(struct ib_qp *qp)
+{
+ struct mlx4_ib_dev *dev = to_mdev(qp->device);
+ struct mlx4_ib_qp *mqp = to_mqp(qp);
+
+ if (is_qp0(dev, mqp))
+ mlx4_CLOSE_PORT(dev->dev, mqp->port);
+
+ destroy_qp_common(dev, mqp, !!qp->pd->uobject);
+
+ if (is_sqp(dev, mqp))
+ kfree(to_msqp(mqp));
+ else
+ kfree(mqp);
+
+ return 0;
+}
+
+static int to_mlx4_st(enum ib_qp_type type)
+{
+ switch (type) {
+ case IB_QPT_RC: return MLX4_QP_ST_RC;
+ case IB_QPT_UC: return MLX4_QP_ST_UC;
+ case IB_QPT_UD: return MLX4_QP_ST_UD;
+ case IB_QPT_XRC: return MLX4_QP_ST_XRC;
+ case IB_QPT_RAW_ETY:
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ case IB_QPT_RAW_ETH: return MLX4_QP_ST_MLX;
+ default: return -1;
+ }
+}
+
+static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, const struct ib_qp_attr *attr,
+ int attr_mask)
+{
+ u8 dest_rd_atomic;
+ u32 access_flags;
+ u32 hw_access_flags = 0;
+
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+ dest_rd_atomic = attr->max_dest_rd_atomic;
+ else
+ dest_rd_atomic = qp->resp_depth;
+
+ if (attr_mask & IB_QP_ACCESS_FLAGS)
+ access_flags = attr->qp_access_flags;
+ else
+ access_flags = qp->atomic_rd_en;
+
+ if (!dest_rd_atomic)
+ access_flags &= IB_ACCESS_REMOTE_WRITE;
+
+ if (access_flags & IB_ACCESS_REMOTE_READ)
+ hw_access_flags |= MLX4_QP_BIT_RRE;
+ if (access_flags & IB_ACCESS_REMOTE_ATOMIC)
+ hw_access_flags |= MLX4_QP_BIT_RAE;
+ if (access_flags & IB_ACCESS_REMOTE_WRITE)
+ hw_access_flags |= MLX4_QP_BIT_RWE;
+
+ return cpu_to_be32(hw_access_flags);
+}
+
+static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, const struct ib_qp_attr *attr,
+ int attr_mask)
+{
+ if (attr_mask & IB_QP_PKEY_INDEX)
+ sqp->pkey_index = attr->pkey_index;
+ if (attr_mask & IB_QP_QKEY)
+ sqp->qkey = attr->qkey;
+ if (attr_mask & IB_QP_SQ_PSN)
+ sqp->send_psn = attr->sq_psn;
+}
+
+static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port)
+{
+ path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6);
+}
+
+static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
+ struct mlx4_qp_path *path, u8 port)
+{
+ int err;
+ int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port) ==
+ IB_LINK_LAYER_ETHERNET;
+ u8 mac[6];
+ int is_mcast;
+ u16 vlan_tag;
+ int vidx;
+
+ path->grh_mylmc = ah->src_path_bits & 0x7f;
+ path->rlid = cpu_to_be16(ah->dlid);
+ if (ah->static_rate) {
+ path->static_rate = ah->static_rate + MLX4_STAT_RATE_OFFSET;
+ while (path->static_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
+ !(1 << path->static_rate & dev->dev->caps.stat_rate_support))
+ --path->static_rate;
+ } else
+ path->static_rate = 0;
+
+ if (ah->ah_flags & IB_AH_GRH) {
+ if (ah->grh.sgid_index >= dev->dev->caps.gid_table_len[port]) {
+ printk(KERN_ERR "sgid_index (%u) too large. max is %d\n",
+ ah->grh.sgid_index, dev->dev->caps.gid_table_len[port] - 1);
+ return -1;
+ }
+
+ path->grh_mylmc |= 1 << 7;
+ path->mgid_index = ah->grh.sgid_index;
+ path->hop_limit = ah->grh.hop_limit;
+ path->tclass_flowlabel =
+ cpu_to_be32((ah->grh.traffic_class << 20) |
+ (ah->grh.flow_label));
+ memcpy(path->rgid, ah->grh.dgid.raw, 16);
+ }
+
+ if (is_eth) {
+ path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
+ ((port - 1) << 6) | ((ah->sl & 0x7) << 3) | ((ah->sl & 8) >> 1);
+
+ if (!(ah->ah_flags & IB_AH_GRH))
+ return -1;
+
+ err = mlx4_ib_resolve_grh(dev, ah, mac, &is_mcast, port);
+ if (err)
+ return err;
+
+ memcpy(path->dmac, mac, 6);
+ path->ackto = MLX4_IB_LINK_TYPE_ETH;
+ /* use index 0 into MAC table for IBoE */
+ path->grh_mylmc &= 0x80;
+
+ vlan_tag = rdma_get_vlan_id(&dev->iboe.gid_table[port - 1][ah->grh.sgid_index]);
+ if (vlan_tag < 0x1000) {
+ if (mlx4_find_cached_vlan(dev->dev, port, vlan_tag, &vidx))
+ return -ENOENT;
+
+ path->vlan_index = vidx;
+ path->fl = 1 << 6;
+ }
+ } else
+ path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
+ ((port - 1) << 6) | ((ah->sl & 0xf) << 2);
+
+ return 0;
+}
+
+static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+ struct gid_entry *ge, *tmp;
+
+ list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
+ if (!ge->added && mlx4_ib_add_mc(dev, qp, &ge->gid)) {
+ ge->added = 1;
+ ge->port = qp->port;
+ }
+ }
+}
+
+static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
+ const struct ib_qp_attr *attr, int attr_mask,
+ enum ib_qp_state cur_state, enum ib_qp_state new_state)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+ struct mlx4_ib_qp *qp = to_mqp(ibqp);
+ struct mlx4_qp_context *context;
+ enum mlx4_qp_optpar optpar = 0;
+ int sqd_event;
+ int err = -EINVAL;
+
+ context = kzalloc(sizeof *context, GFP_KERNEL);
+ if (!context)
+ return -ENOMEM;
+
+ context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) |
+ (to_mlx4_st(ibqp->qp_type) << 16));
+
+ if (!(attr_mask & IB_QP_PATH_MIG_STATE))
+ context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
+ else {
+ optpar |= MLX4_QP_OPTPAR_PM_STATE;
+ switch (attr->path_mig_state) {
+ case IB_MIG_MIGRATED:
+ context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
+ break;
+ case IB_MIG_REARM:
+ context->flags |= cpu_to_be32(MLX4_QP_PM_REARM << 11);
+ break;
+ case IB_MIG_ARMED:
+ context->flags |= cpu_to_be32(MLX4_QP_PM_ARMED << 11);
+ break;
+ }
+ }
+ if (ibqp->qp_type == IB_QPT_RAW_ETH)
+ context->mtu_msgmax = 0xff;
+ else if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI ||
+ ibqp->qp_type == IB_QPT_RAW_ETY)
+ context->mtu_msgmax = (IB_MTU_4096 << 5) | 11;
+ else if (ibqp->qp_type == IB_QPT_UD) {
+ if (qp->flags & MLX4_IB_QP_LSO)
+ context->mtu_msgmax = (IB_MTU_4096 << 5) |
+ ilog2(dev->dev->caps.max_gso_sz);
+ else
+ context->mtu_msgmax = (IB_MTU_4096 << 5) | 12;
+ } else if (attr_mask & IB_QP_PATH_MTU) {
+ if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) {
+ printk(KERN_ERR "path MTU (%u) is invalid\n",
+ attr->path_mtu);
+ goto out;
+ }
+ context->mtu_msgmax = (attr->path_mtu << 5) |
+ ilog2(dev->dev->caps.max_msg_sz);
+ }
+
+ if (qp->rq.wqe_cnt)
+ context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3;
+ context->rq_size_stride |= qp->rq.wqe_shift - 4;
+
+ if (qp->sq.wqe_cnt)
+ context->sq_size_stride = ilog2(qp->sq.wqe_cnt) << 3;
+ context->sq_size_stride |= qp->sq.wqe_shift - 4;
+
+ if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+ context->sq_size_stride |= !!qp->sq_no_prefetch << 7;
+ if (ibqp->qp_type == IB_QPT_XRC)
+ context->xrcd = cpu_to_be32((u32) qp->xrcdn);
+ }
+
+ if (qp->ibqp.uobject)
+ context->usr_page = cpu_to_be32(to_mucontext(ibqp->uobject->context)->uar.index);
+ else
+ context->usr_page = cpu_to_be32(qp->bf.uar->index);
+
+ if (attr_mask & IB_QP_DEST_QPN)
+ context->remote_qpn = cpu_to_be32(attr->dest_qp_num);
+
+ if (attr_mask & IB_QP_PORT) {
+ if (cur_state == IB_QPS_SQD && new_state == IB_QPS_SQD &&
+ !(attr_mask & IB_QP_AV)) {
+ mlx4_set_sched(&context->pri_path, attr->port_num);
+ optpar |= MLX4_QP_OPTPAR_SCHED_QUEUE;
+ }
+ }
+
+ if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR &&
+ dev->counters[qp->port - 1] != -1) {
+ context->pri_path.counter_index = dev->counters[qp->port - 1];
+ optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX;
+ }
+
+ if (attr_mask & IB_QP_PKEY_INDEX) {
+ context->pri_path.pkey_index = attr->pkey_index;
+ optpar |= MLX4_QP_OPTPAR_PKEY_INDEX;
+ }
+
+ if (attr_mask & IB_QP_AV) {
+ if (mlx4_set_path(dev, &attr->ah_attr, &context->pri_path,
+ attr_mask & IB_QP_PORT ? attr->port_num : qp->port)) {
+ mlx4_ib_dbg("qpn 0x%x: could not set pri path params",
+ ibqp->qp_num);
+ goto out;
+ }
+
+ optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH |
+ MLX4_QP_OPTPAR_SCHED_QUEUE);
+ }
+
+ if (attr_mask & IB_QP_TIMEOUT) {
+ context->pri_path.ackto |= (attr->timeout << 3);
+ optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT;
+ }
+
+ if (attr_mask & IB_QP_ALT_PATH) {
+ if (attr->alt_port_num == 0 ||
+ attr->alt_port_num > dev->num_ports) {
+ mlx4_ib_dbg("qpn 0x%x: invalid alternate port num (%d)",
+ ibqp->qp_num, attr->alt_port_num);
+ goto out;
+ }
+
+ if (attr->alt_pkey_index >=
+ dev->dev->caps.pkey_table_len[attr->alt_port_num]) {
+ mlx4_ib_dbg("qpn 0x%x: invalid alt pkey index (0x%x)",
+ ibqp->qp_num, attr->alt_pkey_index);
+ goto out;
+ }
+
+ if (mlx4_set_path(dev, &attr->alt_ah_attr, &context->alt_path,
+ attr->alt_port_num)) {
+ mlx4_ib_dbg("qpn 0x%x: could not set alt path params",
+ ibqp->qp_num);
+ goto out;
+ }
+
+ context->alt_path.pkey_index = attr->alt_pkey_index;
+ context->alt_path.ackto = attr->alt_timeout << 3;
+ optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH;
+ }
+
+ context->pd = cpu_to_be32(to_mpd(ibqp->pd)->pdn);
+ context->params1 = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28);
+
+ /* Set "fast registration enabled" for all kernel QPs */
+ if (!qp->ibqp.uobject)
+ context->params1 |= cpu_to_be32(1 << 11);
+
+ if (attr_mask & IB_QP_RNR_RETRY) {
+ context->params1 |= cpu_to_be32(attr->rnr_retry << 13);
+ optpar |= MLX4_QP_OPTPAR_RNR_RETRY;
+ }
+
+ if (attr_mask & IB_QP_RETRY_CNT) {
+ context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
+ optpar |= MLX4_QP_OPTPAR_RETRY_COUNT;
+ }
+
+ if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+ if (attr->max_rd_atomic)
+ context->params1 |=
+ cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21);
+ optpar |= MLX4_QP_OPTPAR_SRA_MAX;
+ }
+
+ if (attr_mask & IB_QP_SQ_PSN)
+ context->next_send_psn = cpu_to_be32(attr->sq_psn);
+
+ context->cqn_send = cpu_to_be32(to_mcq(ibqp->send_cq)->mcq.cqn);
+
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+ if (attr->max_dest_rd_atomic)
+ context->params2 |=
+ cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
+ optpar |= MLX4_QP_OPTPAR_RRA_MAX;
+ }
+
+ if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) {
+ context->params2 |= to_mlx4_access_flags(qp, attr, attr_mask);
+ optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE;
+ }
+
+ if (ibqp->srq)
+ context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC);
+
+ if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+ context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
+ optpar |= MLX4_QP_OPTPAR_RNR_TIMEOUT;
+ }
+ if (attr_mask & IB_QP_RQ_PSN)
+ context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
+
+ context->cqn_recv = cpu_to_be32(to_mcq(ibqp->recv_cq)->mcq.cqn);
+
+ if (attr_mask & IB_QP_QKEY) {
+ context->qkey = cpu_to_be32(attr->qkey);
+ optpar |= MLX4_QP_OPTPAR_Q_KEY;
+ }
+
+ if (ibqp->srq)
+ context->srqn = cpu_to_be32(1 << 24 | to_msrq(ibqp->srq)->msrq.srqn);
+
+ if (!ibqp->srq && ibqp->qp_type != IB_QPT_XRC &&
+ cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+ context->db_rec_addr = cpu_to_be64(qp->db.dma);
+
+ if (cur_state == IB_QPS_INIT &&
+ new_state == IB_QPS_RTR &&
+ (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI ||
+ ibqp->qp_type == IB_QPT_UD || ibqp->qp_type == IB_QPT_RAW_ETY ||
+ ibqp->qp_type == IB_QPT_RAW_ETH)) {
+ context->pri_path.sched_queue = (qp->port - 1) << 6;
+ if (is_qp0(dev, qp))
+ context->pri_path.sched_queue |= MLX4_IB_DEFAULT_QP0_SCHED_QUEUE;
+ else
+ context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE;
+ }
+
+ if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD &&
+ attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify)
+ sqd_event = 1;
+ else
+ sqd_event = 0;
+
+ if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+ context->rlkey |= (1 << 4);
+
+ /*
+ * Before passing a kernel QP to the HW, make sure that the
+ * ownership bits of the send queue are set and the SQ
+ * headroom is stamped so that the hardware doesn't start
+ * processing stale work requests.
+ */
+ if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+ struct mlx4_wqe_ctrl_seg *ctrl;
+ int i;
+
+ for (i = 0; i < qp->sq.wqe_cnt; ++i) {
+ ctrl = get_send_wqe(qp, i);
+ ctrl->owner_opcode = cpu_to_be32(1 << 31);
+ if (qp->sq_max_wqes_per_wr == 1)
+ ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
+
+ stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift);
+ }
+ }
+
+ err = mlx4_qp_modify(dev->dev, &qp->mtt, to_mlx4_state(cur_state),
+ to_mlx4_state(new_state), context, optpar,
+ sqd_event, &qp->mqp);
+ if (err)
+ goto out;
+
+ qp->state = new_state;
+
+ if (attr_mask & IB_QP_ACCESS_FLAGS)
+ qp->atomic_rd_en = attr->qp_access_flags;
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+ qp->resp_depth = attr->max_dest_rd_atomic;
+ if (attr_mask & IB_QP_PORT) {
+ qp->port = attr->port_num;
+ update_mcg_macs(dev, qp);
+ }
+ if (attr_mask & IB_QP_ALT_PATH)
+ qp->alt_port = attr->alt_port_num;
+
+ if (is_sqp(dev, qp))
+ store_sqp_attrs(to_msqp(qp), attr, attr_mask);
+
+ /*
+ * If we moved QP0 to RTR, bring the IB link up; if we moved
+ * QP0 to RESET or ERROR, bring the link back down.
+ */
+ if (is_qp0(dev, qp)) {
+ if (cur_state != IB_QPS_RTR && new_state == IB_QPS_RTR)
+ if (mlx4_INIT_PORT(dev->dev, qp->port))
+ printk(KERN_WARNING "INIT_PORT failed for port %d\n",
+ qp->port);
+
+ if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR &&
+ (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR))
+ mlx4_CLOSE_PORT(dev->dev, qp->port);
+ }
+
+ /*
+ * If we moved a kernel QP to RESET, clean up all old CQ
+ * entries and reinitialize the QP.
+ */
+ if (new_state == IB_QPS_RESET && !ibqp->uobject) {
+ mlx4_ib_cq_clean(to_mcq(ibqp->recv_cq), qp->mqp.qpn,
+ ibqp->srq ? to_msrq(ibqp->srq): NULL);
+ if (ibqp->send_cq != ibqp->recv_cq)
+ mlx4_ib_cq_clean(to_mcq(ibqp->send_cq), qp->mqp.qpn, NULL);
+
+ qp->rq.head = 0;
+ qp->rq.tail = 0;
+ qp->sq.head = 0;
+ qp->sq.tail = 0;
+ qp->sq_next_wqe = 0;
+ if (!ibqp->srq && ibqp->qp_type != IB_QPT_XRC)
+ *qp->db.db = 0;
+ }
+
+out:
+ kfree(context);
+ return err;
+}
+
+int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+ struct mlx4_ib_qp *qp = to_mqp(ibqp);
+ enum ib_qp_state cur_state, new_state;
+ int err = -EINVAL;
+
+ mutex_lock(&qp->mutex);
+
+ cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
+ new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+ if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) {
+ mlx4_ib_dbg("qpn 0x%x: invalid attribute mask specified "
+ "for transition %d to %d. qp_type %d, attr_mask 0x%x",
+ ibqp->qp_num, cur_state, new_state,
+ ibqp->qp_type, attr_mask);
+ goto out;
+ }
+
+ if ((attr_mask & IB_QP_PORT) && (ibqp->qp_type != IB_QPT_RAW_ETH) &&
+ (attr->port_num == 0 || attr->port_num > dev->num_ports)) {
+ mlx4_ib_dbg("qpn 0x%x: invalid port number (%d) specified "
+ "for transition %d to %d. qp_type %d",
+ ibqp->qp_num, attr->port_num, cur_state,
+ new_state, ibqp->qp_type);
+ goto out;
+ }
+
+ if ((attr_mask & IB_QP_PORT) && (ibqp->qp_type == IB_QPT_RAW_ETH) &&
+ (rdma_port_get_link_layer(&dev->ib_dev, attr->port_num)
+ != IB_LINK_LAYER_ETHERNET)) {
+ mlx4_ib_dbg("qpn 0x%x: invalid port (%d) specified (not RDMAoE)"
+ "for transition %d to %d. qp_type %d",
+ ibqp->qp_num, attr->port_num, cur_state,
+ new_state, ibqp->qp_type);
+ goto out;
+ }
+
+ if (attr_mask & IB_QP_PKEY_INDEX) {
+ int p = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+ if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) {
+ mlx4_ib_dbg("qpn 0x%x: invalid pkey index (%d) specified "
+ "for transition %d to %d. qp_type %d",
+ ibqp->qp_num, attr->pkey_index, cur_state,
+ new_state, ibqp->qp_type);
+ goto out;
+ }
+ }
+
+ if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+ attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) {
+ mlx4_ib_dbg("qpn 0x%x: max_rd_atomic (%d) too large. "
+ "Transition %d to %d. qp_type %d",
+ ibqp->qp_num, attr->max_rd_atomic, cur_state,
+ new_state, ibqp->qp_type);
+ goto out;
+ }
+
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+ attr->max_dest_rd_atomic > dev->dev->caps.max_qp_dest_rdma) {
+ mlx4_ib_dbg("qpn 0x%x: max_dest_rd_atomic (%d) too large. "
+ "Transition %d to %d. qp_type %d",
+ ibqp->qp_num, attr->max_dest_rd_atomic, cur_state,
+ new_state, ibqp->qp_type);
+ goto out;
+ }
+
+ if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+ err = 0;
+ goto out;
+ }
+
+ err = __mlx4_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
+
+out:
+ mutex_unlock(&qp->mutex);
+ return err;
+}
+
+static int build_raw_ety_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
+ void *wqe, unsigned *mlx_seg_len)
+{
+ int payload = 0;
+ int header_size, packet_length;
+ struct mlx4_wqe_mlx_seg *mlx = wqe;
+ struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
+ u32 *lrh = wqe + sizeof *mlx + sizeof *inl;
+ int i;
+
+ /* Only IB_WR_SEND is supported */
+ if (wr->opcode != IB_WR_SEND)
+ return -EINVAL;
+
+ for (i = 0; i < wr->num_sge; ++i)
+ payload += wr->sg_list[i].length;
+
+ header_size = IB_LRH_BYTES + 4; /* LRH + RAW_HEADER (32 bits) */
+
+ /* headers + payload and round up */
+ packet_length = (header_size + payload + 3) / 4;
+
+ mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+
+ mlx->flags |= cpu_to_be32(MLX4_WQE_MLX_ICRC |
+ (wr->wr.raw_ety.lrh->service_level << 8));
+
+ mlx->rlid = wr->wr.raw_ety.lrh->destination_lid;
+
+ wr->wr.raw_ety.lrh->packet_length = cpu_to_be16(packet_length);
+
+ ib_lrh_header_pack(wr->wr.raw_ety.lrh, lrh);
+ lrh += IB_LRH_BYTES / 4; /* LRH size is a dword multiple */
+ *lrh = cpu_to_be32(wr->wr.raw_ety.eth_type);
+
+ inl->byte_count = cpu_to_be32(1 << 31 | header_size);
+
+ *mlx_seg_len =
+ ALIGN(sizeof(struct mlx4_wqe_inline_seg) + header_size, 16);
+
+ return 0;
+}
+
+static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
+ void *wqe, unsigned *mlx_seg_len)
+{
+ struct ib_device *ib_dev = &to_mdev(sqp->qp.ibqp.device)->ib_dev;
+ struct mlx4_wqe_mlx_seg *mlx = wqe;
+ struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
+ struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
+ u16 pkey;
+ int send_size;
+ int header_size;
+ int spc;
+ int i;
+ union ib_gid sgid;
+ int is_eth;
+ int is_grh;
+ int is_vlan = 0;
+ int err;
+ u16 vlan;
+
+ vlan = 0;
+ send_size = 0;
+ for (i = 0; i < wr->num_sge; ++i)
+ send_size += wr->sg_list[i].length;
+
+ is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET;
+ is_grh = mlx4_ib_ah_grh_present(ah);
+ err = ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24,
+ ah->av.ib.gid_index, &sgid);
+ if (err)
+ return err;
+ if (is_eth) {
+ is_vlan = rdma_get_vlan_id(&sgid) < 0x1000;
+ vlan = rdma_get_vlan_id(&sgid);
+ }
+
+ ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header);
+ if (!is_eth) {
+ sqp->ud_header.lrh.service_level =
+ be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
+ sqp->ud_header.lrh.destination_lid = ah->av.ib.dlid;
+ sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f);
+ }
+
+ if (is_grh) {
+ sqp->ud_header.grh.traffic_class =
+ (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
+ sqp->ud_header.grh.flow_label =
+ ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+ sqp->ud_header.grh.hop_limit = ah->av.ib.hop_limit;
+ ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24,
+ ah->av.ib.gid_index, &sqp->ud_header.grh.source_gid);
+ memcpy(sqp->ud_header.grh.destination_gid.raw,
+ ah->av.ib.dgid, 16);
+ }
+
+ mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+
+ if (!is_eth) {
+ mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) |
+ (sqp->ud_header.lrh.destination_lid ==
+ IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) |
+ (sqp->ud_header.lrh.service_level << 8));
+ mlx->rlid = sqp->ud_header.lrh.destination_lid;
+ }
+
+ switch (wr->opcode) {
+ case IB_WR_SEND:
+ sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY;
+ sqp->ud_header.immediate_present = 0;
+ break;
+ case IB_WR_SEND_WITH_IMM:
+ sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+ sqp->ud_header.immediate_present = 1;
+ sqp->ud_header.immediate_data = wr->ex.imm_data;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (is_eth) {
+ u8 *smac;
+
+ memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6);
+#ifdef __linux__
+ smac = to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]->dev_addr; /* fixme: cache this value */
+#else
+ smac = IF_LLADDR(to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]); /* fixme: cache this value */
+#endif
+ memcpy(sqp->ud_header.eth.smac_h, smac, 6);
+ if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6))
+ mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK);
+ if (!is_vlan)
+ sqp->ud_header.eth.type = cpu_to_be16(MLX4_IBOE_ETHERTYPE);
+ else {
+ u16 pcp;
+
+ sqp->ud_header.vlan.type = cpu_to_be16(MLX4_IBOE_ETHERTYPE);
+ pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 27 & 3) << 13;
+ sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp);
+ }
+ } else {
+ sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0;
+ if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
+ sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
+ }
+ sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
+ if (!sqp->qp.ibqp.qp_num)
+ ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey);
+ else
+ ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey);
+ sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
+ sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+ sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+ sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ?
+ sqp->qkey : wr->wr.ud.remote_qkey);
+ sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
+
+ header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
+
+ if (0) {
+ printk(KERN_ERR "built UD header of size %d:\n", header_size);
+ for (i = 0; i < header_size / 4; ++i) {
+ if (i % 8 == 0)
+ printk(" [%02x] ", i * 4);
+ printk(" %08x",
+ be32_to_cpu(((__be32 *) sqp->header_buf)[i]));
+ if ((i + 1) % 8 == 0)
+ printk("\n");
+ }
+ printk("\n");
+ }
+
+ /*
+ * Inline data segments may not cross a 64 byte boundary. If
+ * our UD header is bigger than the space available up to the
+ * next 64 byte boundary in the WQE, use two inline data
+ * segments to hold the UD header.
+ */
+ spc = MLX4_INLINE_ALIGN -
+ ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
+ if (header_size <= spc) {
+ inl->byte_count = cpu_to_be32(1 << 31 | header_size);
+ memcpy(inl + 1, sqp->header_buf, header_size);
+ i = 1;
+ } else {
+ inl->byte_count = cpu_to_be32(1 << 31 | spc);
+ memcpy(inl + 1, sqp->header_buf, spc);
+
+ inl = (void *) (inl + 1) + spc;
+ memcpy(inl + 1, sqp->header_buf + spc, header_size - spc);
+ /*
+ * Need a barrier here to make sure all the data is
+ * visible before the byte_count field is set.
+ * Otherwise the HCA prefetcher could grab the 64-byte
+ * chunk with this inline segment and get a valid (!=
+ * 0xffffffff) byte count but stale data, and end up
+ * generating a packet with bad headers.
+ *
+ * The first inline segment's byte_count field doesn't
+ * need a barrier, because it comes after a
+ * control/MLX segment and therefore is at an offset
+ * of 16 mod 64.
+ */
+ wmb();
+ inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc));
+ i = 2;
+ }
+
+ *mlx_seg_len =
+ ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16);
+ return 0;
+}
+
+static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq)
+{
+ unsigned cur;
+ struct mlx4_ib_cq *cq;
+
+ cur = wq->head - wq->tail;
+ if (likely(cur + nreq < wq->max_post))
+ return 0;
+
+ cq = to_mcq(ib_cq);
+ spin_lock(&cq->lock);
+ cur = wq->head - wq->tail;
+ spin_unlock(&cq->lock);
+
+ return cur + nreq >= wq->max_post;
+}
+
+static __be32 convert_access(int acc)
+{
+ return (acc & IB_ACCESS_REMOTE_ATOMIC ? cpu_to_be32(MLX4_WQE_FMR_PERM_ATOMIC) : 0) |
+ (acc & IB_ACCESS_REMOTE_WRITE ? cpu_to_be32(MLX4_WQE_FMR_PERM_REMOTE_WRITE) : 0) |
+ (acc & IB_ACCESS_REMOTE_READ ? cpu_to_be32(MLX4_WQE_FMR_PERM_REMOTE_READ) : 0) |
+ (acc & IB_ACCESS_LOCAL_WRITE ? cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_WRITE) : 0) |
+ cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_READ);
+}
+
+static void set_fmr_seg(struct mlx4_wqe_fmr_seg *fseg, struct ib_send_wr *wr)
+{
+ struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(wr->wr.fast_reg.page_list);
+ int i;
+
+ for (i = 0; i < wr->wr.fast_reg.page_list_len; ++i)
+ mfrpl->mapped_page_list[i] =
+ cpu_to_be64(wr->wr.fast_reg.page_list->page_list[i] |
+ MLX4_MTT_FLAG_PRESENT);
+
+ fseg->flags = convert_access(wr->wr.fast_reg.access_flags);
+ fseg->mem_key = cpu_to_be32(wr->wr.fast_reg.rkey);
+ fseg->buf_list = cpu_to_be64(mfrpl->map);
+ fseg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start);
+ fseg->reg_len = cpu_to_be64(wr->wr.fast_reg.length);
+ fseg->offset = 0; /* XXX -- is this just for ZBVA? */
+ fseg->page_size = cpu_to_be32(wr->wr.fast_reg.page_shift);
+ fseg->reserved[0] = 0;
+ fseg->reserved[1] = 0;
+}
+
+static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey)
+{
+ iseg->flags = 0;
+ iseg->mem_key = cpu_to_be32(rkey);
+ iseg->guest_id = 0;
+ iseg->pa = 0;
+}
+
+static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
+ u64 remote_addr, u32 rkey)
+{
+ rseg->raddr = cpu_to_be64(remote_addr);
+ rseg->rkey = cpu_to_be32(rkey);
+ rseg->reserved = 0;
+}
+
+static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ib_send_wr *wr)
+{
+ if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+ aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap);
+ aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add);
+ } else if (wr->opcode == IB_WR_MASKED_ATOMIC_FETCH_AND_ADD) {
+ aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add);
+ aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add_mask);
+ } else {
+ aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add);
+ aseg->compare = 0;
+ }
+
+}
+
+static void set_masked_atomic_seg(struct mlx4_wqe_masked_atomic_seg *aseg,
+ struct ib_send_wr *wr)
+{
+ aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap);
+ aseg->swap_add_mask = cpu_to_be64(wr->wr.atomic.swap_mask);
+ aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add);
+ aseg->compare_mask = cpu_to_be64(wr->wr.atomic.compare_add_mask);
+}
+
+static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
+ struct ib_send_wr *wr, __be16 *vlan)
+{
+ memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
+ dseg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+ dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
+ dseg->vlan = to_mah(wr->wr.ud.ah)->av.eth.vlan;
+ memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->av.eth.mac, 6);
+ *vlan = dseg->vlan;
+}
+
+static void set_mlx_icrc_seg(void *dseg)
+{
+ u32 *t = dseg;
+ struct mlx4_wqe_inline_seg *iseg = dseg;
+
+ t[1] = 0;
+
+ /*
+ * Need a barrier here before writing the byte_count field to
+ * make sure that all the data is visible before the
+ * byte_count field is set. Otherwise, if the segment begins
+ * a new cacheline, the HCA prefetcher could grab the 64-byte
+ * chunk and get a valid (!= * 0xffffffff) byte count but
+ * stale data, and end up sending the wrong data.
+ */
+ wmb();
+
+ iseg->byte_count = cpu_to_be32((1 << 31) | 4);
+}
+
+static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
+{
+ dseg->lkey = cpu_to_be32(sg->lkey);
+ dseg->addr = cpu_to_be64(sg->addr);
+
+ /*
+ * Need a barrier here before writing the byte_count field to
+ * make sure that all the data is visible before the
+ * byte_count field is set. Otherwise, if the segment begins
+ * a new cacheline, the HCA prefetcher could grab the 64-byte
+ * chunk and get a valid (!= * 0xffffffff) byte count but
+ * stale data, and end up sending the wrong data.
+ */
+ wmb();
+
+ dseg->byte_count = cpu_to_be32(sg->length);
+}
+
+static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
+{
+ dseg->byte_count = cpu_to_be32(sg->length);
+ dseg->lkey = cpu_to_be32(sg->lkey);
+ dseg->addr = cpu_to_be64(sg->addr);
+}
+
+static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr,
+ struct mlx4_ib_qp *qp, unsigned *lso_seg_len,
+ __be32 *lso_hdr_sz, int *blh)
+{
+ unsigned halign = ALIGN(sizeof *wqe + wr->wr.ud.hlen, 16);
+
+ *blh = unlikely(halign > 64) ? 1 : 0;
+
+ if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) &&
+ wr->num_sge > qp->sq.max_gs - (halign >> 4)))
+ return -EINVAL;
+
+ memcpy(wqe->header, wr->wr.ud.header, wr->wr.ud.hlen);
+
+ *lso_hdr_sz = cpu_to_be32((wr->wr.ud.mss - wr->wr.ud.hlen) << 16 |
+ wr->wr.ud.hlen);
+ *lso_seg_len = halign;
+ return 0;
+}
+
+static __be32 send_ieth(struct ib_send_wr *wr)
+{
+ switch (wr->opcode) {
+ case IB_WR_SEND_WITH_IMM:
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ return wr->ex.imm_data;
+
+ case IB_WR_SEND_WITH_INV:
+ return cpu_to_be32(wr->ex.invalidate_rkey);
+
+ default:
+ return 0;
+ }
+}
+
+static int lay_inline_data(struct mlx4_ib_qp *qp, struct ib_send_wr *wr,
+ void *wqe, int *sz)
+{
+ struct mlx4_wqe_inline_seg *seg;
+ void *addr;
+ int len, seg_len;
+ int num_seg;
+ int off, to_copy;
+ int i;
+ int inl = 0;
+
+ seg = wqe;
+ wqe += sizeof *seg;
+ off = ((unsigned long)wqe) & (unsigned long)(MLX4_INLINE_ALIGN - 1);
+ num_seg = 0;
+ seg_len = 0;
+
+ for (i = 0; i < wr->num_sge; ++i) {
+ addr = (void *) (unsigned long)(wr->sg_list[i].addr);
+ len = wr->sg_list[i].length;
+ inl += len;
+
+ if (inl > qp->max_inline_data) {
+ inl = 0;
+ return -1;
+ }
+
+ while (len >= MLX4_INLINE_ALIGN - off) {
+ to_copy = MLX4_INLINE_ALIGN - off;
+ memcpy(wqe, addr, to_copy);
+ len -= to_copy;
+ wqe += to_copy;
+ addr += to_copy;
+ seg_len += to_copy;
+ wmb(); /* see comment below */
+ seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
+ seg_len = 0;
+ seg = wqe;
+ wqe += sizeof *seg;
+ off = sizeof *seg;
+ ++num_seg;
+ }
+
+ memcpy(wqe, addr, len);
+ wqe += len;
+ seg_len += len;
+ off += len;
+ }
+
+ if (seg_len) {
+ ++num_seg;
+ /*
+ * Need a barrier here to make sure
+ * all the data is visible before the
+ * byte_count field is set. Otherwise
+ * the HCA prefetcher could grab the
+ * 64-byte chunk with this inline
+ * segment and get a valid (!=
+ * 0xffffffff) byte count but stale
+ * data, and end up sending the wrong
+ * data.
+ */
+ wmb();
+ seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
+ }
+
+ *sz = (inl + num_seg * sizeof *seg + 15) / 16;
+
+ return 0;
+}
+
+/*
+ * Avoid using memcpy() to copy to BlueFlame page, since memcpy()
+ * implementations may use move-string-buffer assembler instructions,
+ * which do not guarantee order of copying.
+ */
+static void mlx4_bf_copy(unsigned long *dst, unsigned long *src, unsigned bytecnt)
+{
+ __iowrite64_copy(dst, src, bytecnt / 8);
+}
+
+int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+ struct ib_send_wr **bad_wr)
+{
+ struct mlx4_ib_qp *qp = to_mqp(ibqp);
+ void *wqe;
+ struct mlx4_wqe_ctrl_seg *ctrl;
+ struct mlx4_wqe_data_seg *dseg;
+ unsigned long flags;
+ int nreq;
+ int err = 0;
+ unsigned ind;
+ int uninitialized_var(stamp);
+ int uninitialized_var(size);
+ unsigned uninitialized_var(seglen);
+ __be32 dummy;
+ __be32 *lso_wqe;
+ __be32 uninitialized_var(lso_hdr_sz);
+ int i;
+ int blh = 0;
+ __be16 vlan = 0;
+ int inl = 0;
+
+ ctrl = NULL;
+ spin_lock_irqsave(&qp->sq.lock, flags);
+
+ ind = qp->sq_next_wqe;
+
+ for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ lso_wqe = &dummy;
+
+ if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+ mlx4_ib_dbg("QP 0x%x: WQE overflow", ibqp->qp_num);
+ err = -ENOMEM;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ if (unlikely(wr->num_sge > qp->sq.max_gs)) {
+ mlx4_ib_dbg("QP 0x%x: too many sg entries (%d)",
+ ibqp->qp_num, wr->num_sge);
+ err = -EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+ *((u32 *) (&ctrl->vlan_tag)) = 0;
+ qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
+
+ ctrl->srcrb_flags =
+ (wr->send_flags & IB_SEND_SIGNALED ?
+ cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
+ (wr->send_flags & IB_SEND_SOLICITED ?
+ cpu_to_be32(MLX4_WQE_CTRL_SOLICITED) : 0) |
+ ((wr->send_flags & IB_SEND_IP_CSUM) ?
+ cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM |
+ MLX4_WQE_CTRL_TCP_UDP_CSUM) : 0) |
+ qp->sq_signal_bits;
+
+ ctrl->imm = send_ieth(wr);
+
+ wqe += sizeof *ctrl;
+ size = sizeof *ctrl / 16;
+
+ switch (ibqp->qp_type) {
+ case IB_QPT_XRC:
+ ctrl->srcrb_flags |=
+ cpu_to_be32(wr->xrc_remote_srq_num << 8);
+ /* fall thru */
+ case IB_QPT_RC:
+ case IB_QPT_UC:
+ switch (wr->opcode) {
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
+ set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
+ wr->wr.atomic.rkey);
+ wqe += sizeof (struct mlx4_wqe_raddr_seg);
+
+ set_atomic_seg(wqe, wr);
+ wqe += sizeof (struct mlx4_wqe_atomic_seg);
+
+ size += (sizeof (struct mlx4_wqe_raddr_seg) +
+ sizeof (struct mlx4_wqe_atomic_seg)) / 16;
+
+ break;
+
+ case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
+ set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
+ wr->wr.atomic.rkey);
+ wqe += sizeof (struct mlx4_wqe_raddr_seg);
+
+ set_masked_atomic_seg(wqe, wr);
+ wqe += sizeof (struct mlx4_wqe_masked_atomic_seg);
+
+ size += (sizeof (struct mlx4_wqe_raddr_seg) +
+ sizeof (struct mlx4_wqe_masked_atomic_seg)) / 16;
+
+ break;
+
+ case IB_WR_RDMA_READ:
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+ wr->wr.rdma.rkey);
+ wqe += sizeof (struct mlx4_wqe_raddr_seg);
+ size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
+ break;
+
+ case IB_WR_LOCAL_INV:
+ ctrl->srcrb_flags |=
+ cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
+ set_local_inv_seg(wqe, wr->ex.invalidate_rkey);
+ wqe += sizeof (struct mlx4_wqe_local_inval_seg);
+ size += sizeof (struct mlx4_wqe_local_inval_seg) / 16;
+ break;
+
+ case IB_WR_FAST_REG_MR:
+ ctrl->srcrb_flags |=
+ cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
+ set_fmr_seg(wqe, wr);
+ wqe += sizeof (struct mlx4_wqe_fmr_seg);
+ size += sizeof (struct mlx4_wqe_fmr_seg) / 16;
+ break;
+
+ default:
+ /* No extra segments required for sends */
+ break;
+ }
+ break;
+
+ case IB_QPT_UD:
+ set_datagram_seg(wqe, wr, &vlan);
+ wqe += sizeof (struct mlx4_wqe_datagram_seg);
+ size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+
+ if (wr->opcode == IB_WR_LSO) {
+ err = build_lso_seg(wqe, wr, qp, &seglen, &lso_hdr_sz, &blh);
+ if (unlikely(err)) {
+ *bad_wr = wr;
+ goto out;
+ }
+ lso_wqe = (__be32 *) wqe;
+ wqe += seglen;
+ size += seglen / 16;
+ }
+ break;
+
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ err = build_mlx_header(to_msqp(qp), wr, ctrl, &seglen);
+ if (unlikely(err)) {
+ *bad_wr = wr;
+ goto out;
+ }
+ wqe += seglen;
+ size += seglen / 16;
+ break;
+
+ case IB_QPT_RAW_ETY:
+ err = build_raw_ety_header(to_msqp(qp), wr, ctrl,
+ &seglen);
+ if (unlikely(err)) {
+ *bad_wr = wr;
+ goto out;
+ }
+ wqe += seglen;
+ size += seglen / 16;
+ break;
+
+ default:
+ break;
+ }
+
+ /*
+ * Write data segments in reverse order, so as to
+ * overwrite cacheline stamp last within each
+ * cacheline. This avoids issues with WQE
+ * prefetching.
+ */
+
+ dseg = wqe;
+ dseg += wr->num_sge - 1;
+
+ /* Add one more inline data segment for ICRC for MLX sends */
+ if (unlikely(qp->ibqp.qp_type == IB_QPT_SMI ||
+ qp->ibqp.qp_type == IB_QPT_GSI)) {
+ set_mlx_icrc_seg(dseg + 1);
+ size += sizeof (struct mlx4_wqe_data_seg) / 16;
+ }
+
+ if (wr->send_flags & IB_SEND_INLINE && wr->num_sge) {
+ int sz;
+ err = lay_inline_data(qp, wr, wqe, &sz);
+ if (!err) {
+ inl = 1;
+ size += sz;
+ }
+ } else {
+ size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) / 16);
+ for (i = wr->num_sge - 1; i >= 0; --i, --dseg)
+ set_data_seg(dseg, wr->sg_list + i);
+ }
+
+ /*
+ * Possibly overwrite stamping in cacheline with LSO
+ * segment only after making sure all data segments
+ * are written.
+ */
+ wmb();
+ *lso_wqe = lso_hdr_sz;
+
+ ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ?
+ MLX4_WQE_CTRL_FENCE : 0) | size;
+
+ if (vlan) {
+ ctrl->ins_vlan = 1 << 6;
+ ctrl->vlan_tag = vlan;
+ }
+
+ /*
+ * Make sure descriptor is fully written before
+ * setting ownership bit (because HW can start
+ * executing as soon as we do).
+ */
+ wmb();
+
+ if (wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] |
+ (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) |
+ (blh ? cpu_to_be32(1 << 6) : 0);
+
+ stamp = ind + qp->sq_spare_wqes;
+ ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift);
+
+ /*
+ * We can improve latency by not stamping the last
+ * send queue WQE until after ringing the doorbell, so
+ * only stamp here if there are still more WQEs to post.
+ *
+ * Same optimization applies to padding with NOP wqe
+ * in case of WQE shrinking (used to prevent wrap-around
+ * in the middle of WR).
+ */
+ if (wr->next) {
+ stamp_send_wqe(qp, stamp, size * 16);
+ ind = pad_wraparound(qp, ind);
+ }
+ }
+
+out:
+ if (nreq == 1 && inl && size > 1 && size < qp->bf.buf_size / 16) {
+ ctrl->owner_opcode |= htonl((qp->sq_next_wqe & 0xffff) << 8);
+ *(u32 *) (&ctrl->vlan_tag) |= qp->doorbell_qpn;
+ /*
+ * Make sure that descriptor is written to memory
+ * before writing to BlueFlame page.
+ */
+ wmb();
+
+ ++qp->sq.head;
+
+ mlx4_bf_copy(qp->bf.reg + qp->bf.offset, (unsigned long *) ctrl,
+ ALIGN(size * 16, 64));
+ wc_wmb();
+
+ qp->bf.offset ^= qp->bf.buf_size;
+
+ } else if (nreq) {
+ qp->sq.head += nreq;
+
+ /*
+ * Make sure that descriptors are written before
+ * doorbell record.
+ */
+ wmb();
+
+ writel(qp->doorbell_qpn, qp->bf.uar->map + MLX4_SEND_DOORBELL);
+
+ /*
+ * Make sure doorbells don't leak out of SQ spinlock
+ * and reach the HCA out of order.
+ */
+ mmiowb();
+
+ }
+
+ if (likely(nreq)) {
+ stamp_send_wqe(qp, stamp, size * 16);
+ ind = pad_wraparound(qp, ind);
+ qp->sq_next_wqe = ind;
+ }
+
+ spin_unlock_irqrestore(&qp->sq.lock, flags);
+
+ return err;
+}
+
+int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr)
+{
+ struct mlx4_ib_qp *qp = to_mqp(ibqp);
+ struct mlx4_wqe_data_seg *scat;
+ unsigned long flags;
+ int err = 0;
+ int nreq;
+ int ind;
+ int i;
+
+ spin_lock_irqsave(&qp->rq.lock, flags);
+
+ ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
+
+ for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+ mlx4_ib_dbg("QP 0x%x: WQE overflow", ibqp->qp_num);
+ err = -ENOMEM;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+ mlx4_ib_dbg("QP 0x%x: too many sg entries (%d)",
+ ibqp->qp_num, wr->num_sge);
+ err = -EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ scat = get_recv_wqe(qp, ind);
+
+ for (i = 0; i < wr->num_sge; ++i)
+ __set_data_seg(scat + i, wr->sg_list + i);
+
+ if (i < qp->rq.max_gs) {
+ scat[i].byte_count = 0;
+ scat[i].lkey = cpu_to_be32(MLX4_INVALID_LKEY);
+ scat[i].addr = 0;
+ }
+
+ qp->rq.wrid[ind] = wr->wr_id;
+
+ ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
+ }
+
+out:
+ if (likely(nreq)) {
+ qp->rq.head += nreq;
+
+ /*
+ * Make sure that descriptors are written before
+ * doorbell record.
+ */
+ wmb();
+
+ *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff);
+ }
+
+ spin_unlock_irqrestore(&qp->rq.lock, flags);
+
+ return err;
+}
+
+static inline enum ib_qp_state to_ib_qp_state(enum mlx4_qp_state mlx4_state)
+{
+ switch (mlx4_state) {
+ case MLX4_QP_STATE_RST: return IB_QPS_RESET;
+ case MLX4_QP_STATE_INIT: return IB_QPS_INIT;
+ case MLX4_QP_STATE_RTR: return IB_QPS_RTR;
+ case MLX4_QP_STATE_RTS: return IB_QPS_RTS;
+ case MLX4_QP_STATE_SQ_DRAINING:
+ case MLX4_QP_STATE_SQD: return IB_QPS_SQD;
+ case MLX4_QP_STATE_SQER: return IB_QPS_SQE;
+ case MLX4_QP_STATE_ERR: return IB_QPS_ERR;
+ default: return -1;
+ }
+}
+
+static inline enum ib_mig_state to_ib_mig_state(int mlx4_mig_state)
+{
+ switch (mlx4_mig_state) {
+ case MLX4_QP_PM_ARMED: return IB_MIG_ARMED;
+ case MLX4_QP_PM_REARM: return IB_MIG_REARM;
+ case MLX4_QP_PM_MIGRATED: return IB_MIG_MIGRATED;
+ default: return -1;
+ }
+}
+
+static int to_ib_qp_access_flags(int mlx4_flags)
+{
+ int ib_flags = 0;
+
+ if (mlx4_flags & MLX4_QP_BIT_RRE)
+ ib_flags |= IB_ACCESS_REMOTE_READ;
+ if (mlx4_flags & MLX4_QP_BIT_RWE)
+ ib_flags |= IB_ACCESS_REMOTE_WRITE;
+ if (mlx4_flags & MLX4_QP_BIT_RAE)
+ ib_flags |= IB_ACCESS_REMOTE_ATOMIC;
+
+ return ib_flags;
+}
+
+static void to_ib_ah_attr(struct mlx4_ib_dev *ib_dev, struct ib_ah_attr *ib_ah_attr,
+ struct mlx4_qp_path *path)
+{
+ struct mlx4_dev *dev = ib_dev->dev;
+ int is_eth;
+
+ memset(ib_ah_attr, 0, sizeof *ib_ah_attr);
+ ib_ah_attr->port_num = path->sched_queue & 0x40 ? 2 : 1;
+
+ if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->caps.num_ports)
+ return;
+
+ is_eth = rdma_port_get_link_layer(&ib_dev->ib_dev, ib_ah_attr->port_num) ==
+ IB_LINK_LAYER_ETHERNET;
+ if (is_eth)
+ ib_ah_attr->sl = ((path->sched_queue >> 3) & 0x7) |
+ ((path->sched_queue & 4) << 1);
+ else
+ ib_ah_attr->sl = (path->sched_queue >> 2) & 0xf;
+
+ ib_ah_attr->dlid = be16_to_cpu(path->rlid);
+
+ ib_ah_attr->src_path_bits = path->grh_mylmc & 0x7f;
+ ib_ah_attr->static_rate = path->static_rate ? path->static_rate - 5 : 0;
+ ib_ah_attr->ah_flags = (path->grh_mylmc & (1 << 7)) ? IB_AH_GRH : 0;
+ if (ib_ah_attr->ah_flags) {
+ ib_ah_attr->grh.sgid_index = path->mgid_index;
+ ib_ah_attr->grh.hop_limit = path->hop_limit;
+ ib_ah_attr->grh.traffic_class =
+ (be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff;
+ ib_ah_attr->grh.flow_label =
+ be32_to_cpu(path->tclass_flowlabel) & 0xfffff;
+ memcpy(ib_ah_attr->grh.dgid.raw,
+ path->rgid, sizeof ib_ah_attr->grh.dgid.raw);
+ }
+}
+
+int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+ struct mlx4_ib_qp *qp = to_mqp(ibqp);
+ struct mlx4_qp_context context;
+ int mlx4_state;
+ int err = 0;
+
+ mutex_lock(&qp->mutex);
+
+ if (qp->state == IB_QPS_RESET) {
+ qp_attr->qp_state = IB_QPS_RESET;
+ goto done;
+ }
+
+ err = mlx4_qp_query(dev->dev, &qp->mqp, &context);
+ if (err) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ mlx4_state = be32_to_cpu(context.flags) >> 28;
+
+ qp->state = to_ib_qp_state(mlx4_state);
+ qp_attr->qp_state = qp->state;
+ qp_attr->path_mtu = context.mtu_msgmax >> 5;
+ qp_attr->path_mig_state =
+ to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3);
+ qp_attr->qkey = be32_to_cpu(context.qkey);
+ qp_attr->rq_psn = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff;
+ qp_attr->sq_psn = be32_to_cpu(context.next_send_psn) & 0xffffff;
+ qp_attr->dest_qp_num = be32_to_cpu(context.remote_qpn) & 0xffffff;
+ qp_attr->qp_access_flags =
+ to_ib_qp_access_flags(be32_to_cpu(context.params2));
+
+ if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC ||
+ qp->ibqp.qp_type == IB_QPT_XRC) {
+ to_ib_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path);
+ to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context.alt_path);
+ qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f;
+ qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num;
+ }
+
+ qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f;
+ if (qp_attr->qp_state == IB_QPS_INIT)
+ qp_attr->port_num = qp->port;
+ else
+ qp_attr->port_num = context.pri_path.sched_queue & 0x40 ? 2 : 1;
+
+ /* qp_attr->en_sqd_async_notify is only applicable in modify qp */
+ qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING;
+
+ qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context.params1) >> 21) & 0x7);
+
+ qp_attr->max_dest_rd_atomic =
+ 1 << ((be32_to_cpu(context.params2) >> 21) & 0x7);
+ qp_attr->min_rnr_timer =
+ (be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f;
+ qp_attr->timeout = context.pri_path.ackto >> 3;
+ qp_attr->retry_cnt = (be32_to_cpu(context.params1) >> 16) & 0x7;
+ qp_attr->rnr_retry = (be32_to_cpu(context.params1) >> 13) & 0x7;
+ qp_attr->alt_timeout = context.alt_path.ackto >> 3;
+
+done:
+ qp_attr->cur_qp_state = qp_attr->qp_state;
+ qp_attr->cap.max_recv_wr = qp->rq.wqe_cnt;
+ qp_attr->cap.max_recv_sge = qp->rq.max_gs;
+
+ if (!ibqp->uobject) {
+ qp_attr->cap.max_send_wr = qp->sq.wqe_cnt;
+ qp_attr->cap.max_send_sge = qp->sq.max_gs;
+ } else {
+ qp_attr->cap.max_send_wr = 0;
+ qp_attr->cap.max_send_sge = 0;
+ }
+
+ /*
+ * We don't support inline sends for kernel QPs (yet), and we
+ * don't know what userspace's value should be.
+ */
+ qp_attr->cap.max_inline_data = 0;
+
+ qp_init_attr->cap = qp_attr->cap;
+
+ qp_init_attr->create_flags = 0;
+ if (qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK)
+ qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
+
+ if (qp->flags & MLX4_IB_QP_LSO)
+ qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
+
+out:
+ mutex_unlock(&qp->mutex);
+ return err;
+}
+
+int mlx4_ib_create_xrc_rcv_qp(struct ib_qp_init_attr *init_attr,
+ u32 *qp_num)
+{
+ struct mlx4_ib_dev *dev = to_mdev(init_attr->xrc_domain->device);
+ struct mlx4_ib_xrcd *xrcd = to_mxrcd(init_attr->xrc_domain);
+ struct mlx4_ib_qp *qp;
+ struct ib_qp *ibqp;
+ struct mlx4_ib_xrc_reg_entry *ctx_entry;
+ unsigned long flags;
+ int err;
+
+ if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
+ return -ENOSYS;
+
+ if (init_attr->qp_type != IB_QPT_XRC)
+ return -EINVAL;
+
+ ctx_entry = kmalloc(sizeof *ctx_entry, GFP_KERNEL);
+ if (!ctx_entry)
+ return -ENOMEM;
+
+ qp = kzalloc(sizeof *qp, GFP_KERNEL);
+ if (!qp) {
+ kfree(ctx_entry);
+ return -ENOMEM;
+ }
+ mutex_lock(&dev->xrc_reg_mutex);
+ qp->flags = MLX4_IB_XRC_RCV;
+ qp->xrcdn = to_mxrcd(init_attr->xrc_domain)->xrcdn;
+ INIT_LIST_HEAD(&qp->xrc_reg_list);
+ err = create_qp_common(dev, xrcd->pd, init_attr, NULL, 0, qp);
+ if (err) {
+ mutex_unlock(&dev->xrc_reg_mutex);
+ kfree(ctx_entry);
+ kfree(qp);
+ return err;
+ }
+
+ ibqp = &qp->ibqp;
+ /* set the ibpq attributes which will be used by the mlx4 module */
+ ibqp->qp_num = qp->mqp.qpn;
+ ibqp->device = init_attr->xrc_domain->device;
+ ibqp->pd = xrcd->pd;
+ ibqp->send_cq = ibqp->recv_cq = xrcd->cq;
+ ibqp->event_handler = init_attr->event_handler;
+ ibqp->qp_context = init_attr->qp_context;
+ ibqp->qp_type = init_attr->qp_type;
+ ibqp->xrcd = init_attr->xrc_domain;
+
+ mutex_lock(&qp->mutex);
+ ctx_entry->context = init_attr->qp_context;
+ spin_lock_irqsave(&qp->xrc_reg_list_lock, flags);
+ list_add_tail(&ctx_entry->list, &qp->xrc_reg_list);
+ spin_unlock_irqrestore(&qp->xrc_reg_list_lock, flags);
+ mutex_unlock(&qp->mutex);
+ mutex_unlock(&dev->xrc_reg_mutex);
+ *qp_num = qp->mqp.qpn;
+ return 0;
+}
+
+int mlx4_ib_modify_xrc_rcv_qp(struct ib_xrcd *ibxrcd, u32 qp_num,
+ struct ib_qp_attr *attr, int attr_mask)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibxrcd->device);
+ struct mlx4_ib_xrcd *xrcd = to_mxrcd(ibxrcd);
+ struct mlx4_qp *mqp;
+ struct mlx4_ib_qp *mibqp;
+ int err = -EINVAL;
+
+ if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
+ return -ENOSYS;
+
+ mutex_lock(&dev->xrc_reg_mutex);
+ mqp = mlx4_qp_lookup_lock(dev->dev, qp_num);
+ if (unlikely(!mqp)) {
+ printk(KERN_WARNING "mlx4_ib_reg_xrc_rcv_qp: "
+ "unknown QPN %06x\n", qp_num);
+ goto err_out;
+ }
+
+ mibqp = to_mibqp(mqp);
+
+ if (!(mibqp->flags & MLX4_IB_XRC_RCV) || !mibqp->ibqp.xrcd ||
+ xrcd->xrcdn != to_mxrcd(mibqp->ibqp.xrcd)->xrcdn)
+ goto err_out;
+
+ err = mlx4_ib_modify_qp(&mibqp->ibqp, attr, attr_mask, NULL);
+ mutex_unlock(&dev->xrc_reg_mutex);
+ return err;
+
+err_out:
+ mutex_unlock(&dev->xrc_reg_mutex);
+ return err;
+}
+
+int mlx4_ib_query_xrc_rcv_qp(struct ib_xrcd *ibxrcd, u32 qp_num,
+ struct ib_qp_attr *qp_attr, int qp_attr_mask,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibxrcd->device);
+ struct mlx4_ib_xrcd *xrcd = to_mxrcd(ibxrcd);
+ struct mlx4_ib_qp *qp;
+ struct mlx4_qp *mqp;
+ struct mlx4_qp_context context;
+ int mlx4_state;
+ int err = -EINVAL;
+
+ if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
+ return -ENOSYS;
+
+ mutex_lock(&dev->xrc_reg_mutex);
+ mqp = mlx4_qp_lookup_lock(dev->dev, qp_num);
+ if (unlikely(!mqp)) {
+ printk(KERN_WARNING "mlx4_ib_reg_xrc_rcv_qp: "
+ "unknown QPN %06x\n", qp_num);
+ goto err_out;
+ }
+
+ qp = to_mibqp(mqp);
+ if (!(qp->flags & MLX4_IB_XRC_RCV) || !(qp->ibqp.xrcd) ||
+ xrcd->xrcdn != to_mxrcd(qp->ibqp.xrcd)->xrcdn)
+ goto err_out;
+
+ if (qp->state == IB_QPS_RESET) {
+ qp_attr->qp_state = IB_QPS_RESET;
+ goto done;
+ }
+
+ err = mlx4_qp_query(dev->dev, mqp, &context);
+ if (err)
+ goto err_out;
+
+ mlx4_state = be32_to_cpu(context.flags) >> 28;
+
+ qp_attr->qp_state = to_ib_qp_state(mlx4_state);
+ qp_attr->path_mtu = context.mtu_msgmax >> 5;
+ qp_attr->path_mig_state =
+ to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3);
+ qp_attr->qkey = be32_to_cpu(context.qkey);
+ qp_attr->rq_psn = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff;
+ qp_attr->sq_psn = be32_to_cpu(context.next_send_psn) & 0xffffff;
+ qp_attr->dest_qp_num = be32_to_cpu(context.remote_qpn) & 0xffffff;
+ qp_attr->qp_access_flags =
+ to_ib_qp_access_flags(be32_to_cpu(context.params2));
+
+ if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC ||
+ qp->ibqp.qp_type == IB_QPT_XRC) {
+ to_ib_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path);
+ to_ib_ah_attr(dev, &qp_attr->alt_ah_attr,
+ &context.alt_path);
+ qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f;
+ qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num;
+ }
+
+ qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f;
+ if (qp_attr->qp_state == IB_QPS_INIT)
+ qp_attr->port_num = qp->port;
+ else
+ qp_attr->port_num = context.pri_path.sched_queue & 0x40 ? 2 : 1;
+
+ /* qp_attr->en_sqd_async_notify is only applicable in modify qp */
+ qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING;
+
+ qp_attr->max_rd_atomic =
+ 1 << ((be32_to_cpu(context.params1) >> 21) & 0x7);
+
+ qp_attr->max_dest_rd_atomic =
+ 1 << ((be32_to_cpu(context.params2) >> 21) & 0x7);
+ qp_attr->min_rnr_timer =
+ (be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f;
+ qp_attr->timeout = context.pri_path.ackto >> 3;
+ qp_attr->retry_cnt = (be32_to_cpu(context.params1) >> 16) & 0x7;
+ qp_attr->rnr_retry = (be32_to_cpu(context.params1) >> 13) & 0x7;
+ qp_attr->alt_timeout = context.alt_path.ackto >> 3;
+
+done:
+ qp_attr->cur_qp_state = qp_attr->qp_state;
+ qp_attr->cap.max_recv_wr = 0;
+ qp_attr->cap.max_recv_sge = 0;
+ qp_attr->cap.max_send_wr = 0;
+ qp_attr->cap.max_send_sge = 0;
+ qp_attr->cap.max_inline_data = 0;
+ qp_init_attr->cap = qp_attr->cap;
+
+ mutex_unlock(&dev->xrc_reg_mutex);
+ return 0;
+
+err_out:
+ mutex_unlock(&dev->xrc_reg_mutex);
+ return err;
+}
+
+int mlx4_ib_reg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num)
+{
+
+ struct mlx4_ib_xrcd *mxrcd = to_mxrcd(xrcd);
+
+ struct mlx4_qp *mqp;
+ struct mlx4_ib_qp *mibqp;
+ struct mlx4_ib_xrc_reg_entry *ctx_entry, *tmp;
+ unsigned long flags;
+ int err = -EINVAL;
+
+ mutex_lock(&to_mdev(xrcd->device)->xrc_reg_mutex);
+ mqp = mlx4_qp_lookup_lock(to_mdev(xrcd->device)->dev, qp_num);
+ if (unlikely(!mqp)) {
+ printk(KERN_WARNING "mlx4_ib_reg_xrc_rcv_qp: "
+ "unknown QPN %06x\n", qp_num);
+ goto err_out;
+ }
+
+ mibqp = to_mibqp(mqp);
+
+ if (!(mibqp->flags & MLX4_IB_XRC_RCV) || !(mibqp->ibqp.xrcd) ||
+ mxrcd->xrcdn != to_mxrcd(mibqp->ibqp.xrcd)->xrcdn)
+ goto err_out;
+
+ ctx_entry = kmalloc(sizeof *ctx_entry, GFP_KERNEL);
+ if (!ctx_entry) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+
+ mutex_lock(&mibqp->mutex);
+ list_for_each_entry(tmp, &mibqp->xrc_reg_list, list)
+ if (tmp->context == context) {
+ mutex_unlock(&mibqp->mutex);
+ kfree(ctx_entry);
+ mutex_unlock(&to_mdev(xrcd->device)->xrc_reg_mutex);
+ return 0;
+ }
+
+ ctx_entry->context = context;
+ spin_lock_irqsave(&mibqp->xrc_reg_list_lock, flags);
+ list_add_tail(&ctx_entry->list, &mibqp->xrc_reg_list);
+ spin_unlock_irqrestore(&mibqp->xrc_reg_list_lock, flags);
+ mutex_unlock(&mibqp->mutex);
+ mutex_unlock(&to_mdev(xrcd->device)->xrc_reg_mutex);
+ return 0;
+
+err_out:
+ mutex_unlock(&to_mdev(xrcd->device)->xrc_reg_mutex);
+ return err;
+}
+
+int mlx4_ib_unreg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num)
+{
+
+ struct mlx4_ib_xrcd *mxrcd = to_mxrcd(xrcd);
+
+ struct mlx4_qp *mqp;
+ struct mlx4_ib_qp *mibqp;
+ struct mlx4_ib_xrc_reg_entry *ctx_entry, *tmp;
+ unsigned long flags;
+ int found = 0;
+ int err = -EINVAL;
+
+ mutex_lock(&to_mdev(xrcd->device)->xrc_reg_mutex);
+ mqp = mlx4_qp_lookup_lock(to_mdev(xrcd->device)->dev, qp_num);
+ if (unlikely(!mqp)) {
+ printk(KERN_WARNING "mlx4_ib_unreg_xrc_rcv_qp: "
+ "unknown QPN %06x\n", qp_num);
+ goto err_out;
+ }
+
+ mibqp = to_mibqp(mqp);
+
+ if (!(mibqp->flags & MLX4_IB_XRC_RCV) ||
+ mxrcd->xrcdn != (mibqp->xrcdn & 0xffff))
+ goto err_out;
+
+ mutex_lock(&mibqp->mutex);
+ spin_lock_irqsave(&mibqp->xrc_reg_list_lock, flags);
+ list_for_each_entry_safe(ctx_entry, tmp, &mibqp->xrc_reg_list, list)
+ if (ctx_entry->context == context) {
+ found = 1;
+ list_del(&ctx_entry->list);
+ spin_unlock_irqrestore(&mibqp->xrc_reg_list_lock, flags);
+ kfree(ctx_entry);
+ break;
+ }
+
+ if (!found)
+ spin_unlock_irqrestore(&mibqp->xrc_reg_list_lock, flags);
+ mutex_unlock(&mibqp->mutex);
+ if (!found)
+ goto err_out;
+
+ /* destroy the QP if the registration list is empty */
+ if (list_empty(&mibqp->xrc_reg_list))
+ mlx4_ib_destroy_qp(&mibqp->ibqp);
+
+ mutex_unlock(&to_mdev(xrcd->device)->xrc_reg_mutex);
+ return 0;
+
+err_out:
+ mutex_unlock(&to_mdev(xrcd->device)->xrc_reg_mutex);
+ return err;
+}
+
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/srq.c b/sys/ofed/drivers/infiniband/hw/mlx4/srq.c
new file mode 100644
index 0000000..90918c7
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/srq.c
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/srq.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+
+static void *get_wqe(struct mlx4_ib_srq *srq, int n)
+{
+ return mlx4_buf_offset(&srq->buf, n << srq->msrq.wqe_shift);
+}
+
+static void mlx4_ib_srq_event(struct mlx4_srq *srq, enum mlx4_event type)
+{
+ struct ib_event event;
+ struct ib_srq *ibsrq = &to_mibsrq(srq)->ibsrq;
+
+ if (ibsrq->event_handler) {
+ event.device = ibsrq->device;
+ event.element.srq = ibsrq;
+ switch (type) {
+ case MLX4_EVENT_TYPE_SRQ_LIMIT:
+ event.event = IB_EVENT_SRQ_LIMIT_REACHED;
+ break;
+ case MLX4_EVENT_TYPE_SRQ_CATAS_ERROR:
+ event.event = IB_EVENT_SRQ_ERR;
+ break;
+ default:
+ printk(KERN_WARNING "mlx4_ib: Unexpected event type %d "
+ "on SRQ %06x\n", type, srq->srqn);
+ return;
+ }
+
+ ibsrq->event_handler(&event, ibsrq->srq_context);
+ }
+}
+
+struct ib_srq *mlx4_ib_create_xrc_srq(struct ib_pd *pd,
+ struct ib_cq *xrc_cq,
+ struct ib_xrcd *xrcd,
+ struct ib_srq_init_attr *init_attr,
+ struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(pd->device);
+ struct mlx4_ib_srq *srq;
+ struct mlx4_wqe_srq_next_seg *next;
+ u32 cqn;
+ u16 xrcdn;
+ int desc_size;
+ int buf_size;
+ int err;
+ int i;
+
+ /* Sanity check SRQ size before proceeding */
+ if (init_attr->attr.max_wr >= dev->dev->caps.max_srq_wqes ||
+ init_attr->attr.max_sge > dev->dev->caps.max_srq_sge) {
+ mlx4_ib_dbg("a size param is out of range. "
+ "max_wr = 0x%x, max_sge = 0x%x",
+ init_attr->attr.max_wr, init_attr->attr.max_sge);
+ return ERR_PTR(-EINVAL);
+ }
+
+ srq = kzalloc(sizeof *srq, GFP_KERNEL);
+ if (!srq)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_init(&srq->mutex);
+ spin_lock_init(&srq->lock);
+ srq->msrq.max = roundup_pow_of_two(init_attr->attr.max_wr + 1);
+ srq->msrq.max_gs = init_attr->attr.max_sge;
+
+ desc_size = max(32UL,
+ roundup_pow_of_two(sizeof (struct mlx4_wqe_srq_next_seg) +
+ srq->msrq.max_gs *
+ sizeof (struct mlx4_wqe_data_seg)));
+ srq->msrq.wqe_shift = ilog2(desc_size);
+
+ buf_size = srq->msrq.max * desc_size;
+
+ if (pd->uobject) {
+ struct mlx4_ib_create_srq ucmd;
+
+ if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+ err = -EFAULT;
+ goto err_srq;
+ }
+
+ srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
+ buf_size, 0, 0);
+ if (IS_ERR(srq->umem)) {
+ err = PTR_ERR(srq->umem);
+ goto err_srq;
+ }
+
+ err = mlx4_mtt_init(dev->dev, ib_umem_page_count(srq->umem),
+ ilog2(srq->umem->page_size), &srq->mtt);
+ if (err)
+ goto err_buf;
+
+ err = mlx4_ib_umem_write_mtt(dev, &srq->mtt, srq->umem);
+ if (err)
+ goto err_mtt;
+
+ err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
+ ucmd.db_addr, &srq->db);
+ if (err)
+ goto err_mtt;
+ } else {
+ struct mlx4_wqe_data_seg *scatter;
+
+ err = mlx4_db_alloc(dev->dev, &srq->db, 0);
+ if (err)
+ goto err_srq;
+
+ *srq->db.db = 0;
+
+ if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf)) {
+ err = -ENOMEM;
+ goto err_db;
+ }
+
+ srq->head = 0;
+ srq->tail = srq->msrq.max - 1;
+ srq->wqe_ctr = 0;
+
+ for (i = 0; i < srq->msrq.max; ++i) {
+ next = get_wqe(srq, i);
+ next->next_wqe_index =
+ cpu_to_be16((i + 1) & (srq->msrq.max - 1));
+
+ for (scatter = (void *) (next + 1);
+ (void *) scatter < (void *) next + desc_size;
+ ++scatter)
+ scatter->lkey = cpu_to_be32(MLX4_INVALID_LKEY);
+ }
+
+ err = mlx4_mtt_init(dev->dev, srq->buf.npages, srq->buf.page_shift,
+ &srq->mtt);
+ if (err)
+ goto err_buf;
+
+ err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf);
+ if (err)
+ goto err_mtt;
+
+ srq->wrid = kmalloc(srq->msrq.max * sizeof (u64), GFP_KERNEL);
+ if (!srq->wrid) {
+ err = -ENOMEM;
+ goto err_mtt;
+ }
+ }
+
+ cqn = xrc_cq ? (u32) (to_mcq(xrc_cq)->mcq.cqn) : 0;
+ xrcdn = xrcd ? (u16) (to_mxrcd(xrcd)->xrcdn) :
+ (u16) dev->dev->caps.reserved_xrcds;
+
+ err = mlx4_srq_alloc(dev->dev, to_mpd(pd)->pdn, cqn, xrcdn, &srq->mtt,
+ srq->db.dma, &srq->msrq);
+ if (err)
+ goto err_wrid;
+
+ srq->msrq.event = mlx4_ib_srq_event;
+
+ if (pd->uobject) {
+ if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof (__u32))) {
+ err = -EFAULT;
+ goto err_wrid;
+ }
+ } else
+ srq->ibsrq.xrc_srq_num = srq->msrq.srqn;
+
+ init_attr->attr.max_wr = srq->msrq.max - 1;
+
+ return &srq->ibsrq;
+
+err_wrid:
+ if (pd->uobject)
+ mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db);
+ else
+ kfree(srq->wrid);
+
+err_mtt:
+ mlx4_mtt_cleanup(dev->dev, &srq->mtt);
+
+err_buf:
+ if (pd->uobject)
+ ib_umem_release(srq->umem);
+ else
+ mlx4_buf_free(dev->dev, buf_size, &srq->buf);
+
+err_db:
+ if (!pd->uobject)
+ mlx4_db_free(dev->dev, &srq->db);
+
+err_srq:
+ kfree(srq);
+
+ return ERR_PTR(err);
+}
+
+int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+ enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibsrq->device);
+ struct mlx4_ib_srq *srq = to_msrq(ibsrq);
+ int ret;
+
+ /* We don't support resizing SRQs (yet?) */
+ if (attr_mask & IB_SRQ_MAX_WR) {
+ mlx4_ib_dbg("resize not yet supported");
+ return -EINVAL;
+ }
+
+ if (attr_mask & IB_SRQ_LIMIT) {
+ if (attr->srq_limit >= srq->msrq.max){
+ mlx4_ib_dbg("limit (0x%x) too high", attr->srq_limit);
+ return -EINVAL;
+ }
+
+ mutex_lock(&srq->mutex);
+ ret = mlx4_srq_arm(dev->dev, &srq->msrq, attr->srq_limit);
+ mutex_unlock(&srq->mutex);
+
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
+ struct ib_srq_init_attr *init_attr,
+ struct ib_udata *udata)
+{
+ return mlx4_ib_create_xrc_srq(pd, NULL, NULL, init_attr, udata);
+}
+
+int mlx4_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibsrq->device);
+ struct mlx4_ib_srq *srq = to_msrq(ibsrq);
+ int ret;
+ int limit_watermark;
+
+ ret = mlx4_srq_query(dev->dev, &srq->msrq, &limit_watermark);
+ if (ret)
+ return ret;
+
+ srq_attr->srq_limit = limit_watermark;
+ srq_attr->max_wr = srq->msrq.max - 1;
+ srq_attr->max_sge = srq->msrq.max_gs;
+
+ return 0;
+}
+
+int mlx4_ib_destroy_srq(struct ib_srq *srq)
+{
+ struct mlx4_ib_dev *dev = to_mdev(srq->device);
+ struct mlx4_ib_srq *msrq = to_msrq(srq);
+ struct mlx4_ib_cq *cq;
+
+ mlx4_srq_invalidate(dev->dev, &msrq->msrq);
+
+ if (srq->xrc_cq && !srq->uobject) {
+ cq = to_mcq(srq->xrc_cq);
+ spin_lock_irq(&cq->lock);
+ __mlx4_ib_cq_clean(cq, -1, msrq);
+ mlx4_srq_remove(dev->dev, &msrq->msrq);
+ spin_unlock_irq(&cq->lock);
+ } else
+ mlx4_srq_remove(dev->dev, &msrq->msrq);
+
+ mlx4_srq_free(dev->dev, &msrq->msrq);
+ mlx4_mtt_cleanup(dev->dev, &msrq->mtt);
+
+ if (srq->uobject) {
+ mlx4_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db);
+ ib_umem_release(msrq->umem);
+ } else {
+ kfree(msrq->wrid);
+ mlx4_buf_free(dev->dev, msrq->msrq.max << msrq->msrq.wqe_shift,
+ &msrq->buf);
+ mlx4_db_free(dev->dev, &msrq->db);
+ }
+
+ kfree(msrq);
+
+ return 0;
+}
+
+void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index)
+{
+ struct mlx4_wqe_srq_next_seg *next;
+
+ /* always called with interrupts disabled. */
+ spin_lock(&srq->lock);
+
+ next = get_wqe(srq, srq->tail);
+ next->next_wqe_index = cpu_to_be16(wqe_index);
+ srq->tail = wqe_index;
+
+ spin_unlock(&srq->lock);
+}
+
+int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr)
+{
+ struct mlx4_ib_srq *srq = to_msrq(ibsrq);
+ struct mlx4_wqe_srq_next_seg *next;
+ struct mlx4_wqe_data_seg *scat;
+ unsigned long flags;
+ int err = 0;
+ int nreq;
+ int i;
+
+ spin_lock_irqsave(&srq->lock, flags);
+
+ for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ if (unlikely(wr->num_sge > srq->msrq.max_gs)) {
+ mlx4_ib_dbg("srq num 0x%x: num s/g entries too large (%d)",
+ srq->msrq.srqn, wr->num_sge);
+ err = -EINVAL;
+ *bad_wr = wr;
+ break;
+ }
+
+ if (unlikely(srq->head == srq->tail)) {
+ mlx4_ib_dbg("srq num 0x%x: No entries available to post.",
+ srq->msrq.srqn);
+ err = -ENOMEM;
+ *bad_wr = wr;
+ break;
+ }
+
+ srq->wrid[srq->head] = wr->wr_id;
+
+ next = get_wqe(srq, srq->head);
+ srq->head = be16_to_cpu(next->next_wqe_index);
+ scat = (struct mlx4_wqe_data_seg *) (next + 1);
+
+ for (i = 0; i < wr->num_sge; ++i) {
+ scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length);
+ scat[i].lkey = cpu_to_be32(wr->sg_list[i].lkey);
+ scat[i].addr = cpu_to_be64(wr->sg_list[i].addr);
+ }
+
+ if (i < srq->msrq.max_gs) {
+ scat[i].byte_count = 0;
+ scat[i].lkey = cpu_to_be32(MLX4_INVALID_LKEY);
+ scat[i].addr = 0;
+ }
+ }
+
+ if (likely(nreq)) {
+ srq->wqe_ctr += nreq;
+
+ /*
+ * Make sure that descriptors are written before
+ * doorbell record.
+ */
+ wmb();
+
+ *srq->db.db = cpu_to_be32(srq->wqe_ctr);
+ }
+
+ spin_unlock_irqrestore(&srq->lock, flags);
+
+ return err;
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/user.h b/sys/ofed/drivers/infiniband/hw/mlx4/user.h
new file mode 100644
index 0000000..13beede
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/user.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_IB_USER_H
+#define MLX4_IB_USER_H
+
+#include <linux/types.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define MLX4_IB_UVERBS_ABI_VERSION 3
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct mlx4_ib_alloc_ucontext_resp {
+ __u32 qp_tab_size;
+ __u16 bf_reg_size;
+ __u16 bf_regs_per_page;
+};
+
+struct mlx4_ib_alloc_pd_resp {
+ __u32 pdn;
+ __u32 reserved;
+};
+
+struct mlx4_ib_create_cq {
+ __u64 buf_addr;
+ __u64 db_addr;
+};
+
+struct mlx4_ib_create_cq_resp {
+ __u32 cqn;
+ __u32 reserved;
+};
+
+struct mlx4_ib_resize_cq {
+ __u64 buf_addr;
+};
+
+struct mlx4_ib_create_srq {
+ __u64 buf_addr;
+ __u64 db_addr;
+};
+
+struct mlx4_ib_create_srq_resp {
+ __u32 srqn;
+ __u32 reserved;
+};
+
+struct mlx4_ib_create_qp {
+ __u64 buf_addr;
+ __u64 db_addr;
+ __u8 log_sq_bb_count;
+ __u8 log_sq_stride;
+ __u8 sq_no_prefetch;
+ __u8 reserved[5];
+};
+
+#endif /* MLX4_IB_USER_H */
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/wc.c b/sys/ofed/drivers/infiniband/hw/mlx4/wc.c
new file mode 100644
index 0000000..827de14
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/wc.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2006-2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include "wc.h"
+
+#if defined(__i386__) || defined(__x86_64__)
+
+pgprot_t pgprot_wc(pgprot_t _prot)
+{
+ return pgprot_writecombine(_prot);
+}
+
+int mlx4_wc_enabled(void)
+{
+ return 1;
+}
+
+#elif defined(CONFIG_PPC64)
+
+pgprot_t pgprot_wc(pgprot_t _prot)
+{
+ return __pgprot((pgprot_val(_prot) | _PAGE_NO_CACHE) &
+ ~(pgprot_t)_PAGE_GUARDED);
+}
+
+int mlx4_wc_enabled(void)
+{
+ return 1;
+}
+
+#else /* !(defined(__i386__) || defined(__x86_64__)) */
+
+pgprot_t pgprot_wc(pgprot_t _prot)
+{
+ return pgprot_noncached(_prot);
+}
+
+int mlx4_wc_enabled(void)
+{
+ return 0;
+}
+
+#endif
+
diff --git a/sys/ofed/drivers/infiniband/hw/mlx4/wc.h b/sys/ofed/drivers/infiniband/hw/mlx4/wc.h
new file mode 100644
index 0000000..f32fe1e
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mlx4/wc.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2006-2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef mlx4_WC_H
+#define mlx4_WC_H
+
+#include <asm/pgtable.h>
+
+int mlx4_wc_enabled(void);
+pgprot_t pgprot_wc(pgprot_t _prot);
+
+#endif
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/Kconfig b/sys/ofed/drivers/infiniband/hw/mthca/Kconfig
new file mode 100644
index 0000000..03efc07
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/Kconfig
@@ -0,0 +1,17 @@
+config INFINIBAND_MTHCA
+ tristate "Mellanox HCA support"
+ depends on PCI
+ ---help---
+ This is a low-level driver for Mellanox InfiniHost host
+ channel adapters (HCAs), including the MT23108 PCI-X HCA
+ ("Tavor") and the MT25208 PCI Express HCA ("Arbel").
+
+config INFINIBAND_MTHCA_DEBUG
+ bool "Verbose debugging output" if EMBEDDED
+ depends on INFINIBAND_MTHCA
+ default y
+ ---help---
+ This option causes debugging code to be compiled into the
+ mthca driver. The output can be turned on via the
+ debug_level module parameter (which can also be set after
+ the driver is loaded through sysfs).
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/Makefile b/sys/ofed/drivers/infiniband/hw/mthca/Makefile
new file mode 100644
index 0000000..e388d95
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/Makefile
@@ -0,0 +1,7 @@
+obj-$(CONFIG_INFINIBAND_MTHCA) += ib_mthca.o
+
+ib_mthca-y := mthca_main.o mthca_cmd.o mthca_profile.o mthca_reset.o \
+ mthca_allocator.o mthca_eq.o mthca_pd.o mthca_cq.o \
+ mthca_mr.o mthca_qp.o mthca_av.o mthca_mcg.o mthca_mad.o \
+ mthca_provider.o mthca_memfree.o mthca_uar.o mthca_srq.o \
+ mthca_catas.o
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_allocator.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_allocator.c
new file mode 100644
index 0000000..c5ccc2d
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_allocator.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/bitmap.h>
+
+#include "mthca_dev.h"
+
+/* Trivial bitmap-based allocator */
+u32 mthca_alloc(struct mthca_alloc *alloc)
+{
+ unsigned long flags;
+ u32 obj;
+
+ spin_lock_irqsave(&alloc->lock, flags);
+
+ obj = find_next_zero_bit(alloc->table, alloc->max, alloc->last);
+ if (obj >= alloc->max) {
+ alloc->top = (alloc->top + alloc->max) & alloc->mask;
+ obj = find_first_zero_bit(alloc->table, alloc->max);
+ }
+
+ if (obj < alloc->max) {
+ set_bit(obj, alloc->table);
+ obj |= alloc->top;
+ } else
+ obj = -1;
+
+ spin_unlock_irqrestore(&alloc->lock, flags);
+
+ return obj;
+}
+
+void mthca_free(struct mthca_alloc *alloc, u32 obj)
+{
+ unsigned long flags;
+
+ obj &= alloc->max - 1;
+
+ spin_lock_irqsave(&alloc->lock, flags);
+
+ clear_bit(obj, alloc->table);
+ alloc->last = min(alloc->last, obj);
+ alloc->top = (alloc->top + alloc->max) & alloc->mask;
+
+ spin_unlock_irqrestore(&alloc->lock, flags);
+}
+
+int mthca_alloc_init(struct mthca_alloc *alloc, u32 num, u32 mask,
+ u32 reserved)
+{
+ int i;
+
+ /* num must be a power of 2 */
+ if (num != 1 << (ffs(num) - 1))
+ return -EINVAL;
+
+ alloc->last = 0;
+ alloc->top = 0;
+ alloc->max = num;
+ alloc->mask = mask;
+ spin_lock_init(&alloc->lock);
+ alloc->table = kmalloc(BITS_TO_LONGS(num) * sizeof (long),
+ GFP_KERNEL);
+ if (!alloc->table)
+ return -ENOMEM;
+
+ bitmap_zero(alloc->table, num);
+ for (i = 0; i < reserved; ++i)
+ set_bit(i, alloc->table);
+
+ return 0;
+}
+
+void mthca_alloc_cleanup(struct mthca_alloc *alloc)
+{
+ kfree(alloc->table);
+}
+
+/*
+ * Array of pointers with lazy allocation of leaf pages. Callers of
+ * _get, _set and _clear methods must use a lock or otherwise
+ * serialize access to the array.
+ */
+
+#define MTHCA_ARRAY_MASK (PAGE_SIZE / sizeof (void *) - 1)
+
+void *mthca_array_get(struct mthca_array *array, int index)
+{
+ int p = (index * sizeof (void *)) >> PAGE_SHIFT;
+
+ if (array->page_list[p].page)
+ return array->page_list[p].page[index & MTHCA_ARRAY_MASK];
+ else
+ return NULL;
+}
+
+int mthca_array_set(struct mthca_array *array, int index, void *value)
+{
+ int p = (index * sizeof (void *)) >> PAGE_SHIFT;
+
+ /* Allocate with GFP_ATOMIC because we'll be called with locks held. */
+ if (!array->page_list[p].page)
+ array->page_list[p].page = (void **) get_zeroed_page(GFP_ATOMIC);
+
+ if (!array->page_list[p].page)
+ return -ENOMEM;
+
+ array->page_list[p].page[index & MTHCA_ARRAY_MASK] = value;
+ ++array->page_list[p].used;
+
+ return 0;
+}
+
+void mthca_array_clear(struct mthca_array *array, int index)
+{
+ int p = (index * sizeof (void *)) >> PAGE_SHIFT;
+
+ if (--array->page_list[p].used == 0) {
+ free_page((unsigned long) array->page_list[p].page);
+ array->page_list[p].page = NULL;
+ } else
+ array->page_list[p].page[index & MTHCA_ARRAY_MASK] = NULL;
+
+ if (array->page_list[p].used < 0)
+ pr_debug("Array %p index %d page %d with ref count %d < 0\n",
+ array, index, p, array->page_list[p].used);
+}
+
+int mthca_array_init(struct mthca_array *array, int nent)
+{
+ int npage = (nent * sizeof (void *) + PAGE_SIZE - 1) / PAGE_SIZE;
+ int i;
+
+ array->page_list = kmalloc(npage * sizeof *array->page_list, GFP_KERNEL);
+ if (!array->page_list)
+ return -ENOMEM;
+
+ for (i = 0; i < npage; ++i) {
+ array->page_list[i].page = NULL;
+ array->page_list[i].used = 0;
+ }
+
+ return 0;
+}
+
+void mthca_array_cleanup(struct mthca_array *array, int nent)
+{
+ int i;
+
+ for (i = 0; i < (nent * sizeof (void *) + PAGE_SIZE - 1) / PAGE_SIZE; ++i)
+ free_page((unsigned long) array->page_list[i].page);
+
+ kfree(array->page_list);
+}
+
+/*
+ * Handling for queue buffers -- we allocate a bunch of memory and
+ * register it in a memory region at HCA virtual address 0. If the
+ * requested size is > max_direct, we split the allocation into
+ * multiple pages, so we don't require too much contiguous memory.
+ */
+
+int mthca_buf_alloc(struct mthca_dev *dev, int size, int max_direct,
+ union mthca_buf *buf, int *is_direct, struct mthca_pd *pd,
+ int hca_write, struct mthca_mr *mr)
+{
+ int err = -ENOMEM;
+ int npages, shift;
+ u64 *dma_list = NULL;
+ dma_addr_t t;
+ int i;
+
+ if (size <= max_direct) {
+ *is_direct = 1;
+ npages = 1;
+ shift = get_order(size) + PAGE_SHIFT;
+
+ buf->direct.buf = dma_alloc_coherent(&dev->pdev->dev,
+ size, &t, GFP_KERNEL);
+ if (!buf->direct.buf)
+ return -ENOMEM;
+
+ pci_unmap_addr_set(&buf->direct, mapping, t);
+
+ memset(buf->direct.buf, 0, size);
+
+ while (t & ((1 << shift) - 1)) {
+ --shift;
+ npages *= 2;
+ }
+
+ dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+ if (!dma_list)
+ goto err_free;
+
+ for (i = 0; i < npages; ++i)
+ dma_list[i] = t + i * (1 << shift);
+ } else {
+ *is_direct = 0;
+ npages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+ shift = PAGE_SHIFT;
+
+ dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+ if (!dma_list)
+ return -ENOMEM;
+
+ buf->page_list = kmalloc(npages * sizeof *buf->page_list,
+ GFP_KERNEL);
+ if (!buf->page_list)
+ goto err_out;
+
+ for (i = 0; i < npages; ++i)
+ buf->page_list[i].buf = NULL;
+
+ for (i = 0; i < npages; ++i) {
+ buf->page_list[i].buf =
+ dma_alloc_coherent(&dev->pdev->dev, PAGE_SIZE,
+ &t, GFP_KERNEL);
+ if (!buf->page_list[i].buf)
+ goto err_free;
+
+ dma_list[i] = t;
+ pci_unmap_addr_set(&buf->page_list[i], mapping, t);
+
+ clear_page(buf->page_list[i].buf);
+ }
+ }
+
+ err = mthca_mr_alloc_phys(dev, pd->pd_num,
+ dma_list, shift, npages,
+ 0, size,
+ MTHCA_MPT_FLAG_LOCAL_READ |
+ (hca_write ? MTHCA_MPT_FLAG_LOCAL_WRITE : 0),
+ mr);
+ if (err)
+ goto err_free;
+
+ kfree(dma_list);
+
+ return 0;
+
+err_free:
+ mthca_buf_free(dev, size, buf, *is_direct, NULL);
+
+err_out:
+ kfree(dma_list);
+
+ return err;
+}
+
+void mthca_buf_free(struct mthca_dev *dev, int size, union mthca_buf *buf,
+ int is_direct, struct mthca_mr *mr)
+{
+ int i;
+
+ if (mr)
+ mthca_free_mr(dev, mr);
+
+ if (is_direct)
+ dma_free_coherent(&dev->pdev->dev, size, buf->direct.buf,
+ pci_unmap_addr(&buf->direct, mapping));
+ else {
+ for (i = 0; i < (size + PAGE_SIZE - 1) / PAGE_SIZE; ++i)
+ dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+ buf->page_list[i].buf,
+ pci_unmap_addr(&buf->page_list[i],
+ mapping));
+ kfree(buf->page_list);
+ }
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_av.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_av.c
new file mode 100644
index 0000000..32f6c63
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_av.c
@@ -0,0 +1,374 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+
+#include "mthca_dev.h"
+
+enum {
+ MTHCA_RATE_TAVOR_FULL = 0,
+ MTHCA_RATE_TAVOR_1X = 1,
+ MTHCA_RATE_TAVOR_4X = 2,
+ MTHCA_RATE_TAVOR_1X_DDR = 3
+};
+
+enum {
+ MTHCA_RATE_MEMFREE_FULL = 0,
+ MTHCA_RATE_MEMFREE_QUARTER = 1,
+ MTHCA_RATE_MEMFREE_EIGHTH = 2,
+ MTHCA_RATE_MEMFREE_HALF = 3
+};
+
+struct mthca_av {
+ __be32 port_pd;
+ u8 reserved1;
+ u8 g_slid;
+ __be16 dlid;
+ u8 reserved2;
+ u8 gid_index;
+ u8 msg_sr;
+ u8 hop_limit;
+ __be32 sl_tclass_flowlabel;
+ __be32 dgid[4];
+};
+
+static enum ib_rate memfree_rate_to_ib(u8 mthca_rate, u8 port_rate)
+{
+ switch (mthca_rate) {
+ case MTHCA_RATE_MEMFREE_EIGHTH:
+ return mult_to_ib_rate(port_rate >> 3);
+ case MTHCA_RATE_MEMFREE_QUARTER:
+ return mult_to_ib_rate(port_rate >> 2);
+ case MTHCA_RATE_MEMFREE_HALF:
+ return mult_to_ib_rate(port_rate >> 1);
+ case MTHCA_RATE_MEMFREE_FULL:
+ default:
+ return mult_to_ib_rate(port_rate);
+ }
+}
+
+static enum ib_rate tavor_rate_to_ib(u8 mthca_rate, u8 port_rate)
+{
+ switch (mthca_rate) {
+ case MTHCA_RATE_TAVOR_1X: return IB_RATE_2_5_GBPS;
+ case MTHCA_RATE_TAVOR_1X_DDR: return IB_RATE_5_GBPS;
+ case MTHCA_RATE_TAVOR_4X: return IB_RATE_10_GBPS;
+ default: return mult_to_ib_rate(port_rate);
+ }
+}
+
+enum ib_rate mthca_rate_to_ib(struct mthca_dev *dev, u8 mthca_rate, u8 port)
+{
+ if (mthca_is_memfree(dev)) {
+ /* Handle old Arbel FW */
+ if (dev->limits.stat_rate_support == 0x3 && mthca_rate)
+ return IB_RATE_2_5_GBPS;
+
+ return memfree_rate_to_ib(mthca_rate, dev->rate[port - 1]);
+ } else
+ return tavor_rate_to_ib(mthca_rate, dev->rate[port - 1]);
+}
+
+static u8 ib_rate_to_memfree(u8 req_rate, u8 cur_rate)
+{
+ if (cur_rate <= req_rate)
+ return 0;
+
+ /*
+ * Inter-packet delay (IPD) to get from rate X down to a rate
+ * no more than Y is (X - 1) / Y.
+ */
+ switch ((cur_rate - 1) / req_rate) {
+ case 0: return MTHCA_RATE_MEMFREE_FULL;
+ case 1: return MTHCA_RATE_MEMFREE_HALF;
+ case 2: /* fall through */
+ case 3: return MTHCA_RATE_MEMFREE_QUARTER;
+ default: return MTHCA_RATE_MEMFREE_EIGHTH;
+ }
+}
+
+static u8 ib_rate_to_tavor(u8 static_rate)
+{
+ switch (static_rate) {
+ case IB_RATE_2_5_GBPS: return MTHCA_RATE_TAVOR_1X;
+ case IB_RATE_5_GBPS: return MTHCA_RATE_TAVOR_1X_DDR;
+ case IB_RATE_10_GBPS: return MTHCA_RATE_TAVOR_4X;
+ default: return MTHCA_RATE_TAVOR_FULL;
+ }
+}
+
+u8 mthca_get_rate(struct mthca_dev *dev, int static_rate, u8 port)
+{
+ u8 rate;
+
+ if (!static_rate || ib_rate_to_mult(static_rate) >= dev->rate[port - 1])
+ return 0;
+
+ if (mthca_is_memfree(dev))
+ rate = ib_rate_to_memfree(ib_rate_to_mult(static_rate),
+ dev->rate[port - 1]);
+ else
+ rate = ib_rate_to_tavor(static_rate);
+
+ if (!(dev->limits.stat_rate_support & (1 << rate)))
+ rate = 1;
+
+ return rate;
+}
+
+int mthca_create_ah(struct mthca_dev *dev,
+ struct mthca_pd *pd,
+ struct ib_ah_attr *ah_attr,
+ struct mthca_ah *ah)
+{
+ u32 index = -1;
+ struct mthca_av *av = NULL;
+
+ ah->type = MTHCA_AH_PCI_POOL;
+
+ if (mthca_is_memfree(dev)) {
+ ah->av = kmalloc(sizeof *ah->av, GFP_ATOMIC);
+ if (!ah->av)
+ return -ENOMEM;
+
+ ah->type = MTHCA_AH_KMALLOC;
+ av = ah->av;
+ } else if (!atomic_read(&pd->sqp_count) &&
+ !(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+ index = mthca_alloc(&dev->av_table.alloc);
+
+ /* fall back to allocate in host memory */
+ if (index == -1)
+ goto on_hca_fail;
+
+ av = kmalloc(sizeof *av, GFP_ATOMIC);
+ if (!av)
+ goto on_hca_fail;
+
+ ah->type = MTHCA_AH_ON_HCA;
+ ah->avdma = dev->av_table.ddr_av_base +
+ index * MTHCA_AV_SIZE;
+ }
+
+on_hca_fail:
+ if (ah->type == MTHCA_AH_PCI_POOL) {
+ ah->av = pci_pool_alloc(dev->av_table.pool,
+ GFP_ATOMIC, &ah->avdma);
+ if (!ah->av)
+ return -ENOMEM;
+
+ av = ah->av;
+ }
+
+ ah->key = pd->ntmr.ibmr.lkey;
+
+ memset(av, 0, MTHCA_AV_SIZE);
+
+ av->port_pd = cpu_to_be32(pd->pd_num | (ah_attr->port_num << 24));
+ av->g_slid = ah_attr->src_path_bits;
+ av->dlid = cpu_to_be16(ah_attr->dlid);
+ av->msg_sr = (3 << 4) | /* 2K message */
+ mthca_get_rate(dev, ah_attr->static_rate, ah_attr->port_num);
+ av->sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28);
+ if (ah_attr->ah_flags & IB_AH_GRH) {
+ av->g_slid |= 0x80;
+ av->gid_index = (ah_attr->port_num - 1) * dev->limits.gid_table_len +
+ ah_attr->grh.sgid_index;
+ av->hop_limit = ah_attr->grh.hop_limit;
+ av->sl_tclass_flowlabel |=
+ cpu_to_be32((ah_attr->grh.traffic_class << 20) |
+ ah_attr->grh.flow_label);
+ memcpy(av->dgid, ah_attr->grh.dgid.raw, 16);
+ } else {
+ /* Arbel workaround -- low byte of GID must be 2 */
+ av->dgid[3] = cpu_to_be32(2);
+ }
+
+ if (0) {
+ int j;
+
+ mthca_dbg(dev, "Created UDAV at %p/%08lx:\n",
+ av, (unsigned long) ah->avdma);
+ for (j = 0; j < 8; ++j)
+ printk(KERN_DEBUG " [%2x] %08x\n",
+ j * 4, be32_to_cpu(((__be32 *) av)[j]));
+ }
+
+ if (ah->type == MTHCA_AH_ON_HCA) {
+ memcpy_toio(dev->av_table.av_map + index * MTHCA_AV_SIZE,
+ av, MTHCA_AV_SIZE);
+ kfree(av);
+ }
+
+ return 0;
+}
+
+int mthca_destroy_ah(struct mthca_dev *dev, struct mthca_ah *ah)
+{
+ switch (ah->type) {
+ case MTHCA_AH_ON_HCA:
+ mthca_free(&dev->av_table.alloc,
+ (ah->avdma - dev->av_table.ddr_av_base) /
+ MTHCA_AV_SIZE);
+ break;
+
+ case MTHCA_AH_PCI_POOL:
+ pci_pool_free(dev->av_table.pool, ah->av, ah->avdma);
+ break;
+
+ case MTHCA_AH_KMALLOC:
+ kfree(ah->av);
+ break;
+ }
+
+ return 0;
+}
+
+int mthca_ah_grh_present(struct mthca_ah *ah)
+{
+ return !!(ah->av->g_slid & 0x80);
+}
+
+int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah,
+ struct ib_ud_header *header)
+{
+ if (ah->type == MTHCA_AH_ON_HCA)
+ return -EINVAL;
+
+ header->lrh.service_level = be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 28;
+ header->lrh.destination_lid = ah->av->dlid;
+ header->lrh.source_lid = cpu_to_be16(ah->av->g_slid & 0x7f);
+ if (mthca_ah_grh_present(ah)) {
+ header->grh.traffic_class =
+ (be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 20) & 0xff;
+ header->grh.flow_label =
+ ah->av->sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+ header->grh.hop_limit = ah->av->hop_limit;
+ ib_get_cached_gid(&dev->ib_dev,
+ be32_to_cpu(ah->av->port_pd) >> 24,
+ ah->av->gid_index % dev->limits.gid_table_len,
+ &header->grh.source_gid);
+ memcpy(header->grh.destination_gid.raw,
+ ah->av->dgid, 16);
+ }
+
+ return 0;
+}
+
+int mthca_ah_query(struct ib_ah *ibah, struct ib_ah_attr *attr)
+{
+ struct mthca_ah *ah = to_mah(ibah);
+ struct mthca_dev *dev = to_mdev(ibah->device);
+
+ /* Only implement for MAD and memfree ah for now. */
+ if (ah->type == MTHCA_AH_ON_HCA)
+ return -ENOSYS;
+
+ memset(attr, 0, sizeof *attr);
+ attr->dlid = be16_to_cpu(ah->av->dlid);
+ attr->sl = be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 28;
+ attr->port_num = be32_to_cpu(ah->av->port_pd) >> 24;
+ attr->static_rate = mthca_rate_to_ib(dev, ah->av->msg_sr & 0x7,
+ attr->port_num);
+ attr->src_path_bits = ah->av->g_slid & 0x7F;
+ attr->ah_flags = mthca_ah_grh_present(ah) ? IB_AH_GRH : 0;
+
+ if (attr->ah_flags) {
+ attr->grh.traffic_class =
+ be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 20;
+ attr->grh.flow_label =
+ be32_to_cpu(ah->av->sl_tclass_flowlabel) & 0xfffff;
+ attr->grh.hop_limit = ah->av->hop_limit;
+ attr->grh.sgid_index = ah->av->gid_index &
+ (dev->limits.gid_table_len - 1);
+ memcpy(attr->grh.dgid.raw, ah->av->dgid, 16);
+ }
+
+ return 0;
+}
+
+int mthca_init_av_table(struct mthca_dev *dev)
+{
+ int err;
+
+ if (mthca_is_memfree(dev))
+ return 0;
+
+ err = mthca_alloc_init(&dev->av_table.alloc,
+ dev->av_table.num_ddr_avs,
+ dev->av_table.num_ddr_avs - 1,
+ 0);
+ if (err)
+ return err;
+
+ dev->av_table.pool = pci_pool_create("mthca_av", dev->pdev,
+ MTHCA_AV_SIZE,
+ MTHCA_AV_SIZE, 0);
+ if (!dev->av_table.pool)
+ goto out_free_alloc;
+
+ if (!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+ dev->av_table.av_map = ioremap(pci_resource_start(dev->pdev, 4) +
+ dev->av_table.ddr_av_base -
+ dev->ddr_start,
+ dev->av_table.num_ddr_avs *
+ MTHCA_AV_SIZE);
+ if (!dev->av_table.av_map)
+ goto out_free_pool;
+ } else
+ dev->av_table.av_map = NULL;
+
+ return 0;
+
+ out_free_pool:
+ pci_pool_destroy(dev->av_table.pool);
+
+ out_free_alloc:
+ mthca_alloc_cleanup(&dev->av_table.alloc);
+ return -ENOMEM;
+}
+
+void mthca_cleanup_av_table(struct mthca_dev *dev)
+{
+ if (mthca_is_memfree(dev))
+ return;
+
+ if (dev->av_table.av_map)
+ iounmap(dev->av_table.av_map);
+ pci_pool_destroy(dev->av_table.pool);
+ mthca_alloc_cleanup(&dev->av_table.alloc);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_catas.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_catas.c
new file mode 100644
index 0000000..b200170
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_catas.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+
+#include "mthca_dev.h"
+
+enum {
+ MTHCA_CATAS_TYPE_INTERNAL = 0,
+ MTHCA_CATAS_TYPE_UPLINK = 3,
+ MTHCA_CATAS_TYPE_DDR = 4,
+ MTHCA_CATAS_TYPE_PARITY = 5,
+};
+
+#define MTHCA_CATAS_POLL_INTERVAL (5 * HZ)
+
+static DEFINE_SPINLOCK(catas_lock);
+
+static LIST_HEAD(catas_list);
+static struct workqueue_struct *catas_wq;
+static struct work_struct catas_work;
+
+static int catas_reset_disable;
+module_param_named(catas_reset_disable, catas_reset_disable, int, 0644);
+MODULE_PARM_DESC(catas_reset_disable, "disable reset on catastrophic event if nonzero");
+
+static void catas_reset(struct work_struct *work)
+{
+ struct mthca_dev *dev, *tmpdev;
+ LIST_HEAD(tlist);
+ int ret;
+
+ mutex_lock(&mthca_device_mutex);
+
+ spin_lock_irq(&catas_lock);
+ list_splice_init(&catas_list, &tlist);
+ spin_unlock_irq(&catas_lock);
+
+ list_for_each_entry_safe(dev, tmpdev, &tlist, catas_err.list) {
+ struct pci_dev *pdev = dev->pdev;
+ ret = __mthca_restart_one(dev->pdev);
+ /* 'dev' now is not valid */
+ if (ret)
+ printk(KERN_ERR "mthca %s: Reset failed (%d)\n",
+ pci_name(pdev), ret);
+ else {
+ struct mthca_dev *d = pci_get_drvdata(pdev);
+ mthca_dbg(d, "Reset succeeded\n");
+ }
+ }
+
+ mutex_unlock(&mthca_device_mutex);
+}
+
+static void handle_catas(struct mthca_dev *dev)
+{
+ struct ib_event event;
+ unsigned long flags;
+ const char *type;
+ int i;
+
+ event.device = &dev->ib_dev;
+ event.event = IB_EVENT_DEVICE_FATAL;
+ event.element.port_num = 0;
+ dev->active = 0;
+
+ ib_dispatch_event(&event);
+
+ switch (swab32(readl(dev->catas_err.map)) >> 24) {
+ case MTHCA_CATAS_TYPE_INTERNAL:
+ type = "internal error";
+ break;
+ case MTHCA_CATAS_TYPE_UPLINK:
+ type = "uplink bus error";
+ break;
+ case MTHCA_CATAS_TYPE_DDR:
+ type = "DDR data error";
+ break;
+ case MTHCA_CATAS_TYPE_PARITY:
+ type = "internal parity error";
+ break;
+ default:
+ type = "unknown error";
+ break;
+ }
+
+ mthca_err(dev, "Catastrophic error detected: %s\n", type);
+ for (i = 0; i < dev->catas_err.size; ++i)
+ mthca_err(dev, " buf[%02x]: %08x\n",
+ i, swab32(readl(dev->catas_err.map + i)));
+
+ if (catas_reset_disable)
+ return;
+
+ spin_lock_irqsave(&catas_lock, flags);
+ list_add(&dev->catas_err.list, &catas_list);
+ queue_work(catas_wq, &catas_work);
+ spin_unlock_irqrestore(&catas_lock, flags);
+}
+
+static void poll_catas(unsigned long dev_ptr)
+{
+ struct mthca_dev *dev = (struct mthca_dev *) dev_ptr;
+ int i;
+
+ for (i = 0; i < dev->catas_err.size; ++i)
+ if (readl(dev->catas_err.map + i)) {
+ handle_catas(dev);
+ return;
+ }
+
+ mod_timer(&dev->catas_err.timer,
+ round_jiffies(jiffies + MTHCA_CATAS_POLL_INTERVAL));
+}
+
+void mthca_start_catas_poll(struct mthca_dev *dev)
+{
+ unsigned long addr;
+
+ init_timer(&dev->catas_err.timer);
+ dev->catas_err.map = NULL;
+
+ addr = pci_resource_start(dev->pdev, 0) +
+ ((pci_resource_len(dev->pdev, 0) - 1) &
+ dev->catas_err.addr);
+
+ dev->catas_err.map = ioremap(addr, dev->catas_err.size * 4);
+ if (!dev->catas_err.map) {
+ mthca_warn(dev, "couldn't map catastrophic error region "
+ "at 0x%lx/0x%x\n", addr, dev->catas_err.size * 4);
+ return;
+ }
+
+ dev->catas_err.timer.data = (unsigned long) dev;
+ dev->catas_err.timer.function = poll_catas;
+ dev->catas_err.timer.expires = jiffies + MTHCA_CATAS_POLL_INTERVAL;
+ INIT_LIST_HEAD(&dev->catas_err.list);
+ add_timer(&dev->catas_err.timer);
+}
+
+void mthca_stop_catas_poll(struct mthca_dev *dev)
+{
+ del_timer_sync(&dev->catas_err.timer);
+
+ if (dev->catas_err.map)
+ iounmap(dev->catas_err.map);
+
+ spin_lock_irq(&catas_lock);
+ list_del(&dev->catas_err.list);
+ spin_unlock_irq(&catas_lock);
+}
+
+int __init mthca_catas_init(void)
+{
+ INIT_WORK(&catas_work, catas_reset);
+
+ catas_wq = create_singlethread_workqueue("mthcacatas");
+ if (!catas_wq)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void mthca_catas_cleanup(void)
+{
+ destroy_workqueue(catas_wq);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.c
new file mode 100644
index 0000000..81e2838
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.c
@@ -0,0 +1,1931 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/pci.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <asm/io.h>
+#include <rdma/ib_mad.h>
+
+#include "mthca_dev.h"
+#include "mthca_config_reg.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+
+#define CMD_POLL_TOKEN 0xffff
+
+enum {
+ HCR_IN_PARAM_OFFSET = 0x00,
+ HCR_IN_MODIFIER_OFFSET = 0x08,
+ HCR_OUT_PARAM_OFFSET = 0x0c,
+ HCR_TOKEN_OFFSET = 0x14,
+ HCR_STATUS_OFFSET = 0x18,
+
+ HCR_OPMOD_SHIFT = 12,
+ HCA_E_BIT = 22,
+ HCR_GO_BIT = 23
+};
+
+enum {
+ /* initialization and general commands */
+ CMD_SYS_EN = 0x1,
+ CMD_SYS_DIS = 0x2,
+ CMD_MAP_FA = 0xfff,
+ CMD_UNMAP_FA = 0xffe,
+ CMD_RUN_FW = 0xff6,
+ CMD_MOD_STAT_CFG = 0x34,
+ CMD_QUERY_DEV_LIM = 0x3,
+ CMD_QUERY_FW = 0x4,
+ CMD_ENABLE_LAM = 0xff8,
+ CMD_DISABLE_LAM = 0xff7,
+ CMD_QUERY_DDR = 0x5,
+ CMD_QUERY_ADAPTER = 0x6,
+ CMD_INIT_HCA = 0x7,
+ CMD_CLOSE_HCA = 0x8,
+ CMD_INIT_IB = 0x9,
+ CMD_CLOSE_IB = 0xa,
+ CMD_QUERY_HCA = 0xb,
+ CMD_SET_IB = 0xc,
+ CMD_ACCESS_DDR = 0x2e,
+ CMD_MAP_ICM = 0xffa,
+ CMD_UNMAP_ICM = 0xff9,
+ CMD_MAP_ICM_AUX = 0xffc,
+ CMD_UNMAP_ICM_AUX = 0xffb,
+ CMD_SET_ICM_SIZE = 0xffd,
+
+ /* TPT commands */
+ CMD_SW2HW_MPT = 0xd,
+ CMD_QUERY_MPT = 0xe,
+ CMD_HW2SW_MPT = 0xf,
+ CMD_READ_MTT = 0x10,
+ CMD_WRITE_MTT = 0x11,
+ CMD_SYNC_TPT = 0x2f,
+
+ /* EQ commands */
+ CMD_MAP_EQ = 0x12,
+ CMD_SW2HW_EQ = 0x13,
+ CMD_HW2SW_EQ = 0x14,
+ CMD_QUERY_EQ = 0x15,
+
+ /* CQ commands */
+ CMD_SW2HW_CQ = 0x16,
+ CMD_HW2SW_CQ = 0x17,
+ CMD_QUERY_CQ = 0x18,
+ CMD_RESIZE_CQ = 0x2c,
+
+ /* SRQ commands */
+ CMD_SW2HW_SRQ = 0x35,
+ CMD_HW2SW_SRQ = 0x36,
+ CMD_QUERY_SRQ = 0x37,
+ CMD_ARM_SRQ = 0x40,
+
+ /* QP/EE commands */
+ CMD_RST2INIT_QPEE = 0x19,
+ CMD_INIT2RTR_QPEE = 0x1a,
+ CMD_RTR2RTS_QPEE = 0x1b,
+ CMD_RTS2RTS_QPEE = 0x1c,
+ CMD_SQERR2RTS_QPEE = 0x1d,
+ CMD_2ERR_QPEE = 0x1e,
+ CMD_RTS2SQD_QPEE = 0x1f,
+ CMD_SQD2SQD_QPEE = 0x38,
+ CMD_SQD2RTS_QPEE = 0x20,
+ CMD_ERR2RST_QPEE = 0x21,
+ CMD_QUERY_QPEE = 0x22,
+ CMD_INIT2INIT_QPEE = 0x2d,
+ CMD_SUSPEND_QPEE = 0x32,
+ CMD_UNSUSPEND_QPEE = 0x33,
+ /* special QPs and management commands */
+ CMD_CONF_SPECIAL_QP = 0x23,
+ CMD_MAD_IFC = 0x24,
+
+ /* multicast commands */
+ CMD_READ_MGM = 0x25,
+ CMD_WRITE_MGM = 0x26,
+ CMD_MGID_HASH = 0x27,
+
+ /* miscellaneous commands */
+ CMD_DIAG_RPRT = 0x30,
+ CMD_NOP = 0x31,
+
+ /* debug commands */
+ CMD_QUERY_DEBUG_MSG = 0x2a,
+ CMD_SET_DEBUG_MSG = 0x2b,
+};
+
+/*
+ * According to Mellanox code, FW may be starved and never complete
+ * commands. So we can't use strict timeouts described in PRM -- we
+ * just arbitrarily select 60 seconds for now.
+ */
+#if 0
+/*
+ * Round up and add 1 to make sure we get the full wait time (since we
+ * will be starting in the middle of a jiffy)
+ */
+enum {
+ CMD_TIME_CLASS_A = (HZ + 999) / 1000 + 1,
+ CMD_TIME_CLASS_B = (HZ + 99) / 100 + 1,
+ CMD_TIME_CLASS_C = (HZ + 9) / 10 + 1,
+ CMD_TIME_CLASS_D = 60 * HZ
+};
+#else
+#define CMD_TIME_CLASS_A (60 * HZ)
+#define CMD_TIME_CLASS_B (60 * HZ)
+#define CMD_TIME_CLASS_C (60 * HZ)
+#define CMD_TIME_CLASS_D (60 * HZ)
+#endif
+
+#define GO_BIT_TIMEOUT (HZ * 10)
+
+struct mthca_cmd_context {
+ struct completion done;
+ int result;
+ int next;
+ u64 out_param;
+ u16 token;
+ u8 status;
+};
+
+static int fw_cmd_doorbell = 0;
+module_param(fw_cmd_doorbell, int, 0644);
+MODULE_PARM_DESC(fw_cmd_doorbell, "post FW commands through doorbell page if nonzero "
+ "(and supported by FW)");
+
+static inline int go_bit(struct mthca_dev *dev)
+{
+ return readl(dev->hcr + HCR_STATUS_OFFSET) &
+ swab32(1 << HCR_GO_BIT);
+}
+
+static void mthca_cmd_post_dbell(struct mthca_dev *dev,
+ u64 in_param,
+ u64 out_param,
+ u32 in_modifier,
+ u8 op_modifier,
+ u16 op,
+ u16 token)
+{
+ void __iomem *ptr = dev->cmd.dbell_map;
+ u16 *offs = dev->cmd.dbell_offsets;
+
+ __raw_writel((__force u32) cpu_to_be32(in_param >> 32), ptr + offs[0]);
+ wmb();
+ __raw_writel((__force u32) cpu_to_be32(in_param & 0xfffffffful), ptr + offs[1]);
+ wmb();
+ __raw_writel((__force u32) cpu_to_be32(in_modifier), ptr + offs[2]);
+ wmb();
+ __raw_writel((__force u32) cpu_to_be32(out_param >> 32), ptr + offs[3]);
+ wmb();
+ __raw_writel((__force u32) cpu_to_be32(out_param & 0xfffffffful), ptr + offs[4]);
+ wmb();
+ __raw_writel((__force u32) cpu_to_be32(token << 16), ptr + offs[5]);
+ wmb();
+ __raw_writel((__force u32) cpu_to_be32((1 << HCR_GO_BIT) |
+ (1 << HCA_E_BIT) |
+ (op_modifier << HCR_OPMOD_SHIFT) |
+ op), ptr + offs[6]);
+ wmb();
+ __raw_writel((__force u32) 0, ptr + offs[7]);
+ wmb();
+}
+
+static int mthca_cmd_post_hcr(struct mthca_dev *dev,
+ u64 in_param,
+ u64 out_param,
+ u32 in_modifier,
+ u8 op_modifier,
+ u16 op,
+ u16 token,
+ int event)
+{
+ if (event) {
+ unsigned long end = jiffies + GO_BIT_TIMEOUT;
+
+ while (go_bit(dev) && time_before(jiffies, end))
+ sched_yield();
+ }
+
+ if (go_bit(dev))
+ return -EAGAIN;
+
+ /*
+ * We use writel (instead of something like memcpy_toio)
+ * because writes of less than 32 bits to the HCR don't work
+ * (and some architectures such as ia64 implement memcpy_toio
+ * in terms of writeb).
+ */
+ __raw_writel((__force u32) cpu_to_be32(in_param >> 32), dev->hcr + 0 * 4);
+ __raw_writel((__force u32) cpu_to_be32(in_param & 0xfffffffful), dev->hcr + 1 * 4);
+ __raw_writel((__force u32) cpu_to_be32(in_modifier), dev->hcr + 2 * 4);
+ __raw_writel((__force u32) cpu_to_be32(out_param >> 32), dev->hcr + 3 * 4);
+ __raw_writel((__force u32) cpu_to_be32(out_param & 0xfffffffful), dev->hcr + 4 * 4);
+ __raw_writel((__force u32) cpu_to_be32(token << 16), dev->hcr + 5 * 4);
+
+ /* __raw_writel may not order writes. */
+ wmb();
+
+ __raw_writel((__force u32) cpu_to_be32((1 << HCR_GO_BIT) |
+ (event ? (1 << HCA_E_BIT) : 0) |
+ (op_modifier << HCR_OPMOD_SHIFT) |
+ op), dev->hcr + 6 * 4);
+
+ return 0;
+}
+
+static int mthca_cmd_post(struct mthca_dev *dev,
+ u64 in_param,
+ u64 out_param,
+ u32 in_modifier,
+ u8 op_modifier,
+ u16 op,
+ u16 token,
+ int event)
+{
+ int err = 0;
+
+ mutex_lock(&dev->cmd.hcr_mutex);
+
+ if (event && dev->cmd.flags & MTHCA_CMD_POST_DOORBELLS && fw_cmd_doorbell)
+ mthca_cmd_post_dbell(dev, in_param, out_param, in_modifier,
+ op_modifier, op, token);
+ else
+ err = mthca_cmd_post_hcr(dev, in_param, out_param, in_modifier,
+ op_modifier, op, token, event);
+
+ /*
+ * Make sure that our HCR writes don't get mixed in with
+ * writes from another CPU starting a FW command.
+ */
+ mmiowb();
+
+ mutex_unlock(&dev->cmd.hcr_mutex);
+ return err;
+}
+
+static int mthca_cmd_poll(struct mthca_dev *dev,
+ u64 in_param,
+ u64 *out_param,
+ int out_is_imm,
+ u32 in_modifier,
+ u8 op_modifier,
+ u16 op,
+ unsigned long timeout,
+ u8 *status)
+{
+ int err = 0;
+ unsigned long end;
+
+ down(&dev->cmd.poll_sem);
+
+ err = mthca_cmd_post(dev, in_param,
+ out_param ? *out_param : 0,
+ in_modifier, op_modifier,
+ op, CMD_POLL_TOKEN, 0);
+ if (err)
+ goto out;
+
+ end = timeout + jiffies;
+ while (go_bit(dev) && time_before(jiffies, end))
+ sched_yield();
+
+ if (go_bit(dev)) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (out_is_imm)
+ *out_param =
+ (u64) be32_to_cpu((__force __be32)
+ __raw_readl(dev->hcr + HCR_OUT_PARAM_OFFSET)) << 32 |
+ (u64) be32_to_cpu((__force __be32)
+ __raw_readl(dev->hcr + HCR_OUT_PARAM_OFFSET + 4));
+
+ *status = be32_to_cpu((__force __be32) __raw_readl(dev->hcr + HCR_STATUS_OFFSET)) >> 24;
+
+out:
+ up(&dev->cmd.poll_sem);
+ return err;
+}
+
+void mthca_cmd_event(struct mthca_dev *dev,
+ u16 token,
+ u8 status,
+ u64 out_param)
+{
+ struct mthca_cmd_context *context =
+ &dev->cmd.context[token & dev->cmd.token_mask];
+
+ /* previously timed out command completing at long last */
+ if (token != context->token)
+ return;
+
+ context->result = 0;
+ context->status = status;
+ context->out_param = out_param;
+
+ complete(&context->done);
+}
+
+static int mthca_cmd_wait(struct mthca_dev *dev,
+ u64 in_param,
+ u64 *out_param,
+ int out_is_imm,
+ u32 in_modifier,
+ u8 op_modifier,
+ u16 op,
+ unsigned long timeout,
+ u8 *status)
+{
+ int err = 0;
+ struct mthca_cmd_context *context;
+
+ down(&dev->cmd.event_sem);
+
+ spin_lock(&dev->cmd.context_lock);
+ BUG_ON(dev->cmd.free_head < 0);
+ context = &dev->cmd.context[dev->cmd.free_head];
+ context->token += dev->cmd.token_mask + 1;
+ dev->cmd.free_head = context->next;
+ spin_unlock(&dev->cmd.context_lock);
+
+ init_completion(&context->done);
+
+ err = mthca_cmd_post(dev, in_param,
+ out_param ? *out_param : 0,
+ in_modifier, op_modifier,
+ op, context->token, 1);
+ if (err)
+ goto out;
+
+ if (!wait_for_completion_timeout(&context->done, timeout)) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ err = context->result;
+ if (err)
+ goto out;
+
+ *status = context->status;
+ if (*status)
+ mthca_dbg(dev, "Command %02x completed with status %02x\n",
+ op, *status);
+
+ if (out_is_imm)
+ *out_param = context->out_param;
+
+out:
+ spin_lock(&dev->cmd.context_lock);
+ context->next = dev->cmd.free_head;
+ dev->cmd.free_head = context - dev->cmd.context;
+ spin_unlock(&dev->cmd.context_lock);
+
+ up(&dev->cmd.event_sem);
+ return err;
+}
+
+/* Invoke a command with an output mailbox */
+static int mthca_cmd_box(struct mthca_dev *dev,
+ u64 in_param,
+ u64 out_param,
+ u32 in_modifier,
+ u8 op_modifier,
+ u16 op,
+ unsigned long timeout,
+ u8 *status)
+{
+ if (dev->cmd.flags & MTHCA_CMD_USE_EVENTS)
+ return mthca_cmd_wait(dev, in_param, &out_param, 0,
+ in_modifier, op_modifier, op,
+ timeout, status);
+ else
+ return mthca_cmd_poll(dev, in_param, &out_param, 0,
+ in_modifier, op_modifier, op,
+ timeout, status);
+}
+
+/* Invoke a command with no output parameter */
+static int mthca_cmd(struct mthca_dev *dev,
+ u64 in_param,
+ u32 in_modifier,
+ u8 op_modifier,
+ u16 op,
+ unsigned long timeout,
+ u8 *status)
+{
+ return mthca_cmd_box(dev, in_param, 0, in_modifier,
+ op_modifier, op, timeout, status);
+}
+
+/*
+ * Invoke a command with an immediate output parameter (and copy the
+ * output into the caller's out_param pointer after the command
+ * executes).
+ */
+static int mthca_cmd_imm(struct mthca_dev *dev,
+ u64 in_param,
+ u64 *out_param,
+ u32 in_modifier,
+ u8 op_modifier,
+ u16 op,
+ unsigned long timeout,
+ u8 *status)
+{
+ if (dev->cmd.flags & MTHCA_CMD_USE_EVENTS)
+ return mthca_cmd_wait(dev, in_param, out_param, 1,
+ in_modifier, op_modifier, op,
+ timeout, status);
+ else
+ return mthca_cmd_poll(dev, in_param, out_param, 1,
+ in_modifier, op_modifier, op,
+ timeout, status);
+}
+
+int mthca_cmd_init(struct mthca_dev *dev)
+{
+ mutex_init(&dev->cmd.hcr_mutex);
+ sema_init(&dev->cmd.poll_sem, 1);
+ dev->cmd.flags = 0;
+
+ dev->hcr = ioremap(pci_resource_start(dev->pdev, 0) + MTHCA_HCR_BASE,
+ MTHCA_HCR_SIZE);
+ if (!dev->hcr) {
+ mthca_err(dev, "Couldn't map command register.");
+ return -ENOMEM;
+ }
+
+ dev->cmd.pool = pci_pool_create("mthca_cmd", dev->pdev,
+ MTHCA_MAILBOX_SIZE,
+ MTHCA_MAILBOX_SIZE, 0);
+ if (!dev->cmd.pool) {
+ iounmap(dev->hcr);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+void mthca_cmd_cleanup(struct mthca_dev *dev)
+{
+ pci_pool_destroy(dev->cmd.pool);
+ iounmap(dev->hcr);
+ if (dev->cmd.flags & MTHCA_CMD_POST_DOORBELLS)
+ iounmap(dev->cmd.dbell_map);
+}
+
+/*
+ * Switch to using events to issue FW commands (should be called after
+ * event queue to command events has been initialized).
+ */
+int mthca_cmd_use_events(struct mthca_dev *dev)
+{
+ int i;
+
+ dev->cmd.context = kmalloc(dev->cmd.max_cmds *
+ sizeof (struct mthca_cmd_context),
+ GFP_KERNEL);
+ if (!dev->cmd.context)
+ return -ENOMEM;
+
+ for (i = 0; i < dev->cmd.max_cmds; ++i) {
+ dev->cmd.context[i].token = i;
+ dev->cmd.context[i].next = i + 1;
+ }
+
+ dev->cmd.context[dev->cmd.max_cmds - 1].next = -1;
+ dev->cmd.free_head = 0;
+
+ sema_init(&dev->cmd.event_sem, dev->cmd.max_cmds);
+ spin_lock_init(&dev->cmd.context_lock);
+
+ for (dev->cmd.token_mask = 1;
+ dev->cmd.token_mask < dev->cmd.max_cmds;
+ dev->cmd.token_mask <<= 1)
+ ; /* nothing */
+ --dev->cmd.token_mask;
+
+ dev->cmd.flags |= MTHCA_CMD_USE_EVENTS;
+
+ down(&dev->cmd.poll_sem);
+
+ return 0;
+}
+
+/*
+ * Switch back to polling (used when shutting down the device)
+ */
+void mthca_cmd_use_polling(struct mthca_dev *dev)
+{
+ int i;
+
+ dev->cmd.flags &= ~MTHCA_CMD_USE_EVENTS;
+
+ for (i = 0; i < dev->cmd.max_cmds; ++i)
+ down(&dev->cmd.event_sem);
+
+ kfree(dev->cmd.context);
+
+ up(&dev->cmd.poll_sem);
+}
+
+struct mthca_mailbox *mthca_alloc_mailbox(struct mthca_dev *dev,
+ gfp_t gfp_mask)
+{
+ struct mthca_mailbox *mailbox;
+
+ mailbox = kmalloc(sizeof *mailbox, gfp_mask);
+ if (!mailbox)
+ return ERR_PTR(-ENOMEM);
+
+ mailbox->buf = pci_pool_alloc(dev->cmd.pool, gfp_mask, &mailbox->dma);
+ if (!mailbox->buf) {
+ kfree(mailbox);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return mailbox;
+}
+
+void mthca_free_mailbox(struct mthca_dev *dev, struct mthca_mailbox *mailbox)
+{
+ if (!mailbox)
+ return;
+
+ pci_pool_free(dev->cmd.pool, mailbox->buf, mailbox->dma);
+ kfree(mailbox);
+}
+
+int mthca_SYS_EN(struct mthca_dev *dev, u8 *status)
+{
+ u64 out;
+ int ret;
+
+ ret = mthca_cmd_imm(dev, 0, &out, 0, 0, CMD_SYS_EN, CMD_TIME_CLASS_D, status);
+
+ if (*status == MTHCA_CMD_STAT_DDR_MEM_ERR)
+ mthca_warn(dev, "SYS_EN DDR error: syn=%x, sock=%d, "
+ "sladdr=%d, SPD source=%s\n",
+ (int) (out >> 6) & 0xf, (int) (out >> 4) & 3,
+ (int) (out >> 1) & 7, (int) out & 1 ? "NVMEM" : "DIMM");
+
+ return ret;
+}
+
+int mthca_SYS_DIS(struct mthca_dev *dev, u8 *status)
+{
+ return mthca_cmd(dev, 0, 0, 0, CMD_SYS_DIS, CMD_TIME_CLASS_C, status);
+}
+
+static int mthca_map_cmd(struct mthca_dev *dev, u16 op, struct mthca_icm *icm,
+ u64 virt, u8 *status)
+{
+ struct mthca_mailbox *mailbox;
+ struct mthca_icm_iter iter;
+ __be64 *pages;
+ int lg;
+ int nent = 0;
+ int i;
+ int err = 0;
+ int ts = 0, tc = 0;
+
+ mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ memset(mailbox->buf, 0, MTHCA_MAILBOX_SIZE);
+ pages = mailbox->buf;
+
+ for (mthca_icm_first(icm, &iter);
+ !mthca_icm_last(&iter);
+ mthca_icm_next(&iter)) {
+ /*
+ * We have to pass pages that are aligned to their
+ * size, so find the least significant 1 in the
+ * address or size and use that as our log2 size.
+ */
+ lg = ffs(mthca_icm_addr(&iter) | mthca_icm_size(&iter)) - 1;
+ if (lg < MTHCA_ICM_PAGE_SHIFT) {
+ mthca_warn(dev, "Got FW area not aligned to %d (%llx/%lx).\n",
+ MTHCA_ICM_PAGE_SIZE,
+ (unsigned long long) mthca_icm_addr(&iter),
+ mthca_icm_size(&iter));
+ err = -EINVAL;
+ goto out;
+ }
+ for (i = 0; i < mthca_icm_size(&iter) >> lg; ++i) {
+ if (virt != -1) {
+ pages[nent * 2] = cpu_to_be64(virt);
+ virt += 1 << lg;
+ }
+
+ pages[nent * 2 + 1] =
+ cpu_to_be64((mthca_icm_addr(&iter) + (i << lg)) |
+ (lg - MTHCA_ICM_PAGE_SHIFT));
+ ts += 1 << (lg - 10);
+ ++tc;
+
+ if (++nent == MTHCA_MAILBOX_SIZE / 16) {
+ err = mthca_cmd(dev, mailbox->dma, nent, 0, op,
+ CMD_TIME_CLASS_B, status);
+ if (err || *status)
+ goto out;
+ nent = 0;
+ }
+ }
+ }
+
+ if (nent)
+ err = mthca_cmd(dev, mailbox->dma, nent, 0, op,
+ CMD_TIME_CLASS_B, status);
+
+ switch (op) {
+ case CMD_MAP_FA:
+ mthca_dbg(dev, "Mapped %d chunks/%d KB for FW.\n", tc, ts);
+ break;
+ case CMD_MAP_ICM_AUX:
+ mthca_dbg(dev, "Mapped %d chunks/%d KB for ICM aux.\n", tc, ts);
+ break;
+ case CMD_MAP_ICM:
+ mthca_dbg(dev, "Mapped %d chunks/%d KB at %llx for ICM.\n",
+ tc, ts, (unsigned long long) virt - (ts << 10));
+ break;
+ }
+
+out:
+ mthca_free_mailbox(dev, mailbox);
+ return err;
+}
+
+int mthca_MAP_FA(struct mthca_dev *dev, struct mthca_icm *icm, u8 *status)
+{
+ return mthca_map_cmd(dev, CMD_MAP_FA, icm, -1, status);
+}
+
+int mthca_UNMAP_FA(struct mthca_dev *dev, u8 *status)
+{
+ return mthca_cmd(dev, 0, 0, 0, CMD_UNMAP_FA, CMD_TIME_CLASS_B, status);
+}
+
+int mthca_RUN_FW(struct mthca_dev *dev, u8 *status)
+{
+ return mthca_cmd(dev, 0, 0, 0, CMD_RUN_FW, CMD_TIME_CLASS_A, status);
+}
+
+static void mthca_setup_cmd_doorbells(struct mthca_dev *dev, u64 base)
+{
+ unsigned long addr;
+ u16 max_off = 0;
+ int i;
+
+ for (i = 0; i < 8; ++i)
+ max_off = max(max_off, dev->cmd.dbell_offsets[i]);
+
+ if ((base & PAGE_MASK) != ((base + max_off) & PAGE_MASK)) {
+ mthca_warn(dev, "Firmware doorbell region at 0x%016llx, "
+ "length 0x%x crosses a page boundary\n",
+ (unsigned long long) base, max_off);
+ return;
+ }
+
+ addr = pci_resource_start(dev->pdev, 2) +
+ ((pci_resource_len(dev->pdev, 2) - 1) & base);
+ dev->cmd.dbell_map = ioremap(addr, max_off + sizeof(u32));
+ if (!dev->cmd.dbell_map)
+ return;
+
+ dev->cmd.flags |= MTHCA_CMD_POST_DOORBELLS;
+ mthca_dbg(dev, "Mapped doorbell page for posting FW commands\n");
+}
+
+int mthca_QUERY_FW(struct mthca_dev *dev, u8 *status)
+{
+ struct mthca_mailbox *mailbox;
+ u32 *outbox;
+ u64 base;
+ u32 tmp;
+ int err = 0;
+ u8 lg;
+ int i;
+
+#define QUERY_FW_OUT_SIZE 0x100
+#define QUERY_FW_VER_OFFSET 0x00
+#define QUERY_FW_MAX_CMD_OFFSET 0x0f
+#define QUERY_FW_ERR_START_OFFSET 0x30
+#define QUERY_FW_ERR_SIZE_OFFSET 0x38
+
+#define QUERY_FW_CMD_DB_EN_OFFSET 0x10
+#define QUERY_FW_CMD_DB_OFFSET 0x50
+#define QUERY_FW_CMD_DB_BASE 0x60
+
+#define QUERY_FW_START_OFFSET 0x20
+#define QUERY_FW_END_OFFSET 0x28
+
+#define QUERY_FW_SIZE_OFFSET 0x00
+#define QUERY_FW_CLR_INT_BASE_OFFSET 0x20
+#define QUERY_FW_EQ_ARM_BASE_OFFSET 0x40
+#define QUERY_FW_EQ_SET_CI_BASE_OFFSET 0x48
+
+ mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ outbox = mailbox->buf;
+
+ err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_FW,
+ CMD_TIME_CLASS_A, status);
+
+ if (err)
+ goto out;
+
+ MTHCA_GET(dev->fw_ver, outbox, QUERY_FW_VER_OFFSET);
+ /*
+ * FW subminor version is at more significant bits than minor
+ * version, so swap here.
+ */
+ dev->fw_ver = (dev->fw_ver & 0xffff00000000ull) |
+ ((dev->fw_ver & 0xffff0000ull) >> 16) |
+ ((dev->fw_ver & 0x0000ffffull) << 16);
+
+ MTHCA_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET);
+ dev->cmd.max_cmds = 1 << lg;
+
+ mthca_dbg(dev, "FW version %012llx, max commands %d\n",
+ (unsigned long long) dev->fw_ver, dev->cmd.max_cmds);
+
+ MTHCA_GET(dev->catas_err.addr, outbox, QUERY_FW_ERR_START_OFFSET);
+ MTHCA_GET(dev->catas_err.size, outbox, QUERY_FW_ERR_SIZE_OFFSET);
+
+ mthca_dbg(dev, "Catastrophic error buffer at 0x%llx, size 0x%x\n",
+ (unsigned long long) dev->catas_err.addr, dev->catas_err.size);
+
+ MTHCA_GET(tmp, outbox, QUERY_FW_CMD_DB_EN_OFFSET);
+ if (tmp & 0x1) {
+ mthca_dbg(dev, "FW supports commands through doorbells\n");
+
+ MTHCA_GET(base, outbox, QUERY_FW_CMD_DB_BASE);
+ for (i = 0; i < MTHCA_CMD_NUM_DBELL_DWORDS; ++i)
+ MTHCA_GET(dev->cmd.dbell_offsets[i], outbox,
+ QUERY_FW_CMD_DB_OFFSET + (i << 1));
+
+ mthca_setup_cmd_doorbells(dev, base);
+ }
+
+ if (mthca_is_memfree(dev)) {
+ MTHCA_GET(dev->fw.arbel.fw_pages, outbox, QUERY_FW_SIZE_OFFSET);
+ MTHCA_GET(dev->fw.arbel.clr_int_base, outbox, QUERY_FW_CLR_INT_BASE_OFFSET);
+ MTHCA_GET(dev->fw.arbel.eq_arm_base, outbox, QUERY_FW_EQ_ARM_BASE_OFFSET);
+ MTHCA_GET(dev->fw.arbel.eq_set_ci_base, outbox, QUERY_FW_EQ_SET_CI_BASE_OFFSET);
+ mthca_dbg(dev, "FW size %d KB\n", dev->fw.arbel.fw_pages << 2);
+
+ /*
+ * Round up number of system pages needed in case
+ * MTHCA_ICM_PAGE_SIZE < PAGE_SIZE.
+ */
+ dev->fw.arbel.fw_pages =
+ ALIGN(dev->fw.arbel.fw_pages, PAGE_SIZE / MTHCA_ICM_PAGE_SIZE) >>
+ (PAGE_SHIFT - MTHCA_ICM_PAGE_SHIFT);
+
+ mthca_dbg(dev, "Clear int @ %llx, EQ arm @ %llx, EQ set CI @ %llx\n",
+ (unsigned long long) dev->fw.arbel.clr_int_base,
+ (unsigned long long) dev->fw.arbel.eq_arm_base,
+ (unsigned long long) dev->fw.arbel.eq_set_ci_base);
+ } else {
+ MTHCA_GET(dev->fw.tavor.fw_start, outbox, QUERY_FW_START_OFFSET);
+ MTHCA_GET(dev->fw.tavor.fw_end, outbox, QUERY_FW_END_OFFSET);
+
+ mthca_dbg(dev, "FW size %d KB (start %llx, end %llx)\n",
+ (int) ((dev->fw.tavor.fw_end - dev->fw.tavor.fw_start) >> 10),
+ (unsigned long long) dev->fw.tavor.fw_start,
+ (unsigned long long) dev->fw.tavor.fw_end);
+ }
+
+out:
+ mthca_free_mailbox(dev, mailbox);
+ return err;
+}
+
+int mthca_ENABLE_LAM(struct mthca_dev *dev, u8 *status)
+{
+ struct mthca_mailbox *mailbox;
+ u8 info;
+ u32 *outbox;
+ int err = 0;
+
+#define ENABLE_LAM_OUT_SIZE 0x100
+#define ENABLE_LAM_START_OFFSET 0x00
+#define ENABLE_LAM_END_OFFSET 0x08
+#define ENABLE_LAM_INFO_OFFSET 0x13
+
+#define ENABLE_LAM_INFO_HIDDEN_FLAG (1 << 4)
+#define ENABLE_LAM_INFO_ECC_MASK 0x3
+
+ mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ outbox = mailbox->buf;
+
+ err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_ENABLE_LAM,
+ CMD_TIME_CLASS_C, status);
+
+ if (err)
+ goto out;
+
+ if (*status == MTHCA_CMD_STAT_LAM_NOT_PRE)
+ goto out;
+
+ MTHCA_GET(dev->ddr_start, outbox, ENABLE_LAM_START_OFFSET);
+ MTHCA_GET(dev->ddr_end, outbox, ENABLE_LAM_END_OFFSET);
+ MTHCA_GET(info, outbox, ENABLE_LAM_INFO_OFFSET);
+
+ if (!!(info & ENABLE_LAM_INFO_HIDDEN_FLAG) !=
+ !!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+ mthca_info(dev, "FW reports that HCA-attached memory "
+ "is %s hidden; does not match PCI config\n",
+ (info & ENABLE_LAM_INFO_HIDDEN_FLAG) ?
+ "" : "not");
+ }
+ if (info & ENABLE_LAM_INFO_HIDDEN_FLAG)
+ mthca_dbg(dev, "HCA-attached memory is hidden.\n");
+
+ mthca_dbg(dev, "HCA memory size %d KB (start %llx, end %llx)\n",
+ (int) ((dev->ddr_end - dev->ddr_start) >> 10),
+ (unsigned long long) dev->ddr_start,
+ (unsigned long long) dev->ddr_end);
+
+out:
+ mthca_free_mailbox(dev, mailbox);
+ return err;
+}
+
+int mthca_DISABLE_LAM(struct mthca_dev *dev, u8 *status)
+{
+ return mthca_cmd(dev, 0, 0, 0, CMD_SYS_DIS, CMD_TIME_CLASS_C, status);
+}
+
+int mthca_QUERY_DDR(struct mthca_dev *dev, u8 *status)
+{
+ struct mthca_mailbox *mailbox;
+ u8 info;
+ u32 *outbox;
+ int err = 0;
+
+#define QUERY_DDR_OUT_SIZE 0x100
+#define QUERY_DDR_START_OFFSET 0x00
+#define QUERY_DDR_END_OFFSET 0x08
+#define QUERY_DDR_INFO_OFFSET 0x13
+
+#define QUERY_DDR_INFO_HIDDEN_FLAG (1 << 4)
+#define QUERY_DDR_INFO_ECC_MASK 0x3
+
+ mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ outbox = mailbox->buf;
+
+ err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_DDR,
+ CMD_TIME_CLASS_A, status);
+
+ if (err)
+ goto out;
+
+ MTHCA_GET(dev->ddr_start, outbox, QUERY_DDR_START_OFFSET);
+ MTHCA_GET(dev->ddr_end, outbox, QUERY_DDR_END_OFFSET);
+ MTHCA_GET(info, outbox, QUERY_DDR_INFO_OFFSET);
+
+ if (!!(info & QUERY_DDR_INFO_HIDDEN_FLAG) !=
+ !!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+ mthca_info(dev, "FW reports that HCA-attached memory "
+ "is %s hidden; does not match PCI config\n",
+ (info & QUERY_DDR_INFO_HIDDEN_FLAG) ?
+ "" : "not");
+ }
+ if (info & QUERY_DDR_INFO_HIDDEN_FLAG)
+ mthca_dbg(dev, "HCA-attached memory is hidden.\n");
+
+ mthca_dbg(dev, "HCA memory size %d KB (start %llx, end %llx)\n",
+ (int) ((dev->ddr_end - dev->ddr_start) >> 10),
+ (unsigned long long) dev->ddr_start,
+ (unsigned long long) dev->ddr_end);
+
+out:
+ mthca_free_mailbox(dev, mailbox);
+ return err;
+}
+
+int mthca_QUERY_DEV_LIM(struct mthca_dev *dev,
+ struct mthca_dev_lim *dev_lim, u8 *status)
+{
+ struct mthca_mailbox *mailbox;
+ u32 *outbox;
+ u8 field;
+ u16 size;
+ u16 stat_rate;
+ int err;
+
+#define QUERY_DEV_LIM_OUT_SIZE 0x100
+#define QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET 0x10
+#define QUERY_DEV_LIM_MAX_QP_SZ_OFFSET 0x11
+#define QUERY_DEV_LIM_RSVD_QP_OFFSET 0x12
+#define QUERY_DEV_LIM_MAX_QP_OFFSET 0x13
+#define QUERY_DEV_LIM_RSVD_SRQ_OFFSET 0x14
+#define QUERY_DEV_LIM_MAX_SRQ_OFFSET 0x15
+#define QUERY_DEV_LIM_RSVD_EEC_OFFSET 0x16
+#define QUERY_DEV_LIM_MAX_EEC_OFFSET 0x17
+#define QUERY_DEV_LIM_MAX_CQ_SZ_OFFSET 0x19
+#define QUERY_DEV_LIM_RSVD_CQ_OFFSET 0x1a
+#define QUERY_DEV_LIM_MAX_CQ_OFFSET 0x1b
+#define QUERY_DEV_LIM_MAX_MPT_OFFSET 0x1d
+#define QUERY_DEV_LIM_RSVD_EQ_OFFSET 0x1e
+#define QUERY_DEV_LIM_MAX_EQ_OFFSET 0x1f
+#define QUERY_DEV_LIM_RSVD_MTT_OFFSET 0x20
+#define QUERY_DEV_LIM_MAX_MRW_SZ_OFFSET 0x21
+#define QUERY_DEV_LIM_RSVD_MRW_OFFSET 0x22
+#define QUERY_DEV_LIM_MAX_MTT_SEG_OFFSET 0x23
+#define QUERY_DEV_LIM_MAX_AV_OFFSET 0x27
+#define QUERY_DEV_LIM_MAX_REQ_QP_OFFSET 0x29
+#define QUERY_DEV_LIM_MAX_RES_QP_OFFSET 0x2b
+#define QUERY_DEV_LIM_MAX_RDMA_OFFSET 0x2f
+#define QUERY_DEV_LIM_RSZ_SRQ_OFFSET 0x33
+#define QUERY_DEV_LIM_ACK_DELAY_OFFSET 0x35
+#define QUERY_DEV_LIM_MTU_WIDTH_OFFSET 0x36
+#define QUERY_DEV_LIM_VL_PORT_OFFSET 0x37
+#define QUERY_DEV_LIM_MAX_GID_OFFSET 0x3b
+#define QUERY_DEV_LIM_RATE_SUPPORT_OFFSET 0x3c
+#define QUERY_DEV_LIM_MAX_PKEY_OFFSET 0x3f
+#define QUERY_DEV_LIM_FLAGS_OFFSET 0x44
+#define QUERY_DEV_LIM_RSVD_UAR_OFFSET 0x48
+#define QUERY_DEV_LIM_UAR_SZ_OFFSET 0x49
+#define QUERY_DEV_LIM_PAGE_SZ_OFFSET 0x4b
+#define QUERY_DEV_LIM_MAX_SG_OFFSET 0x51
+#define QUERY_DEV_LIM_MAX_DESC_SZ_OFFSET 0x52
+#define QUERY_DEV_LIM_MAX_SG_RQ_OFFSET 0x55
+#define QUERY_DEV_LIM_MAX_DESC_SZ_RQ_OFFSET 0x56
+#define QUERY_DEV_LIM_MAX_QP_MCG_OFFSET 0x61
+#define QUERY_DEV_LIM_RSVD_MCG_OFFSET 0x62
+#define QUERY_DEV_LIM_MAX_MCG_OFFSET 0x63
+#define QUERY_DEV_LIM_RSVD_PD_OFFSET 0x64
+#define QUERY_DEV_LIM_MAX_PD_OFFSET 0x65
+#define QUERY_DEV_LIM_RSVD_RDD_OFFSET 0x66
+#define QUERY_DEV_LIM_MAX_RDD_OFFSET 0x67
+#define QUERY_DEV_LIM_EEC_ENTRY_SZ_OFFSET 0x80
+#define QUERY_DEV_LIM_QPC_ENTRY_SZ_OFFSET 0x82
+#define QUERY_DEV_LIM_EEEC_ENTRY_SZ_OFFSET 0x84
+#define QUERY_DEV_LIM_EQPC_ENTRY_SZ_OFFSET 0x86
+#define QUERY_DEV_LIM_EQC_ENTRY_SZ_OFFSET 0x88
+#define QUERY_DEV_LIM_CQC_ENTRY_SZ_OFFSET 0x8a
+#define QUERY_DEV_LIM_SRQ_ENTRY_SZ_OFFSET 0x8c
+#define QUERY_DEV_LIM_UAR_ENTRY_SZ_OFFSET 0x8e
+#define QUERY_DEV_LIM_MTT_ENTRY_SZ_OFFSET 0x90
+#define QUERY_DEV_LIM_MPT_ENTRY_SZ_OFFSET 0x92
+#define QUERY_DEV_LIM_PBL_SZ_OFFSET 0x96
+#define QUERY_DEV_LIM_BMME_FLAGS_OFFSET 0x97
+#define QUERY_DEV_LIM_RSVD_LKEY_OFFSET 0x98
+#define QUERY_DEV_LIM_LAMR_OFFSET 0x9f
+#define QUERY_DEV_LIM_MAX_ICM_SZ_OFFSET 0xa0
+
+ mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ outbox = mailbox->buf;
+
+ err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_DEV_LIM,
+ CMD_TIME_CLASS_A, status);
+
+ if (err)
+ goto out;
+
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_QP_OFFSET);
+ dev_lim->reserved_qps = 1 << (field & 0xf);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_OFFSET);
+ dev_lim->max_qps = 1 << (field & 0x1f);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_SRQ_OFFSET);
+ dev_lim->reserved_srqs = 1 << (field >> 4);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_OFFSET);
+ dev_lim->max_srqs = 1 << (field & 0x1f);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_EEC_OFFSET);
+ dev_lim->reserved_eecs = 1 << (field & 0xf);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_EEC_OFFSET);
+ dev_lim->max_eecs = 1 << (field & 0x1f);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_CQ_SZ_OFFSET);
+ dev_lim->max_cq_sz = 1 << field;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_CQ_OFFSET);
+ dev_lim->reserved_cqs = 1 << (field & 0xf);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_CQ_OFFSET);
+ dev_lim->max_cqs = 1 << (field & 0x1f);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MPT_OFFSET);
+ dev_lim->max_mpts = 1 << (field & 0x3f);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_EQ_OFFSET);
+ dev_lim->reserved_eqs = 1 << (field & 0xf);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_EQ_OFFSET);
+ dev_lim->max_eqs = 1 << (field & 0x7);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MTT_OFFSET);
+ if (mthca_is_memfree(dev))
+ dev_lim->reserved_mtts = ALIGN((1 << (field >> 4)) * sizeof(u64),
+ dev->limits.mtt_seg_size) / dev->limits.mtt_seg_size;
+ else
+ dev_lim->reserved_mtts = 1 << (field >> 4);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MRW_SZ_OFFSET);
+ dev_lim->max_mrw_sz = 1 << field;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MRW_OFFSET);
+ dev_lim->reserved_mrws = 1 << (field & 0xf);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MTT_SEG_OFFSET);
+ dev_lim->max_mtt_seg = 1 << (field & 0x3f);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_REQ_QP_OFFSET);
+ dev_lim->max_requester_per_qp = 1 << (field & 0x3f);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RES_QP_OFFSET);
+ dev_lim->max_responder_per_qp = 1 << (field & 0x3f);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RDMA_OFFSET);
+ dev_lim->max_rdma_global = 1 << (field & 0x3f);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_ACK_DELAY_OFFSET);
+ dev_lim->local_ca_ack_delay = field & 0x1f;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MTU_WIDTH_OFFSET);
+ dev_lim->max_mtu = field >> 4;
+ dev_lim->max_port_width = field & 0xf;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_VL_PORT_OFFSET);
+ dev_lim->max_vl = field >> 4;
+ dev_lim->num_ports = field & 0xf;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_GID_OFFSET);
+ dev_lim->max_gids = 1 << (field & 0xf);
+ MTHCA_GET(stat_rate, outbox, QUERY_DEV_LIM_RATE_SUPPORT_OFFSET);
+ dev_lim->stat_rate_support = stat_rate;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_PKEY_OFFSET);
+ dev_lim->max_pkeys = 1 << (field & 0xf);
+ MTHCA_GET(dev_lim->flags, outbox, QUERY_DEV_LIM_FLAGS_OFFSET);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_UAR_OFFSET);
+ dev_lim->reserved_uars = field >> 4;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_UAR_SZ_OFFSET);
+ dev_lim->uar_size = 1 << ((field & 0x3f) + 20);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_PAGE_SZ_OFFSET);
+ dev_lim->min_page_sz = 1 << field;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SG_OFFSET);
+ dev_lim->max_sg = field;
+
+ MTHCA_GET(size, outbox, QUERY_DEV_LIM_MAX_DESC_SZ_OFFSET);
+ dev_lim->max_desc_sz = size;
+
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_MCG_OFFSET);
+ dev_lim->max_qp_per_mcg = 1 << field;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MCG_OFFSET);
+ dev_lim->reserved_mgms = field & 0xf;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MCG_OFFSET);
+ dev_lim->max_mcgs = 1 << field;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_PD_OFFSET);
+ dev_lim->reserved_pds = field >> 4;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_PD_OFFSET);
+ dev_lim->max_pds = 1 << (field & 0x3f);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_RDD_OFFSET);
+ dev_lim->reserved_rdds = field >> 4;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RDD_OFFSET);
+ dev_lim->max_rdds = 1 << (field & 0x3f);
+
+ MTHCA_GET(size, outbox, QUERY_DEV_LIM_EEC_ENTRY_SZ_OFFSET);
+ dev_lim->eec_entry_sz = size;
+ MTHCA_GET(size, outbox, QUERY_DEV_LIM_QPC_ENTRY_SZ_OFFSET);
+ dev_lim->qpc_entry_sz = size;
+ MTHCA_GET(size, outbox, QUERY_DEV_LIM_EEEC_ENTRY_SZ_OFFSET);
+ dev_lim->eeec_entry_sz = size;
+ MTHCA_GET(size, outbox, QUERY_DEV_LIM_EQPC_ENTRY_SZ_OFFSET);
+ dev_lim->eqpc_entry_sz = size;
+ MTHCA_GET(size, outbox, QUERY_DEV_LIM_EQC_ENTRY_SZ_OFFSET);
+ dev_lim->eqc_entry_sz = size;
+ MTHCA_GET(size, outbox, QUERY_DEV_LIM_CQC_ENTRY_SZ_OFFSET);
+ dev_lim->cqc_entry_sz = size;
+ MTHCA_GET(size, outbox, QUERY_DEV_LIM_SRQ_ENTRY_SZ_OFFSET);
+ dev_lim->srq_entry_sz = size;
+ MTHCA_GET(size, outbox, QUERY_DEV_LIM_UAR_ENTRY_SZ_OFFSET);
+ dev_lim->uar_scratch_entry_sz = size;
+
+ if (mthca_is_memfree(dev)) {
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET);
+ dev_lim->max_srq_sz = 1 << field;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_SZ_OFFSET);
+ dev_lim->max_qp_sz = 1 << field;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSZ_SRQ_OFFSET);
+ dev_lim->hca.arbel.resize_srq = field & 1;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SG_RQ_OFFSET);
+ dev_lim->max_sg = min_t(int, field, dev_lim->max_sg);
+ MTHCA_GET(size, outbox, QUERY_DEV_LIM_MAX_DESC_SZ_RQ_OFFSET);
+ dev_lim->max_desc_sz = min_t(int, size, dev_lim->max_desc_sz);
+ MTHCA_GET(size, outbox, QUERY_DEV_LIM_MPT_ENTRY_SZ_OFFSET);
+ dev_lim->mpt_entry_sz = size;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_PBL_SZ_OFFSET);
+ dev_lim->hca.arbel.max_pbl_sz = 1 << (field & 0x3f);
+ MTHCA_GET(dev_lim->hca.arbel.bmme_flags, outbox,
+ QUERY_DEV_LIM_BMME_FLAGS_OFFSET);
+ MTHCA_GET(dev_lim->hca.arbel.reserved_lkey, outbox,
+ QUERY_DEV_LIM_RSVD_LKEY_OFFSET);
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_LAMR_OFFSET);
+ dev_lim->hca.arbel.lam_required = field & 1;
+ MTHCA_GET(dev_lim->hca.arbel.max_icm_sz, outbox,
+ QUERY_DEV_LIM_MAX_ICM_SZ_OFFSET);
+
+ if (dev_lim->hca.arbel.bmme_flags & 1)
+ mthca_dbg(dev, "Base MM extensions: yes "
+ "(flags %d, max PBL %d, rsvd L_Key %08x)\n",
+ dev_lim->hca.arbel.bmme_flags,
+ dev_lim->hca.arbel.max_pbl_sz,
+ dev_lim->hca.arbel.reserved_lkey);
+ else
+ mthca_dbg(dev, "Base MM extensions: no\n");
+
+ mthca_dbg(dev, "Max ICM size %lld MB\n",
+ (unsigned long long) dev_lim->hca.arbel.max_icm_sz >> 20);
+ } else {
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET);
+ dev_lim->max_srq_sz = (1 << field) - 1;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_SZ_OFFSET);
+ dev_lim->max_qp_sz = (1 << field) - 1;
+ MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_AV_OFFSET);
+ dev_lim->hca.tavor.max_avs = 1 << (field & 0x3f);
+ dev_lim->mpt_entry_sz = MTHCA_MPT_ENTRY_SIZE;
+ }
+
+ mthca_dbg(dev, "Max QPs: %d, reserved QPs: %d, entry size: %d\n",
+ dev_lim->max_qps, dev_lim->reserved_qps, dev_lim->qpc_entry_sz);
+ mthca_dbg(dev, "Max SRQs: %d, reserved SRQs: %d, entry size: %d\n",
+ dev_lim->max_srqs, dev_lim->reserved_srqs, dev_lim->srq_entry_sz);
+ mthca_dbg(dev, "Max CQs: %d, reserved CQs: %d, entry size: %d\n",
+ dev_lim->max_cqs, dev_lim->reserved_cqs, dev_lim->cqc_entry_sz);
+ mthca_dbg(dev, "Max EQs: %d, reserved EQs: %d, entry size: %d\n",
+ dev_lim->max_eqs, dev_lim->reserved_eqs, dev_lim->eqc_entry_sz);
+ mthca_dbg(dev, "reserved MPTs: %d, reserved MTTs: %d\n",
+ dev_lim->reserved_mrws, dev_lim->reserved_mtts);
+ mthca_dbg(dev, "Max PDs: %d, reserved PDs: %d, reserved UARs: %d\n",
+ dev_lim->max_pds, dev_lim->reserved_pds, dev_lim->reserved_uars);
+ mthca_dbg(dev, "Max QP/MCG: %d, reserved MGMs: %d\n",
+ dev_lim->max_pds, dev_lim->reserved_mgms);
+ mthca_dbg(dev, "Max CQEs: %d, max WQEs: %d, max SRQ WQEs: %d\n",
+ dev_lim->max_cq_sz, dev_lim->max_qp_sz, dev_lim->max_srq_sz);
+
+ mthca_dbg(dev, "Flags: %08x\n", dev_lim->flags);
+
+out:
+ mthca_free_mailbox(dev, mailbox);
+ return err;
+}
+
+static void get_board_id(void *vsd, char *board_id)
+{
+ int i;
+
+#define VSD_OFFSET_SIG1 0x00
+#define VSD_OFFSET_SIG2 0xde
+#define VSD_OFFSET_MLX_BOARD_ID 0xd0
+#define VSD_OFFSET_TS_BOARD_ID 0x20
+
+#define VSD_SIGNATURE_TOPSPIN 0x5ad
+
+ memset(board_id, 0, MTHCA_BOARD_ID_LEN);
+
+ if (be16_to_cpup(vsd + VSD_OFFSET_SIG1) == VSD_SIGNATURE_TOPSPIN &&
+ be16_to_cpup(vsd + VSD_OFFSET_SIG2) == VSD_SIGNATURE_TOPSPIN) {
+ strlcpy(board_id, vsd + VSD_OFFSET_TS_BOARD_ID, MTHCA_BOARD_ID_LEN);
+ } else {
+ /*
+ * The board ID is a string but the firmware byte
+ * swaps each 4-byte word before passing it back to
+ * us. Therefore we need to swab it before printing.
+ */
+ for (i = 0; i < 4; ++i)
+ ((u32 *) board_id)[i] =
+ swab32(*(u32 *) (vsd + VSD_OFFSET_MLX_BOARD_ID + i * 4));
+ }
+}
+
+int mthca_QUERY_ADAPTER(struct mthca_dev *dev,
+ struct mthca_adapter *adapter, u8 *status)
+{
+ struct mthca_mailbox *mailbox;
+ u32 *outbox;
+ int err;
+
+#define QUERY_ADAPTER_OUT_SIZE 0x100
+#define QUERY_ADAPTER_VENDOR_ID_OFFSET 0x00
+#define QUERY_ADAPTER_DEVICE_ID_OFFSET 0x04
+#define QUERY_ADAPTER_REVISION_ID_OFFSET 0x08
+#define QUERY_ADAPTER_INTA_PIN_OFFSET 0x10
+#define QUERY_ADAPTER_VSD_OFFSET 0x20
+
+ mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ outbox = mailbox->buf;
+
+ err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_ADAPTER,
+ CMD_TIME_CLASS_A, status);
+
+ if (err)
+ goto out;
+
+ if (!mthca_is_memfree(dev)) {
+ MTHCA_GET(adapter->vendor_id, outbox,
+ QUERY_ADAPTER_VENDOR_ID_OFFSET);
+ MTHCA_GET(adapter->device_id, outbox,
+ QUERY_ADAPTER_DEVICE_ID_OFFSET);
+ MTHCA_GET(adapter->revision_id, outbox,
+ QUERY_ADAPTER_REVISION_ID_OFFSET);
+ }
+ MTHCA_GET(adapter->inta_pin, outbox, QUERY_ADAPTER_INTA_PIN_OFFSET);
+
+ get_board_id(outbox + QUERY_ADAPTER_VSD_OFFSET / 4,
+ adapter->board_id);
+
+out:
+ mthca_free_mailbox(dev, mailbox);
+ return err;
+}
+
+int mthca_INIT_HCA(struct mthca_dev *dev,
+ struct mthca_init_hca_param *param,
+ u8 *status)
+{
+ struct mthca_mailbox *mailbox;
+ __be32 *inbox;
+ int err;
+
+#define INIT_HCA_IN_SIZE 0x200
+#define INIT_HCA_FLAGS1_OFFSET 0x00c
+#define INIT_HCA_FLAGS2_OFFSET 0x014
+#define INIT_HCA_QPC_OFFSET 0x020
+#define INIT_HCA_QPC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x10)
+#define INIT_HCA_LOG_QP_OFFSET (INIT_HCA_QPC_OFFSET + 0x17)
+#define INIT_HCA_EEC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x20)
+#define INIT_HCA_LOG_EEC_OFFSET (INIT_HCA_QPC_OFFSET + 0x27)
+#define INIT_HCA_SRQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x28)
+#define INIT_HCA_LOG_SRQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x2f)
+#define INIT_HCA_CQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x30)
+#define INIT_HCA_LOG_CQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x37)
+#define INIT_HCA_EQPC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x40)
+#define INIT_HCA_EEEC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x50)
+#define INIT_HCA_EQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x60)
+#define INIT_HCA_LOG_EQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x67)
+#define INIT_HCA_RDB_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x70)
+#define INIT_HCA_UDAV_OFFSET 0x0b0
+#define INIT_HCA_UDAV_LKEY_OFFSET (INIT_HCA_UDAV_OFFSET + 0x0)
+#define INIT_HCA_UDAV_PD_OFFSET (INIT_HCA_UDAV_OFFSET + 0x4)
+#define INIT_HCA_MCAST_OFFSET 0x0c0
+#define INIT_HCA_MC_BASE_OFFSET (INIT_HCA_MCAST_OFFSET + 0x00)
+#define INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x12)
+#define INIT_HCA_MC_HASH_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x16)
+#define INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b)
+#define INIT_HCA_TPT_OFFSET 0x0f0
+#define INIT_HCA_MPT_BASE_OFFSET (INIT_HCA_TPT_OFFSET + 0x00)
+#define INIT_HCA_MTT_SEG_SZ_OFFSET (INIT_HCA_TPT_OFFSET + 0x09)
+#define INIT_HCA_LOG_MPT_SZ_OFFSET (INIT_HCA_TPT_OFFSET + 0x0b)
+#define INIT_HCA_MTT_BASE_OFFSET (INIT_HCA_TPT_OFFSET + 0x10)
+#define INIT_HCA_UAR_OFFSET 0x120
+#define INIT_HCA_UAR_BASE_OFFSET (INIT_HCA_UAR_OFFSET + 0x00)
+#define INIT_HCA_UARC_SZ_OFFSET (INIT_HCA_UAR_OFFSET + 0x09)
+#define INIT_HCA_LOG_UAR_SZ_OFFSET (INIT_HCA_UAR_OFFSET + 0x0a)
+#define INIT_HCA_UAR_PAGE_SZ_OFFSET (INIT_HCA_UAR_OFFSET + 0x0b)
+#define INIT_HCA_UAR_SCATCH_BASE_OFFSET (INIT_HCA_UAR_OFFSET + 0x10)
+#define INIT_HCA_UAR_CTX_BASE_OFFSET (INIT_HCA_UAR_OFFSET + 0x18)
+
+ mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ inbox = mailbox->buf;
+
+ memset(inbox, 0, INIT_HCA_IN_SIZE);
+
+ if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+ MTHCA_PUT(inbox, 0x1, INIT_HCA_FLAGS1_OFFSET);
+
+#if defined(__LITTLE_ENDIAN)
+ *(inbox + INIT_HCA_FLAGS2_OFFSET / 4) &= ~cpu_to_be32(1 << 1);
+#elif defined(__BIG_ENDIAN)
+ *(inbox + INIT_HCA_FLAGS2_OFFSET / 4) |= cpu_to_be32(1 << 1);
+#else
+#error Host endianness not defined
+#endif
+ /* Check port for UD address vector: */
+ *(inbox + INIT_HCA_FLAGS2_OFFSET / 4) |= cpu_to_be32(1);
+
+ /* Enable IPoIB checksumming if we can: */
+ if (dev->device_cap_flags & IB_DEVICE_UD_IP_CSUM)
+ *(inbox + INIT_HCA_FLAGS2_OFFSET / 4) |= cpu_to_be32(7 << 3);
+
+ /* We leave wqe_quota, responder_exu, etc as 0 (default) */
+
+ /* QPC/EEC/CQC/EQC/RDB attributes */
+
+ MTHCA_PUT(inbox, param->qpc_base, INIT_HCA_QPC_BASE_OFFSET);
+ MTHCA_PUT(inbox, param->log_num_qps, INIT_HCA_LOG_QP_OFFSET);
+ MTHCA_PUT(inbox, param->eec_base, INIT_HCA_EEC_BASE_OFFSET);
+ MTHCA_PUT(inbox, param->log_num_eecs, INIT_HCA_LOG_EEC_OFFSET);
+ MTHCA_PUT(inbox, param->srqc_base, INIT_HCA_SRQC_BASE_OFFSET);
+ MTHCA_PUT(inbox, param->log_num_srqs, INIT_HCA_LOG_SRQ_OFFSET);
+ MTHCA_PUT(inbox, param->cqc_base, INIT_HCA_CQC_BASE_OFFSET);
+ MTHCA_PUT(inbox, param->log_num_cqs, INIT_HCA_LOG_CQ_OFFSET);
+ MTHCA_PUT(inbox, param->eqpc_base, INIT_HCA_EQPC_BASE_OFFSET);
+ MTHCA_PUT(inbox, param->eeec_base, INIT_HCA_EEEC_BASE_OFFSET);
+ MTHCA_PUT(inbox, param->eqc_base, INIT_HCA_EQC_BASE_OFFSET);
+ MTHCA_PUT(inbox, param->log_num_eqs, INIT_HCA_LOG_EQ_OFFSET);
+ MTHCA_PUT(inbox, param->rdb_base, INIT_HCA_RDB_BASE_OFFSET);
+
+ /* UD AV attributes */
+
+ /* multicast attributes */
+
+ MTHCA_PUT(inbox, param->mc_base, INIT_HCA_MC_BASE_OFFSET);
+ MTHCA_PUT(inbox, param->log_mc_entry_sz, INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
+ MTHCA_PUT(inbox, param->mc_hash_sz, INIT_HCA_MC_HASH_SZ_OFFSET);
+ MTHCA_PUT(inbox, param->log_mc_table_sz, INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
+
+ /* TPT attributes */
+
+ MTHCA_PUT(inbox, param->mpt_base, INIT_HCA_MPT_BASE_OFFSET);
+ if (!mthca_is_memfree(dev))
+ MTHCA_PUT(inbox, param->mtt_seg_sz, INIT_HCA_MTT_SEG_SZ_OFFSET);
+ MTHCA_PUT(inbox, param->log_mpt_sz, INIT_HCA_LOG_MPT_SZ_OFFSET);
+ MTHCA_PUT(inbox, param->mtt_base, INIT_HCA_MTT_BASE_OFFSET);
+
+ /* UAR attributes */
+ {
+ u8 uar_page_sz = PAGE_SHIFT - 12;
+ MTHCA_PUT(inbox, uar_page_sz, INIT_HCA_UAR_PAGE_SZ_OFFSET);
+ }
+
+ MTHCA_PUT(inbox, param->uar_scratch_base, INIT_HCA_UAR_SCATCH_BASE_OFFSET);
+
+ if (mthca_is_memfree(dev)) {
+ MTHCA_PUT(inbox, param->log_uarc_sz, INIT_HCA_UARC_SZ_OFFSET);
+ MTHCA_PUT(inbox, param->log_uar_sz, INIT_HCA_LOG_UAR_SZ_OFFSET);
+ MTHCA_PUT(inbox, param->uarc_base, INIT_HCA_UAR_CTX_BASE_OFFSET);
+ }
+
+ err = mthca_cmd(dev, mailbox->dma, 0, 0, CMD_INIT_HCA, CMD_TIME_CLASS_D, status);
+
+ mthca_free_mailbox(dev, mailbox);
+ return err;
+}
+
+int mthca_INIT_IB(struct mthca_dev *dev,
+ struct mthca_init_ib_param *param,
+ int port, u8 *status)
+{
+ struct mthca_mailbox *mailbox;
+ u32 *inbox;
+ int err;
+ u32 flags;
+
+#define INIT_IB_IN_SIZE 56
+#define INIT_IB_FLAGS_OFFSET 0x00
+#define INIT_IB_FLAG_SIG (1 << 18)
+#define INIT_IB_FLAG_NG (1 << 17)
+#define INIT_IB_FLAG_G0 (1 << 16)
+#define INIT_IB_VL_SHIFT 4
+#define INIT_IB_PORT_WIDTH_SHIFT 8
+#define INIT_IB_MTU_SHIFT 12
+#define INIT_IB_MAX_GID_OFFSET 0x06
+#define INIT_IB_MAX_PKEY_OFFSET 0x0a
+#define INIT_IB_GUID0_OFFSET 0x10
+#define INIT_IB_NODE_GUID_OFFSET 0x18
+#define INIT_IB_SI_GUID_OFFSET 0x20
+
+ mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ inbox = mailbox->buf;
+
+ memset(inbox, 0, INIT_IB_IN_SIZE);
+
+ flags = 0;
+ flags |= param->set_guid0 ? INIT_IB_FLAG_G0 : 0;
+ flags |= param->set_node_guid ? INIT_IB_FLAG_NG : 0;
+ flags |= param->set_si_guid ? INIT_IB_FLAG_SIG : 0;
+ flags |= param->vl_cap << INIT_IB_VL_SHIFT;
+ flags |= param->port_width << INIT_IB_PORT_WIDTH_SHIFT;
+ flags |= param->mtu_cap << INIT_IB_MTU_SHIFT;
+ MTHCA_PUT(inbox, flags, INIT_IB_FLAGS_OFFSET);
+
+ MTHCA_PUT(inbox, param->gid_cap, INIT_IB_MAX_GID_OFFSET);
+ MTHCA_PUT(inbox, param->pkey_cap, INIT_IB_MAX_PKEY_OFFSET);
+ MTHCA_PUT(inbox, param->guid0, INIT_IB_GUID0_OFFSET);
+ MTHCA_PUT(inbox, param->node_guid, INIT_IB_NODE_GUID_OFFSET);
+ MTHCA_PUT(inbox, param->si_guid, INIT_IB_SI_GUID_OFFSET);
+
+ err = mthca_cmd(dev, mailbox->dma, port, 0, CMD_INIT_IB,
+ CMD_TIME_CLASS_A, status);
+
+ mthca_free_mailbox(dev, mailbox);
+ return err;
+}
+
+int mthca_CLOSE_IB(struct mthca_dev *dev, int port, u8 *status)
+{
+ return mthca_cmd(dev, 0, port, 0, CMD_CLOSE_IB, CMD_TIME_CLASS_A, status);
+}
+
+int mthca_CLOSE_HCA(struct mthca_dev *dev, int panic, u8 *status)
+{
+ return mthca_cmd(dev, 0, 0, panic, CMD_CLOSE_HCA, CMD_TIME_CLASS_C, status);
+}
+
+int mthca_SET_IB(struct mthca_dev *dev, struct mthca_set_ib_param *param,
+ int port, u8 *status)
+{
+ struct mthca_mailbox *mailbox;
+ u32 *inbox;
+ int err;
+ u32 flags = 0;
+
+#define SET_IB_IN_SIZE 0x40
+#define SET_IB_FLAGS_OFFSET 0x00
+#define SET_IB_FLAG_SIG (1 << 18)
+#define SET_IB_FLAG_RQK (1 << 0)
+#define SET_IB_CAP_MASK_OFFSET 0x04
+#define SET_IB_SI_GUID_OFFSET 0x08
+
+ mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ inbox = mailbox->buf;
+
+ memset(inbox, 0, SET_IB_IN_SIZE);
+
+ flags |= param->set_si_guid ? SET_IB_FLAG_SIG : 0;
+ flags |= param->reset_qkey_viol ? SET_IB_FLAG_RQK : 0;
+ MTHCA_PUT(inbox, flags, SET_IB_FLAGS_OFFSET);
+
+ MTHCA_PUT(inbox, param->cap_mask, SET_IB_CAP_MASK_OFFSET);
+ MTHCA_PUT(inbox, param->si_guid, SET_IB_SI_GUID_OFFSET);
+
+ err = mthca_cmd(dev, mailbox->dma, port, 0, CMD_SET_IB,
+ CMD_TIME_CLASS_B, status);
+
+ mthca_free_mailbox(dev, mailbox);
+ return err;
+}
+
+int mthca_MAP_ICM(struct mthca_dev *dev, struct mthca_icm *icm, u64 virt, u8 *status)
+{
+ return mthca_map_cmd(dev, CMD_MAP_ICM, icm, virt, status);
+}
+
+int mthca_MAP_ICM_page(struct mthca_dev *dev, u64 dma_addr, u64 virt, u8 *status)
+{
+ struct mthca_mailbox *mailbox;
+ __be64 *inbox;
+ int err;
+
+ mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ inbox = mailbox->buf;
+
+ inbox[0] = cpu_to_be64(virt);
+ inbox[1] = cpu_to_be64(dma_addr);
+
+ err = mthca_cmd(dev, mailbox->dma, 1, 0, CMD_MAP_ICM,
+ CMD_TIME_CLASS_B, status);
+
+ mthca_free_mailbox(dev, mailbox);
+
+ if (!err)
+ mthca_dbg(dev, "Mapped page at %llx to %llx for ICM.\n",
+ (unsigned long long) dma_addr, (unsigned long long) virt);
+
+ return err;
+}
+
+int mthca_UNMAP_ICM(struct mthca_dev *dev, u64 virt, u32 page_count, u8 *status)
+{
+ mthca_dbg(dev, "Unmapping %d pages at %llx from ICM.\n",
+ page_count, (unsigned long long) virt);
+
+ return mthca_cmd(dev, virt, page_count, 0, CMD_UNMAP_ICM, CMD_TIME_CLASS_B, status);
+}
+
+int mthca_MAP_ICM_AUX(struct mthca_dev *dev, struct mthca_icm *icm, u8 *status)
+{
+ return mthca_map_cmd(dev, CMD_MAP_ICM_AUX, icm, -1, status);
+}
+
+int mthca_UNMAP_ICM_AUX(struct mthca_dev *dev, u8 *status)
+{
+ return mthca_cmd(dev, 0, 0, 0, CMD_UNMAP_ICM_AUX, CMD_TIME_CLASS_B, status);
+}
+
+int mthca_SET_ICM_SIZE(struct mthca_dev *dev, u64 icm_size, u64 *aux_pages,
+ u8 *status)
+{
+ int ret = mthca_cmd_imm(dev, icm_size, aux_pages, 0, 0, CMD_SET_ICM_SIZE,
+ CMD_TIME_CLASS_A, status);
+
+ if (ret || status)
+ return ret;
+
+ /*
+ * Round up number of system pages needed in case
+ * MTHCA_ICM_PAGE_SIZE < PAGE_SIZE.
+ */
+ *aux_pages = ALIGN(*aux_pages, PAGE_SIZE / MTHCA_ICM_PAGE_SIZE) >>
+ (PAGE_SHIFT - MTHCA_ICM_PAGE_SHIFT);
+
+ return 0;
+}
+
+int mthca_SW2HW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+ int mpt_index, u8 *status)
+{
+ return mthca_cmd(dev, mailbox->dma, mpt_index, 0, CMD_SW2HW_MPT,
+ CMD_TIME_CLASS_B, status);
+}
+
+int mthca_HW2SW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+ int mpt_index, u8 *status)
+{
+ return mthca_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, mpt_index,
+ !mailbox, CMD_HW2SW_MPT,
+ CMD_TIME_CLASS_B, status);
+}
+
+int mthca_WRITE_MTT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+ int num_mtt, u8 *status)
+{
+ return mthca_cmd(dev, mailbox->dma, num_mtt, 0, CMD_WRITE_MTT,
+ CMD_TIME_CLASS_B, status);
+}
+
+int mthca_SYNC_TPT(struct mthca_dev *dev, u8 *status)
+{
+ return mthca_cmd(dev, 0, 0, 0, CMD_SYNC_TPT, CMD_TIME_CLASS_B, status);
+}
+
+int mthca_MAP_EQ(struct mthca_dev *dev, u64 event_mask, int unmap,
+ int eq_num, u8 *status)
+{
+ mthca_dbg(dev, "%s mask %016llx for eqn %d\n",
+ unmap ? "Clearing" : "Setting",
+ (unsigned long long) event_mask, eq_num);
+ return mthca_cmd(dev, event_mask, (unmap << 31) | eq_num,
+ 0, CMD_MAP_EQ, CMD_TIME_CLASS_B, status);
+}
+
+int mthca_SW2HW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+ int eq_num, u8 *status)
+{
+ return mthca_cmd(dev, mailbox->dma, eq_num, 0, CMD_SW2HW_EQ,
+ CMD_TIME_CLASS_A, status);
+}
+
+int mthca_HW2SW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+ int eq_num, u8 *status)
+{
+ return mthca_cmd_box(dev, 0, mailbox->dma, eq_num, 0,
+ CMD_HW2SW_EQ,
+ CMD_TIME_CLASS_A, status);
+}
+
+int mthca_SW2HW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+ int cq_num, u8 *status)
+{
+ return mthca_cmd(dev, mailbox->dma, cq_num, 0, CMD_SW2HW_CQ,
+ CMD_TIME_CLASS_A, status);
+}
+
+int mthca_HW2SW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+ int cq_num, u8 *status)
+{
+ return mthca_cmd_box(dev, 0, mailbox->dma, cq_num, 0,
+ CMD_HW2SW_CQ,
+ CMD_TIME_CLASS_A, status);
+}
+
+int mthca_RESIZE_CQ(struct mthca_dev *dev, int cq_num, u32 lkey, u8 log_size,
+ u8 *status)
+{
+ struct mthca_mailbox *mailbox;
+ __be32 *inbox;
+ int err;
+
+#define RESIZE_CQ_IN_SIZE 0x40
+#define RESIZE_CQ_LOG_SIZE_OFFSET 0x0c
+#define RESIZE_CQ_LKEY_OFFSET 0x1c
+
+ mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ inbox = mailbox->buf;
+
+ memset(inbox, 0, RESIZE_CQ_IN_SIZE);
+ /*
+ * Leave start address fields zeroed out -- mthca assumes that
+ * MRs for CQs always start at virtual address 0.
+ */
+ MTHCA_PUT(inbox, log_size, RESIZE_CQ_LOG_SIZE_OFFSET);
+ MTHCA_PUT(inbox, lkey, RESIZE_CQ_LKEY_OFFSET);
+
+ err = mthca_cmd(dev, mailbox->dma, cq_num, 1, CMD_RESIZE_CQ,
+ CMD_TIME_CLASS_B, status);
+
+ mthca_free_mailbox(dev, mailbox);
+ return err;
+}
+
+int mthca_SW2HW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+ int srq_num, u8 *status)
+{
+ return mthca_cmd(dev, mailbox->dma, srq_num, 0, CMD_SW2HW_SRQ,
+ CMD_TIME_CLASS_A, status);
+}
+
+int mthca_HW2SW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+ int srq_num, u8 *status)
+{
+ return mthca_cmd_box(dev, 0, mailbox->dma, srq_num, 0,
+ CMD_HW2SW_SRQ,
+ CMD_TIME_CLASS_A, status);
+}
+
+int mthca_QUERY_SRQ(struct mthca_dev *dev, u32 num,
+ struct mthca_mailbox *mailbox, u8 *status)
+{
+ return mthca_cmd_box(dev, 0, mailbox->dma, num, 0,
+ CMD_QUERY_SRQ, CMD_TIME_CLASS_A, status);
+}
+
+int mthca_ARM_SRQ(struct mthca_dev *dev, int srq_num, int limit, u8 *status)
+{
+ return mthca_cmd(dev, limit, srq_num, 0, CMD_ARM_SRQ,
+ CMD_TIME_CLASS_B, status);
+}
+
+int mthca_MODIFY_QP(struct mthca_dev *dev, enum ib_qp_state cur,
+ enum ib_qp_state next, u32 num, int is_ee,
+ struct mthca_mailbox *mailbox, u32 optmask,
+ u8 *status)
+{
+ static const u16 op[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
+ [IB_QPS_RESET] = {
+ [IB_QPS_RESET] = CMD_ERR2RST_QPEE,
+ [IB_QPS_ERR] = CMD_2ERR_QPEE,
+ [IB_QPS_INIT] = CMD_RST2INIT_QPEE,
+ },
+ [IB_QPS_INIT] = {
+ [IB_QPS_RESET] = CMD_ERR2RST_QPEE,
+ [IB_QPS_ERR] = CMD_2ERR_QPEE,
+ [IB_QPS_INIT] = CMD_INIT2INIT_QPEE,
+ [IB_QPS_RTR] = CMD_INIT2RTR_QPEE,
+ },
+ [IB_QPS_RTR] = {
+ [IB_QPS_RESET] = CMD_ERR2RST_QPEE,
+ [IB_QPS_ERR] = CMD_2ERR_QPEE,
+ [IB_QPS_RTS] = CMD_RTR2RTS_QPEE,
+ },
+ [IB_QPS_RTS] = {
+ [IB_QPS_RESET] = CMD_ERR2RST_QPEE,
+ [IB_QPS_ERR] = CMD_2ERR_QPEE,
+ [IB_QPS_RTS] = CMD_RTS2RTS_QPEE,
+ [IB_QPS_SQD] = CMD_RTS2SQD_QPEE,
+ },
+ [IB_QPS_SQD] = {
+ [IB_QPS_RESET] = CMD_ERR2RST_QPEE,
+ [IB_QPS_ERR] = CMD_2ERR_QPEE,
+ [IB_QPS_RTS] = CMD_SQD2RTS_QPEE,
+ [IB_QPS_SQD] = CMD_SQD2SQD_QPEE,
+ },
+ [IB_QPS_SQE] = {
+ [IB_QPS_RESET] = CMD_ERR2RST_QPEE,
+ [IB_QPS_ERR] = CMD_2ERR_QPEE,
+ [IB_QPS_RTS] = CMD_SQERR2RTS_QPEE,
+ },
+ [IB_QPS_ERR] = {
+ [IB_QPS_RESET] = CMD_ERR2RST_QPEE,
+ [IB_QPS_ERR] = CMD_2ERR_QPEE,
+ }
+ };
+
+ u8 op_mod = 0;
+ int my_mailbox = 0;
+ int err;
+
+ if (op[cur][next] == CMD_ERR2RST_QPEE) {
+ op_mod = 3; /* don't write outbox, any->reset */
+
+ /* For debugging */
+ if (!mailbox) {
+ mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (!IS_ERR(mailbox)) {
+ my_mailbox = 1;
+ op_mod = 2; /* write outbox, any->reset */
+ } else
+ mailbox = NULL;
+ }
+
+ err = mthca_cmd_box(dev, 0, mailbox ? mailbox->dma : 0,
+ (!!is_ee << 24) | num, op_mod,
+ op[cur][next], CMD_TIME_CLASS_C, status);
+
+ if (0 && mailbox) {
+ int i;
+ mthca_dbg(dev, "Dumping QP context:\n");
+ printk(" %08x\n", be32_to_cpup(mailbox->buf));
+ for (i = 0; i < 0x100 / 4; ++i) {
+ if (i % 8 == 0)
+ printk("[%02x] ", i * 4);
+ printk(" %08x",
+ be32_to_cpu(((__be32 *) mailbox->buf)[i + 2]));
+ if ((i + 1) % 8 == 0)
+ printk("\n");
+ }
+ }
+
+ if (my_mailbox)
+ mthca_free_mailbox(dev, mailbox);
+ } else {
+ if (0) {
+ int i;
+ mthca_dbg(dev, "Dumping QP context:\n");
+ printk(" opt param mask: %08x\n", be32_to_cpup(mailbox->buf));
+ for (i = 0; i < 0x100 / 4; ++i) {
+ if (i % 8 == 0)
+ printk(" [%02x] ", i * 4);
+ printk(" %08x",
+ be32_to_cpu(((__be32 *) mailbox->buf)[i + 2]));
+ if ((i + 1) % 8 == 0)
+ printk("\n");
+ }
+ }
+
+ err = mthca_cmd(dev, mailbox->dma, optmask | (!!is_ee << 24) | num,
+ op_mod, op[cur][next], CMD_TIME_CLASS_C, status);
+ }
+
+ return err;
+}
+
+int mthca_QUERY_QP(struct mthca_dev *dev, u32 num, int is_ee,
+ struct mthca_mailbox *mailbox, u8 *status)
+{
+ return mthca_cmd_box(dev, 0, mailbox->dma, (!!is_ee << 24) | num, 0,
+ CMD_QUERY_QPEE, CMD_TIME_CLASS_A, status);
+}
+
+int mthca_CONF_SPECIAL_QP(struct mthca_dev *dev, int type, u32 qpn,
+ u8 *status)
+{
+ u8 op_mod;
+
+ switch (type) {
+ case IB_QPT_SMI:
+ op_mod = 0;
+ break;
+ case IB_QPT_GSI:
+ op_mod = 1;
+ break;
+ case IB_QPT_RAW_IPV6:
+ op_mod = 2;
+ break;
+ case IB_QPT_RAW_ETY:
+ op_mod = 3;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return mthca_cmd(dev, 0, qpn, op_mod, CMD_CONF_SPECIAL_QP,
+ CMD_TIME_CLASS_B, status);
+}
+
+int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey,
+ int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+ void *in_mad, void *response_mad, u8 *status)
+{
+ struct mthca_mailbox *inmailbox, *outmailbox;
+ void *inbox;
+ int err;
+ u32 in_modifier = port;
+ u8 op_modifier = 0;
+
+#define MAD_IFC_BOX_SIZE 0x400
+#define MAD_IFC_MY_QPN_OFFSET 0x100
+#define MAD_IFC_RQPN_OFFSET 0x108
+#define MAD_IFC_SL_OFFSET 0x10c
+#define MAD_IFC_G_PATH_OFFSET 0x10d
+#define MAD_IFC_RLID_OFFSET 0x10e
+#define MAD_IFC_PKEY_OFFSET 0x112
+#define MAD_IFC_GRH_OFFSET 0x140
+
+ inmailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(inmailbox))
+ return PTR_ERR(inmailbox);
+ inbox = inmailbox->buf;
+
+ outmailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(outmailbox)) {
+ mthca_free_mailbox(dev, inmailbox);
+ return PTR_ERR(outmailbox);
+ }
+
+ memcpy(inbox, in_mad, 256);
+
+ /*
+ * Key check traps can't be generated unless we have in_wc to
+ * tell us where to send the trap.
+ */
+ if (ignore_mkey || !in_wc)
+ op_modifier |= 0x1;
+ if (ignore_bkey || !in_wc)
+ op_modifier |= 0x2;
+
+ if (in_wc) {
+ u8 val;
+
+ memset(inbox + 256, 0, 256);
+
+ MTHCA_PUT(inbox, in_wc->qp->qp_num, MAD_IFC_MY_QPN_OFFSET);
+ MTHCA_PUT(inbox, in_wc->src_qp, MAD_IFC_RQPN_OFFSET);
+
+ val = in_wc->sl << 4;
+ MTHCA_PUT(inbox, val, MAD_IFC_SL_OFFSET);
+
+ val = in_wc->dlid_path_bits |
+ (in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0);
+ MTHCA_PUT(inbox, val, MAD_IFC_G_PATH_OFFSET);
+
+ MTHCA_PUT(inbox, in_wc->slid, MAD_IFC_RLID_OFFSET);
+ MTHCA_PUT(inbox, in_wc->pkey_index, MAD_IFC_PKEY_OFFSET);
+
+ if (in_grh)
+ memcpy(inbox + MAD_IFC_GRH_OFFSET, in_grh, 40);
+
+ op_modifier |= 0x4;
+
+ in_modifier |= in_wc->slid << 16;
+ }
+
+ err = mthca_cmd_box(dev, inmailbox->dma, outmailbox->dma,
+ in_modifier, op_modifier,
+ CMD_MAD_IFC, CMD_TIME_CLASS_C, status);
+
+ if (!err && !*status)
+ memcpy(response_mad, outmailbox->buf, 256);
+
+ mthca_free_mailbox(dev, inmailbox);
+ mthca_free_mailbox(dev, outmailbox);
+ return err;
+}
+
+int mthca_READ_MGM(struct mthca_dev *dev, int index,
+ struct mthca_mailbox *mailbox, u8 *status)
+{
+ return mthca_cmd_box(dev, 0, mailbox->dma, index, 0,
+ CMD_READ_MGM, CMD_TIME_CLASS_A, status);
+}
+
+int mthca_WRITE_MGM(struct mthca_dev *dev, int index,
+ struct mthca_mailbox *mailbox, u8 *status)
+{
+ return mthca_cmd(dev, mailbox->dma, index, 0, CMD_WRITE_MGM,
+ CMD_TIME_CLASS_A, status);
+}
+
+int mthca_MGID_HASH(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+ u16 *hash, u8 *status)
+{
+ u64 imm;
+ int err;
+
+ err = mthca_cmd_imm(dev, mailbox->dma, &imm, 0, 0, CMD_MGID_HASH,
+ CMD_TIME_CLASS_A, status);
+
+ *hash = imm;
+ return err;
+}
+
+int mthca_NOP(struct mthca_dev *dev, u8 *status)
+{
+ return mthca_cmd(dev, 0, 0x1f, 0, CMD_NOP, msecs_to_jiffies(100), status);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.h b/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.h
new file mode 100644
index 0000000..6efd326
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_cmd.h
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_CMD_H
+#define MTHCA_CMD_H
+
+#include <rdma/ib_verbs.h>
+
+#define MTHCA_MAILBOX_SIZE 4096
+
+enum {
+ /* command completed successfully: */
+ MTHCA_CMD_STAT_OK = 0x00,
+ /* Internal error (such as a bus error) occurred while processing command: */
+ MTHCA_CMD_STAT_INTERNAL_ERR = 0x01,
+ /* Operation/command not supported or opcode modifier not supported: */
+ MTHCA_CMD_STAT_BAD_OP = 0x02,
+ /* Parameter not supported or parameter out of range: */
+ MTHCA_CMD_STAT_BAD_PARAM = 0x03,
+ /* System not enabled or bad system state: */
+ MTHCA_CMD_STAT_BAD_SYS_STATE = 0x04,
+ /* Attempt to access reserved or unallocaterd resource: */
+ MTHCA_CMD_STAT_BAD_RESOURCE = 0x05,
+ /* Requested resource is currently executing a command, or is otherwise busy: */
+ MTHCA_CMD_STAT_RESOURCE_BUSY = 0x06,
+ /* memory error: */
+ MTHCA_CMD_STAT_DDR_MEM_ERR = 0x07,
+ /* Required capability exceeds device limits: */
+ MTHCA_CMD_STAT_EXCEED_LIM = 0x08,
+ /* Resource is not in the appropriate state or ownership: */
+ MTHCA_CMD_STAT_BAD_RES_STATE = 0x09,
+ /* Index out of range: */
+ MTHCA_CMD_STAT_BAD_INDEX = 0x0a,
+ /* FW image corrupted: */
+ MTHCA_CMD_STAT_BAD_NVMEM = 0x0b,
+ /* Attempt to modify a QP/EE which is not in the presumed state: */
+ MTHCA_CMD_STAT_BAD_QPEE_STATE = 0x10,
+ /* Bad segment parameters (Address/Size): */
+ MTHCA_CMD_STAT_BAD_SEG_PARAM = 0x20,
+ /* Memory Region has Memory Windows bound to: */
+ MTHCA_CMD_STAT_REG_BOUND = 0x21,
+ /* HCA local attached memory not present: */
+ MTHCA_CMD_STAT_LAM_NOT_PRE = 0x22,
+ /* Bad management packet (silently discarded): */
+ MTHCA_CMD_STAT_BAD_PKT = 0x30,
+ /* More outstanding CQEs in CQ than new CQ size: */
+ MTHCA_CMD_STAT_BAD_SIZE = 0x40
+};
+
+enum {
+ MTHCA_TRANS_INVALID = 0,
+ MTHCA_TRANS_RST2INIT,
+ MTHCA_TRANS_INIT2INIT,
+ MTHCA_TRANS_INIT2RTR,
+ MTHCA_TRANS_RTR2RTS,
+ MTHCA_TRANS_RTS2RTS,
+ MTHCA_TRANS_SQERR2RTS,
+ MTHCA_TRANS_ANY2ERR,
+ MTHCA_TRANS_RTS2SQD,
+ MTHCA_TRANS_SQD2SQD,
+ MTHCA_TRANS_SQD2RTS,
+ MTHCA_TRANS_ANY2RST,
+};
+
+enum {
+ DEV_LIM_FLAG_RC = 1 << 0,
+ DEV_LIM_FLAG_UC = 1 << 1,
+ DEV_LIM_FLAG_UD = 1 << 2,
+ DEV_LIM_FLAG_RD = 1 << 3,
+ DEV_LIM_FLAG_RAW_IPV6 = 1 << 4,
+ DEV_LIM_FLAG_RAW_ETHER = 1 << 5,
+ DEV_LIM_FLAG_SRQ = 1 << 6,
+ DEV_LIM_FLAG_IPOIB_CSUM = 1 << 7,
+ DEV_LIM_FLAG_BAD_PKEY_CNTR = 1 << 8,
+ DEV_LIM_FLAG_BAD_QKEY_CNTR = 1 << 9,
+ DEV_LIM_FLAG_MW = 1 << 16,
+ DEV_LIM_FLAG_AUTO_PATH_MIG = 1 << 17,
+ DEV_LIM_FLAG_ATOMIC = 1 << 18,
+ DEV_LIM_FLAG_RAW_MULTI = 1 << 19,
+ DEV_LIM_FLAG_UD_AV_PORT_ENFORCE = 1 << 20,
+ DEV_LIM_FLAG_UD_MULTI = 1 << 21,
+};
+
+struct mthca_mailbox {
+ dma_addr_t dma;
+ void *buf;
+};
+
+struct mthca_dev_lim {
+ int max_srq_sz;
+ int max_qp_sz;
+ int reserved_qps;
+ int max_qps;
+ int reserved_srqs;
+ int max_srqs;
+ int reserved_eecs;
+ int max_eecs;
+ int max_cq_sz;
+ int reserved_cqs;
+ int max_cqs;
+ int max_mpts;
+ int reserved_eqs;
+ int max_eqs;
+ int reserved_mtts;
+ int max_mrw_sz;
+ int reserved_mrws;
+ int max_mtt_seg;
+ int max_requester_per_qp;
+ int max_responder_per_qp;
+ int max_rdma_global;
+ int local_ca_ack_delay;
+ int max_mtu;
+ int max_port_width;
+ int max_vl;
+ int num_ports;
+ int max_gids;
+ u16 stat_rate_support;
+ int max_pkeys;
+ u32 flags;
+ int reserved_uars;
+ int uar_size;
+ int min_page_sz;
+ int max_sg;
+ int max_desc_sz;
+ int max_qp_per_mcg;
+ int reserved_mgms;
+ int max_mcgs;
+ int reserved_pds;
+ int max_pds;
+ int reserved_rdds;
+ int max_rdds;
+ int eec_entry_sz;
+ int qpc_entry_sz;
+ int eeec_entry_sz;
+ int eqpc_entry_sz;
+ int eqc_entry_sz;
+ int cqc_entry_sz;
+ int srq_entry_sz;
+ int uar_scratch_entry_sz;
+ int mpt_entry_sz;
+ union {
+ struct {
+ int max_avs;
+ } tavor;
+ struct {
+ int resize_srq;
+ int max_pbl_sz;
+ u8 bmme_flags;
+ u32 reserved_lkey;
+ int lam_required;
+ u64 max_icm_sz;
+ } arbel;
+ } hca;
+};
+
+struct mthca_adapter {
+ u32 vendor_id;
+ u32 device_id;
+ u32 revision_id;
+ char board_id[MTHCA_BOARD_ID_LEN];
+ u8 inta_pin;
+};
+
+struct mthca_init_hca_param {
+ u64 qpc_base;
+ u64 eec_base;
+ u64 srqc_base;
+ u64 cqc_base;
+ u64 eqpc_base;
+ u64 eeec_base;
+ u64 eqc_base;
+ u64 rdb_base;
+ u64 mc_base;
+ u64 mpt_base;
+ u64 mtt_base;
+ u64 uar_scratch_base;
+ u64 uarc_base;
+ u16 log_mc_entry_sz;
+ u16 mc_hash_sz;
+ u8 log_num_qps;
+ u8 log_num_eecs;
+ u8 log_num_srqs;
+ u8 log_num_cqs;
+ u8 log_num_eqs;
+ u8 log_mc_table_sz;
+ u8 mtt_seg_sz;
+ u8 log_mpt_sz;
+ u8 log_uar_sz;
+ u8 log_uarc_sz;
+};
+
+struct mthca_init_ib_param {
+ int port_width;
+ int vl_cap;
+ int mtu_cap;
+ u16 gid_cap;
+ u16 pkey_cap;
+ int set_guid0;
+ u64 guid0;
+ int set_node_guid;
+ u64 node_guid;
+ int set_si_guid;
+ u64 si_guid;
+};
+
+struct mthca_set_ib_param {
+ int set_si_guid;
+ int reset_qkey_viol;
+ u64 si_guid;
+ u32 cap_mask;
+};
+
+int mthca_cmd_init(struct mthca_dev *dev);
+void mthca_cmd_cleanup(struct mthca_dev *dev);
+int mthca_cmd_use_events(struct mthca_dev *dev);
+void mthca_cmd_use_polling(struct mthca_dev *dev);
+void mthca_cmd_event(struct mthca_dev *dev, u16 token,
+ u8 status, u64 out_param);
+
+struct mthca_mailbox *mthca_alloc_mailbox(struct mthca_dev *dev,
+ gfp_t gfp_mask);
+void mthca_free_mailbox(struct mthca_dev *dev, struct mthca_mailbox *mailbox);
+
+int mthca_SYS_EN(struct mthca_dev *dev, u8 *status);
+int mthca_SYS_DIS(struct mthca_dev *dev, u8 *status);
+int mthca_MAP_FA(struct mthca_dev *dev, struct mthca_icm *icm, u8 *status);
+int mthca_UNMAP_FA(struct mthca_dev *dev, u8 *status);
+int mthca_RUN_FW(struct mthca_dev *dev, u8 *status);
+int mthca_QUERY_FW(struct mthca_dev *dev, u8 *status);
+int mthca_ENABLE_LAM(struct mthca_dev *dev, u8 *status);
+int mthca_DISABLE_LAM(struct mthca_dev *dev, u8 *status);
+int mthca_QUERY_DDR(struct mthca_dev *dev, u8 *status);
+int mthca_QUERY_DEV_LIM(struct mthca_dev *dev,
+ struct mthca_dev_lim *dev_lim, u8 *status);
+int mthca_QUERY_ADAPTER(struct mthca_dev *dev,
+ struct mthca_adapter *adapter, u8 *status);
+int mthca_INIT_HCA(struct mthca_dev *dev,
+ struct mthca_init_hca_param *param,
+ u8 *status);
+int mthca_INIT_IB(struct mthca_dev *dev,
+ struct mthca_init_ib_param *param,
+ int port, u8 *status);
+int mthca_CLOSE_IB(struct mthca_dev *dev, int port, u8 *status);
+int mthca_CLOSE_HCA(struct mthca_dev *dev, int panic, u8 *status);
+int mthca_SET_IB(struct mthca_dev *dev, struct mthca_set_ib_param *param,
+ int port, u8 *status);
+int mthca_MAP_ICM(struct mthca_dev *dev, struct mthca_icm *icm, u64 virt, u8 *status);
+int mthca_MAP_ICM_page(struct mthca_dev *dev, u64 dma_addr, u64 virt, u8 *status);
+int mthca_UNMAP_ICM(struct mthca_dev *dev, u64 virt, u32 page_count, u8 *status);
+int mthca_MAP_ICM_AUX(struct mthca_dev *dev, struct mthca_icm *icm, u8 *status);
+int mthca_UNMAP_ICM_AUX(struct mthca_dev *dev, u8 *status);
+int mthca_SET_ICM_SIZE(struct mthca_dev *dev, u64 icm_size, u64 *aux_pages,
+ u8 *status);
+int mthca_SW2HW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+ int mpt_index, u8 *status);
+int mthca_HW2SW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+ int mpt_index, u8 *status);
+int mthca_WRITE_MTT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+ int num_mtt, u8 *status);
+int mthca_SYNC_TPT(struct mthca_dev *dev, u8 *status);
+int mthca_MAP_EQ(struct mthca_dev *dev, u64 event_mask, int unmap,
+ int eq_num, u8 *status);
+int mthca_SW2HW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+ int eq_num, u8 *status);
+int mthca_HW2SW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+ int eq_num, u8 *status);
+int mthca_SW2HW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+ int cq_num, u8 *status);
+int mthca_HW2SW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+ int cq_num, u8 *status);
+int mthca_RESIZE_CQ(struct mthca_dev *dev, int cq_num, u32 lkey, u8 log_size,
+ u8 *status);
+int mthca_SW2HW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+ int srq_num, u8 *status);
+int mthca_HW2SW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+ int srq_num, u8 *status);
+int mthca_QUERY_SRQ(struct mthca_dev *dev, u32 num,
+ struct mthca_mailbox *mailbox, u8 *status);
+int mthca_ARM_SRQ(struct mthca_dev *dev, int srq_num, int limit, u8 *status);
+int mthca_MODIFY_QP(struct mthca_dev *dev, enum ib_qp_state cur,
+ enum ib_qp_state next, u32 num, int is_ee,
+ struct mthca_mailbox *mailbox, u32 optmask,
+ u8 *status);
+int mthca_QUERY_QP(struct mthca_dev *dev, u32 num, int is_ee,
+ struct mthca_mailbox *mailbox, u8 *status);
+int mthca_CONF_SPECIAL_QP(struct mthca_dev *dev, int type, u32 qpn,
+ u8 *status);
+int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey,
+ int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+ void *in_mad, void *response_mad, u8 *status);
+int mthca_READ_MGM(struct mthca_dev *dev, int index,
+ struct mthca_mailbox *mailbox, u8 *status);
+int mthca_WRITE_MGM(struct mthca_dev *dev, int index,
+ struct mthca_mailbox *mailbox, u8 *status);
+int mthca_MGID_HASH(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+ u16 *hash, u8 *status);
+int mthca_NOP(struct mthca_dev *dev, u8 *status);
+
+#endif /* MTHCA_CMD_H */
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_config_reg.h b/sys/ofed/drivers/infiniband/hw/mthca/mthca_config_reg.h
new file mode 100644
index 0000000..75671f7
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_config_reg.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_CONFIG_REG_H
+#define MTHCA_CONFIG_REG_H
+
+#include <asm/page.h>
+
+#define MTHCA_HCR_BASE 0x80680
+#define MTHCA_HCR_SIZE 0x0001c
+#define MTHCA_ECR_BASE 0x80700
+#define MTHCA_ECR_SIZE 0x00008
+#define MTHCA_ECR_CLR_BASE 0x80708
+#define MTHCA_ECR_CLR_SIZE 0x00008
+#define MTHCA_MAP_ECR_SIZE (MTHCA_ECR_SIZE + MTHCA_ECR_CLR_SIZE)
+#define MTHCA_CLR_INT_BASE 0xf00d8
+#define MTHCA_CLR_INT_SIZE 0x00008
+#define MTHCA_EQ_SET_CI_SIZE (8 * 32)
+
+#endif /* MTHCA_CONFIG_REG_H */
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_cq.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_cq.c
new file mode 100644
index 0000000..aa75d26
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_cq.c
@@ -0,0 +1,992 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/hardirq.h>
+#include <linux/sched.h>
+
+#include <asm/io.h>
+
+#include <rdma/ib_pack.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+
+enum {
+ MTHCA_MAX_DIRECT_CQ_SIZE = 4 * PAGE_SIZE
+};
+
+enum {
+ MTHCA_CQ_ENTRY_SIZE = 0x20
+};
+
+enum {
+ MTHCA_ATOMIC_BYTE_LEN = 8
+};
+
+/*
+ * Must be packed because start is 64 bits but only aligned to 32 bits.
+ */
+struct mthca_cq_context {
+ __be32 flags;
+ __be64 start;
+ __be32 logsize_usrpage;
+ __be32 error_eqn; /* Tavor only */
+ __be32 comp_eqn;
+ __be32 pd;
+ __be32 lkey;
+ __be32 last_notified_index;
+ __be32 solicit_producer_index;
+ __be32 consumer_index;
+ __be32 producer_index;
+ __be32 cqn;
+ __be32 ci_db; /* Arbel only */
+ __be32 state_db; /* Arbel only */
+ u32 reserved;
+} __attribute__((packed));
+
+#define MTHCA_CQ_STATUS_OK ( 0 << 28)
+#define MTHCA_CQ_STATUS_OVERFLOW ( 9 << 28)
+#define MTHCA_CQ_STATUS_WRITE_FAIL (10 << 28)
+#define MTHCA_CQ_FLAG_TR ( 1 << 18)
+#define MTHCA_CQ_FLAG_OI ( 1 << 17)
+#define MTHCA_CQ_STATE_DISARMED ( 0 << 8)
+#define MTHCA_CQ_STATE_ARMED ( 1 << 8)
+#define MTHCA_CQ_STATE_ARMED_SOL ( 4 << 8)
+#define MTHCA_EQ_STATE_FIRED (10 << 8)
+
+enum {
+ MTHCA_ERROR_CQE_OPCODE_MASK = 0xfe
+};
+
+enum {
+ SYNDROME_LOCAL_LENGTH_ERR = 0x01,
+ SYNDROME_LOCAL_QP_OP_ERR = 0x02,
+ SYNDROME_LOCAL_EEC_OP_ERR = 0x03,
+ SYNDROME_LOCAL_PROT_ERR = 0x04,
+ SYNDROME_WR_FLUSH_ERR = 0x05,
+ SYNDROME_MW_BIND_ERR = 0x06,
+ SYNDROME_BAD_RESP_ERR = 0x10,
+ SYNDROME_LOCAL_ACCESS_ERR = 0x11,
+ SYNDROME_REMOTE_INVAL_REQ_ERR = 0x12,
+ SYNDROME_REMOTE_ACCESS_ERR = 0x13,
+ SYNDROME_REMOTE_OP_ERR = 0x14,
+ SYNDROME_RETRY_EXC_ERR = 0x15,
+ SYNDROME_RNR_RETRY_EXC_ERR = 0x16,
+ SYNDROME_LOCAL_RDD_VIOL_ERR = 0x20,
+ SYNDROME_REMOTE_INVAL_RD_REQ_ERR = 0x21,
+ SYNDROME_REMOTE_ABORTED_ERR = 0x22,
+ SYNDROME_INVAL_EECN_ERR = 0x23,
+ SYNDROME_INVAL_EEC_STATE_ERR = 0x24
+};
+
+struct mthca_cqe {
+ __be32 my_qpn;
+ __be32 my_ee;
+ __be32 rqpn;
+ u8 sl_ipok;
+ u8 g_mlpath;
+ __be16 rlid;
+ __be32 imm_etype_pkey_eec;
+ __be32 byte_cnt;
+ __be32 wqe;
+ u8 opcode;
+ u8 is_send;
+ u8 reserved;
+ u8 owner;
+};
+
+struct mthca_err_cqe {
+ __be32 my_qpn;
+ u32 reserved1[3];
+ u8 syndrome;
+ u8 vendor_err;
+ __be16 db_cnt;
+ u32 reserved2;
+ __be32 wqe;
+ u8 opcode;
+ u8 reserved3[2];
+ u8 owner;
+};
+
+#define MTHCA_CQ_ENTRY_OWNER_SW (0 << 7)
+#define MTHCA_CQ_ENTRY_OWNER_HW (1 << 7)
+
+#define MTHCA_TAVOR_CQ_DB_INC_CI (1 << 24)
+#define MTHCA_TAVOR_CQ_DB_REQ_NOT (2 << 24)
+#define MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL (3 << 24)
+#define MTHCA_TAVOR_CQ_DB_SET_CI (4 << 24)
+#define MTHCA_TAVOR_CQ_DB_REQ_NOT_MULT (5 << 24)
+
+#define MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL (1 << 24)
+#define MTHCA_ARBEL_CQ_DB_REQ_NOT (2 << 24)
+#define MTHCA_ARBEL_CQ_DB_REQ_NOT_MULT (3 << 24)
+
+static inline struct mthca_cqe *get_cqe_from_buf(struct mthca_cq_buf *buf,
+ int entry)
+{
+ if (buf->is_direct)
+ return buf->queue.direct.buf + (entry * MTHCA_CQ_ENTRY_SIZE);
+ else
+ return buf->queue.page_list[entry * MTHCA_CQ_ENTRY_SIZE / PAGE_SIZE].buf
+ + (entry * MTHCA_CQ_ENTRY_SIZE) % PAGE_SIZE;
+}
+
+static inline struct mthca_cqe *get_cqe(struct mthca_cq *cq, int entry)
+{
+ return get_cqe_from_buf(&cq->buf, entry);
+}
+
+static inline struct mthca_cqe *cqe_sw(struct mthca_cqe *cqe)
+{
+ return MTHCA_CQ_ENTRY_OWNER_HW & cqe->owner ? NULL : cqe;
+}
+
+static inline struct mthca_cqe *next_cqe_sw(struct mthca_cq *cq)
+{
+ return cqe_sw(get_cqe(cq, cq->cons_index & cq->ibcq.cqe));
+}
+
+static inline void set_cqe_hw(struct mthca_cqe *cqe)
+{
+ cqe->owner = MTHCA_CQ_ENTRY_OWNER_HW;
+}
+
+static void dump_cqe(struct mthca_dev *dev, void *cqe_ptr)
+{
+ __be32 *cqe = cqe_ptr;
+
+ (void) cqe; /* avoid warning if mthca_dbg compiled away... */
+ mthca_dbg(dev, "CQE contents %08x %08x %08x %08x %08x %08x %08x %08x\n",
+ be32_to_cpu(cqe[0]), be32_to_cpu(cqe[1]), be32_to_cpu(cqe[2]),
+ be32_to_cpu(cqe[3]), be32_to_cpu(cqe[4]), be32_to_cpu(cqe[5]),
+ be32_to_cpu(cqe[6]), be32_to_cpu(cqe[7]));
+}
+
+/*
+ * incr is ignored in native Arbel (mem-free) mode, so cq->cons_index
+ * should be correct before calling update_cons_index().
+ */
+static inline void update_cons_index(struct mthca_dev *dev, struct mthca_cq *cq,
+ int incr)
+{
+ if (mthca_is_memfree(dev)) {
+ *cq->set_ci_db = cpu_to_be32(cq->cons_index);
+ wmb();
+ } else {
+ mthca_write64(MTHCA_TAVOR_CQ_DB_INC_CI | cq->cqn, incr - 1,
+ dev->kar + MTHCA_CQ_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+ /*
+ * Make sure doorbells don't leak out of CQ spinlock
+ * and reach the HCA out of order:
+ */
+ mmiowb();
+ }
+}
+
+void mthca_cq_completion(struct mthca_dev *dev, u32 cqn)
+{
+ struct mthca_cq *cq;
+
+ cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1));
+
+ if (!cq) {
+ mthca_warn(dev, "Completion event for bogus CQ %08x\n", cqn);
+ return;
+ }
+
+ ++cq->arm_sn;
+
+ cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+}
+
+void mthca_cq_event(struct mthca_dev *dev, u32 cqn,
+ enum ib_event_type event_type)
+{
+ struct mthca_cq *cq;
+ struct ib_event event;
+
+ spin_lock(&dev->cq_table.lock);
+
+ cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1));
+ if (cq)
+ ++cq->refcount;
+
+ spin_unlock(&dev->cq_table.lock);
+
+ if (!cq) {
+ mthca_warn(dev, "Async event for bogus CQ %08x\n", cqn);
+ return;
+ }
+
+ event.device = &dev->ib_dev;
+ event.event = event_type;
+ event.element.cq = &cq->ibcq;
+ if (cq->ibcq.event_handler)
+ cq->ibcq.event_handler(&event, cq->ibcq.cq_context);
+
+ spin_lock(&dev->cq_table.lock);
+ if (!--cq->refcount)
+ wake_up(&cq->wait);
+ spin_unlock(&dev->cq_table.lock);
+}
+
+static inline int is_recv_cqe(struct mthca_cqe *cqe)
+{
+ if ((cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) ==
+ MTHCA_ERROR_CQE_OPCODE_MASK)
+ return !(cqe->opcode & 0x01);
+ else
+ return !(cqe->is_send & 0x80);
+}
+
+void mthca_cq_clean(struct mthca_dev *dev, struct mthca_cq *cq, u32 qpn,
+ struct mthca_srq *srq)
+{
+ struct mthca_cqe *cqe;
+ u32 prod_index;
+ int i, nfreed = 0;
+
+ spin_lock_irq(&cq->lock);
+
+ /*
+ * First we need to find the current producer index, so we
+ * know where to start cleaning from. It doesn't matter if HW
+ * adds new entries after this loop -- the QP we're worried
+ * about is already in RESET, so the new entries won't come
+ * from our QP and therefore don't need to be checked.
+ */
+ for (prod_index = cq->cons_index;
+ cqe_sw(get_cqe(cq, prod_index & cq->ibcq.cqe));
+ ++prod_index)
+ if (prod_index == cq->cons_index + cq->ibcq.cqe)
+ break;
+
+ if (0)
+ mthca_dbg(dev, "Cleaning QPN %06x from CQN %06x; ci %d, pi %d\n",
+ qpn, cq->cqn, cq->cons_index, prod_index);
+
+ /*
+ * Now sweep backwards through the CQ, removing CQ entries
+ * that match our QP by copying older entries on top of them.
+ */
+ while ((int) --prod_index - (int) cq->cons_index >= 0) {
+ cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
+ if (cqe->my_qpn == cpu_to_be32(qpn)) {
+ if (srq && is_recv_cqe(cqe))
+ mthca_free_srq_wqe(srq, be32_to_cpu(cqe->wqe));
+ ++nfreed;
+ } else if (nfreed)
+ memcpy(get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe),
+ cqe, MTHCA_CQ_ENTRY_SIZE);
+ }
+
+ if (nfreed) {
+ for (i = 0; i < nfreed; ++i)
+ set_cqe_hw(get_cqe(cq, (cq->cons_index + i) & cq->ibcq.cqe));
+ wmb();
+ cq->cons_index += nfreed;
+ update_cons_index(dev, cq, nfreed);
+ }
+
+ spin_unlock_irq(&cq->lock);
+}
+
+void mthca_cq_resize_copy_cqes(struct mthca_cq *cq)
+{
+ int i;
+
+ /*
+ * In Tavor mode, the hardware keeps the consumer and producer
+ * indices mod the CQ size. Since we might be making the CQ
+ * bigger, we need to deal with the case where the producer
+ * index wrapped around before the CQ was resized.
+ */
+ if (!mthca_is_memfree(to_mdev(cq->ibcq.device)) &&
+ cq->ibcq.cqe < cq->resize_buf->cqe) {
+ cq->cons_index &= cq->ibcq.cqe;
+ if (cqe_sw(get_cqe(cq, cq->ibcq.cqe)))
+ cq->cons_index -= cq->ibcq.cqe + 1;
+ }
+
+ for (i = cq->cons_index; cqe_sw(get_cqe(cq, i & cq->ibcq.cqe)); ++i)
+ memcpy(get_cqe_from_buf(&cq->resize_buf->buf,
+ i & cq->resize_buf->cqe),
+ get_cqe(cq, i & cq->ibcq.cqe), MTHCA_CQ_ENTRY_SIZE);
+}
+
+int mthca_alloc_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int nent)
+{
+ int ret;
+ int i;
+
+ ret = mthca_buf_alloc(dev, nent * MTHCA_CQ_ENTRY_SIZE,
+ MTHCA_MAX_DIRECT_CQ_SIZE,
+ &buf->queue, &buf->is_direct,
+ &dev->driver_pd, 1, &buf->mr);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < nent; ++i)
+ set_cqe_hw(get_cqe_from_buf(buf, i));
+
+ return 0;
+}
+
+void mthca_free_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int cqe)
+{
+ mthca_buf_free(dev, (cqe + 1) * MTHCA_CQ_ENTRY_SIZE, &buf->queue,
+ buf->is_direct, &buf->mr);
+}
+
+static void handle_error_cqe(struct mthca_dev *dev, struct mthca_cq *cq,
+ struct mthca_qp *qp, int wqe_index, int is_send,
+ struct mthca_err_cqe *cqe,
+ struct ib_wc *entry, int *free_cqe)
+{
+ int dbd;
+ __be32 new_wqe;
+
+ if (cqe->syndrome == SYNDROME_LOCAL_QP_OP_ERR) {
+ mthca_dbg(dev, "local QP operation err "
+ "(QPN %06x, WQE @ %08x, CQN %06x, index %d)\n",
+ be32_to_cpu(cqe->my_qpn), be32_to_cpu(cqe->wqe),
+ cq->cqn, cq->cons_index);
+ dump_cqe(dev, cqe);
+ }
+
+ /*
+ * For completions in error, only work request ID, status, vendor error
+ * (and freed resource count for RD) have to be set.
+ */
+ switch (cqe->syndrome) {
+ case SYNDROME_LOCAL_LENGTH_ERR:
+ entry->status = IB_WC_LOC_LEN_ERR;
+ break;
+ case SYNDROME_LOCAL_QP_OP_ERR:
+ entry->status = IB_WC_LOC_QP_OP_ERR;
+ break;
+ case SYNDROME_LOCAL_EEC_OP_ERR:
+ entry->status = IB_WC_LOC_EEC_OP_ERR;
+ break;
+ case SYNDROME_LOCAL_PROT_ERR:
+ entry->status = IB_WC_LOC_PROT_ERR;
+ break;
+ case SYNDROME_WR_FLUSH_ERR:
+ entry->status = IB_WC_WR_FLUSH_ERR;
+ break;
+ case SYNDROME_MW_BIND_ERR:
+ entry->status = IB_WC_MW_BIND_ERR;
+ break;
+ case SYNDROME_BAD_RESP_ERR:
+ entry->status = IB_WC_BAD_RESP_ERR;
+ break;
+ case SYNDROME_LOCAL_ACCESS_ERR:
+ entry->status = IB_WC_LOC_ACCESS_ERR;
+ break;
+ case SYNDROME_REMOTE_INVAL_REQ_ERR:
+ entry->status = IB_WC_REM_INV_REQ_ERR;
+ break;
+ case SYNDROME_REMOTE_ACCESS_ERR:
+ entry->status = IB_WC_REM_ACCESS_ERR;
+ break;
+ case SYNDROME_REMOTE_OP_ERR:
+ entry->status = IB_WC_REM_OP_ERR;
+ break;
+ case SYNDROME_RETRY_EXC_ERR:
+ entry->status = IB_WC_RETRY_EXC_ERR;
+ break;
+ case SYNDROME_RNR_RETRY_EXC_ERR:
+ entry->status = IB_WC_RNR_RETRY_EXC_ERR;
+ break;
+ case SYNDROME_LOCAL_RDD_VIOL_ERR:
+ entry->status = IB_WC_LOC_RDD_VIOL_ERR;
+ break;
+ case SYNDROME_REMOTE_INVAL_RD_REQ_ERR:
+ entry->status = IB_WC_REM_INV_RD_REQ_ERR;
+ break;
+ case SYNDROME_REMOTE_ABORTED_ERR:
+ entry->status = IB_WC_REM_ABORT_ERR;
+ break;
+ case SYNDROME_INVAL_EECN_ERR:
+ entry->status = IB_WC_INV_EECN_ERR;
+ break;
+ case SYNDROME_INVAL_EEC_STATE_ERR:
+ entry->status = IB_WC_INV_EEC_STATE_ERR;
+ break;
+ default:
+ entry->status = IB_WC_GENERAL_ERR;
+ break;
+ }
+
+ entry->vendor_err = cqe->vendor_err;
+
+ /*
+ * Mem-free HCAs always generate one CQE per WQE, even in the
+ * error case, so we don't have to check the doorbell count, etc.
+ */
+ if (mthca_is_memfree(dev))
+ return;
+
+ mthca_free_err_wqe(dev, qp, is_send, wqe_index, &dbd, &new_wqe);
+
+ /*
+ * If we're at the end of the WQE chain, or we've used up our
+ * doorbell count, free the CQE. Otherwise just update it for
+ * the next poll operation.
+ */
+ if (!(new_wqe & cpu_to_be32(0x3f)) || (!cqe->db_cnt && dbd))
+ return;
+
+ be16_add_cpu(&cqe->db_cnt, -dbd);
+ cqe->wqe = new_wqe;
+ cqe->syndrome = SYNDROME_WR_FLUSH_ERR;
+
+ *free_cqe = 0;
+}
+
+static inline int mthca_poll_one(struct mthca_dev *dev,
+ struct mthca_cq *cq,
+ struct mthca_qp **cur_qp,
+ int *freed,
+ struct ib_wc *entry)
+{
+ struct mthca_wq *wq;
+ struct mthca_cqe *cqe;
+ int wqe_index;
+ int is_error;
+ int is_send;
+ int free_cqe = 1;
+ int err = 0;
+ u16 checksum;
+
+ cqe = next_cqe_sw(cq);
+ if (!cqe)
+ return -EAGAIN;
+
+ /*
+ * Make sure we read CQ entry contents after we've checked the
+ * ownership bit.
+ */
+ rmb();
+
+ if (0) {
+ mthca_dbg(dev, "%x/%d: CQE -> QPN %06x, WQE @ %08x\n",
+ cq->cqn, cq->cons_index, be32_to_cpu(cqe->my_qpn),
+ be32_to_cpu(cqe->wqe));
+ dump_cqe(dev, cqe);
+ }
+
+ is_error = (cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) ==
+ MTHCA_ERROR_CQE_OPCODE_MASK;
+ is_send = is_error ? cqe->opcode & 0x01 : cqe->is_send & 0x80;
+
+ if (!*cur_qp || be32_to_cpu(cqe->my_qpn) != (*cur_qp)->qpn) {
+ /*
+ * We do not have to take the QP table lock here,
+ * because CQs will be locked while QPs are removed
+ * from the table.
+ */
+ *cur_qp = mthca_array_get(&dev->qp_table.qp,
+ be32_to_cpu(cqe->my_qpn) &
+ (dev->limits.num_qps - 1));
+ if (!*cur_qp) {
+ mthca_warn(dev, "CQ entry for unknown QP %06x\n",
+ be32_to_cpu(cqe->my_qpn) & 0xffffff);
+ err = -EINVAL;
+ goto out;
+ }
+ }
+
+ entry->qp = &(*cur_qp)->ibqp;
+
+ if (is_send) {
+ wq = &(*cur_qp)->sq;
+ wqe_index = ((be32_to_cpu(cqe->wqe) - (*cur_qp)->send_wqe_offset)
+ >> wq->wqe_shift);
+ entry->wr_id = (*cur_qp)->wrid[wqe_index];
+ } else if ((*cur_qp)->ibqp.srq) {
+ struct mthca_srq *srq = to_msrq((*cur_qp)->ibqp.srq);
+ u32 wqe = be32_to_cpu(cqe->wqe);
+ wq = NULL;
+ wqe_index = wqe >> srq->wqe_shift;
+ entry->wr_id = srq->wrid[wqe_index];
+ mthca_free_srq_wqe(srq, wqe);
+ } else {
+ s32 wqe;
+ wq = &(*cur_qp)->rq;
+ wqe = be32_to_cpu(cqe->wqe);
+ wqe_index = wqe >> wq->wqe_shift;
+ /*
+ * WQE addr == base - 1 might be reported in receive completion
+ * with error instead of (rq size - 1) by Sinai FW 1.0.800 and
+ * Arbel FW 5.1.400. This bug should be fixed in later FW revs.
+ */
+ if (unlikely(wqe_index < 0))
+ wqe_index = wq->max - 1;
+ entry->wr_id = (*cur_qp)->wrid[wqe_index + (*cur_qp)->sq.max];
+ }
+
+ if (wq) {
+ if (wq->last_comp < wqe_index)
+ wq->tail += wqe_index - wq->last_comp;
+ else
+ wq->tail += wqe_index + wq->max - wq->last_comp;
+
+ wq->last_comp = wqe_index;
+ }
+
+ if (is_error) {
+ handle_error_cqe(dev, cq, *cur_qp, wqe_index, is_send,
+ (struct mthca_err_cqe *) cqe,
+ entry, &free_cqe);
+ goto out;
+ }
+
+ if (is_send) {
+ entry->wc_flags = 0;
+ switch (cqe->opcode) {
+ case MTHCA_OPCODE_RDMA_WRITE:
+ entry->opcode = IB_WC_RDMA_WRITE;
+ break;
+ case MTHCA_OPCODE_RDMA_WRITE_IMM:
+ entry->opcode = IB_WC_RDMA_WRITE;
+ entry->wc_flags |= IB_WC_WITH_IMM;
+ break;
+ case MTHCA_OPCODE_SEND:
+ entry->opcode = IB_WC_SEND;
+ break;
+ case MTHCA_OPCODE_SEND_IMM:
+ entry->opcode = IB_WC_SEND;
+ entry->wc_flags |= IB_WC_WITH_IMM;
+ break;
+ case MTHCA_OPCODE_RDMA_READ:
+ entry->opcode = IB_WC_RDMA_READ;
+ entry->byte_len = be32_to_cpu(cqe->byte_cnt);
+ break;
+ case MTHCA_OPCODE_ATOMIC_CS:
+ entry->opcode = IB_WC_COMP_SWAP;
+ entry->byte_len = MTHCA_ATOMIC_BYTE_LEN;
+ break;
+ case MTHCA_OPCODE_ATOMIC_FA:
+ entry->opcode = IB_WC_FETCH_ADD;
+ entry->byte_len = MTHCA_ATOMIC_BYTE_LEN;
+ break;
+ case MTHCA_OPCODE_BIND_MW:
+ entry->opcode = IB_WC_BIND_MW;
+ break;
+ default:
+ entry->opcode = MTHCA_OPCODE_INVALID;
+ break;
+ }
+ } else {
+ entry->byte_len = be32_to_cpu(cqe->byte_cnt);
+ switch (cqe->opcode & 0x1f) {
+ case IB_OPCODE_SEND_LAST_WITH_IMMEDIATE:
+ case IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE:
+ entry->wc_flags = IB_WC_WITH_IMM;
+ entry->ex.imm_data = cqe->imm_etype_pkey_eec;
+ entry->opcode = IB_WC_RECV;
+ break;
+ case IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+ case IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE:
+ entry->wc_flags = IB_WC_WITH_IMM;
+ entry->ex.imm_data = cqe->imm_etype_pkey_eec;
+ entry->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+ break;
+ default:
+ entry->wc_flags = 0;
+ entry->opcode = IB_WC_RECV;
+ break;
+ }
+ entry->slid = be16_to_cpu(cqe->rlid);
+ entry->sl = cqe->sl_ipok >> 4;
+ entry->src_qp = be32_to_cpu(cqe->rqpn) & 0xffffff;
+ entry->dlid_path_bits = cqe->g_mlpath & 0x7f;
+ entry->pkey_index = be32_to_cpu(cqe->imm_etype_pkey_eec) >> 16;
+ entry->wc_flags |= cqe->g_mlpath & 0x80 ? IB_WC_GRH : 0;
+ checksum = (be32_to_cpu(cqe->rqpn) >> 24) |
+ ((be32_to_cpu(cqe->my_ee) >> 16) & 0xff00);
+ entry->csum_ok = (cqe->sl_ipok & 1 && checksum == 0xffff);
+ }
+
+ entry->status = IB_WC_SUCCESS;
+
+ out:
+ if (likely(free_cqe)) {
+ set_cqe_hw(cqe);
+ ++(*freed);
+ ++cq->cons_index;
+ }
+
+ return err;
+}
+
+int mthca_poll_cq(struct ib_cq *ibcq, int num_entries,
+ struct ib_wc *entry)
+{
+ struct mthca_dev *dev = to_mdev(ibcq->device);
+ struct mthca_cq *cq = to_mcq(ibcq);
+ struct mthca_qp *qp = NULL;
+ unsigned long flags;
+ int err = 0;
+ int freed = 0;
+ int npolled;
+
+ spin_lock_irqsave(&cq->lock, flags);
+
+ npolled = 0;
+repoll:
+ while (npolled < num_entries) {
+ err = mthca_poll_one(dev, cq, &qp,
+ &freed, entry + npolled);
+ if (err)
+ break;
+ ++npolled;
+ }
+
+ if (freed) {
+ wmb();
+ update_cons_index(dev, cq, freed);
+ }
+
+ /*
+ * If a CQ resize is in progress and we discovered that the
+ * old buffer is empty, then peek in the new buffer, and if
+ * it's not empty, switch to the new buffer and continue
+ * polling there.
+ */
+ if (unlikely(err == -EAGAIN && cq->resize_buf &&
+ cq->resize_buf->state == CQ_RESIZE_READY)) {
+ /*
+ * In Tavor mode, the hardware keeps the producer
+ * index modulo the CQ size. Since we might be making
+ * the CQ bigger, we need to mask our consumer index
+ * using the size of the old CQ buffer before looking
+ * in the new CQ buffer.
+ */
+ if (!mthca_is_memfree(dev))
+ cq->cons_index &= cq->ibcq.cqe;
+
+ if (cqe_sw(get_cqe_from_buf(&cq->resize_buf->buf,
+ cq->cons_index & cq->resize_buf->cqe))) {
+ struct mthca_cq_buf tbuf;
+ int tcqe;
+
+ tbuf = cq->buf;
+ tcqe = cq->ibcq.cqe;
+ cq->buf = cq->resize_buf->buf;
+ cq->ibcq.cqe = cq->resize_buf->cqe;
+
+ cq->resize_buf->buf = tbuf;
+ cq->resize_buf->cqe = tcqe;
+ cq->resize_buf->state = CQ_RESIZE_SWAPPED;
+
+ goto repoll;
+ }
+ }
+
+ spin_unlock_irqrestore(&cq->lock, flags);
+
+ return err == 0 || err == -EAGAIN ? npolled : err;
+}
+
+int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags)
+{
+ u32 dbhi = ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+ MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL :
+ MTHCA_TAVOR_CQ_DB_REQ_NOT) |
+ to_mcq(cq)->cqn;
+
+ mthca_write64(dbhi, 0xffffffff, to_mdev(cq->device)->kar + MTHCA_CQ_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&to_mdev(cq->device)->doorbell_lock));
+
+ return 0;
+}
+
+int mthca_arbel_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+ struct mthca_cq *cq = to_mcq(ibcq);
+ __be32 db_rec[2];
+ u32 dbhi;
+ u32 sn = cq->arm_sn & 3;
+
+ db_rec[0] = cpu_to_be32(cq->cons_index);
+ db_rec[1] = cpu_to_be32((cq->cqn << 8) | (2 << 5) | (sn << 3) |
+ ((flags & IB_CQ_SOLICITED_MASK) ==
+ IB_CQ_SOLICITED ? 1 : 2));
+
+ mthca_write_db_rec(db_rec, cq->arm_db);
+
+ /*
+ * Make sure that the doorbell record in host memory is
+ * written before ringing the doorbell via PCI MMIO.
+ */
+ wmb();
+
+ dbhi = (sn << 28) |
+ ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+ MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL :
+ MTHCA_ARBEL_CQ_DB_REQ_NOT) | cq->cqn;
+
+ mthca_write64(dbhi, cq->cons_index,
+ to_mdev(ibcq->device)->kar + MTHCA_CQ_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->doorbell_lock));
+
+ return 0;
+}
+
+int mthca_init_cq(struct mthca_dev *dev, int nent,
+ struct mthca_ucontext *ctx, u32 pdn,
+ struct mthca_cq *cq)
+{
+ struct mthca_mailbox *mailbox;
+ struct mthca_cq_context *cq_context;
+ int err = -ENOMEM;
+ u8 status;
+
+ cq->ibcq.cqe = nent - 1;
+ cq->is_kernel = !ctx;
+
+ cq->cqn = mthca_alloc(&dev->cq_table.alloc);
+ if (cq->cqn == -1)
+ return -ENOMEM;
+
+ if (mthca_is_memfree(dev)) {
+ err = mthca_table_get(dev, dev->cq_table.table, cq->cqn);
+ if (err)
+ goto err_out;
+
+ if (cq->is_kernel) {
+ cq->arm_sn = 1;
+
+ err = -ENOMEM;
+
+ cq->set_ci_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_SET_CI,
+ cq->cqn, &cq->set_ci_db);
+ if (cq->set_ci_db_index < 0)
+ goto err_out_icm;
+
+ cq->arm_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_ARM,
+ cq->cqn, &cq->arm_db);
+ if (cq->arm_db_index < 0)
+ goto err_out_ci;
+ }
+ }
+
+ mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(mailbox))
+ goto err_out_arm;
+
+ cq_context = mailbox->buf;
+
+ if (cq->is_kernel) {
+ err = mthca_alloc_cq_buf(dev, &cq->buf, nent);
+ if (err)
+ goto err_out_mailbox;
+ }
+
+ spin_lock_init(&cq->lock);
+ cq->refcount = 1;
+ init_waitqueue_head(&cq->wait);
+ mutex_init(&cq->mutex);
+
+ memset(cq_context, 0, sizeof *cq_context);
+ cq_context->flags = cpu_to_be32(MTHCA_CQ_STATUS_OK |
+ MTHCA_CQ_STATE_DISARMED |
+ MTHCA_CQ_FLAG_TR);
+ cq_context->logsize_usrpage = cpu_to_be32((ffs(nent) - 1) << 24);
+ if (ctx)
+ cq_context->logsize_usrpage |= cpu_to_be32(ctx->uar.index);
+ else
+ cq_context->logsize_usrpage |= cpu_to_be32(dev->driver_uar.index);
+ cq_context->error_eqn = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn);
+ cq_context->comp_eqn = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_COMP].eqn);
+ cq_context->pd = cpu_to_be32(pdn);
+ cq_context->lkey = cpu_to_be32(cq->buf.mr.ibmr.lkey);
+ cq_context->cqn = cpu_to_be32(cq->cqn);
+
+ if (mthca_is_memfree(dev)) {
+ cq_context->ci_db = cpu_to_be32(cq->set_ci_db_index);
+ cq_context->state_db = cpu_to_be32(cq->arm_db_index);
+ }
+
+ err = mthca_SW2HW_CQ(dev, mailbox, cq->cqn, &status);
+ if (err) {
+ mthca_warn(dev, "SW2HW_CQ failed (%d)\n", err);
+ goto err_out_free_mr;
+ }
+
+ if (status) {
+ mthca_warn(dev, "SW2HW_CQ returned status 0x%02x\n",
+ status);
+ err = -EINVAL;
+ goto err_out_free_mr;
+ }
+
+ spin_lock_irq(&dev->cq_table.lock);
+ if (mthca_array_set(&dev->cq_table.cq,
+ cq->cqn & (dev->limits.num_cqs - 1),
+ cq)) {
+ spin_unlock_irq(&dev->cq_table.lock);
+ goto err_out_free_mr;
+ }
+ spin_unlock_irq(&dev->cq_table.lock);
+
+ cq->cons_index = 0;
+
+ mthca_free_mailbox(dev, mailbox);
+
+ return 0;
+
+err_out_free_mr:
+ if (cq->is_kernel)
+ mthca_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
+
+err_out_mailbox:
+ mthca_free_mailbox(dev, mailbox);
+
+err_out_arm:
+ if (cq->is_kernel && mthca_is_memfree(dev))
+ mthca_free_db(dev, MTHCA_DB_TYPE_CQ_ARM, cq->arm_db_index);
+
+err_out_ci:
+ if (cq->is_kernel && mthca_is_memfree(dev))
+ mthca_free_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, cq->set_ci_db_index);
+
+err_out_icm:
+ mthca_table_put(dev, dev->cq_table.table, cq->cqn);
+
+err_out:
+ mthca_free(&dev->cq_table.alloc, cq->cqn);
+
+ return err;
+}
+
+static inline int get_cq_refcount(struct mthca_dev *dev, struct mthca_cq *cq)
+{
+ int c;
+
+ spin_lock_irq(&dev->cq_table.lock);
+ c = cq->refcount;
+ spin_unlock_irq(&dev->cq_table.lock);
+
+ return c;
+}
+
+void mthca_free_cq(struct mthca_dev *dev,
+ struct mthca_cq *cq)
+{
+ struct mthca_mailbox *mailbox;
+ int err;
+ u8 status;
+
+ mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(mailbox)) {
+ mthca_warn(dev, "No memory for mailbox to free CQ.\n");
+ return;
+ }
+
+ err = mthca_HW2SW_CQ(dev, mailbox, cq->cqn, &status);
+ if (err)
+ mthca_warn(dev, "HW2SW_CQ failed (%d)\n", err);
+ else if (status)
+ mthca_warn(dev, "HW2SW_CQ returned status 0x%02x\n", status);
+
+ if (0) {
+ __be32 *ctx = mailbox->buf;
+ int j;
+
+ printk(KERN_ERR "context for CQN %x (cons index %x, next sw %d)\n",
+ cq->cqn, cq->cons_index,
+ cq->is_kernel ? !!next_cqe_sw(cq) : 0);
+ for (j = 0; j < 16; ++j)
+ printk(KERN_ERR "[%2x] %08x\n", j * 4, be32_to_cpu(ctx[j]));
+ }
+
+ spin_lock_irq(&dev->cq_table.lock);
+ mthca_array_clear(&dev->cq_table.cq,
+ cq->cqn & (dev->limits.num_cqs - 1));
+ --cq->refcount;
+ spin_unlock_irq(&dev->cq_table.lock);
+
+ if (dev->mthca_flags & MTHCA_FLAG_MSI_X)
+ synchronize_irq(dev->eq_table.eq[MTHCA_EQ_COMP].msi_x_vector);
+ else
+ synchronize_irq(dev->pdev->irq);
+
+ wait_event(cq->wait, !get_cq_refcount(dev, cq));
+
+ if (cq->is_kernel) {
+ mthca_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
+ if (mthca_is_memfree(dev)) {
+ mthca_free_db(dev, MTHCA_DB_TYPE_CQ_ARM, cq->arm_db_index);
+ mthca_free_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, cq->set_ci_db_index);
+ }
+ }
+
+ mthca_table_put(dev, dev->cq_table.table, cq->cqn);
+ mthca_free(&dev->cq_table.alloc, cq->cqn);
+ mthca_free_mailbox(dev, mailbox);
+}
+
+int mthca_init_cq_table(struct mthca_dev *dev)
+{
+ int err;
+
+ spin_lock_init(&dev->cq_table.lock);
+
+ err = mthca_alloc_init(&dev->cq_table.alloc,
+ dev->limits.num_cqs,
+ (1 << 24) - 1,
+ dev->limits.reserved_cqs);
+ if (err)
+ return err;
+
+ err = mthca_array_init(&dev->cq_table.cq,
+ dev->limits.num_cqs);
+ if (err)
+ mthca_alloc_cleanup(&dev->cq_table.alloc);
+
+ return err;
+}
+
+void mthca_cleanup_cq_table(struct mthca_dev *dev)
+{
+ mthca_array_cleanup(&dev->cq_table.cq, dev->limits.num_cqs);
+ mthca_alloc_cleanup(&dev->cq_table.alloc);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_dev.h b/sys/ofed/drivers/infiniband/hw/mthca/mthca_dev.h
new file mode 100644
index 0000000..14e3f62
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_dev.h
@@ -0,0 +1,596 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_DEV_H
+#define MTHCA_DEV_H
+
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/timer.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/semaphore.h>
+
+#include "mthca_provider.h"
+#include "mthca_doorbell.h"
+
+#define DRV_NAME "ib_mthca"
+#define PFX DRV_NAME ": "
+#define DRV_VERSION "1.0-ofed1.5.2"
+#define DRV_RELDATE "August 4, 2010"
+
+enum {
+ MTHCA_FLAG_DDR_HIDDEN = 1 << 1,
+ MTHCA_FLAG_SRQ = 1 << 2,
+ MTHCA_FLAG_MSI_X = 1 << 3,
+ MTHCA_FLAG_NO_LAM = 1 << 4,
+ MTHCA_FLAG_FMR = 1 << 5,
+ MTHCA_FLAG_MEMFREE = 1 << 6,
+ MTHCA_FLAG_PCIE = 1 << 7,
+ MTHCA_FLAG_SINAI_OPT = 1 << 8
+};
+
+enum {
+ MTHCA_MAX_PORTS = 2
+};
+
+enum {
+ MTHCA_BOARD_ID_LEN = 64
+};
+
+enum {
+ MTHCA_EQ_CONTEXT_SIZE = 0x40,
+ MTHCA_CQ_CONTEXT_SIZE = 0x40,
+ MTHCA_QP_CONTEXT_SIZE = 0x200,
+ MTHCA_RDB_ENTRY_SIZE = 0x20,
+ MTHCA_AV_SIZE = 0x20,
+ MTHCA_MGM_ENTRY_SIZE = 0x100,
+
+ /* Arbel FW gives us these, but we need them for Tavor */
+ MTHCA_MPT_ENTRY_SIZE = 0x40,
+ MTHCA_MTT_SEG_SIZE = 0x40,
+
+ MTHCA_QP_PER_MGM = 4 * (MTHCA_MGM_ENTRY_SIZE / 16 - 2)
+};
+
+enum {
+ MTHCA_EQ_CMD,
+ MTHCA_EQ_ASYNC,
+ MTHCA_EQ_COMP,
+ MTHCA_NUM_EQ
+};
+
+enum {
+ MTHCA_OPCODE_NOP = 0x00,
+ MTHCA_OPCODE_RDMA_WRITE = 0x08,
+ MTHCA_OPCODE_RDMA_WRITE_IMM = 0x09,
+ MTHCA_OPCODE_SEND = 0x0a,
+ MTHCA_OPCODE_SEND_IMM = 0x0b,
+ MTHCA_OPCODE_RDMA_READ = 0x10,
+ MTHCA_OPCODE_ATOMIC_CS = 0x11,
+ MTHCA_OPCODE_ATOMIC_FA = 0x12,
+ MTHCA_OPCODE_BIND_MW = 0x18,
+ MTHCA_OPCODE_INVALID = 0xff
+};
+
+enum {
+ MTHCA_CMD_USE_EVENTS = 1 << 0,
+ MTHCA_CMD_POST_DOORBELLS = 1 << 1
+};
+
+enum {
+ MTHCA_CMD_NUM_DBELL_DWORDS = 8
+};
+
+struct mthca_cmd {
+ struct pci_pool *pool;
+ struct mutex hcr_mutex;
+ struct semaphore poll_sem;
+ struct semaphore event_sem;
+ int max_cmds;
+ spinlock_t context_lock;
+ int free_head;
+ struct mthca_cmd_context *context;
+ u16 token_mask;
+ u32 flags;
+ void __iomem *dbell_map;
+ u16 dbell_offsets[MTHCA_CMD_NUM_DBELL_DWORDS];
+};
+
+struct mthca_limits {
+ int num_ports;
+ int vl_cap;
+ int mtu_cap;
+ int gid_table_len;
+ int pkey_table_len;
+ int local_ca_ack_delay;
+ int num_uars;
+ int max_sg;
+ int num_qps;
+ int max_wqes;
+ int max_desc_sz;
+ int max_qp_init_rdma;
+ int reserved_qps;
+ int num_srqs;
+ int max_srq_wqes;
+ int max_srq_sge;
+ int reserved_srqs;
+ int num_eecs;
+ int reserved_eecs;
+ int num_cqs;
+ int max_cqes;
+ int reserved_cqs;
+ int num_eqs;
+ int reserved_eqs;
+ int num_mpts;
+ int num_mtt_segs;
+ int mtt_seg_size;
+ int fmr_reserved_mtts;
+ int reserved_mtts;
+ int reserved_mrws;
+ int reserved_uars;
+ int num_mgms;
+ int num_amgms;
+ int reserved_mcgs;
+ int num_pds;
+ int reserved_pds;
+ u32 page_size_cap;
+ u32 flags;
+ u16 stat_rate_support;
+ u8 port_width_cap;
+};
+
+struct mthca_alloc {
+ u32 last;
+ u32 top;
+ u32 max;
+ u32 mask;
+ spinlock_t lock;
+ unsigned long *table;
+};
+
+struct mthca_array {
+ struct {
+ void **page;
+ int used;
+ } *page_list;
+};
+
+struct mthca_uar_table {
+ struct mthca_alloc alloc;
+ u64 uarc_base;
+ int uarc_size;
+};
+
+struct mthca_pd_table {
+ struct mthca_alloc alloc;
+};
+
+struct mthca_buddy {
+ unsigned long **bits;
+ int *num_free;
+ int max_order;
+ spinlock_t lock;
+};
+
+struct mthca_mr_table {
+ struct mthca_alloc mpt_alloc;
+ struct mthca_buddy mtt_buddy;
+ struct mthca_buddy *fmr_mtt_buddy;
+ u64 mtt_base;
+ u64 mpt_base;
+ struct mthca_icm_table *mtt_table;
+ struct mthca_icm_table *mpt_table;
+ struct {
+ void __iomem *mpt_base;
+ void __iomem *mtt_base;
+ struct mthca_buddy mtt_buddy;
+ } tavor_fmr;
+};
+
+struct mthca_eq_table {
+ struct mthca_alloc alloc;
+ void __iomem *clr_int;
+ u32 clr_mask;
+ u32 arm_mask;
+ struct mthca_eq eq[MTHCA_NUM_EQ];
+ u64 icm_virt;
+ struct page *icm_page;
+ dma_addr_t icm_dma;
+ int have_irq;
+ u8 inta_pin;
+};
+
+struct mthca_cq_table {
+ struct mthca_alloc alloc;
+ spinlock_t lock;
+ struct mthca_array cq;
+ struct mthca_icm_table *table;
+};
+
+struct mthca_srq_table {
+ struct mthca_alloc alloc;
+ spinlock_t lock;
+ struct mthca_array srq;
+ struct mthca_icm_table *table;
+};
+
+struct mthca_qp_table {
+ struct mthca_alloc alloc;
+ u32 rdb_base;
+ int rdb_shift;
+ int sqp_start;
+ spinlock_t lock;
+ struct mthca_array qp;
+ struct mthca_icm_table *qp_table;
+ struct mthca_icm_table *eqp_table;
+ struct mthca_icm_table *rdb_table;
+};
+
+struct mthca_av_table {
+ struct pci_pool *pool;
+ int num_ddr_avs;
+ u64 ddr_av_base;
+ void __iomem *av_map;
+ struct mthca_alloc alloc;
+};
+
+struct mthca_mcg_table {
+ struct mutex mutex;
+ struct mthca_alloc alloc;
+ struct mthca_icm_table *table;
+};
+
+struct mthca_catas_err {
+ u64 addr;
+ u32 __iomem *map;
+ u32 size;
+ struct timer_list timer;
+ struct list_head list;
+};
+
+extern struct mutex mthca_device_mutex;
+
+struct mthca_dev {
+ struct ib_device ib_dev;
+ struct pci_dev *pdev;
+
+ int hca_type;
+ unsigned long mthca_flags;
+ unsigned long device_cap_flags;
+
+ u32 rev_id;
+ char board_id[MTHCA_BOARD_ID_LEN];
+
+ /* firmware info */
+ u64 fw_ver;
+ union {
+ struct {
+ u64 fw_start;
+ u64 fw_end;
+ } tavor;
+ struct {
+ u64 clr_int_base;
+ u64 eq_arm_base;
+ u64 eq_set_ci_base;
+ struct mthca_icm *fw_icm;
+ struct mthca_icm *aux_icm;
+ u16 fw_pages;
+ } arbel;
+ } fw;
+
+ u64 ddr_start;
+ u64 ddr_end;
+
+ MTHCA_DECLARE_DOORBELL_LOCK(doorbell_lock)
+ struct mutex cap_mask_mutex;
+
+ void __iomem *hcr;
+ void __iomem *kar;
+ void __iomem *clr_base;
+ union {
+ struct {
+ void __iomem *ecr_base;
+ } tavor;
+ struct {
+ void __iomem *eq_arm;
+ void __iomem *eq_set_ci_base;
+ } arbel;
+ } eq_regs;
+
+ struct mthca_cmd cmd;
+ struct mthca_limits limits;
+
+ struct mthca_uar_table uar_table;
+ struct mthca_pd_table pd_table;
+ struct mthca_mr_table mr_table;
+ struct mthca_eq_table eq_table;
+ struct mthca_cq_table cq_table;
+ struct mthca_srq_table srq_table;
+ struct mthca_qp_table qp_table;
+ struct mthca_av_table av_table;
+ struct mthca_mcg_table mcg_table;
+
+ struct mthca_catas_err catas_err;
+
+ struct mthca_uar driver_uar;
+ struct mthca_db_table *db_tab;
+ struct mthca_pd driver_pd;
+ struct mthca_mr driver_mr;
+
+ struct ib_mad_agent *send_agent[MTHCA_MAX_PORTS][2];
+ struct ib_ah *sm_ah[MTHCA_MAX_PORTS];
+ spinlock_t sm_lock;
+ u8 rate[MTHCA_MAX_PORTS];
+ int active;
+};
+
+#ifdef CONFIG_INFINIBAND_MTHCA_DEBUG
+extern int mthca_debug_level;
+
+#define mthca_dbg(mdev, format, arg...) \
+ do { \
+ if (mthca_debug_level) \
+ dev_printk(KERN_DEBUG, &mdev->pdev->dev, format, ## arg); \
+ } while (0)
+
+#else /* CONFIG_INFINIBAND_MTHCA_DEBUG */
+
+#define mthca_dbg(mdev, format, arg...) do { (void) mdev; } while (0)
+
+#endif /* CONFIG_INFINIBAND_MTHCA_DEBUG */
+
+#define mthca_err(mdev, format, arg...) \
+ dev_err(&mdev->pdev->dev, format, ## arg)
+#define mthca_info(mdev, format, arg...) \
+ dev_info(&mdev->pdev->dev, format, ## arg)
+#define mthca_warn(mdev, format, arg...) \
+ dev_warn(&mdev->pdev->dev, format, ## arg)
+
+extern void __buggy_use_of_MTHCA_GET(void);
+extern void __buggy_use_of_MTHCA_PUT(void);
+
+#define MTHCA_GET(dest, source, offset) \
+ do { \
+ void *__p = (char *) (source) + (offset); \
+ switch (sizeof (dest)) { \
+ case 1: (dest) = *(u8 *) __p; break; \
+ case 2: (dest) = be16_to_cpup(__p); break; \
+ case 4: (dest) = be32_to_cpup(__p); break; \
+ case 8: (dest) = be64_to_cpup(__p); break; \
+ default: __buggy_use_of_MTHCA_GET(); \
+ } \
+ } while (0)
+
+#define MTHCA_PUT(dest, source, offset) \
+ do { \
+ void *__d = ((char *) (dest) + (offset)); \
+ switch (sizeof(source)) { \
+ case 1: *(u8 *) __d = (source); break; \
+ case 2: *(__be16 *) __d = cpu_to_be16(source); break; \
+ case 4: *(__be32 *) __d = cpu_to_be32(source); break; \
+ case 8: *(__be64 *) __d = cpu_to_be64(source); break; \
+ default: __buggy_use_of_MTHCA_PUT(); \
+ } \
+ } while (0)
+
+int mthca_reset(struct mthca_dev *mdev);
+
+u32 mthca_alloc(struct mthca_alloc *alloc);
+void mthca_free(struct mthca_alloc *alloc, u32 obj);
+int mthca_alloc_init(struct mthca_alloc *alloc, u32 num, u32 mask,
+ u32 reserved);
+void mthca_alloc_cleanup(struct mthca_alloc *alloc);
+void *mthca_array_get(struct mthca_array *array, int index);
+int mthca_array_set(struct mthca_array *array, int index, void *value);
+void mthca_array_clear(struct mthca_array *array, int index);
+int mthca_array_init(struct mthca_array *array, int nent);
+void mthca_array_cleanup(struct mthca_array *array, int nent);
+int mthca_buf_alloc(struct mthca_dev *dev, int size, int max_direct,
+ union mthca_buf *buf, int *is_direct, struct mthca_pd *pd,
+ int hca_write, struct mthca_mr *mr);
+void mthca_buf_free(struct mthca_dev *dev, int size, union mthca_buf *buf,
+ int is_direct, struct mthca_mr *mr);
+
+int mthca_init_uar_table(struct mthca_dev *dev);
+int mthca_init_pd_table(struct mthca_dev *dev);
+int mthca_init_mr_table(struct mthca_dev *dev);
+int mthca_init_eq_table(struct mthca_dev *dev);
+int mthca_init_cq_table(struct mthca_dev *dev);
+int mthca_init_srq_table(struct mthca_dev *dev);
+int mthca_init_qp_table(struct mthca_dev *dev);
+int mthca_init_av_table(struct mthca_dev *dev);
+int mthca_init_mcg_table(struct mthca_dev *dev);
+
+void mthca_cleanup_uar_table(struct mthca_dev *dev);
+void mthca_cleanup_pd_table(struct mthca_dev *dev);
+void mthca_cleanup_mr_table(struct mthca_dev *dev);
+void mthca_cleanup_eq_table(struct mthca_dev *dev);
+void mthca_cleanup_cq_table(struct mthca_dev *dev);
+void mthca_cleanup_srq_table(struct mthca_dev *dev);
+void mthca_cleanup_qp_table(struct mthca_dev *dev);
+void mthca_cleanup_av_table(struct mthca_dev *dev);
+void mthca_cleanup_mcg_table(struct mthca_dev *dev);
+
+int mthca_register_device(struct mthca_dev *dev);
+void mthca_unregister_device(struct mthca_dev *dev);
+
+void mthca_start_catas_poll(struct mthca_dev *dev);
+void mthca_stop_catas_poll(struct mthca_dev *dev);
+int __mthca_restart_one(struct pci_dev *pdev);
+int mthca_catas_init(void);
+void mthca_catas_cleanup(void);
+
+int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar);
+void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar);
+
+int mthca_pd_alloc(struct mthca_dev *dev, int privileged, struct mthca_pd *pd);
+void mthca_pd_free(struct mthca_dev *dev, struct mthca_pd *pd);
+
+int mthca_write_mtt_size(struct mthca_dev *dev);
+
+struct mthca_mtt *mthca_alloc_mtt(struct mthca_dev *dev, int size);
+void mthca_free_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt);
+int mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt,
+ int start_index, u64 *buffer_list, int list_len);
+int mthca_mr_alloc(struct mthca_dev *dev, u32 pd, int buffer_size_shift,
+ u64 iova, u64 total_size, u32 access, struct mthca_mr *mr);
+int mthca_mr_alloc_notrans(struct mthca_dev *dev, u32 pd,
+ u32 access, struct mthca_mr *mr);
+int mthca_mr_alloc_phys(struct mthca_dev *dev, u32 pd,
+ u64 *buffer_list, int buffer_size_shift,
+ int list_len, u64 iova, u64 total_size,
+ u32 access, struct mthca_mr *mr);
+void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr);
+
+int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd,
+ u32 access, struct mthca_fmr *fmr);
+int mthca_tavor_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+ int list_len, u64 iova);
+void mthca_tavor_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr);
+int mthca_arbel_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+ int list_len, u64 iova);
+void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr);
+int mthca_free_fmr(struct mthca_dev *dev, struct mthca_fmr *fmr);
+
+int mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt);
+void mthca_unmap_eq_icm(struct mthca_dev *dev);
+
+int mthca_poll_cq(struct ib_cq *ibcq, int num_entries,
+ struct ib_wc *entry);
+int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
+int mthca_arbel_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
+int mthca_init_cq(struct mthca_dev *dev, int nent,
+ struct mthca_ucontext *ctx, u32 pdn,
+ struct mthca_cq *cq);
+void mthca_free_cq(struct mthca_dev *dev,
+ struct mthca_cq *cq);
+void mthca_cq_completion(struct mthca_dev *dev, u32 cqn);
+void mthca_cq_event(struct mthca_dev *dev, u32 cqn,
+ enum ib_event_type event_type);
+void mthca_cq_clean(struct mthca_dev *dev, struct mthca_cq *cq, u32 qpn,
+ struct mthca_srq *srq);
+void mthca_cq_resize_copy_cqes(struct mthca_cq *cq);
+int mthca_alloc_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int nent);
+void mthca_free_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int cqe);
+
+int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd,
+ struct ib_srq_attr *attr, struct mthca_srq *srq);
+void mthca_free_srq(struct mthca_dev *dev, struct mthca_srq *srq);
+int mthca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+ enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
+int mthca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
+int mthca_max_srq_sge(struct mthca_dev *dev);
+void mthca_srq_event(struct mthca_dev *dev, u32 srqn,
+ enum ib_event_type event_type);
+void mthca_free_srq_wqe(struct mthca_srq *srq, u32 wqe_addr);
+int mthca_tavor_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr);
+int mthca_arbel_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr);
+
+void mthca_qp_event(struct mthca_dev *dev, u32 qpn,
+ enum ib_event_type event_type);
+int mthca_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+ struct ib_qp_init_attr *qp_init_attr);
+int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
+ struct ib_udata *udata);
+int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+ struct ib_send_wr **bad_wr);
+int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr);
+int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+ struct ib_send_wr **bad_wr);
+int mthca_arbel_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr);
+void mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send,
+ int index, int *dbd, __be32 *new_wqe);
+int mthca_alloc_qp(struct mthca_dev *dev,
+ struct mthca_pd *pd,
+ struct mthca_cq *send_cq,
+ struct mthca_cq *recv_cq,
+ enum ib_qp_type type,
+ enum ib_sig_type send_policy,
+ struct ib_qp_cap *cap,
+ struct mthca_qp *qp);
+int mthca_alloc_sqp(struct mthca_dev *dev,
+ struct mthca_pd *pd,
+ struct mthca_cq *send_cq,
+ struct mthca_cq *recv_cq,
+ enum ib_sig_type send_policy,
+ struct ib_qp_cap *cap,
+ int qpn,
+ int port,
+ struct mthca_sqp *sqp);
+void mthca_free_qp(struct mthca_dev *dev, struct mthca_qp *qp);
+int mthca_create_ah(struct mthca_dev *dev,
+ struct mthca_pd *pd,
+ struct ib_ah_attr *ah_attr,
+ struct mthca_ah *ah);
+int mthca_destroy_ah(struct mthca_dev *dev, struct mthca_ah *ah);
+int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah,
+ struct ib_ud_header *header);
+int mthca_ah_query(struct ib_ah *ibah, struct ib_ah_attr *attr);
+int mthca_ah_grh_present(struct mthca_ah *ah);
+u8 mthca_get_rate(struct mthca_dev *dev, int static_rate, u8 port);
+enum ib_rate mthca_rate_to_ib(struct mthca_dev *dev, u8 mthca_rate, u8 port);
+
+int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int mthca_process_mad(struct ib_device *ibdev,
+ int mad_flags,
+ u8 port_num,
+ struct ib_wc *in_wc,
+ struct ib_grh *in_grh,
+ struct ib_mad *in_mad,
+ struct ib_mad *out_mad);
+int mthca_create_agents(struct mthca_dev *dev);
+void mthca_free_agents(struct mthca_dev *dev);
+
+static inline struct mthca_dev *to_mdev(struct ib_device *ibdev)
+{
+ return container_of(ibdev, struct mthca_dev, ib_dev);
+}
+
+static inline int mthca_is_memfree(struct mthca_dev *dev)
+{
+ return dev->mthca_flags & MTHCA_FLAG_MEMFREE;
+}
+
+#endif /* MTHCA_DEV_H */
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_doorbell.h b/sys/ofed/drivers/infiniband/hw/mthca/mthca_doorbell.h
new file mode 100644
index 0000000..14f51ef
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_doorbell.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+
+#define MTHCA_RD_DOORBELL 0x00
+#define MTHCA_SEND_DOORBELL 0x10
+#define MTHCA_RECEIVE_DOORBELL 0x18
+#define MTHCA_CQ_DOORBELL 0x20
+#define MTHCA_EQ_DOORBELL 0x28
+
+#if BITS_PER_LONG == 64
+/*
+ * Assume that we can just write a 64-bit doorbell atomically. s390
+ * actually doesn't have writeq() but S/390 systems don't even have
+ * PCI so we won't worry about it.
+ */
+
+#define MTHCA_DECLARE_DOORBELL_LOCK(name)
+#define MTHCA_INIT_DOORBELL_LOCK(ptr) do { } while (0)
+#define MTHCA_GET_DOORBELL_LOCK(ptr) (NULL)
+
+static inline void mthca_write64_raw(__be64 val, void __iomem *dest)
+{
+ __raw_writeq((__force u64) val, dest);
+}
+
+static inline void mthca_write64(u32 hi, u32 lo, void __iomem *dest,
+ spinlock_t *doorbell_lock)
+{
+ __raw_writeq((__force u64) cpu_to_be64((u64) hi << 32 | lo), dest);
+}
+
+static inline void mthca_write_db_rec(__be32 val[2], __be32 *db)
+{
+ *(u64 *) db = *(u64 *) val;
+}
+
+#else
+
+/*
+ * Just fall back to a spinlock to protect the doorbell if
+ * BITS_PER_LONG is 32 -- there's no portable way to do atomic 64-bit
+ * MMIO writes.
+ */
+
+#define MTHCA_DECLARE_DOORBELL_LOCK(name) spinlock_t name;
+#define MTHCA_INIT_DOORBELL_LOCK(ptr) spin_lock_init(ptr)
+#define MTHCA_GET_DOORBELL_LOCK(ptr) (ptr)
+
+static inline void mthca_write64_raw(__be64 val, void __iomem *dest)
+{
+ __raw_writel(((__force u32 *) &val)[0], dest);
+ __raw_writel(((__force u32 *) &val)[1], dest + 4);
+}
+
+static inline void mthca_write64(u32 hi, u32 lo, void __iomem *dest,
+ spinlock_t *doorbell_lock)
+{
+ unsigned long flags;
+
+ hi = (__force u32) cpu_to_be32(hi);
+ lo = (__force u32) cpu_to_be32(lo);
+
+ spin_lock_irqsave(doorbell_lock, flags);
+ __raw_writel(hi, dest);
+ __raw_writel(lo, dest + 4);
+ spin_unlock_irqrestore(doorbell_lock, flags);
+}
+
+static inline void mthca_write_db_rec(__be32 val[2], __be32 *db)
+{
+ db[0] = val[0];
+ wmb();
+ db[1] = val[1];
+}
+
+#endif
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_eq.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_eq.c
new file mode 100644
index 0000000..90e4e45
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_eq.c
@@ -0,0 +1,920 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_config_reg.h"
+
+enum {
+ MTHCA_NUM_ASYNC_EQE = 0x80,
+ MTHCA_NUM_CMD_EQE = 0x80,
+ MTHCA_NUM_SPARE_EQE = 0x80,
+ MTHCA_EQ_ENTRY_SIZE = 0x20
+};
+
+/*
+ * Must be packed because start is 64 bits but only aligned to 32 bits.
+ */
+struct mthca_eq_context {
+ __be32 flags;
+ __be64 start;
+ __be32 logsize_usrpage;
+ __be32 tavor_pd; /* reserved for Arbel */
+ u8 reserved1[3];
+ u8 intr;
+ __be32 arbel_pd; /* lost_count for Tavor */
+ __be32 lkey;
+ u32 reserved2[2];
+ __be32 consumer_index;
+ __be32 producer_index;
+ u32 reserved3[4];
+} __attribute__((packed));
+
+#define MTHCA_EQ_STATUS_OK ( 0 << 28)
+#define MTHCA_EQ_STATUS_OVERFLOW ( 9 << 28)
+#define MTHCA_EQ_STATUS_WRITE_FAIL (10 << 28)
+#define MTHCA_EQ_OWNER_SW ( 0 << 24)
+#define MTHCA_EQ_OWNER_HW ( 1 << 24)
+#define MTHCA_EQ_FLAG_TR ( 1 << 18)
+#define MTHCA_EQ_FLAG_OI ( 1 << 17)
+#define MTHCA_EQ_STATE_ARMED ( 1 << 8)
+#define MTHCA_EQ_STATE_FIRED ( 2 << 8)
+#define MTHCA_EQ_STATE_ALWAYS_ARMED ( 3 << 8)
+#define MTHCA_EQ_STATE_ARBEL ( 8 << 8)
+
+enum {
+ MTHCA_EVENT_TYPE_COMP = 0x00,
+ MTHCA_EVENT_TYPE_PATH_MIG = 0x01,
+ MTHCA_EVENT_TYPE_COMM_EST = 0x02,
+ MTHCA_EVENT_TYPE_SQ_DRAINED = 0x03,
+ MTHCA_EVENT_TYPE_SRQ_QP_LAST_WQE = 0x13,
+ MTHCA_EVENT_TYPE_SRQ_LIMIT = 0x14,
+ MTHCA_EVENT_TYPE_CQ_ERROR = 0x04,
+ MTHCA_EVENT_TYPE_WQ_CATAS_ERROR = 0x05,
+ MTHCA_EVENT_TYPE_EEC_CATAS_ERROR = 0x06,
+ MTHCA_EVENT_TYPE_PATH_MIG_FAILED = 0x07,
+ MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR = 0x10,
+ MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR = 0x11,
+ MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR = 0x12,
+ MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR = 0x08,
+ MTHCA_EVENT_TYPE_PORT_CHANGE = 0x09,
+ MTHCA_EVENT_TYPE_EQ_OVERFLOW = 0x0f,
+ MTHCA_EVENT_TYPE_ECC_DETECT = 0x0e,
+ MTHCA_EVENT_TYPE_CMD = 0x0a
+};
+
+#define MTHCA_ASYNC_EVENT_MASK ((1ULL << MTHCA_EVENT_TYPE_PATH_MIG) | \
+ (1ULL << MTHCA_EVENT_TYPE_COMM_EST) | \
+ (1ULL << MTHCA_EVENT_TYPE_SQ_DRAINED) | \
+ (1ULL << MTHCA_EVENT_TYPE_CQ_ERROR) | \
+ (1ULL << MTHCA_EVENT_TYPE_WQ_CATAS_ERROR) | \
+ (1ULL << MTHCA_EVENT_TYPE_EEC_CATAS_ERROR) | \
+ (1ULL << MTHCA_EVENT_TYPE_PATH_MIG_FAILED) | \
+ (1ULL << MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | \
+ (1ULL << MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR) | \
+ (1ULL << MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR) | \
+ (1ULL << MTHCA_EVENT_TYPE_PORT_CHANGE) | \
+ (1ULL << MTHCA_EVENT_TYPE_ECC_DETECT))
+#define MTHCA_SRQ_EVENT_MASK ((1ULL << MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR) | \
+ (1ULL << MTHCA_EVENT_TYPE_SRQ_QP_LAST_WQE) | \
+ (1ULL << MTHCA_EVENT_TYPE_SRQ_LIMIT))
+#define MTHCA_CMD_EVENT_MASK (1ULL << MTHCA_EVENT_TYPE_CMD)
+
+#define MTHCA_EQ_DB_INC_CI (1 << 24)
+#define MTHCA_EQ_DB_REQ_NOT (2 << 24)
+#define MTHCA_EQ_DB_DISARM_CQ (3 << 24)
+#define MTHCA_EQ_DB_SET_CI (4 << 24)
+#define MTHCA_EQ_DB_ALWAYS_ARM (5 << 24)
+
+struct mthca_eqe {
+ u8 reserved1;
+ u8 type;
+ u8 reserved2;
+ u8 subtype;
+ union {
+ u32 raw[6];
+ struct {
+ __be32 cqn;
+ } __attribute__((packed)) comp;
+ struct {
+ u16 reserved1;
+ __be16 token;
+ u32 reserved2;
+ u8 reserved3[3];
+ u8 status;
+ __be64 out_param;
+ } __attribute__((packed)) cmd;
+ struct {
+ __be32 qpn;
+ } __attribute__((packed)) qp;
+ struct {
+ __be32 srqn;
+ } __attribute__((packed)) srq;
+ struct {
+ __be32 cqn;
+ u32 reserved1;
+ u8 reserved2[3];
+ u8 syndrome;
+ } __attribute__((packed)) cq_err;
+ struct {
+ u32 reserved1[2];
+ __be32 port;
+ } __attribute__((packed)) port_change;
+ } event;
+ u8 reserved3[3];
+ u8 owner;
+} __attribute__((packed));
+
+#define MTHCA_EQ_ENTRY_OWNER_SW (0 << 7)
+#define MTHCA_EQ_ENTRY_OWNER_HW (1 << 7)
+
+static inline u64 async_mask(struct mthca_dev *dev)
+{
+ return dev->mthca_flags & MTHCA_FLAG_SRQ ?
+ MTHCA_ASYNC_EVENT_MASK | MTHCA_SRQ_EVENT_MASK :
+ MTHCA_ASYNC_EVENT_MASK;
+}
+
+static inline void tavor_set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci)
+{
+ /*
+ * This barrier makes sure that all updates to ownership bits
+ * done by set_eqe_hw() hit memory before the consumer index
+ * is updated. set_eq_ci() allows the HCA to possibly write
+ * more EQ entries, and we want to avoid the exceedingly
+ * unlikely possibility of the HCA writing an entry and then
+ * having set_eqe_hw() overwrite the owner field.
+ */
+ wmb();
+ mthca_write64(MTHCA_EQ_DB_SET_CI | eq->eqn, ci & (eq->nent - 1),
+ dev->kar + MTHCA_EQ_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+}
+
+static inline void arbel_set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci)
+{
+ /* See comment in tavor_set_eq_ci() above. */
+ wmb();
+ __raw_writel((__force u32) cpu_to_be32(ci),
+ dev->eq_regs.arbel.eq_set_ci_base + eq->eqn * 8);
+ /* We still want ordering, just not swabbing, so add a barrier */
+ mb();
+}
+
+static inline void set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci)
+{
+ if (mthca_is_memfree(dev))
+ arbel_set_eq_ci(dev, eq, ci);
+ else
+ tavor_set_eq_ci(dev, eq, ci);
+}
+
+static inline void tavor_eq_req_not(struct mthca_dev *dev, int eqn)
+{
+ mthca_write64(MTHCA_EQ_DB_REQ_NOT | eqn, 0,
+ dev->kar + MTHCA_EQ_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+}
+
+static inline void arbel_eq_req_not(struct mthca_dev *dev, u32 eqn_mask)
+{
+ writel(eqn_mask, dev->eq_regs.arbel.eq_arm);
+}
+
+static inline void disarm_cq(struct mthca_dev *dev, int eqn, int cqn)
+{
+ if (!mthca_is_memfree(dev)) {
+ mthca_write64(MTHCA_EQ_DB_DISARM_CQ | eqn, cqn,
+ dev->kar + MTHCA_EQ_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+ }
+}
+
+static inline struct mthca_eqe *get_eqe(struct mthca_eq *eq, u32 entry)
+{
+ unsigned long off = (entry & (eq->nent - 1)) * MTHCA_EQ_ENTRY_SIZE;
+ return eq->page_list[off / PAGE_SIZE].buf + off % PAGE_SIZE;
+}
+
+static inline struct mthca_eqe *next_eqe_sw(struct mthca_eq *eq)
+{
+ struct mthca_eqe *eqe;
+ eqe = get_eqe(eq, eq->cons_index);
+ return (MTHCA_EQ_ENTRY_OWNER_HW & eqe->owner) ? NULL : eqe;
+}
+
+static inline void set_eqe_hw(struct mthca_eqe *eqe)
+{
+ eqe->owner = MTHCA_EQ_ENTRY_OWNER_HW;
+}
+
+static void port_change(struct mthca_dev *dev, int port, int active)
+{
+ struct ib_event record;
+
+ mthca_dbg(dev, "Port change to %s for port %d\n",
+ active ? "active" : "down", port);
+
+ record.device = &dev->ib_dev;
+ record.event = active ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
+ record.element.port_num = port;
+
+ ib_dispatch_event(&record);
+}
+
+static int mthca_eq_int(struct mthca_dev *dev, struct mthca_eq *eq)
+{
+ struct mthca_eqe *eqe;
+ int disarm_cqn;
+ int eqes_found = 0;
+ int set_ci = 0;
+
+ while ((eqe = next_eqe_sw(eq))) {
+ /*
+ * Make sure we read EQ entry contents after we've
+ * checked the ownership bit.
+ */
+ rmb();
+
+ switch (eqe->type) {
+ case MTHCA_EVENT_TYPE_COMP:
+ disarm_cqn = be32_to_cpu(eqe->event.comp.cqn) & 0xffffff;
+ disarm_cq(dev, eq->eqn, disarm_cqn);
+ mthca_cq_completion(dev, disarm_cqn);
+ break;
+
+ case MTHCA_EVENT_TYPE_PATH_MIG:
+ mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+ IB_EVENT_PATH_MIG);
+ break;
+
+ case MTHCA_EVENT_TYPE_COMM_EST:
+ mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+ IB_EVENT_COMM_EST);
+ break;
+
+ case MTHCA_EVENT_TYPE_SQ_DRAINED:
+ mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+ IB_EVENT_SQ_DRAINED);
+ break;
+
+ case MTHCA_EVENT_TYPE_SRQ_QP_LAST_WQE:
+ mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+ IB_EVENT_QP_LAST_WQE_REACHED);
+ break;
+
+ case MTHCA_EVENT_TYPE_SRQ_LIMIT:
+ mthca_srq_event(dev, be32_to_cpu(eqe->event.srq.srqn) & 0xffffff,
+ IB_EVENT_SRQ_LIMIT_REACHED);
+ break;
+
+ case MTHCA_EVENT_TYPE_WQ_CATAS_ERROR:
+ mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+ IB_EVENT_QP_FATAL);
+ break;
+
+ case MTHCA_EVENT_TYPE_PATH_MIG_FAILED:
+ mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+ IB_EVENT_PATH_MIG_ERR);
+ break;
+
+ case MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+ mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+ IB_EVENT_QP_REQ_ERR);
+ break;
+
+ case MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR:
+ mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+ IB_EVENT_QP_ACCESS_ERR);
+ break;
+
+ case MTHCA_EVENT_TYPE_CMD:
+ mthca_cmd_event(dev,
+ be16_to_cpu(eqe->event.cmd.token),
+ eqe->event.cmd.status,
+ be64_to_cpu(eqe->event.cmd.out_param));
+ break;
+
+ case MTHCA_EVENT_TYPE_PORT_CHANGE:
+ port_change(dev,
+ (be32_to_cpu(eqe->event.port_change.port) >> 28) & 3,
+ eqe->subtype == 0x4);
+ break;
+
+ case MTHCA_EVENT_TYPE_CQ_ERROR:
+ mthca_warn(dev, "CQ %s on CQN %06x\n",
+ eqe->event.cq_err.syndrome == 1 ?
+ "overrun" : "access violation",
+ be32_to_cpu(eqe->event.cq_err.cqn) & 0xffffff);
+ mthca_cq_event(dev, be32_to_cpu(eqe->event.cq_err.cqn),
+ IB_EVENT_CQ_ERR);
+ break;
+
+ case MTHCA_EVENT_TYPE_EQ_OVERFLOW:
+ mthca_warn(dev, "EQ overrun on EQN %d\n", eq->eqn);
+ break;
+
+ case MTHCA_EVENT_TYPE_EEC_CATAS_ERROR:
+ case MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR:
+ case MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR:
+ case MTHCA_EVENT_TYPE_ECC_DETECT:
+ default:
+ mthca_warn(dev, "Unhandled event %02x(%02x) on EQ %d\n",
+ eqe->type, eqe->subtype, eq->eqn);
+ break;
+ };
+
+ set_eqe_hw(eqe);
+ ++eq->cons_index;
+ eqes_found = 1;
+ ++set_ci;
+
+ /*
+ * The HCA will think the queue has overflowed if we
+ * don't tell it we've been processing events. We
+ * create our EQs with MTHCA_NUM_SPARE_EQE extra
+ * entries, so we must update our consumer index at
+ * least that often.
+ */
+ if (unlikely(set_ci >= MTHCA_NUM_SPARE_EQE)) {
+ /*
+ * Conditional on hca_type is OK here because
+ * this is a rare case, not the fast path.
+ */
+ set_eq_ci(dev, eq, eq->cons_index);
+ set_ci = 0;
+ }
+ }
+
+ /*
+ * Rely on caller to set consumer index so that we don't have
+ * to test hca_type in our interrupt handling fast path.
+ */
+ return eqes_found;
+}
+
+static irqreturn_t mthca_tavor_interrupt(int irq, void *dev_ptr)
+{
+ struct mthca_dev *dev = dev_ptr;
+ u32 ecr;
+ int i;
+
+ if (dev->eq_table.clr_mask)
+ writel(dev->eq_table.clr_mask, dev->eq_table.clr_int);
+
+ ecr = readl(dev->eq_regs.tavor.ecr_base + 4);
+ if (!ecr)
+ return IRQ_NONE;
+
+ writel(ecr, dev->eq_regs.tavor.ecr_base +
+ MTHCA_ECR_CLR_BASE - MTHCA_ECR_BASE + 4);
+
+ for (i = 0; i < MTHCA_NUM_EQ; ++i)
+ if (ecr & dev->eq_table.eq[i].eqn_mask) {
+ if (mthca_eq_int(dev, &dev->eq_table.eq[i]))
+ tavor_set_eq_ci(dev, &dev->eq_table.eq[i],
+ dev->eq_table.eq[i].cons_index);
+ tavor_eq_req_not(dev, dev->eq_table.eq[i].eqn);
+ }
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t mthca_tavor_msi_x_interrupt(int irq, void *eq_ptr)
+{
+ struct mthca_eq *eq = eq_ptr;
+ struct mthca_dev *dev = eq->dev;
+
+ mthca_eq_int(dev, eq);
+ tavor_set_eq_ci(dev, eq, eq->cons_index);
+ tavor_eq_req_not(dev, eq->eqn);
+
+ /* MSI-X vectors always belong to us */
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t mthca_arbel_interrupt(int irq, void *dev_ptr)
+{
+ struct mthca_dev *dev = dev_ptr;
+ int work = 0;
+ int i;
+
+ if (dev->eq_table.clr_mask)
+ writel(dev->eq_table.clr_mask, dev->eq_table.clr_int);
+
+ for (i = 0; i < MTHCA_NUM_EQ; ++i)
+ if (mthca_eq_int(dev, &dev->eq_table.eq[i])) {
+ work = 1;
+ arbel_set_eq_ci(dev, &dev->eq_table.eq[i],
+ dev->eq_table.eq[i].cons_index);
+ }
+
+ arbel_eq_req_not(dev, dev->eq_table.arm_mask);
+
+ return IRQ_RETVAL(work);
+}
+
+static irqreturn_t mthca_arbel_msi_x_interrupt(int irq, void *eq_ptr)
+{
+ struct mthca_eq *eq = eq_ptr;
+ struct mthca_dev *dev = eq->dev;
+
+ mthca_eq_int(dev, eq);
+ arbel_set_eq_ci(dev, eq, eq->cons_index);
+ arbel_eq_req_not(dev, eq->eqn_mask);
+
+ /* MSI-X vectors always belong to us */
+ return IRQ_HANDLED;
+}
+
+static int mthca_create_eq(struct mthca_dev *dev,
+ int nent,
+ u8 intr,
+ struct mthca_eq *eq)
+{
+ int npages;
+ u64 *dma_list = NULL;
+ dma_addr_t t;
+ struct mthca_mailbox *mailbox;
+ struct mthca_eq_context *eq_context;
+ int err = -ENOMEM;
+ int i;
+ u8 status;
+
+ eq->dev = dev;
+ eq->nent = roundup_pow_of_two(max(nent, 2));
+ npages = ALIGN(eq->nent * MTHCA_EQ_ENTRY_SIZE, PAGE_SIZE) / PAGE_SIZE;
+
+ eq->page_list = kmalloc(npages * sizeof *eq->page_list,
+ GFP_KERNEL);
+ if (!eq->page_list)
+ goto err_out;
+
+ for (i = 0; i < npages; ++i)
+ eq->page_list[i].buf = NULL;
+
+ dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+ if (!dma_list)
+ goto err_out_free;
+
+ mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(mailbox))
+ goto err_out_free;
+ eq_context = mailbox->buf;
+
+ for (i = 0; i < npages; ++i) {
+ eq->page_list[i].buf = dma_alloc_coherent(&dev->pdev->dev,
+ PAGE_SIZE, &t, GFP_KERNEL);
+ if (!eq->page_list[i].buf)
+ goto err_out_free_pages;
+
+ dma_list[i] = t;
+ pci_unmap_addr_set(&eq->page_list[i], mapping, t);
+
+ clear_page(eq->page_list[i].buf);
+ }
+
+ for (i = 0; i < eq->nent; ++i)
+ set_eqe_hw(get_eqe(eq, i));
+
+ eq->eqn = mthca_alloc(&dev->eq_table.alloc);
+ if (eq->eqn == -1)
+ goto err_out_free_pages;
+
+ err = mthca_mr_alloc_phys(dev, dev->driver_pd.pd_num,
+ dma_list, PAGE_SHIFT, npages,
+ 0, npages * PAGE_SIZE,
+ MTHCA_MPT_FLAG_LOCAL_WRITE |
+ MTHCA_MPT_FLAG_LOCAL_READ,
+ &eq->mr);
+ if (err)
+ goto err_out_free_eq;
+
+ memset(eq_context, 0, sizeof *eq_context);
+ eq_context->flags = cpu_to_be32(MTHCA_EQ_STATUS_OK |
+ MTHCA_EQ_OWNER_HW |
+ MTHCA_EQ_STATE_ARMED |
+ MTHCA_EQ_FLAG_TR);
+ if (mthca_is_memfree(dev))
+ eq_context->flags |= cpu_to_be32(MTHCA_EQ_STATE_ARBEL);
+
+ eq_context->logsize_usrpage = cpu_to_be32((ffs(eq->nent) - 1) << 24);
+ if (mthca_is_memfree(dev)) {
+ eq_context->arbel_pd = cpu_to_be32(dev->driver_pd.pd_num);
+ } else {
+ eq_context->logsize_usrpage |= cpu_to_be32(dev->driver_uar.index);
+ eq_context->tavor_pd = cpu_to_be32(dev->driver_pd.pd_num);
+ }
+ eq_context->intr = intr;
+ eq_context->lkey = cpu_to_be32(eq->mr.ibmr.lkey);
+
+ err = mthca_SW2HW_EQ(dev, mailbox, eq->eqn, &status);
+ if (err) {
+ mthca_warn(dev, "SW2HW_EQ failed (%d)\n", err);
+ goto err_out_free_mr;
+ }
+ if (status) {
+ mthca_warn(dev, "SW2HW_EQ returned status 0x%02x\n",
+ status);
+ err = -EINVAL;
+ goto err_out_free_mr;
+ }
+
+ kfree(dma_list);
+ mthca_free_mailbox(dev, mailbox);
+
+ eq->eqn_mask = swab32(1 << eq->eqn);
+ eq->cons_index = 0;
+
+ dev->eq_table.arm_mask |= eq->eqn_mask;
+
+ mthca_dbg(dev, "Allocated EQ %d with %d entries\n",
+ eq->eqn, eq->nent);
+
+ return err;
+
+ err_out_free_mr:
+ mthca_free_mr(dev, &eq->mr);
+
+ err_out_free_eq:
+ mthca_free(&dev->eq_table.alloc, eq->eqn);
+
+ err_out_free_pages:
+ for (i = 0; i < npages; ++i)
+ if (eq->page_list[i].buf)
+ dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+ eq->page_list[i].buf,
+ pci_unmap_addr(&eq->page_list[i],
+ mapping));
+
+ mthca_free_mailbox(dev, mailbox);
+
+ err_out_free:
+ kfree(eq->page_list);
+ kfree(dma_list);
+
+ err_out:
+ return err;
+}
+
+static void mthca_free_eq(struct mthca_dev *dev,
+ struct mthca_eq *eq)
+{
+ struct mthca_mailbox *mailbox;
+ int err;
+ u8 status;
+ int npages = (eq->nent * MTHCA_EQ_ENTRY_SIZE + PAGE_SIZE - 1) /
+ PAGE_SIZE;
+ int i;
+
+ mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(mailbox))
+ return;
+
+ err = mthca_HW2SW_EQ(dev, mailbox, eq->eqn, &status);
+ if (err)
+ mthca_warn(dev, "HW2SW_EQ failed (%d)\n", err);
+ if (status)
+ mthca_warn(dev, "HW2SW_EQ returned status 0x%02x\n", status);
+
+ dev->eq_table.arm_mask &= ~eq->eqn_mask;
+
+ if (0) {
+ mthca_dbg(dev, "Dumping EQ context %02x:\n", eq->eqn);
+ for (i = 0; i < sizeof (struct mthca_eq_context) / 4; ++i) {
+ if (i % 4 == 0)
+ printk("[%02x] ", i * 4);
+ printk(" %08x", be32_to_cpup(mailbox->buf + i * 4));
+ if ((i + 1) % 4 == 0)
+ printk("\n");
+ }
+ }
+
+ mthca_free_mr(dev, &eq->mr);
+ for (i = 0; i < npages; ++i)
+ pci_free_consistent(dev->pdev, PAGE_SIZE,
+ eq->page_list[i].buf,
+ pci_unmap_addr(&eq->page_list[i], mapping));
+
+ kfree(eq->page_list);
+ mthca_free_mailbox(dev, mailbox);
+}
+
+static void mthca_free_irqs(struct mthca_dev *dev)
+{
+ int i;
+
+ if (dev->eq_table.have_irq)
+ free_irq(dev->pdev->irq, dev);
+ for (i = 0; i < MTHCA_NUM_EQ; ++i)
+ if (dev->eq_table.eq[i].have_irq) {
+ free_irq(dev->eq_table.eq[i].msi_x_vector,
+ dev->eq_table.eq + i);
+ dev->eq_table.eq[i].have_irq = 0;
+ }
+}
+
+static int mthca_map_reg(struct mthca_dev *dev,
+ unsigned long offset, unsigned long size,
+ void __iomem **map)
+{
+ unsigned long base = pci_resource_start(dev->pdev, 0);
+
+ *map = ioremap(base + offset, size);
+ if (!*map)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int mthca_map_eq_regs(struct mthca_dev *dev)
+{
+ if (mthca_is_memfree(dev)) {
+ /*
+ * We assume that the EQ arm and EQ set CI registers
+ * fall within the first BAR. We can't trust the
+ * values firmware gives us, since those addresses are
+ * valid on the HCA's side of the PCI bus but not
+ * necessarily the host side.
+ */
+ if (mthca_map_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+ dev->fw.arbel.clr_int_base, MTHCA_CLR_INT_SIZE,
+ &dev->clr_base)) {
+ mthca_err(dev, "Couldn't map interrupt clear register, "
+ "aborting.\n");
+ return -ENOMEM;
+ }
+
+ /*
+ * Add 4 because we limit ourselves to EQs 0 ... 31,
+ * so we only need the low word of the register.
+ */
+ if (mthca_map_reg(dev, ((pci_resource_len(dev->pdev, 0) - 1) &
+ dev->fw.arbel.eq_arm_base) + 4, 4,
+ &dev->eq_regs.arbel.eq_arm)) {
+ mthca_err(dev, "Couldn't map EQ arm register, aborting.\n");
+ iounmap(dev->clr_base);
+ return -ENOMEM;
+ }
+
+ if (mthca_map_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+ dev->fw.arbel.eq_set_ci_base,
+ MTHCA_EQ_SET_CI_SIZE,
+ &dev->eq_regs.arbel.eq_set_ci_base)) {
+ mthca_err(dev, "Couldn't map EQ CI register, aborting.\n");
+ iounmap(dev->eq_regs.arbel.eq_arm);
+ iounmap(dev->clr_base);
+ return -ENOMEM;
+ }
+ } else {
+ if (mthca_map_reg(dev, MTHCA_CLR_INT_BASE, MTHCA_CLR_INT_SIZE,
+ &dev->clr_base)) {
+ mthca_err(dev, "Couldn't map interrupt clear register, "
+ "aborting.\n");
+ return -ENOMEM;
+ }
+
+ if (mthca_map_reg(dev, MTHCA_ECR_BASE,
+ MTHCA_ECR_SIZE + MTHCA_ECR_CLR_SIZE,
+ &dev->eq_regs.tavor.ecr_base)) {
+ mthca_err(dev, "Couldn't map ecr register, "
+ "aborting.\n");
+ iounmap(dev->clr_base);
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+
+}
+
+static void mthca_unmap_eq_regs(struct mthca_dev *dev)
+{
+ if (mthca_is_memfree(dev)) {
+ iounmap(dev->eq_regs.arbel.eq_set_ci_base);
+ iounmap(dev->eq_regs.arbel.eq_arm);
+ iounmap(dev->clr_base);
+ } else {
+ iounmap(dev->eq_regs.tavor.ecr_base);
+ iounmap(dev->clr_base);
+ }
+}
+
+int mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt)
+{
+ int ret;
+ u8 status;
+
+ /*
+ * We assume that mapping one page is enough for the whole EQ
+ * context table. This is fine with all current HCAs, because
+ * we only use 32 EQs and each EQ uses 32 bytes of context
+ * memory, or 1 KB total.
+ */
+ dev->eq_table.icm_virt = icm_virt;
+ dev->eq_table.icm_page = alloc_page(GFP_HIGHUSER);
+ if (!dev->eq_table.icm_page)
+ return -ENOMEM;
+ dev->eq_table.icm_dma = pci_map_page(dev->pdev, dev->eq_table.icm_page, 0,
+ PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
+ if (pci_dma_mapping_error(dev->pdev, dev->eq_table.icm_dma)) {
+ __free_page(dev->eq_table.icm_page);
+ return -ENOMEM;
+ }
+
+ ret = mthca_MAP_ICM_page(dev, dev->eq_table.icm_dma, icm_virt, &status);
+ if (!ret && status)
+ ret = -EINVAL;
+ if (ret) {
+ pci_unmap_page(dev->pdev, dev->eq_table.icm_dma, PAGE_SIZE,
+ PCI_DMA_BIDIRECTIONAL);
+ __free_page(dev->eq_table.icm_page);
+ }
+
+ return ret;
+}
+
+void mthca_unmap_eq_icm(struct mthca_dev *dev)
+{
+ u8 status;
+
+ mthca_UNMAP_ICM(dev, dev->eq_table.icm_virt, 1, &status);
+ pci_unmap_page(dev->pdev, dev->eq_table.icm_dma, PAGE_SIZE,
+ PCI_DMA_BIDIRECTIONAL);
+ __free_page(dev->eq_table.icm_page);
+}
+
+int mthca_init_eq_table(struct mthca_dev *dev)
+{
+ int err;
+ u8 status;
+ u8 intr;
+ int i;
+
+ err = mthca_alloc_init(&dev->eq_table.alloc,
+ dev->limits.num_eqs,
+ dev->limits.num_eqs - 1,
+ dev->limits.reserved_eqs);
+ if (err)
+ return err;
+
+ err = mthca_map_eq_regs(dev);
+ if (err)
+ goto err_out_free;
+
+ if (dev->mthca_flags & MTHCA_FLAG_MSI_X) {
+ dev->eq_table.clr_mask = 0;
+ } else {
+ dev->eq_table.clr_mask =
+ swab32(1 << (dev->eq_table.inta_pin & 31));
+ dev->eq_table.clr_int = dev->clr_base +
+ (dev->eq_table.inta_pin < 32 ? 4 : 0);
+ }
+
+ dev->eq_table.arm_mask = 0;
+
+ intr = dev->eq_table.inta_pin;
+
+ err = mthca_create_eq(dev, dev->limits.num_cqs + MTHCA_NUM_SPARE_EQE,
+ (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 128 : intr,
+ &dev->eq_table.eq[MTHCA_EQ_COMP]);
+ if (err)
+ goto err_out_unmap;
+
+ err = mthca_create_eq(dev, MTHCA_NUM_ASYNC_EQE + MTHCA_NUM_SPARE_EQE,
+ (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 129 : intr,
+ &dev->eq_table.eq[MTHCA_EQ_ASYNC]);
+ if (err)
+ goto err_out_comp;
+
+ err = mthca_create_eq(dev, MTHCA_NUM_CMD_EQE + MTHCA_NUM_SPARE_EQE,
+ (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 130 : intr,
+ &dev->eq_table.eq[MTHCA_EQ_CMD]);
+ if (err)
+ goto err_out_async;
+
+ if (dev->mthca_flags & MTHCA_FLAG_MSI_X) {
+ static const char *eq_name[] = {
+ [MTHCA_EQ_COMP] = DRV_NAME " (comp)",
+ [MTHCA_EQ_ASYNC] = DRV_NAME " (async)",
+ [MTHCA_EQ_CMD] = DRV_NAME " (cmd)"
+ };
+
+ for (i = 0; i < MTHCA_NUM_EQ; ++i) {
+ err = request_irq(dev->eq_table.eq[i].msi_x_vector,
+ mthca_is_memfree(dev) ?
+ mthca_arbel_msi_x_interrupt :
+ mthca_tavor_msi_x_interrupt,
+ 0, eq_name[i], dev->eq_table.eq + i);
+ if (err)
+ goto err_out_cmd;
+ dev->eq_table.eq[i].have_irq = 1;
+ }
+ } else {
+ err = request_irq(dev->pdev->irq,
+ mthca_is_memfree(dev) ?
+ mthca_arbel_interrupt :
+ mthca_tavor_interrupt,
+ IRQF_SHARED, DRV_NAME, dev);
+ if (err)
+ goto err_out_cmd;
+ dev->eq_table.have_irq = 1;
+ }
+
+ err = mthca_MAP_EQ(dev, async_mask(dev),
+ 0, dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn, &status);
+ if (err)
+ mthca_warn(dev, "MAP_EQ for async EQ %d failed (%d)\n",
+ dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn, err);
+ if (status)
+ mthca_warn(dev, "MAP_EQ for async EQ %d returned status 0x%02x\n",
+ dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn, status);
+
+ err = mthca_MAP_EQ(dev, MTHCA_CMD_EVENT_MASK,
+ 0, dev->eq_table.eq[MTHCA_EQ_CMD].eqn, &status);
+ if (err)
+ mthca_warn(dev, "MAP_EQ for cmd EQ %d failed (%d)\n",
+ dev->eq_table.eq[MTHCA_EQ_CMD].eqn, err);
+ if (status)
+ mthca_warn(dev, "MAP_EQ for cmd EQ %d returned status 0x%02x\n",
+ dev->eq_table.eq[MTHCA_EQ_CMD].eqn, status);
+
+ for (i = 0; i < MTHCA_NUM_EQ; ++i)
+ if (mthca_is_memfree(dev))
+ arbel_eq_req_not(dev, dev->eq_table.eq[i].eqn_mask);
+ else
+ tavor_eq_req_not(dev, dev->eq_table.eq[i].eqn);
+
+ return 0;
+
+err_out_cmd:
+ mthca_free_irqs(dev);
+ mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_CMD]);
+
+err_out_async:
+ mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_ASYNC]);
+
+err_out_comp:
+ mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_COMP]);
+
+err_out_unmap:
+ mthca_unmap_eq_regs(dev);
+
+err_out_free:
+ mthca_alloc_cleanup(&dev->eq_table.alloc);
+ return err;
+}
+
+void mthca_cleanup_eq_table(struct mthca_dev *dev)
+{
+ u8 status;
+ int i;
+
+ mthca_free_irqs(dev);
+
+ mthca_MAP_EQ(dev, async_mask(dev),
+ 1, dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn, &status);
+ mthca_MAP_EQ(dev, MTHCA_CMD_EVENT_MASK,
+ 1, dev->eq_table.eq[MTHCA_EQ_CMD].eqn, &status);
+
+ for (i = 0; i < MTHCA_NUM_EQ; ++i)
+ mthca_free_eq(dev, &dev->eq_table.eq[i]);
+
+ mthca_unmap_eq_regs(dev);
+
+ mthca_alloc_cleanup(&dev->eq_table.alloc);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_mad.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_mad.c
new file mode 100644
index 0000000..5648659
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_mad.c
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+enum {
+ MTHCA_VENDOR_CLASS1 = 0x9,
+ MTHCA_VENDOR_CLASS2 = 0xa
+};
+
+static int mthca_update_rate(struct mthca_dev *dev, u8 port_num)
+{
+ struct ib_port_attr *tprops = NULL;
+ int ret;
+
+ tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+ if (!tprops)
+ return -ENOMEM;
+
+ ret = ib_query_port(&dev->ib_dev, port_num, tprops);
+ if (ret) {
+ printk(KERN_WARNING "ib_query_port failed (%d) for %s port %d\n",
+ ret, dev->ib_dev.name, port_num);
+ goto out;
+ }
+
+ dev->rate[port_num - 1] = tprops->active_speed *
+ ib_width_enum_to_int(tprops->active_width);
+
+out:
+ kfree(tprops);
+ return ret;
+}
+
+static void update_sm_ah(struct mthca_dev *dev,
+ u8 port_num, u16 lid, u8 sl)
+{
+ struct ib_ah *new_ah;
+ struct ib_ah_attr ah_attr;
+ unsigned long flags;
+
+ if (!dev->send_agent[port_num - 1][0])
+ return;
+
+ memset(&ah_attr, 0, sizeof ah_attr);
+ ah_attr.dlid = lid;
+ ah_attr.sl = sl;
+ ah_attr.port_num = port_num;
+
+ new_ah = ib_create_ah(dev->send_agent[port_num - 1][0]->qp->pd,
+ &ah_attr);
+ if (IS_ERR(new_ah))
+ return;
+
+ spin_lock_irqsave(&dev->sm_lock, flags);
+ if (dev->sm_ah[port_num - 1])
+ ib_destroy_ah(dev->sm_ah[port_num - 1]);
+ dev->sm_ah[port_num - 1] = new_ah;
+ spin_unlock_irqrestore(&dev->sm_lock, flags);
+}
+
+/*
+ * Snoop SM MADs for port info and P_Key table sets, so we can
+ * synthesize LID change and P_Key change events.
+ */
+static void smp_snoop(struct ib_device *ibdev,
+ u8 port_num,
+ struct ib_mad *mad,
+ u16 prev_lid)
+{
+ struct ib_event event;
+
+ if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+ mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+ mad->mad_hdr.method == IB_MGMT_METHOD_SET) {
+ if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO) {
+ struct ib_port_info *pinfo =
+ (struct ib_port_info *) ((struct ib_smp *) mad)->data;
+ u16 lid = be16_to_cpu(pinfo->lid);
+
+ mthca_update_rate(to_mdev(ibdev), port_num);
+ update_sm_ah(to_mdev(ibdev), port_num,
+ be16_to_cpu(pinfo->sm_lid),
+ pinfo->neighbormtu_mastersmsl & 0xf);
+
+ event.device = ibdev;
+ event.element.port_num = port_num;
+
+ if (pinfo->clientrereg_resv_subnetto & 0x80) {
+ event.event = IB_EVENT_CLIENT_REREGISTER;
+ ib_dispatch_event(&event);
+ }
+
+ if (prev_lid != lid) {
+ event.event = IB_EVENT_LID_CHANGE;
+ ib_dispatch_event(&event);
+ }
+ }
+
+ if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PKEY_TABLE) {
+ event.device = ibdev;
+ event.event = IB_EVENT_PKEY_CHANGE;
+ event.element.port_num = port_num;
+ ib_dispatch_event(&event);
+ }
+ }
+}
+
+static void node_desc_override(struct ib_device *dev,
+ struct ib_mad *mad)
+{
+ if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+ mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+ mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP &&
+ mad->mad_hdr.attr_id == IB_SMP_ATTR_NODE_DESC) {
+ mutex_lock(&to_mdev(dev)->cap_mask_mutex);
+ memcpy(((struct ib_smp *) mad)->data, dev->node_desc, 64);
+ mutex_unlock(&to_mdev(dev)->cap_mask_mutex);
+ }
+}
+
+static void forward_trap(struct mthca_dev *dev,
+ u8 port_num,
+ struct ib_mad *mad)
+{
+ int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED;
+ struct ib_mad_send_buf *send_buf;
+ struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn];
+ int ret;
+ unsigned long flags;
+
+ if (agent) {
+ send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR,
+ IB_MGMT_MAD_DATA, GFP_ATOMIC);
+ /*
+ * We rely here on the fact that MLX QPs don't use the
+ * address handle after the send is posted (this is
+ * wrong following the IB spec strictly, but we know
+ * it's OK for our devices).
+ */
+ spin_lock_irqsave(&dev->sm_lock, flags);
+ memcpy(send_buf->mad, mad, sizeof *mad);
+ if ((send_buf->ah = dev->sm_ah[port_num - 1]))
+ ret = ib_post_send_mad(send_buf, NULL);
+ else
+ ret = -EINVAL;
+ spin_unlock_irqrestore(&dev->sm_lock, flags);
+
+ if (ret)
+ ib_free_send_mad(send_buf);
+ }
+}
+
+int mthca_process_mad(struct ib_device *ibdev,
+ int mad_flags,
+ u8 port_num,
+ struct ib_wc *in_wc,
+ struct ib_grh *in_grh,
+ struct ib_mad *in_mad,
+ struct ib_mad *out_mad)
+{
+ int err;
+ u8 status;
+ u16 slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE);
+ u16 prev_lid = 0;
+ struct ib_port_attr pattr;
+
+ /* Forward locally generated traps to the SM */
+ if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP &&
+ slid == 0) {
+ forward_trap(to_mdev(ibdev), port_num, in_mad);
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+ }
+
+ /*
+ * Only handle SM gets, sets and trap represses for SM class
+ *
+ * Only handle PMA and Mellanox vendor-specific class gets and
+ * sets for other classes.
+ */
+ if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+ in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+ if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET &&
+ in_mad->mad_hdr.method != IB_MGMT_METHOD_SET &&
+ in_mad->mad_hdr.method != IB_MGMT_METHOD_TRAP_REPRESS)
+ return IB_MAD_RESULT_SUCCESS;
+
+ /*
+ * Don't process SMInfo queries or vendor-specific
+ * MADs -- the SMA can't handle them.
+ */
+ if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO ||
+ ((in_mad->mad_hdr.attr_id & IB_SMP_ATTR_VENDOR_MASK) ==
+ IB_SMP_ATTR_VENDOR_MASK))
+ return IB_MAD_RESULT_SUCCESS;
+ } else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
+ in_mad->mad_hdr.mgmt_class == MTHCA_VENDOR_CLASS1 ||
+ in_mad->mad_hdr.mgmt_class == MTHCA_VENDOR_CLASS2) {
+ if (in_mad->mad_hdr.method != IB_MGMT_METHOD_GET &&
+ in_mad->mad_hdr.method != IB_MGMT_METHOD_SET)
+ return IB_MAD_RESULT_SUCCESS;
+ } else
+ return IB_MAD_RESULT_SUCCESS;
+ if ((in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+ in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+ in_mad->mad_hdr.method == IB_MGMT_METHOD_SET &&
+ in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
+ !ib_query_port(ibdev, port_num, &pattr))
+ prev_lid = pattr.lid;
+
+ err = mthca_MAD_IFC(to_mdev(ibdev),
+ mad_flags & IB_MAD_IGNORE_MKEY,
+ mad_flags & IB_MAD_IGNORE_BKEY,
+ port_num, in_wc, in_grh, in_mad, out_mad,
+ &status);
+ if (err) {
+ mthca_err(to_mdev(ibdev), "MAD_IFC failed\n");
+ return IB_MAD_RESULT_FAILURE;
+ }
+ if (status == MTHCA_CMD_STAT_BAD_PKT)
+ return IB_MAD_RESULT_SUCCESS;
+ if (status) {
+ mthca_err(to_mdev(ibdev), "MAD_IFC returned status %02x\n",
+ status);
+ return IB_MAD_RESULT_FAILURE;
+ }
+
+ if (!out_mad->mad_hdr.status) {
+ smp_snoop(ibdev, port_num, in_mad, prev_lid);
+ node_desc_override(ibdev, out_mad);
+ }
+
+ /* set return bit in status of directed route responses */
+ if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ out_mad->mad_hdr.status |= cpu_to_be16(1 << 15);
+
+ if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS)
+ /* no response for trap repress */
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+ return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+int mthca_create_agents(struct mthca_dev *dev)
+{
+ struct ib_mad_agent *agent;
+ int p, q;
+ int ret;
+
+ spin_lock_init(&dev->sm_lock);
+
+ for (p = 0; p < dev->limits.num_ports; ++p)
+ for (q = 0; q <= 1; ++q) {
+ agent = ib_register_mad_agent(&dev->ib_dev, p + 1,
+ q ? IB_QPT_GSI : IB_QPT_SMI,
+ NULL, 0, send_handler,
+ NULL, NULL);
+ if (IS_ERR(agent)) {
+ ret = PTR_ERR(agent);
+ goto err;
+ }
+ dev->send_agent[p][q] = agent;
+ }
+
+
+ for (p = 1; p <= dev->limits.num_ports; ++p) {
+ ret = mthca_update_rate(dev, p);
+ if (ret) {
+ mthca_err(dev, "Failed to obtain port %d rate."
+ " aborting.\n", p);
+ goto err;
+ }
+ }
+
+ return 0;
+
+err:
+ for (p = 0; p < dev->limits.num_ports; ++p)
+ for (q = 0; q <= 1; ++q)
+ if (dev->send_agent[p][q])
+ ib_unregister_mad_agent(dev->send_agent[p][q]);
+
+ return ret;
+}
+
+void mthca_free_agents(struct mthca_dev *dev)
+{
+ struct ib_mad_agent *agent;
+ int p, q;
+
+ for (p = 0; p < dev->limits.num_ports; ++p) {
+ for (q = 0; q <= 1; ++q) {
+ agent = dev->send_agent[p][q];
+ dev->send_agent[p][q] = NULL;
+ ib_unregister_mad_agent(agent);
+ }
+
+ if (dev->sm_ah[p])
+ ib_destroy_ah(dev->sm_ah[p]);
+ }
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_main.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_main.c
new file mode 100644
index 0000000..772cf8c
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_main.c
@@ -0,0 +1,1360 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+
+#include "mthca_dev.h"
+#include "mthca_config_reg.h"
+#include "mthca_cmd.h"
+#include "mthca_profile.h"
+#include "mthca_memfree.h"
+#include "mthca_wqe.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("Mellanox InfiniBand HCA low-level driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+#ifdef CONFIG_INFINIBAND_MTHCA_DEBUG
+
+int mthca_debug_level = 0;
+module_param_named(debug_level, mthca_debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
+
+#endif /* CONFIG_INFINIBAND_MTHCA_DEBUG */
+
+#ifdef CONFIG_PCI_MSI
+
+static int msi_x = 1;
+module_param(msi_x, int, 0444);
+MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero");
+
+#else /* CONFIG_PCI_MSI */
+
+#define msi_x (0)
+
+#endif /* CONFIG_PCI_MSI */
+
+static int tune_pci = 0;
+module_param(tune_pci, int, 0444);
+MODULE_PARM_DESC(tune_pci, "increase PCI burst from the default set by BIOS if nonzero");
+
+DEFINE_MUTEX(mthca_device_mutex);
+
+#define MTHCA_DEFAULT_NUM_QP (1 << 16)
+#define MTHCA_DEFAULT_RDB_PER_QP (1 << 2)
+#define MTHCA_DEFAULT_NUM_CQ (1 << 16)
+#define MTHCA_DEFAULT_NUM_MCG (1 << 13)
+#define MTHCA_DEFAULT_NUM_MPT (1 << 17)
+#define MTHCA_DEFAULT_NUM_MTT (1 << 20)
+#define MTHCA_DEFAULT_NUM_UDAV (1 << 15)
+#define MTHCA_DEFAULT_NUM_RESERVED_MTTS (1 << 18)
+#define MTHCA_DEFAULT_NUM_UARC_SIZE (1 << 18)
+
+static struct mthca_profile hca_profile = {
+ .num_qp = MTHCA_DEFAULT_NUM_QP,
+ .rdb_per_qp = MTHCA_DEFAULT_RDB_PER_QP,
+ .num_cq = MTHCA_DEFAULT_NUM_CQ,
+ .num_mcg = MTHCA_DEFAULT_NUM_MCG,
+ .num_mpt = MTHCA_DEFAULT_NUM_MPT,
+ .num_mtt = MTHCA_DEFAULT_NUM_MTT,
+ .num_udav = MTHCA_DEFAULT_NUM_UDAV, /* Tavor only */
+ .fmr_reserved_mtts = MTHCA_DEFAULT_NUM_RESERVED_MTTS, /* Tavor only */
+ .uarc_size = MTHCA_DEFAULT_NUM_UARC_SIZE, /* Arbel only */
+};
+
+module_param_named(num_qp, hca_profile.num_qp, int, 0444);
+MODULE_PARM_DESC(num_qp, "maximum number of QPs per HCA");
+
+module_param_named(rdb_per_qp, hca_profile.rdb_per_qp, int, 0444);
+MODULE_PARM_DESC(rdb_per_qp, "number of RDB buffers per QP");
+
+module_param_named(num_cq, hca_profile.num_cq, int, 0444);
+MODULE_PARM_DESC(num_cq, "maximum number of CQs per HCA");
+
+module_param_named(num_mcg, hca_profile.num_mcg, int, 0444);
+MODULE_PARM_DESC(num_mcg, "maximum number of multicast groups per HCA");
+
+module_param_named(num_mpt, hca_profile.num_mpt, int, 0444);
+MODULE_PARM_DESC(num_mpt,
+ "maximum number of memory protection table entries per HCA");
+
+module_param_named(num_mtt, hca_profile.num_mtt, int, 0444);
+MODULE_PARM_DESC(num_mtt,
+ "maximum number of memory translation table segments per HCA");
+
+module_param_named(num_udav, hca_profile.num_udav, int, 0444);
+MODULE_PARM_DESC(num_udav, "maximum number of UD address vectors per HCA");
+
+module_param_named(fmr_reserved_mtts, hca_profile.fmr_reserved_mtts, int, 0444);
+MODULE_PARM_DESC(fmr_reserved_mtts,
+ "number of memory translation table segments reserved for FMR");
+
+static int log_mtts_per_seg;
+module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444);
+MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment (1-5)");
+
+static char mthca_version[] __devinitdata =
+ DRV_NAME ": Mellanox InfiniBand HCA driver v"
+ DRV_VERSION " (" DRV_RELDATE ")\n";
+
+static int mthca_tune_pci(struct mthca_dev *mdev)
+{
+ if (!tune_pci)
+ return 0;
+
+ /* First try to max out Read Byte Count */
+ if (pci_find_capability(mdev->pdev, PCI_CAP_ID_PCIX)) {
+ if (pcix_set_mmrbc(mdev->pdev, pcix_get_max_mmrbc(mdev->pdev))) {
+ mthca_err(mdev, "Couldn't set PCI-X max read count, "
+ "aborting.\n");
+ return -ENODEV;
+ }
+ } else if (!(mdev->mthca_flags & MTHCA_FLAG_PCIE))
+ mthca_info(mdev, "No PCI-X capability, not setting RBC.\n");
+
+ if (pci_find_capability(mdev->pdev, PCI_CAP_ID_EXP)) {
+ if (pcie_set_readrq(mdev->pdev, 4096)) {
+ mthca_err(mdev, "Couldn't write PCI Express read request, "
+ "aborting.\n");
+ return -ENODEV;
+ }
+ } else if (mdev->mthca_flags & MTHCA_FLAG_PCIE)
+ mthca_info(mdev, "No PCI Express capability, "
+ "not setting Max Read Request Size.\n");
+
+ return 0;
+}
+
+static int mthca_dev_lim(struct mthca_dev *mdev, struct mthca_dev_lim *dev_lim)
+{
+ int err;
+ u8 status;
+
+ mdev->limits.mtt_seg_size = (1 << log_mtts_per_seg) * 8;
+ err = mthca_QUERY_DEV_LIM(mdev, dev_lim, &status);
+ if (err) {
+ mthca_err(mdev, "QUERY_DEV_LIM command failed, aborting.\n");
+ return err;
+ }
+ if (status) {
+ mthca_err(mdev, "QUERY_DEV_LIM returned status 0x%02x, "
+ "aborting.\n", status);
+ return -EINVAL;
+ }
+ if (dev_lim->min_page_sz > PAGE_SIZE) {
+ mthca_err(mdev, "HCA minimum page size of %d bigger than "
+ "kernel PAGE_SIZE of %d, aborting.\n",
+ dev_lim->min_page_sz, PAGE_SIZE);
+ return -ENODEV;
+ }
+ if (dev_lim->num_ports > MTHCA_MAX_PORTS) {
+ mthca_err(mdev, "HCA has %d ports, but we only support %d, "
+ "aborting.\n",
+ dev_lim->num_ports, MTHCA_MAX_PORTS);
+ return -ENODEV;
+ }
+
+ if (dev_lim->uar_size > pci_resource_len(mdev->pdev, 2)) {
+ mthca_err(mdev, "HCA reported UAR size of 0x%x bigger than "
+ "PCI resource 2 size of 0x%llx, aborting.\n",
+ dev_lim->uar_size,
+ (unsigned long long)pci_resource_len(mdev->pdev, 2));
+ return -ENODEV;
+ }
+
+ mdev->limits.num_ports = dev_lim->num_ports;
+ mdev->limits.vl_cap = dev_lim->max_vl;
+ mdev->limits.mtu_cap = dev_lim->max_mtu;
+ mdev->limits.gid_table_len = dev_lim->max_gids;
+ mdev->limits.pkey_table_len = dev_lim->max_pkeys;
+ mdev->limits.local_ca_ack_delay = dev_lim->local_ca_ack_delay;
+ /*
+ * Need to allow for worst case send WQE overhead and check
+ * whether max_desc_sz imposes a lower limit than max_sg; UD
+ * send has the biggest overhead.
+ */
+ mdev->limits.max_sg = min_t(int, dev_lim->max_sg,
+ (dev_lim->max_desc_sz -
+ sizeof (struct mthca_next_seg) -
+ (mthca_is_memfree(mdev) ?
+ sizeof (struct mthca_arbel_ud_seg) :
+ sizeof (struct mthca_tavor_ud_seg))) /
+ sizeof (struct mthca_data_seg));
+ mdev->limits.max_wqes = dev_lim->max_qp_sz;
+ mdev->limits.max_qp_init_rdma = dev_lim->max_requester_per_qp;
+ mdev->limits.reserved_qps = dev_lim->reserved_qps;
+ mdev->limits.max_srq_wqes = dev_lim->max_srq_sz;
+ mdev->limits.reserved_srqs = dev_lim->reserved_srqs;
+ mdev->limits.reserved_eecs = dev_lim->reserved_eecs;
+ mdev->limits.max_desc_sz = dev_lim->max_desc_sz;
+ mdev->limits.max_srq_sge = mthca_max_srq_sge(mdev);
+ /*
+ * Subtract 1 from the limit because we need to allocate a
+ * spare CQE so the HCA HW can tell the difference between an
+ * empty CQ and a full CQ.
+ */
+ mdev->limits.max_cqes = dev_lim->max_cq_sz - 1;
+ mdev->limits.reserved_cqs = dev_lim->reserved_cqs;
+ mdev->limits.reserved_eqs = dev_lim->reserved_eqs;
+ mdev->limits.reserved_mtts = dev_lim->reserved_mtts;
+ mdev->limits.reserved_mrws = dev_lim->reserved_mrws;
+ mdev->limits.reserved_uars = dev_lim->reserved_uars;
+ mdev->limits.reserved_pds = dev_lim->reserved_pds;
+ mdev->limits.port_width_cap = dev_lim->max_port_width;
+ mdev->limits.page_size_cap = ~(u32) (dev_lim->min_page_sz - 1);
+ mdev->limits.flags = dev_lim->flags;
+ /*
+ * For old FW that doesn't return static rate support, use a
+ * value of 0x3 (only static rate values of 0 or 1 are handled),
+ * except on Sinai, where even old FW can handle static rate
+ * values of 2 and 3.
+ */
+ if (dev_lim->stat_rate_support)
+ mdev->limits.stat_rate_support = dev_lim->stat_rate_support;
+ else if (mdev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+ mdev->limits.stat_rate_support = 0xf;
+ else
+ mdev->limits.stat_rate_support = 0x3;
+
+ /* IB_DEVICE_RESIZE_MAX_WR not supported by driver.
+ May be doable since hardware supports it for SRQ.
+
+ IB_DEVICE_N_NOTIFY_CQ is supported by hardware but not by driver.
+
+ IB_DEVICE_SRQ_RESIZE is supported by hardware but SRQ is not
+ supported by driver. */
+ mdev->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT |
+ IB_DEVICE_PORT_ACTIVE_EVENT |
+ IB_DEVICE_SYS_IMAGE_GUID |
+ IB_DEVICE_RC_RNR_NAK_GEN;
+
+ if (dev_lim->flags & DEV_LIM_FLAG_BAD_PKEY_CNTR)
+ mdev->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
+
+ if (dev_lim->flags & DEV_LIM_FLAG_BAD_QKEY_CNTR)
+ mdev->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
+
+ if (dev_lim->flags & DEV_LIM_FLAG_RAW_MULTI)
+ mdev->device_cap_flags |= IB_DEVICE_RAW_MULTI;
+
+ if (dev_lim->flags & DEV_LIM_FLAG_AUTO_PATH_MIG)
+ mdev->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
+
+ if (dev_lim->flags & DEV_LIM_FLAG_UD_AV_PORT_ENFORCE)
+ mdev->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE;
+
+ if (dev_lim->flags & DEV_LIM_FLAG_SRQ)
+ mdev->mthca_flags |= MTHCA_FLAG_SRQ;
+
+ if (mthca_is_memfree(mdev))
+ if (dev_lim->flags & DEV_LIM_FLAG_IPOIB_CSUM)
+ mdev->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
+
+ return 0;
+}
+
+static int mthca_init_tavor(struct mthca_dev *mdev)
+{
+ s64 size;
+ u8 status;
+ int err;
+ struct mthca_dev_lim dev_lim;
+ struct mthca_profile profile;
+ struct mthca_init_hca_param init_hca;
+
+ err = mthca_SYS_EN(mdev, &status);
+ if (err) {
+ mthca_err(mdev, "SYS_EN command failed, aborting.\n");
+ return err;
+ }
+ if (status) {
+ mthca_err(mdev, "SYS_EN returned status 0x%02x, "
+ "aborting.\n", status);
+ return -EINVAL;
+ }
+
+ err = mthca_QUERY_FW(mdev, &status);
+ if (err) {
+ mthca_err(mdev, "QUERY_FW command failed, aborting.\n");
+ goto err_disable;
+ }
+ if (status) {
+ mthca_err(mdev, "QUERY_FW returned status 0x%02x, "
+ "aborting.\n", status);
+ err = -EINVAL;
+ goto err_disable;
+ }
+ err = mthca_QUERY_DDR(mdev, &status);
+ if (err) {
+ mthca_err(mdev, "QUERY_DDR command failed, aborting.\n");
+ goto err_disable;
+ }
+ if (status) {
+ mthca_err(mdev, "QUERY_DDR returned status 0x%02x, "
+ "aborting.\n", status);
+ err = -EINVAL;
+ goto err_disable;
+ }
+
+ err = mthca_dev_lim(mdev, &dev_lim);
+ if (err) {
+ mthca_err(mdev, "QUERY_DEV_LIM command failed, aborting.\n");
+ goto err_disable;
+ }
+
+ profile = hca_profile;
+ profile.num_uar = dev_lim.uar_size / PAGE_SIZE;
+ profile.uarc_size = 0;
+ if (mdev->mthca_flags & MTHCA_FLAG_SRQ)
+ profile.num_srq = dev_lim.max_srqs;
+
+ size = mthca_make_profile(mdev, &profile, &dev_lim, &init_hca);
+ if (size < 0) {
+ err = size;
+ goto err_disable;
+ }
+
+ err = mthca_INIT_HCA(mdev, &init_hca, &status);
+ if (err) {
+ mthca_err(mdev, "INIT_HCA command failed, aborting.\n");
+ goto err_disable;
+ }
+ if (status) {
+ mthca_err(mdev, "INIT_HCA returned status 0x%02x, "
+ "aborting.\n", status);
+ err = -EINVAL;
+ goto err_disable;
+ }
+
+ return 0;
+
+err_disable:
+ mthca_SYS_DIS(mdev, &status);
+
+ return err;
+}
+
+static int mthca_load_fw(struct mthca_dev *mdev)
+{
+ u8 status;
+ int err;
+
+ /* FIXME: use HCA-attached memory for FW if present */
+
+ mdev->fw.arbel.fw_icm =
+ mthca_alloc_icm(mdev, mdev->fw.arbel.fw_pages,
+ GFP_HIGHUSER | __GFP_NOWARN, 0);
+ if (!mdev->fw.arbel.fw_icm) {
+ mthca_err(mdev, "Couldn't allocate FW area, aborting.\n");
+ return -ENOMEM;
+ }
+
+ err = mthca_MAP_FA(mdev, mdev->fw.arbel.fw_icm, &status);
+ if (err) {
+ mthca_err(mdev, "MAP_FA command failed, aborting.\n");
+ goto err_free;
+ }
+ if (status) {
+ mthca_err(mdev, "MAP_FA returned status 0x%02x, aborting.\n", status);
+ err = -EINVAL;
+ goto err_free;
+ }
+ err = mthca_RUN_FW(mdev, &status);
+ if (err) {
+ mthca_err(mdev, "RUN_FW command failed, aborting.\n");
+ goto err_unmap_fa;
+ }
+ if (status) {
+ mthca_err(mdev, "RUN_FW returned status 0x%02x, aborting.\n", status);
+ err = -EINVAL;
+ goto err_unmap_fa;
+ }
+
+ return 0;
+
+err_unmap_fa:
+ mthca_UNMAP_FA(mdev, &status);
+
+err_free:
+ mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0);
+ return err;
+}
+
+static int mthca_init_icm(struct mthca_dev *mdev,
+ struct mthca_dev_lim *dev_lim,
+ struct mthca_init_hca_param *init_hca,
+ u64 icm_size)
+{
+ u64 aux_pages;
+ u8 status;
+ int err;
+
+ err = mthca_SET_ICM_SIZE(mdev, icm_size, &aux_pages, &status);
+ if (err) {
+ mthca_err(mdev, "SET_ICM_SIZE command failed, aborting.\n");
+ return err;
+ }
+ if (status) {
+ mthca_err(mdev, "SET_ICM_SIZE returned status 0x%02x, "
+ "aborting.\n", status);
+ return -EINVAL;
+ }
+
+ mthca_dbg(mdev, "%lld KB of HCA context requires %lld KB aux memory.\n",
+ (unsigned long long) icm_size >> 10,
+ (unsigned long long) aux_pages << 2);
+
+ mdev->fw.arbel.aux_icm = mthca_alloc_icm(mdev, aux_pages,
+ GFP_HIGHUSER | __GFP_NOWARN, 0);
+ if (!mdev->fw.arbel.aux_icm) {
+ mthca_err(mdev, "Couldn't allocate aux memory, aborting.\n");
+ return -ENOMEM;
+ }
+
+ err = mthca_MAP_ICM_AUX(mdev, mdev->fw.arbel.aux_icm, &status);
+ if (err) {
+ mthca_err(mdev, "MAP_ICM_AUX command failed, aborting.\n");
+ goto err_free_aux;
+ }
+ if (status) {
+ mthca_err(mdev, "MAP_ICM_AUX returned status 0x%02x, aborting.\n", status);
+ err = -EINVAL;
+ goto err_free_aux;
+ }
+
+ err = mthca_map_eq_icm(mdev, init_hca->eqc_base);
+ if (err) {
+ mthca_err(mdev, "Failed to map EQ context memory, aborting.\n");
+ goto err_unmap_aux;
+ }
+
+ /* CPU writes to non-reserved MTTs, while HCA might DMA to reserved mtts */
+ mdev->limits.reserved_mtts = ALIGN(mdev->limits.reserved_mtts * mdev->limits.mtt_seg_size,
+ dma_get_cache_alignment()) / mdev->limits.mtt_seg_size;
+
+ mdev->mr_table.mtt_table = mthca_alloc_icm_table(mdev, init_hca->mtt_base,
+ mdev->limits.mtt_seg_size,
+ mdev->limits.num_mtt_segs,
+ mdev->limits.reserved_mtts,
+ 1, 0);
+ if (!mdev->mr_table.mtt_table) {
+ mthca_err(mdev, "Failed to map MTT context memory, aborting.\n");
+ err = -ENOMEM;
+ goto err_unmap_eq;
+ }
+
+ mdev->mr_table.mpt_table = mthca_alloc_icm_table(mdev, init_hca->mpt_base,
+ dev_lim->mpt_entry_sz,
+ mdev->limits.num_mpts,
+ mdev->limits.reserved_mrws,
+ 1, 1);
+ if (!mdev->mr_table.mpt_table) {
+ mthca_err(mdev, "Failed to map MPT context memory, aborting.\n");
+ err = -ENOMEM;
+ goto err_unmap_mtt;
+ }
+
+ mdev->qp_table.qp_table = mthca_alloc_icm_table(mdev, init_hca->qpc_base,
+ dev_lim->qpc_entry_sz,
+ mdev->limits.num_qps,
+ mdev->limits.reserved_qps,
+ 0, 0);
+ if (!mdev->qp_table.qp_table) {
+ mthca_err(mdev, "Failed to map QP context memory, aborting.\n");
+ err = -ENOMEM;
+ goto err_unmap_mpt;
+ }
+
+ mdev->qp_table.eqp_table = mthca_alloc_icm_table(mdev, init_hca->eqpc_base,
+ dev_lim->eqpc_entry_sz,
+ mdev->limits.num_qps,
+ mdev->limits.reserved_qps,
+ 0, 0);
+ if (!mdev->qp_table.eqp_table) {
+ mthca_err(mdev, "Failed to map EQP context memory, aborting.\n");
+ err = -ENOMEM;
+ goto err_unmap_qp;
+ }
+
+ mdev->qp_table.rdb_table = mthca_alloc_icm_table(mdev, init_hca->rdb_base,
+ MTHCA_RDB_ENTRY_SIZE,
+ mdev->limits.num_qps <<
+ mdev->qp_table.rdb_shift, 0,
+ 0, 0);
+ if (!mdev->qp_table.rdb_table) {
+ mthca_err(mdev, "Failed to map RDB context memory, aborting\n");
+ err = -ENOMEM;
+ goto err_unmap_eqp;
+ }
+
+ mdev->cq_table.table = mthca_alloc_icm_table(mdev, init_hca->cqc_base,
+ dev_lim->cqc_entry_sz,
+ mdev->limits.num_cqs,
+ mdev->limits.reserved_cqs,
+ 0, 0);
+ if (!mdev->cq_table.table) {
+ mthca_err(mdev, "Failed to map CQ context memory, aborting.\n");
+ err = -ENOMEM;
+ goto err_unmap_rdb;
+ }
+
+ if (mdev->mthca_flags & MTHCA_FLAG_SRQ) {
+ mdev->srq_table.table =
+ mthca_alloc_icm_table(mdev, init_hca->srqc_base,
+ dev_lim->srq_entry_sz,
+ mdev->limits.num_srqs,
+ mdev->limits.reserved_srqs,
+ 0, 0);
+ if (!mdev->srq_table.table) {
+ mthca_err(mdev, "Failed to map SRQ context memory, "
+ "aborting.\n");
+ err = -ENOMEM;
+ goto err_unmap_cq;
+ }
+ }
+
+ /*
+ * It's not strictly required, but for simplicity just map the
+ * whole multicast group table now. The table isn't very big
+ * and it's a lot easier than trying to track ref counts.
+ */
+ mdev->mcg_table.table = mthca_alloc_icm_table(mdev, init_hca->mc_base,
+ MTHCA_MGM_ENTRY_SIZE,
+ mdev->limits.num_mgms +
+ mdev->limits.num_amgms,
+ mdev->limits.num_mgms +
+ mdev->limits.num_amgms,
+ 0, 0);
+ if (!mdev->mcg_table.table) {
+ mthca_err(mdev, "Failed to map MCG context memory, aborting.\n");
+ err = -ENOMEM;
+ goto err_unmap_srq;
+ }
+
+ return 0;
+
+err_unmap_srq:
+ if (mdev->mthca_flags & MTHCA_FLAG_SRQ)
+ mthca_free_icm_table(mdev, mdev->srq_table.table);
+
+err_unmap_cq:
+ mthca_free_icm_table(mdev, mdev->cq_table.table);
+
+err_unmap_rdb:
+ mthca_free_icm_table(mdev, mdev->qp_table.rdb_table);
+
+err_unmap_eqp:
+ mthca_free_icm_table(mdev, mdev->qp_table.eqp_table);
+
+err_unmap_qp:
+ mthca_free_icm_table(mdev, mdev->qp_table.qp_table);
+
+err_unmap_mpt:
+ mthca_free_icm_table(mdev, mdev->mr_table.mpt_table);
+
+err_unmap_mtt:
+ mthca_free_icm_table(mdev, mdev->mr_table.mtt_table);
+
+err_unmap_eq:
+ mthca_unmap_eq_icm(mdev);
+
+err_unmap_aux:
+ mthca_UNMAP_ICM_AUX(mdev, &status);
+
+err_free_aux:
+ mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0);
+
+ return err;
+}
+
+static void mthca_free_icms(struct mthca_dev *mdev)
+{
+ u8 status;
+
+ mthca_free_icm_table(mdev, mdev->mcg_table.table);
+ if (mdev->mthca_flags & MTHCA_FLAG_SRQ)
+ mthca_free_icm_table(mdev, mdev->srq_table.table);
+ mthca_free_icm_table(mdev, mdev->cq_table.table);
+ mthca_free_icm_table(mdev, mdev->qp_table.rdb_table);
+ mthca_free_icm_table(mdev, mdev->qp_table.eqp_table);
+ mthca_free_icm_table(mdev, mdev->qp_table.qp_table);
+ mthca_free_icm_table(mdev, mdev->mr_table.mpt_table);
+ mthca_free_icm_table(mdev, mdev->mr_table.mtt_table);
+ mthca_unmap_eq_icm(mdev);
+
+ mthca_UNMAP_ICM_AUX(mdev, &status);
+ mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0);
+}
+
+static int mthca_init_arbel(struct mthca_dev *mdev)
+{
+ struct mthca_dev_lim dev_lim;
+ struct mthca_profile profile;
+ struct mthca_init_hca_param init_hca;
+ s64 icm_size;
+ u8 status;
+ int err;
+
+ err = mthca_QUERY_FW(mdev, &status);
+ if (err) {
+ mthca_err(mdev, "QUERY_FW command failed, aborting.\n");
+ return err;
+ }
+ if (status) {
+ mthca_err(mdev, "QUERY_FW returned status 0x%02x, "
+ "aborting.\n", status);
+ return -EINVAL;
+ }
+
+ err = mthca_ENABLE_LAM(mdev, &status);
+ if (err) {
+ mthca_err(mdev, "ENABLE_LAM command failed, aborting.\n");
+ return err;
+ }
+ if (status == MTHCA_CMD_STAT_LAM_NOT_PRE) {
+ mthca_dbg(mdev, "No HCA-attached memory (running in MemFree mode)\n");
+ mdev->mthca_flags |= MTHCA_FLAG_NO_LAM;
+ } else if (status) {
+ mthca_err(mdev, "ENABLE_LAM returned status 0x%02x, "
+ "aborting.\n", status);
+ return -EINVAL;
+ }
+
+ err = mthca_load_fw(mdev);
+ if (err) {
+ mthca_err(mdev, "Failed to start FW, aborting.\n");
+ goto err_disable;
+ }
+
+ err = mthca_dev_lim(mdev, &dev_lim);
+ if (err) {
+ mthca_err(mdev, "QUERY_DEV_LIM command failed, aborting.\n");
+ goto err_stop_fw;
+ }
+
+ profile = hca_profile;
+ profile.num_uar = dev_lim.uar_size / PAGE_SIZE;
+ profile.num_udav = 0;
+ if (mdev->mthca_flags & MTHCA_FLAG_SRQ)
+ profile.num_srq = dev_lim.max_srqs;
+
+ icm_size = mthca_make_profile(mdev, &profile, &dev_lim, &init_hca);
+ if (icm_size < 0) {
+ err = icm_size;
+ goto err_stop_fw;
+ }
+
+ err = mthca_init_icm(mdev, &dev_lim, &init_hca, icm_size);
+ if (err)
+ goto err_stop_fw;
+
+ err = mthca_INIT_HCA(mdev, &init_hca, &status);
+ if (err) {
+ mthca_err(mdev, "INIT_HCA command failed, aborting.\n");
+ goto err_free_icm;
+ }
+ if (status) {
+ mthca_err(mdev, "INIT_HCA returned status 0x%02x, "
+ "aborting.\n", status);
+ err = -EINVAL;
+ goto err_free_icm;
+ }
+
+ return 0;
+
+err_free_icm:
+ mthca_free_icms(mdev);
+
+err_stop_fw:
+ mthca_UNMAP_FA(mdev, &status);
+ mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0);
+
+err_disable:
+ if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM))
+ mthca_DISABLE_LAM(mdev, &status);
+
+ return err;
+}
+
+static void mthca_close_hca(struct mthca_dev *mdev)
+{
+ u8 status;
+
+ mthca_CLOSE_HCA(mdev, 0, &status);
+
+ if (mthca_is_memfree(mdev)) {
+ mthca_free_icms(mdev);
+
+ mthca_UNMAP_FA(mdev, &status);
+ mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0);
+
+ if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM))
+ mthca_DISABLE_LAM(mdev, &status);
+ } else
+ mthca_SYS_DIS(mdev, &status);
+}
+
+static int mthca_init_hca(struct mthca_dev *mdev)
+{
+ u8 status;
+ int err;
+ struct mthca_adapter adapter;
+
+ if (mthca_is_memfree(mdev))
+ err = mthca_init_arbel(mdev);
+ else
+ err = mthca_init_tavor(mdev);
+
+ if (err)
+ return err;
+
+ err = mthca_QUERY_ADAPTER(mdev, &adapter, &status);
+ if (err) {
+ mthca_err(mdev, "QUERY_ADAPTER command failed, aborting.\n");
+ goto err_close;
+ }
+ if (status) {
+ mthca_err(mdev, "QUERY_ADAPTER returned status 0x%02x, "
+ "aborting.\n", status);
+ err = -EINVAL;
+ goto err_close;
+ }
+
+ mdev->eq_table.inta_pin = adapter.inta_pin;
+ if (!mthca_is_memfree(mdev))
+ mdev->rev_id = adapter.revision_id;
+ memcpy(mdev->board_id, adapter.board_id, sizeof mdev->board_id);
+
+ return 0;
+
+err_close:
+ mthca_close_hca(mdev);
+ return err;
+}
+
+static int mthca_setup_hca(struct mthca_dev *dev)
+{
+ int err;
+ u8 status;
+
+ MTHCA_INIT_DOORBELL_LOCK(&dev->doorbell_lock);
+
+ err = mthca_init_uar_table(dev);
+ if (err) {
+ mthca_err(dev, "Failed to initialize "
+ "user access region table, aborting.\n");
+ return err;
+ }
+
+ err = mthca_uar_alloc(dev, &dev->driver_uar);
+ if (err) {
+ mthca_err(dev, "Failed to allocate driver access region, "
+ "aborting.\n");
+ goto err_uar_table_free;
+ }
+
+ dev->kar = ioremap(dev->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
+ if (!dev->kar) {
+ mthca_err(dev, "Couldn't map kernel access region, "
+ "aborting.\n");
+ err = -ENOMEM;
+ goto err_uar_free;
+ }
+
+ err = mthca_init_pd_table(dev);
+ if (err) {
+ mthca_err(dev, "Failed to initialize "
+ "protection domain table, aborting.\n");
+ goto err_kar_unmap;
+ }
+
+ err = mthca_init_mr_table(dev);
+ if (err) {
+ mthca_err(dev, "Failed to initialize "
+ "memory region table, aborting.\n");
+ goto err_pd_table_free;
+ }
+
+ err = mthca_pd_alloc(dev, 1, &dev->driver_pd);
+ if (err) {
+ mthca_err(dev, "Failed to create driver PD, "
+ "aborting.\n");
+ goto err_mr_table_free;
+ }
+
+ err = mthca_init_eq_table(dev);
+ if (err) {
+ mthca_err(dev, "Failed to initialize "
+ "event queue table, aborting.\n");
+ goto err_pd_free;
+ }
+
+ err = mthca_cmd_use_events(dev);
+ if (err) {
+ mthca_err(dev, "Failed to switch to event-driven "
+ "firmware commands, aborting.\n");
+ goto err_eq_table_free;
+ }
+
+ err = mthca_NOP(dev, &status);
+ if (err || status) {
+ if (dev->mthca_flags & MTHCA_FLAG_MSI_X) {
+ mthca_warn(dev, "NOP command failed to generate interrupt "
+ "(IRQ %d).\n",
+ dev->eq_table.eq[MTHCA_EQ_CMD].msi_x_vector);
+ mthca_warn(dev, "Trying again with MSI-X disabled.\n");
+ } else {
+ mthca_err(dev, "NOP command failed to generate interrupt "
+ "(IRQ %d), aborting.\n",
+ dev->pdev->irq);
+ mthca_err(dev, "BIOS or ACPI interrupt routing problem?\n");
+ }
+
+ goto err_cmd_poll;
+ }
+
+ mthca_dbg(dev, "NOP command IRQ test passed\n");
+
+ err = mthca_init_cq_table(dev);
+ if (err) {
+ mthca_err(dev, "Failed to initialize "
+ "completion queue table, aborting.\n");
+ goto err_cmd_poll;
+ }
+
+ err = mthca_init_srq_table(dev);
+ if (err) {
+ mthca_err(dev, "Failed to initialize "
+ "shared receive queue table, aborting.\n");
+ goto err_cq_table_free;
+ }
+
+ err = mthca_init_qp_table(dev);
+ if (err) {
+ mthca_err(dev, "Failed to initialize "
+ "queue pair table, aborting.\n");
+ goto err_srq_table_free;
+ }
+
+ err = mthca_init_av_table(dev);
+ if (err) {
+ mthca_err(dev, "Failed to initialize "
+ "address vector table, aborting.\n");
+ goto err_qp_table_free;
+ }
+
+ err = mthca_init_mcg_table(dev);
+ if (err) {
+ mthca_err(dev, "Failed to initialize "
+ "multicast group table, aborting.\n");
+ goto err_av_table_free;
+ }
+
+ return 0;
+
+err_av_table_free:
+ mthca_cleanup_av_table(dev);
+
+err_qp_table_free:
+ mthca_cleanup_qp_table(dev);
+
+err_srq_table_free:
+ mthca_cleanup_srq_table(dev);
+
+err_cq_table_free:
+ mthca_cleanup_cq_table(dev);
+
+err_cmd_poll:
+ mthca_cmd_use_polling(dev);
+
+err_eq_table_free:
+ mthca_cleanup_eq_table(dev);
+
+err_pd_free:
+ mthca_pd_free(dev, &dev->driver_pd);
+
+err_mr_table_free:
+ mthca_cleanup_mr_table(dev);
+
+err_pd_table_free:
+ mthca_cleanup_pd_table(dev);
+
+err_kar_unmap:
+ iounmap(dev->kar);
+
+err_uar_free:
+ mthca_uar_free(dev, &dev->driver_uar);
+
+err_uar_table_free:
+ mthca_cleanup_uar_table(dev);
+ return err;
+}
+
+static int mthca_enable_msi_x(struct mthca_dev *mdev)
+{
+ struct msix_entry entries[3];
+ int err;
+
+ entries[0].entry = 0;
+ entries[1].entry = 1;
+ entries[2].entry = 2;
+
+ err = pci_enable_msix(mdev->pdev, entries, ARRAY_SIZE(entries));
+ if (err) {
+ if (err > 0)
+ mthca_info(mdev, "Only %d MSI-X vectors available, "
+ "not using MSI-X\n", err);
+ return err;
+ }
+
+ mdev->eq_table.eq[MTHCA_EQ_COMP ].msi_x_vector = entries[0].vector;
+ mdev->eq_table.eq[MTHCA_EQ_ASYNC].msi_x_vector = entries[1].vector;
+ mdev->eq_table.eq[MTHCA_EQ_CMD ].msi_x_vector = entries[2].vector;
+
+ return 0;
+}
+
+/* Types of supported HCA */
+enum {
+ TAVOR, /* MT23108 */
+ ARBEL_COMPAT, /* MT25208 in Tavor compat mode */
+ ARBEL_NATIVE, /* MT25208 with extended features */
+ SINAI /* MT25204 */
+};
+
+#define MTHCA_FW_VER(major, minor, subminor) \
+ (((u64) (major) << 32) | ((u64) (minor) << 16) | (u64) (subminor))
+
+static struct {
+ u64 latest_fw;
+ u32 flags;
+} mthca_hca_table[] = {
+ [TAVOR] = { .latest_fw = MTHCA_FW_VER(3, 5, 0),
+ .flags = 0 },
+ [ARBEL_COMPAT] = { .latest_fw = MTHCA_FW_VER(4, 8, 200),
+ .flags = MTHCA_FLAG_PCIE },
+ [ARBEL_NATIVE] = { .latest_fw = MTHCA_FW_VER(5, 3, 0),
+ .flags = MTHCA_FLAG_MEMFREE |
+ MTHCA_FLAG_PCIE },
+ [SINAI] = { .latest_fw = MTHCA_FW_VER(1, 2, 0),
+ .flags = MTHCA_FLAG_MEMFREE |
+ MTHCA_FLAG_PCIE |
+ MTHCA_FLAG_SINAI_OPT }
+};
+
+static int __mthca_init_one(struct pci_dev *pdev, int hca_type)
+{
+ int ddr_hidden = 0;
+ int err;
+ struct mthca_dev *mdev;
+
+ printk(KERN_INFO PFX "Initializing %s\n",
+ pci_name(pdev));
+
+ err = pci_enable_device(pdev);
+ if (err) {
+ dev_err(&pdev->dev, "Cannot enable PCI device, "
+ "aborting.\n");
+ return err;
+ }
+
+ /*
+ * Check for BARs. We expect 0: 1MB, 2: 8MB, 4: DDR (may not
+ * be present)
+ */
+ if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM) ||
+ pci_resource_len(pdev, 0) != 1 << 20) {
+ dev_err(&pdev->dev, "Missing DCS, aborting.\n");
+ err = -ENODEV;
+ goto err_disable_pdev;
+ }
+ if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) {
+ dev_err(&pdev->dev, "Missing UAR, aborting.\n");
+ err = -ENODEV;
+ goto err_disable_pdev;
+ }
+ if (!(pci_resource_flags(pdev, 4) & IORESOURCE_MEM))
+ ddr_hidden = 1;
+
+ err = pci_request_regions(pdev, DRV_NAME);
+ if (err) {
+ dev_err(&pdev->dev, "Cannot obtain PCI resources, "
+ "aborting.\n");
+ goto err_disable_pdev;
+ }
+
+ pci_set_master(pdev);
+
+ err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+ if (err) {
+ dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask.\n");
+ err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+ if (err) {
+ dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n");
+ goto err_free_res;
+ }
+ }
+ err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+ if (err) {
+ dev_warn(&pdev->dev, "Warning: couldn't set 64-bit "
+ "consistent PCI DMA mask.\n");
+ err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+ if (err) {
+ dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, "
+ "aborting.\n");
+ goto err_free_res;
+ }
+ }
+
+ mdev = (struct mthca_dev *) ib_alloc_device(sizeof *mdev);
+ if (!mdev) {
+ dev_err(&pdev->dev, "Device struct alloc failed, "
+ "aborting.\n");
+ err = -ENOMEM;
+ goto err_free_res;
+ }
+
+ mdev->pdev = pdev;
+
+ mdev->mthca_flags = mthca_hca_table[hca_type].flags;
+ if (ddr_hidden)
+ mdev->mthca_flags |= MTHCA_FLAG_DDR_HIDDEN;
+
+ /*
+ * Now reset the HCA before we touch the PCI capabilities or
+ * attempt a firmware command, since a boot ROM may have left
+ * the HCA in an undefined state.
+ */
+ err = mthca_reset(mdev);
+ if (err) {
+ mthca_err(mdev, "Failed to reset HCA, aborting.\n");
+ goto err_free_dev;
+ }
+
+ if (mthca_cmd_init(mdev)) {
+ mthca_err(mdev, "Failed to init command interface, aborting.\n");
+ goto err_free_dev;
+ }
+
+ err = mthca_tune_pci(mdev);
+ if (err)
+ goto err_cmd;
+
+ err = mthca_init_hca(mdev);
+ if (err)
+ goto err_cmd;
+
+ if (mdev->fw_ver < mthca_hca_table[hca_type].latest_fw) {
+ mthca_warn(mdev, "HCA FW version %d.%d.%03d is old (%d.%d.%03d is current).\n",
+ (int) (mdev->fw_ver >> 32), (int) (mdev->fw_ver >> 16) & 0xffff,
+ (int) (mdev->fw_ver & 0xffff),
+ (int) (mthca_hca_table[hca_type].latest_fw >> 32),
+ (int) (mthca_hca_table[hca_type].latest_fw >> 16) & 0xffff,
+ (int) (mthca_hca_table[hca_type].latest_fw & 0xffff));
+ mthca_warn(mdev, "If you have problems, try updating your HCA FW.\n");
+ }
+
+ if (msi_x && !mthca_enable_msi_x(mdev))
+ mdev->mthca_flags |= MTHCA_FLAG_MSI_X;
+
+ err = mthca_setup_hca(mdev);
+ if (err == -EBUSY && (mdev->mthca_flags & MTHCA_FLAG_MSI_X)) {
+ if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
+ pci_disable_msix(pdev);
+ mdev->mthca_flags &= ~MTHCA_FLAG_MSI_X;
+
+ err = mthca_setup_hca(mdev);
+ }
+
+ if (err)
+ goto err_close;
+
+ err = mthca_register_device(mdev);
+ if (err)
+ goto err_cleanup;
+
+ err = mthca_create_agents(mdev);
+ if (err)
+ goto err_unregister;
+
+ pci_set_drvdata(pdev, mdev);
+ mdev->hca_type = hca_type;
+
+ mdev->active = 1;
+
+ return 0;
+
+err_unregister:
+ mthca_unregister_device(mdev);
+
+err_cleanup:
+ mthca_cleanup_mcg_table(mdev);
+ mthca_cleanup_av_table(mdev);
+ mthca_cleanup_qp_table(mdev);
+ mthca_cleanup_srq_table(mdev);
+ mthca_cleanup_cq_table(mdev);
+ mthca_cmd_use_polling(mdev);
+ mthca_cleanup_eq_table(mdev);
+
+ mthca_pd_free(mdev, &mdev->driver_pd);
+
+ mthca_cleanup_mr_table(mdev);
+ mthca_cleanup_pd_table(mdev);
+ mthca_cleanup_uar_table(mdev);
+
+err_close:
+ if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
+ pci_disable_msix(pdev);
+
+ mthca_close_hca(mdev);
+
+err_cmd:
+ mthca_cmd_cleanup(mdev);
+
+err_free_dev:
+ ib_dealloc_device(&mdev->ib_dev);
+
+err_free_res:
+ pci_release_regions(pdev);
+
+err_disable_pdev:
+ pci_disable_device(pdev);
+ pci_set_drvdata(pdev, NULL);
+ return err;
+}
+
+static void __mthca_remove_one(struct pci_dev *pdev)
+{
+ struct mthca_dev *mdev = pci_get_drvdata(pdev);
+ u8 status;
+ int p;
+
+ if (mdev) {
+ mthca_free_agents(mdev);
+ mthca_unregister_device(mdev);
+
+ for (p = 1; p <= mdev->limits.num_ports; ++p)
+ mthca_CLOSE_IB(mdev, p, &status);
+
+ mthca_cleanup_mcg_table(mdev);
+ mthca_cleanup_av_table(mdev);
+ mthca_cleanup_qp_table(mdev);
+ mthca_cleanup_srq_table(mdev);
+ mthca_cleanup_cq_table(mdev);
+ mthca_cmd_use_polling(mdev);
+ mthca_cleanup_eq_table(mdev);
+
+ mthca_pd_free(mdev, &mdev->driver_pd);
+
+ mthca_cleanup_mr_table(mdev);
+ mthca_cleanup_pd_table(mdev);
+
+ iounmap(mdev->kar);
+ mthca_uar_free(mdev, &mdev->driver_uar);
+ mthca_cleanup_uar_table(mdev);
+ mthca_close_hca(mdev);
+ mthca_cmd_cleanup(mdev);
+
+ if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
+ pci_disable_msix(pdev);
+
+ ib_dealloc_device(&mdev->ib_dev);
+ pci_release_regions(pdev);
+ pci_disable_device(pdev);
+ pci_set_drvdata(pdev, NULL);
+ }
+}
+
+int __mthca_restart_one(struct pci_dev *pdev)
+{
+ struct mthca_dev *mdev;
+ int hca_type;
+
+ mdev = pci_get_drvdata(pdev);
+ if (!mdev)
+ return -ENODEV;
+ hca_type = mdev->hca_type;
+ __mthca_remove_one(pdev);
+ return __mthca_init_one(pdev, hca_type);
+}
+
+static int __devinit mthca_init_one(struct pci_dev *pdev,
+ const struct pci_device_id *id)
+{
+ static int mthca_version_printed = 0;
+ int ret;
+
+ mutex_lock(&mthca_device_mutex);
+
+ if (!mthca_version_printed) {
+ printk(KERN_INFO "%s", mthca_version);
+ ++mthca_version_printed;
+ }
+
+ if (id->driver_data >= ARRAY_SIZE(mthca_hca_table)) {
+ printk(KERN_ERR PFX "%s has invalid driver data %lx\n",
+ pci_name(pdev), id->driver_data);
+ mutex_unlock(&mthca_device_mutex);
+ return -ENODEV;
+ }
+
+ ret = __mthca_init_one(pdev, id->driver_data);
+
+ mutex_unlock(&mthca_device_mutex);
+
+ return ret;
+}
+
+static void __devexit mthca_remove_one(struct pci_dev *pdev)
+{
+ mutex_lock(&mthca_device_mutex);
+ __mthca_remove_one(pdev);
+ mutex_unlock(&mthca_device_mutex);
+}
+
+static struct pci_device_id mthca_pci_table[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR),
+ .driver_data = TAVOR },
+ { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_TAVOR),
+ .driver_data = TAVOR },
+ { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT),
+ .driver_data = ARBEL_COMPAT },
+ { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT),
+ .driver_data = ARBEL_COMPAT },
+ { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_ARBEL),
+ .driver_data = ARBEL_NATIVE },
+ { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_ARBEL),
+ .driver_data = ARBEL_NATIVE },
+ { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_SINAI),
+ .driver_data = SINAI },
+ { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_SINAI),
+ .driver_data = SINAI },
+ { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_SINAI_OLD),
+ .driver_data = SINAI },
+ { PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_SINAI_OLD),
+ .driver_data = SINAI },
+ { 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, mthca_pci_table);
+
+static struct pci_driver mthca_driver = {
+ .name = DRV_NAME,
+ .id_table = mthca_pci_table,
+ .probe = mthca_init_one,
+ .remove = __devexit_p(mthca_remove_one)
+};
+
+static void __init __mthca_check_profile_val(const char *name, int *pval,
+ int pval_default)
+{
+ /* value must be positive and power of 2 */
+ int old_pval = *pval;
+
+ if (old_pval <= 0)
+ *pval = pval_default;
+ else
+ *pval = roundup_pow_of_two(old_pval);
+
+ if (old_pval != *pval) {
+ printk(KERN_WARNING PFX "Invalid value %d for %s in module parameter.\n",
+ old_pval, name);
+ printk(KERN_WARNING PFX "Corrected %s to %d.\n", name, *pval);
+ }
+}
+
+#define mthca_check_profile_val(name, default) \
+ __mthca_check_profile_val(#name, &hca_profile.name, default)
+
+static void __init mthca_validate_profile(void)
+{
+ mthca_check_profile_val(num_qp, MTHCA_DEFAULT_NUM_QP);
+ mthca_check_profile_val(rdb_per_qp, MTHCA_DEFAULT_RDB_PER_QP);
+ mthca_check_profile_val(num_cq, MTHCA_DEFAULT_NUM_CQ);
+ mthca_check_profile_val(num_mcg, MTHCA_DEFAULT_NUM_MCG);
+ mthca_check_profile_val(num_mpt, MTHCA_DEFAULT_NUM_MPT);
+ mthca_check_profile_val(num_mtt, MTHCA_DEFAULT_NUM_MTT);
+ mthca_check_profile_val(num_udav, MTHCA_DEFAULT_NUM_UDAV);
+ mthca_check_profile_val(fmr_reserved_mtts, MTHCA_DEFAULT_NUM_RESERVED_MTTS);
+
+ if (hca_profile.fmr_reserved_mtts >= hca_profile.num_mtt) {
+ printk(KERN_WARNING PFX "Invalid fmr_reserved_mtts module parameter %d.\n",
+ hca_profile.fmr_reserved_mtts);
+ printk(KERN_WARNING PFX "(Must be smaller than num_mtt %d)\n",
+ hca_profile.num_mtt);
+ hca_profile.fmr_reserved_mtts = hca_profile.num_mtt / 2;
+ printk(KERN_WARNING PFX "Corrected fmr_reserved_mtts to %d.\n",
+ hca_profile.fmr_reserved_mtts);
+ }
+ if (log_mtts_per_seg == 0)
+ log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8);
+ if ((log_mtts_per_seg < 1) || (log_mtts_per_seg > 5)) {
+ printk(KERN_WARNING PFX "bad log_mtts_per_seg (%d). Using default - %ld\n",
+ log_mtts_per_seg, ilog2(MTHCA_MTT_SEG_SIZE / 8));
+ log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8);
+ }
+}
+
+static int __init mthca_init(void)
+{
+ int ret;
+
+ mthca_validate_profile();
+
+ ret = mthca_catas_init();
+ if (ret)
+ return ret;
+
+ ret = pci_register_driver(&mthca_driver);
+ if (ret < 0) {
+ mthca_catas_cleanup();
+ return ret;
+ }
+
+ return 0;
+}
+
+static void __exit mthca_cleanup(void)
+{
+ pci_unregister_driver(&mthca_driver);
+ mthca_catas_cleanup();
+}
+
+module_init_order(mthca_init, SI_ORDER_MIDDLE);
+module_exit(mthca_cleanup);
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_mcg.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_mcg.c
new file mode 100644
index 0000000..d4c8105
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_mcg.c
@@ -0,0 +1,372 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+struct mthca_mgm {
+ __be32 next_gid_index;
+ u32 reserved[3];
+ u8 gid[16];
+ __be32 qp[MTHCA_QP_PER_MGM];
+};
+
+static const u8 zero_gid[16]; /* automatically initialized to 0 */
+
+/*
+ * Caller must hold MCG table semaphore. gid and mgm parameters must
+ * be properly aligned for command interface.
+ *
+ * Returns 0 unless a firmware command error occurs.
+ *
+ * If GID is found in MGM or MGM is empty, *index = *hash, *prev = -1
+ * and *mgm holds MGM entry.
+ *
+ * if GID is found in AMGM, *index = index in AMGM, *prev = index of
+ * previous entry in hash chain and *mgm holds AMGM entry.
+ *
+ * If no AMGM exists for given gid, *index = -1, *prev = index of last
+ * entry in hash chain and *mgm holds end of hash chain.
+ */
+static int find_mgm(struct mthca_dev *dev,
+ u8 *gid, struct mthca_mailbox *mgm_mailbox,
+ u16 *hash, int *prev, int *index)
+{
+ struct mthca_mailbox *mailbox;
+ struct mthca_mgm *mgm = mgm_mailbox->buf;
+ u8 *mgid;
+ int err;
+ u8 status;
+
+ mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(mailbox))
+ return -ENOMEM;
+ mgid = mailbox->buf;
+
+ memcpy(mgid, gid, 16);
+
+ err = mthca_MGID_HASH(dev, mailbox, hash, &status);
+ if (err)
+ goto out;
+ if (status) {
+ mthca_err(dev, "MGID_HASH returned status %02x\n", status);
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (0)
+ mthca_dbg(dev, "Hash for %pI6 is %04x\n", gid, *hash);
+
+ *index = *hash;
+ *prev = -1;
+
+ do {
+ err = mthca_READ_MGM(dev, *index, mgm_mailbox, &status);
+ if (err)
+ goto out;
+ if (status) {
+ mthca_err(dev, "READ_MGM returned status %02x\n", status);
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (!memcmp(mgm->gid, zero_gid, 16)) {
+ if (*index != *hash) {
+ mthca_err(dev, "Found zero MGID in AMGM.\n");
+ err = -EINVAL;
+ }
+ goto out;
+ }
+
+ if (!memcmp(mgm->gid, gid, 16))
+ goto out;
+
+ *prev = *index;
+ *index = be32_to_cpu(mgm->next_gid_index) >> 6;
+ } while (*index);
+
+ *index = -1;
+
+ out:
+ mthca_free_mailbox(dev, mailbox);
+ return err;
+}
+
+int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+ struct mthca_dev *dev = to_mdev(ibqp->device);
+ struct mthca_mailbox *mailbox;
+ struct mthca_mgm *mgm;
+ u16 hash;
+ int index, prev;
+ int link = 0;
+ int i;
+ int err;
+ u8 status;
+
+ mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ mgm = mailbox->buf;
+
+ mutex_lock(&dev->mcg_table.mutex);
+
+ err = find_mgm(dev, gid->raw, mailbox, &hash, &prev, &index);
+ if (err)
+ goto out;
+
+ if (index != -1) {
+ if (!memcmp(mgm->gid, zero_gid, 16))
+ memcpy(mgm->gid, gid->raw, 16);
+ } else {
+ link = 1;
+
+ index = mthca_alloc(&dev->mcg_table.alloc);
+ if (index == -1) {
+ mthca_err(dev, "No AMGM entries left\n");
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = mthca_READ_MGM(dev, index, mailbox, &status);
+ if (err)
+ goto out;
+ if (status) {
+ mthca_err(dev, "READ_MGM returned status %02x\n", status);
+ err = -EINVAL;
+ goto out;
+ }
+ memset(mgm, 0, sizeof *mgm);
+ memcpy(mgm->gid, gid->raw, 16);
+ }
+
+ for (i = 0; i < MTHCA_QP_PER_MGM; ++i)
+ if (mgm->qp[i] == cpu_to_be32(ibqp->qp_num | (1 << 31))) {
+ mthca_dbg(dev, "QP %06x already a member of MGM\n",
+ ibqp->qp_num);
+ err = 0;
+ goto out;
+ } else if (!(mgm->qp[i] & cpu_to_be32(1 << 31))) {
+ mgm->qp[i] = cpu_to_be32(ibqp->qp_num | (1 << 31));
+ break;
+ }
+
+ if (i == MTHCA_QP_PER_MGM) {
+ mthca_err(dev, "MGM at index %x is full.\n", index);
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = mthca_WRITE_MGM(dev, index, mailbox, &status);
+ if (err)
+ goto out;
+ if (status) {
+ mthca_err(dev, "WRITE_MGM returned status %02x\n", status);
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (!link)
+ goto out;
+
+ err = mthca_READ_MGM(dev, prev, mailbox, &status);
+ if (err)
+ goto out;
+ if (status) {
+ mthca_err(dev, "READ_MGM returned status %02x\n", status);
+ err = -EINVAL;
+ goto out;
+ }
+
+ mgm->next_gid_index = cpu_to_be32(index << 6);
+
+ err = mthca_WRITE_MGM(dev, prev, mailbox, &status);
+ if (err)
+ goto out;
+ if (status) {
+ mthca_err(dev, "WRITE_MGM returned status %02x\n", status);
+ err = -EINVAL;
+ }
+
+ out:
+ if (err && link && index != -1) {
+ BUG_ON(index < dev->limits.num_mgms);
+ mthca_free(&dev->mcg_table.alloc, index);
+ }
+ mutex_unlock(&dev->mcg_table.mutex);
+
+ mthca_free_mailbox(dev, mailbox);
+ return err;
+}
+
+int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+ struct mthca_dev *dev = to_mdev(ibqp->device);
+ struct mthca_mailbox *mailbox;
+ struct mthca_mgm *mgm;
+ u16 hash;
+ int prev, index;
+ int i, loc;
+ int err;
+ u8 status;
+
+ mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ mgm = mailbox->buf;
+
+ mutex_lock(&dev->mcg_table.mutex);
+
+ err = find_mgm(dev, gid->raw, mailbox, &hash, &prev, &index);
+ if (err)
+ goto out;
+
+ if (index == -1) {
+ mthca_err(dev, "MGID %pI6 not found\n", gid->raw);
+ err = -EINVAL;
+ goto out;
+ }
+
+ for (loc = -1, i = 0; i < MTHCA_QP_PER_MGM; ++i) {
+ if (mgm->qp[i] == cpu_to_be32(ibqp->qp_num | (1 << 31)))
+ loc = i;
+ if (!(mgm->qp[i] & cpu_to_be32(1 << 31)))
+ break;
+ }
+
+ if (loc == -1) {
+ mthca_err(dev, "QP %06x not found in MGM\n", ibqp->qp_num);
+ err = -EINVAL;
+ goto out;
+ }
+
+ mgm->qp[loc] = mgm->qp[i - 1];
+ mgm->qp[i - 1] = 0;
+
+ err = mthca_WRITE_MGM(dev, index, mailbox, &status);
+ if (err)
+ goto out;
+ if (status) {
+ mthca_err(dev, "WRITE_MGM returned status %02x\n", status);
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (i != 1)
+ goto out;
+
+ if (prev == -1) {
+ /* Remove entry from MGM */
+ int amgm_index_to_free = be32_to_cpu(mgm->next_gid_index) >> 6;
+ if (amgm_index_to_free) {
+ err = mthca_READ_MGM(dev, amgm_index_to_free,
+ mailbox, &status);
+ if (err)
+ goto out;
+ if (status) {
+ mthca_err(dev, "READ_MGM returned status %02x\n",
+ status);
+ err = -EINVAL;
+ goto out;
+ }
+ } else
+ memset(mgm->gid, 0, 16);
+
+ err = mthca_WRITE_MGM(dev, index, mailbox, &status);
+ if (err)
+ goto out;
+ if (status) {
+ mthca_err(dev, "WRITE_MGM returned status %02x\n", status);
+ err = -EINVAL;
+ goto out;
+ }
+ if (amgm_index_to_free) {
+ BUG_ON(amgm_index_to_free < dev->limits.num_mgms);
+ mthca_free(&dev->mcg_table.alloc, amgm_index_to_free);
+ }
+ } else {
+ /* Remove entry from AMGM */
+ int curr_next_index = be32_to_cpu(mgm->next_gid_index) >> 6;
+ err = mthca_READ_MGM(dev, prev, mailbox, &status);
+ if (err)
+ goto out;
+ if (status) {
+ mthca_err(dev, "READ_MGM returned status %02x\n", status);
+ err = -EINVAL;
+ goto out;
+ }
+
+ mgm->next_gid_index = cpu_to_be32(curr_next_index << 6);
+
+ err = mthca_WRITE_MGM(dev, prev, mailbox, &status);
+ if (err)
+ goto out;
+ if (status) {
+ mthca_err(dev, "WRITE_MGM returned status %02x\n", status);
+ err = -EINVAL;
+ goto out;
+ }
+ BUG_ON(index < dev->limits.num_mgms);
+ mthca_free(&dev->mcg_table.alloc, index);
+ }
+
+ out:
+ mutex_unlock(&dev->mcg_table.mutex);
+
+ mthca_free_mailbox(dev, mailbox);
+ return err;
+}
+
+int mthca_init_mcg_table(struct mthca_dev *dev)
+{
+ int err;
+ int table_size = dev->limits.num_mgms + dev->limits.num_amgms;
+
+ err = mthca_alloc_init(&dev->mcg_table.alloc,
+ table_size,
+ table_size - 1,
+ dev->limits.num_mgms);
+ if (err)
+ return err;
+
+ mutex_init(&dev->mcg_table.mutex);
+
+ return 0;
+}
+
+void mthca_cleanup_mcg_table(struct mthca_dev *dev)
+{
+ mthca_alloc_cleanup(&dev->mcg_table.alloc);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.c
new file mode 100644
index 0000000..783da4b
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -0,0 +1,879 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <linux/scatterlist.h>
+#include <linux/sched.h>
+
+#include <asm/page.h>
+
+#include "mthca_memfree.h"
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+/*
+ * We allocate in as big chunks as we can, up to a maximum of 256 KB
+ * per chunk.
+ */
+enum {
+ MTHCA_ICM_ALLOC_SIZE = 1 << 18,
+ MTHCA_TABLE_CHUNK_SIZE = 1 << 18
+};
+
+struct mthca_user_db_table {
+ struct mutex mutex;
+ struct {
+ u64 uvirt;
+ struct scatterlist mem;
+ int refcount;
+ } page[0];
+};
+
+static void mthca_free_icm_pages(struct mthca_dev *dev, struct mthca_icm_chunk *chunk)
+{
+ int i;
+
+ if (chunk->nsg > 0)
+ pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages,
+ PCI_DMA_BIDIRECTIONAL);
+
+ for (i = 0; i < chunk->npages; ++i)
+ __free_pages(sg_page(&chunk->mem[i]),
+ get_order(chunk->mem[i].length));
+}
+
+static void mthca_free_icm_coherent(struct mthca_dev *dev, struct mthca_icm_chunk *chunk)
+{
+ int i;
+
+ for (i = 0; i < chunk->npages; ++i) {
+ dma_free_coherent(&dev->pdev->dev, chunk->mem[i].length,
+ lowmem_page_address(sg_page(&chunk->mem[i])),
+ sg_dma_address(&chunk->mem[i]));
+ }
+}
+
+void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm, int coherent)
+{
+ struct mthca_icm_chunk *chunk, *tmp;
+
+ if (!icm)
+ return;
+
+ list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) {
+ if (coherent)
+ mthca_free_icm_coherent(dev, chunk);
+ else
+ mthca_free_icm_pages(dev, chunk);
+
+ kfree(chunk);
+ }
+
+ kfree(icm);
+}
+
+static int mthca_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_mask)
+{
+ struct page *page;
+
+ /*
+ * Use __GFP_ZERO because buggy firmware assumes ICM pages are
+ * cleared, and subtle failures are seen if they aren't.
+ */
+ page = alloc_pages(gfp_mask | __GFP_ZERO, order);
+ if (!page)
+ return -ENOMEM;
+
+ sg_set_page(mem, page, PAGE_SIZE << order, 0);
+ return 0;
+}
+
+static int mthca_alloc_icm_coherent(struct device *dev, struct scatterlist *mem,
+ int order, gfp_t gfp_mask)
+{
+ void *buf = dma_alloc_coherent(dev, PAGE_SIZE << order, &sg_dma_address(mem),
+ gfp_mask);
+ if (!buf)
+ return -ENOMEM;
+
+ sg_set_buf(mem, buf, PAGE_SIZE << order);
+ BUG_ON(mem->offset);
+ sg_dma_len(mem) = PAGE_SIZE << order;
+ return 0;
+}
+
+struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages,
+ gfp_t gfp_mask, int coherent)
+{
+ struct mthca_icm *icm;
+ struct mthca_icm_chunk *chunk = NULL;
+ int cur_order;
+ int ret;
+
+ /* We use sg_set_buf for coherent allocs, which assumes low memory */
+ BUG_ON(coherent && (gfp_mask & __GFP_HIGHMEM));
+
+ icm = kmalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+ if (!icm)
+ return icm;
+
+ icm->refcount = 0;
+ INIT_LIST_HEAD(&icm->chunk_list);
+
+ cur_order = get_order(MTHCA_ICM_ALLOC_SIZE);
+
+ while (npages > 0) {
+ if (!chunk) {
+ chunk = kmalloc(sizeof *chunk,
+ gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+ if (!chunk)
+ goto fail;
+
+ sg_init_table(chunk->mem, MTHCA_ICM_CHUNK_LEN);
+ chunk->npages = 0;
+ chunk->nsg = 0;
+ list_add_tail(&chunk->list, &icm->chunk_list);
+ }
+
+ while (1 << cur_order > npages)
+ --cur_order;
+
+ if (coherent)
+ ret = mthca_alloc_icm_coherent(&dev->pdev->dev,
+ &chunk->mem[chunk->npages],
+ cur_order, gfp_mask);
+ else
+ ret = mthca_alloc_icm_pages(&chunk->mem[chunk->npages],
+ cur_order, gfp_mask);
+
+ if (!ret) {
+ ++chunk->npages;
+
+ if (coherent)
+ ++chunk->nsg;
+ else if (chunk->npages == MTHCA_ICM_CHUNK_LEN) {
+ chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+ chunk->npages,
+ PCI_DMA_BIDIRECTIONAL);
+
+ if (chunk->nsg <= 0)
+ goto fail;
+ }
+
+ if (chunk->npages == MTHCA_ICM_CHUNK_LEN)
+ chunk = NULL;
+
+ npages -= 1 << cur_order;
+ } else {
+ --cur_order;
+ if (cur_order < 0)
+ goto fail;
+ }
+ }
+
+ if (!coherent && chunk) {
+ chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+ chunk->npages,
+ PCI_DMA_BIDIRECTIONAL);
+
+ if (chunk->nsg <= 0)
+ goto fail;
+ }
+
+ return icm;
+
+fail:
+ mthca_free_icm(dev, icm, coherent);
+ return NULL;
+}
+
+int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int obj)
+{
+ int i = (obj & (table->num_obj - 1)) * table->obj_size / MTHCA_TABLE_CHUNK_SIZE;
+ int ret = 0;
+ u8 status;
+
+ mutex_lock(&table->mutex);
+
+ if (table->icm[i]) {
+ ++table->icm[i]->refcount;
+ goto out;
+ }
+
+ table->icm[i] = mthca_alloc_icm(dev, MTHCA_TABLE_CHUNK_SIZE >> PAGE_SHIFT,
+ (table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+ __GFP_NOWARN, table->coherent);
+ if (!table->icm[i]) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (mthca_MAP_ICM(dev, table->icm[i], table->virt + i * MTHCA_TABLE_CHUNK_SIZE,
+ &status) || status) {
+ mthca_free_icm(dev, table->icm[i], table->coherent);
+ table->icm[i] = NULL;
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ++table->icm[i]->refcount;
+
+out:
+ mutex_unlock(&table->mutex);
+ return ret;
+}
+
+void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int obj)
+{
+ int i;
+ u8 status;
+
+ if (!mthca_is_memfree(dev))
+ return;
+
+ i = (obj & (table->num_obj - 1)) * table->obj_size / MTHCA_TABLE_CHUNK_SIZE;
+
+ mutex_lock(&table->mutex);
+
+ if (--table->icm[i]->refcount == 0) {
+ mthca_UNMAP_ICM(dev, table->virt + i * MTHCA_TABLE_CHUNK_SIZE,
+ MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE,
+ &status);
+ mthca_free_icm(dev, table->icm[i], table->coherent);
+ table->icm[i] = NULL;
+ }
+
+ mutex_unlock(&table->mutex);
+}
+
+void *mthca_table_find(struct mthca_icm_table *table, int obj, dma_addr_t *dma_handle)
+{
+ int idx, offset, dma_offset, i;
+ struct mthca_icm_chunk *chunk;
+ struct mthca_icm *icm;
+ struct page *page = NULL;
+
+ if (!table->lowmem)
+ return NULL;
+
+ mutex_lock(&table->mutex);
+
+ idx = (obj & (table->num_obj - 1)) * table->obj_size;
+ icm = table->icm[idx / MTHCA_TABLE_CHUNK_SIZE];
+ dma_offset = offset = idx % MTHCA_TABLE_CHUNK_SIZE;
+
+ if (!icm)
+ goto out;
+
+ list_for_each_entry(chunk, &icm->chunk_list, list) {
+ for (i = 0; i < chunk->npages; ++i) {
+ if (dma_handle && dma_offset >= 0) {
+ if (sg_dma_len(&chunk->mem[i]) > dma_offset)
+ *dma_handle = sg_dma_address(&chunk->mem[i]) +
+ dma_offset;
+ dma_offset -= sg_dma_len(&chunk->mem[i]);
+ }
+ /* DMA mapping can merge pages but not split them,
+ * so if we found the page, dma_handle has already
+ * been assigned to. */
+ if (chunk->mem[i].length > offset) {
+ page = sg_page(&chunk->mem[i]);
+ goto out;
+ }
+ offset -= chunk->mem[i].length;
+ }
+ }
+
+out:
+ mutex_unlock(&table->mutex);
+ return page ? lowmem_page_address(page) + offset : NULL;
+}
+
+int mthca_table_get_range(struct mthca_dev *dev, struct mthca_icm_table *table,
+ int start, int end)
+{
+ int inc = MTHCA_TABLE_CHUNK_SIZE / table->obj_size;
+ int i, err;
+
+ for (i = start; i <= end; i += inc) {
+ err = mthca_table_get(dev, table, i);
+ if (err)
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ while (i > start) {
+ i -= inc;
+ mthca_table_put(dev, table, i);
+ }
+
+ return err;
+}
+
+void mthca_table_put_range(struct mthca_dev *dev, struct mthca_icm_table *table,
+ int start, int end)
+{
+ int i;
+
+ if (!mthca_is_memfree(dev))
+ return;
+
+ for (i = start; i <= end; i += MTHCA_TABLE_CHUNK_SIZE / table->obj_size)
+ mthca_table_put(dev, table, i);
+}
+
+struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev,
+ u64 virt, int obj_size,
+ int nobj, int reserved,
+ int use_lowmem, int use_coherent)
+{
+ struct mthca_icm_table *table;
+ int obj_per_chunk;
+ int num_icm;
+ unsigned chunk_size;
+ int i;
+ u8 status;
+
+ obj_per_chunk = MTHCA_TABLE_CHUNK_SIZE / obj_size;
+ num_icm = DIV_ROUND_UP(nobj, obj_per_chunk);
+
+ table = kmalloc(sizeof *table + num_icm * sizeof *table->icm, GFP_KERNEL);
+ if (!table)
+ return NULL;
+
+ table->virt = virt;
+ table->num_icm = num_icm;
+ table->num_obj = nobj;
+ table->obj_size = obj_size;
+ table->lowmem = use_lowmem;
+ table->coherent = use_coherent;
+ mutex_init(&table->mutex);
+
+ for (i = 0; i < num_icm; ++i)
+ table->icm[i] = NULL;
+
+ for (i = 0; i * MTHCA_TABLE_CHUNK_SIZE < reserved * obj_size; ++i) {
+ chunk_size = MTHCA_TABLE_CHUNK_SIZE;
+ if ((i + 1) * MTHCA_TABLE_CHUNK_SIZE > nobj * obj_size)
+ chunk_size = nobj * obj_size - i * MTHCA_TABLE_CHUNK_SIZE;
+
+ table->icm[i] = mthca_alloc_icm(dev, chunk_size >> PAGE_SHIFT,
+ (use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+ __GFP_NOWARN, use_coherent);
+ if (!table->icm[i])
+ goto err;
+ if (mthca_MAP_ICM(dev, table->icm[i], virt + i * MTHCA_TABLE_CHUNK_SIZE,
+ &status) || status) {
+ mthca_free_icm(dev, table->icm[i], table->coherent);
+ table->icm[i] = NULL;
+ goto err;
+ }
+
+ /*
+ * Add a reference to this ICM chunk so that it never
+ * gets freed (since it contains reserved firmware objects).
+ */
+ ++table->icm[i]->refcount;
+ }
+
+ return table;
+
+err:
+ for (i = 0; i < num_icm; ++i)
+ if (table->icm[i]) {
+ mthca_UNMAP_ICM(dev, virt + i * MTHCA_TABLE_CHUNK_SIZE,
+ MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE,
+ &status);
+ mthca_free_icm(dev, table->icm[i], table->coherent);
+ }
+
+ kfree(table);
+
+ return NULL;
+}
+
+void mthca_free_icm_table(struct mthca_dev *dev, struct mthca_icm_table *table)
+{
+ int i;
+ u8 status;
+
+ for (i = 0; i < table->num_icm; ++i)
+ if (table->icm[i]) {
+ mthca_UNMAP_ICM(dev, table->virt + i * MTHCA_TABLE_CHUNK_SIZE,
+ MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE,
+ &status);
+ mthca_free_icm(dev, table->icm[i], table->coherent);
+ }
+
+ kfree(table);
+}
+
+static u64 mthca_uarc_virt(struct mthca_dev *dev, struct mthca_uar *uar, int page)
+{
+ return dev->uar_table.uarc_base +
+ uar->index * dev->uar_table.uarc_size +
+ page * MTHCA_ICM_PAGE_SIZE;
+}
+
+#include <vm/vm_map.h>
+#include <vm/vm_pageout.h>
+#include <vm/pmap.h>
+
+#include <sys/resource.h>
+#include <sys/resourcevar.h>
+
+int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
+ struct mthca_user_db_table *db_tab, int index, u64 uaddr)
+{
+#ifdef __linux__
+ struct page *pages[1];
+ int ret = 0;
+ u8 status;
+ int i;
+
+ if (!mthca_is_memfree(dev))
+ return 0;
+
+ if (index < 0 || index > dev->uar_table.uarc_size / 8)
+ return -EINVAL;
+
+ mutex_lock(&db_tab->mutex);
+
+ i = index / MTHCA_DB_REC_PER_PAGE;
+
+ if ((db_tab->page[i].refcount >= MTHCA_DB_REC_PER_PAGE) ||
+ (db_tab->page[i].uvirt && db_tab->page[i].uvirt != uaddr) ||
+ (uaddr & 4095)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (db_tab->page[i].refcount) {
+ ++db_tab->page[i].refcount;
+ goto out;
+ }
+
+ ret = get_user_pages(current, current->mm, uaddr & PAGE_MASK, 1, 1, 0,
+ pages, NULL);
+ if (ret < 0)
+ goto out;
+
+ sg_set_page(&db_tab->page[i].mem, pages[0], MTHCA_ICM_PAGE_SIZE,
+ uaddr & ~PAGE_MASK);
+
+ ret = pci_map_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
+ if (ret < 0) {
+ put_page(pages[0]);
+ goto out;
+ }
+
+ ret = mthca_MAP_ICM_page(dev, sg_dma_address(&db_tab->page[i].mem),
+ mthca_uarc_virt(dev, uar, i), &status);
+ if (!ret && status)
+ ret = -EINVAL;
+ if (ret) {
+ pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
+ put_page(sg_page(&db_tab->page[i].mem));
+ goto out;
+ }
+
+ db_tab->page[i].uvirt = uaddr;
+ db_tab->page[i].refcount = 1;
+
+out:
+ mutex_unlock(&db_tab->mutex);
+ return ret;
+#else
+ struct proc *proc;
+ vm_offset_t start;
+ vm_paddr_t paddr;
+ pmap_t pmap;
+ vm_page_t m;
+ int ret = 0;
+ u8 status;
+ int i;
+
+ if (!mthca_is_memfree(dev))
+ return 0;
+
+ if (index < 0 || index > dev->uar_table.uarc_size / 8)
+ return -EINVAL;
+
+ mutex_lock(&db_tab->mutex);
+
+ i = index / MTHCA_DB_REC_PER_PAGE;
+ start = 0;
+
+ if ((db_tab->page[i].refcount >= MTHCA_DB_REC_PER_PAGE) ||
+ (db_tab->page[i].uvirt && db_tab->page[i].uvirt != uaddr) ||
+ (uaddr & 4095)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (db_tab->page[i].refcount) {
+ ++db_tab->page[i].refcount;
+ goto out;
+ }
+
+ proc = curproc;
+ pmap = vm_map_pmap(&proc->p_vmspace->vm_map);
+ PROC_LOCK(proc);
+ if (ptoa(pmap_wired_count(pmap) + 1) > lim_cur(proc, RLIMIT_MEMLOCK)) {
+ PROC_UNLOCK(proc);
+ ret = -ENOMEM;
+ goto out;
+ }
+ PROC_UNLOCK(proc);
+ if (cnt.v_wire_count + 1 > vm_page_max_wired) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ start = uaddr & PAGE_MASK;
+ ret = vm_map_wire(&proc->p_vmspace->vm_map, start, start + PAGE_SIZE,
+ VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES | VM_MAP_WIRE_WRITE);
+ if (ret != KERN_SUCCESS) {
+ start = 0;
+ ret = -ENOMEM;
+ goto out;
+ }
+ paddr = pmap_extract(pmap, uaddr);
+ if (paddr == 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+ m = PHYS_TO_VM_PAGE(paddr);
+
+ sg_set_page(&db_tab->page[i].mem, m, MTHCA_ICM_PAGE_SIZE,
+ uaddr & ~PAGE_MASK);
+
+ ret = pci_map_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
+ if (ret < 0)
+ goto out;
+
+ ret = mthca_MAP_ICM_page(dev, sg_dma_address(&db_tab->page[i].mem),
+ mthca_uarc_virt(dev, uar, i), &status);
+ if (!ret && status)
+ ret = -EINVAL;
+ if (ret) {
+ pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
+ goto out;
+ }
+
+ db_tab->page[i].uvirt = uaddr;
+ db_tab->page[i].refcount = 1;
+
+out:
+ if (ret < 0 && start)
+ vm_map_unwire(&curthread->td_proc->p_vmspace->vm_map,
+ start, start + PAGE_SIZE,
+ VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
+ mutex_unlock(&db_tab->mutex);
+ return ret;
+#endif
+}
+
+void mthca_unmap_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
+ struct mthca_user_db_table *db_tab, int index)
+{
+ if (!mthca_is_memfree(dev))
+ return;
+
+ /*
+ * To make our bookkeeping simpler, we don't unmap DB
+ * pages until we clean up the whole db table.
+ */
+
+ mutex_lock(&db_tab->mutex);
+
+ --db_tab->page[index / MTHCA_DB_REC_PER_PAGE].refcount;
+
+ mutex_unlock(&db_tab->mutex);
+}
+
+struct mthca_user_db_table *mthca_init_user_db_tab(struct mthca_dev *dev)
+{
+ struct mthca_user_db_table *db_tab;
+ int npages;
+ int i;
+
+ if (!mthca_is_memfree(dev))
+ return NULL;
+
+ npages = dev->uar_table.uarc_size / MTHCA_ICM_PAGE_SIZE;
+ db_tab = kmalloc(sizeof *db_tab + npages * sizeof *db_tab->page, GFP_KERNEL);
+ if (!db_tab)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_init(&db_tab->mutex);
+ for (i = 0; i < npages; ++i) {
+ db_tab->page[i].refcount = 0;
+ db_tab->page[i].uvirt = 0;
+ sg_init_table(&db_tab->page[i].mem, 1);
+ }
+
+ return db_tab;
+}
+
+void mthca_cleanup_user_db_tab(struct mthca_dev *dev, struct mthca_uar *uar,
+ struct mthca_user_db_table *db_tab)
+{
+ int i;
+ u8 status;
+
+ if (!mthca_is_memfree(dev))
+ return;
+
+ for (i = 0; i < dev->uar_table.uarc_size / MTHCA_ICM_PAGE_SIZE; ++i) {
+ if (db_tab->page[i].uvirt) {
+ mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, uar, i), 1, &status);
+ pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
+#ifdef __linux__
+ put_page(sg_page(&db_tab->page[i].mem));
+#else
+ vm_offset_t start;
+
+ start = db_tab->page[i].uvirt & PAGE_MASK;
+ vm_map_unwire(&curthread->td_proc->p_vmspace->vm_map,
+ start, start + PAGE_SIZE,
+ VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
+#endif
+ }
+ }
+
+ kfree(db_tab);
+}
+
+int mthca_alloc_db(struct mthca_dev *dev, enum mthca_db_type type,
+ u32 qn, __be32 **db)
+{
+ int group;
+ int start, end, dir;
+ int i, j;
+ struct mthca_db_page *page;
+ int ret = 0;
+ u8 status;
+
+ mutex_lock(&dev->db_tab->mutex);
+
+ switch (type) {
+ case MTHCA_DB_TYPE_CQ_ARM:
+ case MTHCA_DB_TYPE_SQ:
+ group = 0;
+ start = 0;
+ end = dev->db_tab->max_group1;
+ dir = 1;
+ break;
+
+ case MTHCA_DB_TYPE_CQ_SET_CI:
+ case MTHCA_DB_TYPE_RQ:
+ case MTHCA_DB_TYPE_SRQ:
+ group = 1;
+ start = dev->db_tab->npages - 1;
+ end = dev->db_tab->min_group2;
+ dir = -1;
+ break;
+
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+
+ for (i = start; i != end; i += dir)
+ if (dev->db_tab->page[i].db_rec &&
+ !bitmap_full(dev->db_tab->page[i].used,
+ MTHCA_DB_REC_PER_PAGE)) {
+ page = dev->db_tab->page + i;
+ goto found;
+ }
+
+ for (i = start; i != end; i += dir)
+ if (!dev->db_tab->page[i].db_rec) {
+ page = dev->db_tab->page + i;
+ goto alloc;
+ }
+
+ if (dev->db_tab->max_group1 >= dev->db_tab->min_group2 - 1) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (group == 0)
+ ++dev->db_tab->max_group1;
+ else
+ --dev->db_tab->min_group2;
+
+ page = dev->db_tab->page + end;
+
+alloc:
+ page->db_rec = dma_alloc_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE,
+ &page->mapping, GFP_KERNEL);
+ if (!page->db_rec) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ memset(page->db_rec, 0, MTHCA_ICM_PAGE_SIZE);
+
+ ret = mthca_MAP_ICM_page(dev, page->mapping,
+ mthca_uarc_virt(dev, &dev->driver_uar, i), &status);
+ if (!ret && status)
+ ret = -EINVAL;
+ if (ret) {
+ dma_free_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE,
+ page->db_rec, page->mapping);
+ goto out;
+ }
+
+ bitmap_zero(page->used, MTHCA_DB_REC_PER_PAGE);
+
+found:
+ j = find_first_zero_bit(page->used, MTHCA_DB_REC_PER_PAGE);
+ set_bit(j, page->used);
+
+ if (group == 1)
+ j = MTHCA_DB_REC_PER_PAGE - 1 - j;
+
+ ret = i * MTHCA_DB_REC_PER_PAGE + j;
+
+ page->db_rec[j] = cpu_to_be64((qn << 8) | (type << 5));
+
+ *db = (__be32 *) &page->db_rec[j];
+
+out:
+ mutex_unlock(&dev->db_tab->mutex);
+
+ return ret;
+}
+
+void mthca_free_db(struct mthca_dev *dev, int type, int db_index)
+{
+ int i, j;
+ struct mthca_db_page *page;
+ u8 status;
+
+ i = db_index / MTHCA_DB_REC_PER_PAGE;
+ j = db_index % MTHCA_DB_REC_PER_PAGE;
+
+ page = dev->db_tab->page + i;
+
+ mutex_lock(&dev->db_tab->mutex);
+
+ page->db_rec[j] = 0;
+ if (i >= dev->db_tab->min_group2)
+ j = MTHCA_DB_REC_PER_PAGE - 1 - j;
+ clear_bit(j, page->used);
+
+ if (bitmap_empty(page->used, MTHCA_DB_REC_PER_PAGE) &&
+ i >= dev->db_tab->max_group1 - 1) {
+ mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, &dev->driver_uar, i), 1, &status);
+
+ dma_free_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE,
+ page->db_rec, page->mapping);
+ page->db_rec = NULL;
+
+ if (i == dev->db_tab->max_group1) {
+ --dev->db_tab->max_group1;
+ /* XXX may be able to unmap more pages now */
+ }
+ if (i == dev->db_tab->min_group2)
+ ++dev->db_tab->min_group2;
+ }
+
+ mutex_unlock(&dev->db_tab->mutex);
+}
+
+int mthca_init_db_tab(struct mthca_dev *dev)
+{
+ int i;
+
+ if (!mthca_is_memfree(dev))
+ return 0;
+
+ dev->db_tab = kmalloc(sizeof *dev->db_tab, GFP_KERNEL);
+ if (!dev->db_tab)
+ return -ENOMEM;
+
+ mutex_init(&dev->db_tab->mutex);
+
+ dev->db_tab->npages = dev->uar_table.uarc_size / MTHCA_ICM_PAGE_SIZE;
+ dev->db_tab->max_group1 = 0;
+ dev->db_tab->min_group2 = dev->db_tab->npages - 1;
+
+ dev->db_tab->page = kmalloc(dev->db_tab->npages *
+ sizeof *dev->db_tab->page,
+ GFP_KERNEL);
+ if (!dev->db_tab->page) {
+ kfree(dev->db_tab);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < dev->db_tab->npages; ++i)
+ dev->db_tab->page[i].db_rec = NULL;
+
+ return 0;
+}
+
+void mthca_cleanup_db_tab(struct mthca_dev *dev)
+{
+ int i;
+ u8 status;
+
+ if (!mthca_is_memfree(dev))
+ return;
+
+ /*
+ * Because we don't always free our UARC pages when they
+ * become empty to make mthca_free_db() simpler we need to
+ * make a sweep through the doorbell pages and free any
+ * leftover pages now.
+ */
+ for (i = 0; i < dev->db_tab->npages; ++i) {
+ if (!dev->db_tab->page[i].db_rec)
+ continue;
+
+ if (!bitmap_empty(dev->db_tab->page[i].used, MTHCA_DB_REC_PER_PAGE))
+ mthca_warn(dev, "Kernel UARC page %d not empty\n", i);
+
+ mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, &dev->driver_uar, i), 1, &status);
+
+ dma_free_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE,
+ dev->db_tab->page[i].db_rec,
+ dev->db_tab->page[i].mapping);
+ }
+
+ kfree(dev->db_tab->page);
+ kfree(dev->db_tab);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.h b/sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.h
new file mode 100644
index 0000000..da9b8f9
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_memfree.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_MEMFREE_H
+#define MTHCA_MEMFREE_H
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+
+#define MTHCA_ICM_CHUNK_LEN \
+ ((256 - sizeof (struct list_head) - 2 * sizeof (int)) / \
+ (sizeof (struct scatterlist)))
+
+enum {
+ MTHCA_ICM_PAGE_SHIFT = 12,
+ MTHCA_ICM_PAGE_SIZE = 1 << MTHCA_ICM_PAGE_SHIFT,
+ MTHCA_DB_REC_PER_PAGE = MTHCA_ICM_PAGE_SIZE / 8
+};
+
+struct mthca_icm_chunk {
+ struct list_head list;
+ int npages;
+ int nsg;
+ struct scatterlist mem[MTHCA_ICM_CHUNK_LEN];
+};
+
+struct mthca_icm {
+ struct list_head chunk_list;
+ int refcount;
+};
+
+struct mthca_icm_table {
+ u64 virt;
+ int num_icm;
+ int num_obj;
+ int obj_size;
+ int lowmem;
+ int coherent;
+ struct mutex mutex;
+ struct mthca_icm *icm[0];
+};
+
+struct mthca_icm_iter {
+ struct mthca_icm *icm;
+ struct mthca_icm_chunk *chunk;
+ int page_idx;
+};
+
+struct mthca_dev;
+
+struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages,
+ gfp_t gfp_mask, int coherent);
+void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm, int coherent);
+
+struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev,
+ u64 virt, int obj_size,
+ int nobj, int reserved,
+ int use_lowmem, int use_coherent);
+void mthca_free_icm_table(struct mthca_dev *dev, struct mthca_icm_table *table);
+int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int obj);
+void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int obj);
+void *mthca_table_find(struct mthca_icm_table *table, int obj, dma_addr_t *dma_handle);
+int mthca_table_get_range(struct mthca_dev *dev, struct mthca_icm_table *table,
+ int start, int end);
+void mthca_table_put_range(struct mthca_dev *dev, struct mthca_icm_table *table,
+ int start, int end);
+
+static inline void mthca_icm_first(struct mthca_icm *icm,
+ struct mthca_icm_iter *iter)
+{
+ iter->icm = icm;
+ iter->chunk = list_empty(&icm->chunk_list) ?
+ NULL : list_entry(icm->chunk_list.next,
+ struct mthca_icm_chunk, list);
+ iter->page_idx = 0;
+}
+
+static inline int mthca_icm_last(struct mthca_icm_iter *iter)
+{
+ return !iter->chunk;
+}
+
+static inline void mthca_icm_next(struct mthca_icm_iter *iter)
+{
+ if (++iter->page_idx >= iter->chunk->nsg) {
+ if (iter->chunk->list.next == &iter->icm->chunk_list) {
+ iter->chunk = NULL;
+ return;
+ }
+
+ iter->chunk = list_entry(iter->chunk->list.next,
+ struct mthca_icm_chunk, list);
+ iter->page_idx = 0;
+ }
+}
+
+static inline dma_addr_t mthca_icm_addr(struct mthca_icm_iter *iter)
+{
+ return sg_dma_address(&iter->chunk->mem[iter->page_idx]);
+}
+
+static inline unsigned long mthca_icm_size(struct mthca_icm_iter *iter)
+{
+ return sg_dma_len(&iter->chunk->mem[iter->page_idx]);
+}
+
+struct mthca_db_page {
+ DECLARE_BITMAP(used, MTHCA_DB_REC_PER_PAGE);
+ __be64 *db_rec;
+ dma_addr_t mapping;
+};
+
+struct mthca_db_table {
+ int npages;
+ int max_group1;
+ int min_group2;
+ struct mthca_db_page *page;
+ struct mutex mutex;
+};
+
+enum mthca_db_type {
+ MTHCA_DB_TYPE_INVALID = 0x0,
+ MTHCA_DB_TYPE_CQ_SET_CI = 0x1,
+ MTHCA_DB_TYPE_CQ_ARM = 0x2,
+ MTHCA_DB_TYPE_SQ = 0x3,
+ MTHCA_DB_TYPE_RQ = 0x4,
+ MTHCA_DB_TYPE_SRQ = 0x5,
+ MTHCA_DB_TYPE_GROUP_SEP = 0x7
+};
+
+struct mthca_user_db_table;
+struct mthca_uar;
+
+int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
+ struct mthca_user_db_table *db_tab, int index, u64 uaddr);
+void mthca_unmap_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
+ struct mthca_user_db_table *db_tab, int index);
+struct mthca_user_db_table *mthca_init_user_db_tab(struct mthca_dev *dev);
+void mthca_cleanup_user_db_tab(struct mthca_dev *dev, struct mthca_uar *uar,
+ struct mthca_user_db_table *db_tab);
+
+int mthca_init_db_tab(struct mthca_dev *dev);
+void mthca_cleanup_db_tab(struct mthca_dev *dev);
+int mthca_alloc_db(struct mthca_dev *dev, enum mthca_db_type type,
+ u32 qn, __be32 **db);
+void mthca_free_db(struct mthca_dev *dev, int type, int db_index);
+
+#endif /* MTHCA_MEMFREE_H */
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_mr.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_mr.c
new file mode 100644
index 0000000..d606edf
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_mr.c
@@ -0,0 +1,985 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+#include <linux/errno.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+
+struct mthca_mtt {
+ struct mthca_buddy *buddy;
+ int order;
+ u32 first_seg;
+};
+
+/*
+ * Must be packed because mtt_seg is 64 bits but only aligned to 32 bits.
+ */
+struct mthca_mpt_entry {
+ __be32 flags;
+ __be32 page_size;
+ __be32 key;
+ __be32 pd;
+ __be64 start;
+ __be64 length;
+ __be32 lkey;
+ __be32 window_count;
+ __be32 window_count_limit;
+ __be64 mtt_seg;
+ __be32 mtt_sz; /* Arbel only */
+ u32 reserved[2];
+} __attribute__((packed));
+
+#define MTHCA_MPT_FLAG_SW_OWNS (0xfUL << 28)
+#define MTHCA_MPT_FLAG_MIO (1 << 17)
+#define MTHCA_MPT_FLAG_BIND_ENABLE (1 << 15)
+#define MTHCA_MPT_FLAG_PHYSICAL (1 << 9)
+#define MTHCA_MPT_FLAG_REGION (1 << 8)
+
+#define MTHCA_MTT_FLAG_PRESENT 1
+
+#define MTHCA_MPT_STATUS_SW 0xF0
+#define MTHCA_MPT_STATUS_HW 0x00
+
+#define SINAI_FMR_KEY_INC 0x1000000
+
+/*
+ * Buddy allocator for MTT segments (currently not very efficient
+ * since it doesn't keep a free list and just searches linearly
+ * through the bitmaps)
+ */
+
+static u32 mthca_buddy_alloc(struct mthca_buddy *buddy, int order)
+{
+ int o;
+ int m;
+ u32 seg;
+
+ spin_lock(&buddy->lock);
+
+ for (o = order; o <= buddy->max_order; ++o)
+ if (buddy->num_free[o]) {
+ m = 1 << (buddy->max_order - o);
+ seg = find_first_bit(buddy->bits[o], m);
+ if (seg < m)
+ goto found;
+ }
+
+ spin_unlock(&buddy->lock);
+ return -1;
+
+ found:
+ clear_bit(seg, buddy->bits[o]);
+ --buddy->num_free[o];
+
+ while (o > order) {
+ --o;
+ seg <<= 1;
+ set_bit(seg ^ 1, buddy->bits[o]);
+ ++buddy->num_free[o];
+ }
+
+ spin_unlock(&buddy->lock);
+
+ seg <<= order;
+
+ return seg;
+}
+
+static void mthca_buddy_free(struct mthca_buddy *buddy, u32 seg, int order)
+{
+ seg >>= order;
+
+ spin_lock(&buddy->lock);
+
+ while (test_bit(seg ^ 1, buddy->bits[order])) {
+ clear_bit(seg ^ 1, buddy->bits[order]);
+ --buddy->num_free[order];
+ seg >>= 1;
+ ++order;
+ }
+
+ set_bit(seg, buddy->bits[order]);
+ ++buddy->num_free[order];
+
+ spin_unlock(&buddy->lock);
+}
+
+static int mthca_buddy_init(struct mthca_buddy *buddy, int max_order)
+{
+ int i, s;
+
+ buddy->max_order = max_order;
+ spin_lock_init(&buddy->lock);
+
+ buddy->bits = kzalloc((buddy->max_order + 1) * sizeof (long *),
+ GFP_KERNEL);
+ buddy->num_free = kzalloc((buddy->max_order + 1) * sizeof (int *),
+ GFP_KERNEL);
+ if (!buddy->bits || !buddy->num_free)
+ goto err_out;
+
+ for (i = 0; i <= buddy->max_order; ++i) {
+ s = BITS_TO_LONGS(1 << (buddy->max_order - i));
+ buddy->bits[i] = kmalloc(s * sizeof (long), GFP_KERNEL);
+ if (!buddy->bits[i])
+ goto err_out_free;
+ bitmap_zero(buddy->bits[i],
+ 1 << (buddy->max_order - i));
+ }
+
+ set_bit(0, buddy->bits[buddy->max_order]);
+ buddy->num_free[buddy->max_order] = 1;
+
+ return 0;
+
+err_out_free:
+ for (i = 0; i <= buddy->max_order; ++i)
+ kfree(buddy->bits[i]);
+
+err_out:
+ kfree(buddy->bits);
+ kfree(buddy->num_free);
+
+ return -ENOMEM;
+}
+
+static void mthca_buddy_cleanup(struct mthca_buddy *buddy)
+{
+ int i;
+
+ for (i = 0; i <= buddy->max_order; ++i)
+ kfree(buddy->bits[i]);
+
+ kfree(buddy->bits);
+ kfree(buddy->num_free);
+}
+
+static u32 mthca_alloc_mtt_range(struct mthca_dev *dev, int order,
+ struct mthca_buddy *buddy)
+{
+ u32 seg = mthca_buddy_alloc(buddy, order);
+
+ if (seg == -1)
+ return -1;
+
+ if (mthca_is_memfree(dev))
+ if (mthca_table_get_range(dev, dev->mr_table.mtt_table, seg,
+ seg + (1 << order) - 1)) {
+ mthca_buddy_free(buddy, seg, order);
+ seg = -1;
+ }
+
+ return seg;
+}
+
+static struct mthca_mtt *__mthca_alloc_mtt(struct mthca_dev *dev, int size,
+ struct mthca_buddy *buddy)
+{
+ struct mthca_mtt *mtt;
+ int i;
+
+ if (size <= 0)
+ return ERR_PTR(-EINVAL);
+
+ mtt = kmalloc(sizeof *mtt, GFP_KERNEL);
+ if (!mtt)
+ return ERR_PTR(-ENOMEM);
+
+ mtt->buddy = buddy;
+ mtt->order = 0;
+ for (i = dev->limits.mtt_seg_size / 8; i < size; i <<= 1)
+ ++mtt->order;
+
+ mtt->first_seg = mthca_alloc_mtt_range(dev, mtt->order, buddy);
+ if (mtt->first_seg == -1) {
+ kfree(mtt);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return mtt;
+}
+
+struct mthca_mtt *mthca_alloc_mtt(struct mthca_dev *dev, int size)
+{
+ return __mthca_alloc_mtt(dev, size, &dev->mr_table.mtt_buddy);
+}
+
+void mthca_free_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt)
+{
+ if (!mtt)
+ return;
+
+ mthca_buddy_free(mtt->buddy, mtt->first_seg, mtt->order);
+
+ mthca_table_put_range(dev, dev->mr_table.mtt_table,
+ mtt->first_seg,
+ mtt->first_seg + (1 << mtt->order) - 1);
+
+ kfree(mtt);
+}
+
+static int __mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt,
+ int start_index, u64 *buffer_list, int list_len)
+{
+ struct mthca_mailbox *mailbox;
+ __be64 *mtt_entry;
+ int err = 0;
+ u8 status;
+ int i;
+
+ mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ mtt_entry = mailbox->buf;
+
+ while (list_len > 0) {
+ mtt_entry[0] = cpu_to_be64(dev->mr_table.mtt_base +
+ mtt->first_seg * dev->limits.mtt_seg_size +
+ start_index * 8);
+ mtt_entry[1] = 0;
+ for (i = 0; i < list_len && i < MTHCA_MAILBOX_SIZE / 8 - 2; ++i)
+ mtt_entry[i + 2] = cpu_to_be64(buffer_list[i] |
+ MTHCA_MTT_FLAG_PRESENT);
+
+ /*
+ * If we have an odd number of entries to write, add
+ * one more dummy entry for firmware efficiency.
+ */
+ if (i & 1)
+ mtt_entry[i + 2] = 0;
+
+ err = mthca_WRITE_MTT(dev, mailbox, (i + 1) & ~1, &status);
+ if (err) {
+ mthca_warn(dev, "WRITE_MTT failed (%d)\n", err);
+ goto out;
+ }
+ if (status) {
+ mthca_warn(dev, "WRITE_MTT returned status 0x%02x\n",
+ status);
+ err = -EINVAL;
+ goto out;
+ }
+
+ list_len -= i;
+ start_index += i;
+ buffer_list += i;
+ }
+
+out:
+ mthca_free_mailbox(dev, mailbox);
+ return err;
+}
+
+int mthca_write_mtt_size(struct mthca_dev *dev)
+{
+ if (dev->mr_table.fmr_mtt_buddy != &dev->mr_table.mtt_buddy ||
+ !(dev->mthca_flags & MTHCA_FLAG_FMR))
+ /*
+ * Be friendly to WRITE_MTT command
+ * and leave two empty slots for the
+ * index and reserved fields of the
+ * mailbox.
+ */
+ return PAGE_SIZE / sizeof (u64) - 2;
+
+ /* For Arbel, all MTTs must fit in the same page. */
+ return mthca_is_memfree(dev) ? (PAGE_SIZE / sizeof (u64)) : 0x7ffffff;
+}
+
+static void mthca_tavor_write_mtt_seg(struct mthca_dev *dev,
+ struct mthca_mtt *mtt, int start_index,
+ u64 *buffer_list, int list_len)
+{
+ u64 __iomem *mtts;
+ int i;
+
+ mtts = dev->mr_table.tavor_fmr.mtt_base + mtt->first_seg * dev->limits.mtt_seg_size +
+ start_index * sizeof (u64);
+ for (i = 0; i < list_len; ++i)
+ mthca_write64_raw(cpu_to_be64(buffer_list[i] | MTHCA_MTT_FLAG_PRESENT),
+ mtts + i);
+}
+
+static void mthca_arbel_write_mtt_seg(struct mthca_dev *dev,
+ struct mthca_mtt *mtt, int start_index,
+ u64 *buffer_list, int list_len)
+{
+ __be64 *mtts;
+ dma_addr_t dma_handle;
+ int i;
+ int s = start_index * sizeof (u64);
+
+ /* For Arbel, all MTTs must fit in the same page. */
+ BUG_ON(s / PAGE_SIZE != (s + list_len * sizeof(u64) - 1) / PAGE_SIZE);
+ /* Require full segments */
+ BUG_ON(s % dev->limits.mtt_seg_size);
+
+ mtts = mthca_table_find(dev->mr_table.mtt_table, mtt->first_seg +
+ s / dev->limits.mtt_seg_size, &dma_handle);
+
+ BUG_ON(!mtts);
+
+ for (i = 0; i < list_len; ++i)
+ mtts[i] = cpu_to_be64(buffer_list[i] | MTHCA_MTT_FLAG_PRESENT);
+
+ dma_sync_single(&dev->pdev->dev, dma_handle, list_len * sizeof (u64), DMA_TO_DEVICE);
+}
+
+int mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt,
+ int start_index, u64 *buffer_list, int list_len)
+{
+ int size = mthca_write_mtt_size(dev);
+ int chunk;
+
+ if (dev->mr_table.fmr_mtt_buddy != &dev->mr_table.mtt_buddy ||
+ !(dev->mthca_flags & MTHCA_FLAG_FMR))
+ return __mthca_write_mtt(dev, mtt, start_index, buffer_list, list_len);
+
+ while (list_len > 0) {
+ chunk = min(size, list_len);
+ if (mthca_is_memfree(dev))
+ mthca_arbel_write_mtt_seg(dev, mtt, start_index,
+ buffer_list, chunk);
+ else
+ mthca_tavor_write_mtt_seg(dev, mtt, start_index,
+ buffer_list, chunk);
+
+ list_len -= chunk;
+ start_index += chunk;
+ buffer_list += chunk;
+ }
+
+ return 0;
+}
+
+static inline u32 tavor_hw_index_to_key(u32 ind)
+{
+ return ind;
+}
+
+static inline u32 tavor_key_to_hw_index(u32 key)
+{
+ return key;
+}
+
+static inline u32 arbel_hw_index_to_key(u32 ind)
+{
+ return (ind >> 24) | (ind << 8);
+}
+
+static inline u32 arbel_key_to_hw_index(u32 key)
+{
+ return (key << 24) | (key >> 8);
+}
+
+static inline u32 hw_index_to_key(struct mthca_dev *dev, u32 ind)
+{
+ if (mthca_is_memfree(dev))
+ return arbel_hw_index_to_key(ind);
+ else
+ return tavor_hw_index_to_key(ind);
+}
+
+static inline u32 key_to_hw_index(struct mthca_dev *dev, u32 key)
+{
+ if (mthca_is_memfree(dev))
+ return arbel_key_to_hw_index(key);
+ else
+ return tavor_key_to_hw_index(key);
+}
+
+static inline u32 adjust_key(struct mthca_dev *dev, u32 key)
+{
+ if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+ return ((key << 20) & 0x800000) | (key & 0x7fffff);
+ else
+ return key;
+}
+
+int mthca_mr_alloc(struct mthca_dev *dev, u32 pd, int buffer_size_shift,
+ u64 iova, u64 total_size, u32 access, struct mthca_mr *mr)
+{
+ struct mthca_mailbox *mailbox;
+ struct mthca_mpt_entry *mpt_entry;
+ u32 key;
+ int i;
+ int err;
+ u8 status;
+
+ WARN_ON(buffer_size_shift >= 32);
+
+ key = mthca_alloc(&dev->mr_table.mpt_alloc);
+ if (key == -1)
+ return -ENOMEM;
+ key = adjust_key(dev, key);
+ mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key);
+
+ if (mthca_is_memfree(dev)) {
+ err = mthca_table_get(dev, dev->mr_table.mpt_table, key);
+ if (err)
+ goto err_out_mpt_free;
+ }
+
+ mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(mailbox)) {
+ err = PTR_ERR(mailbox);
+ goto err_out_table;
+ }
+ mpt_entry = mailbox->buf;
+
+ mpt_entry->flags = cpu_to_be32(MTHCA_MPT_FLAG_SW_OWNS |
+ MTHCA_MPT_FLAG_MIO |
+ MTHCA_MPT_FLAG_REGION |
+ access);
+ if (!mr->mtt)
+ mpt_entry->flags |= cpu_to_be32(MTHCA_MPT_FLAG_PHYSICAL);
+
+ mpt_entry->page_size = cpu_to_be32(buffer_size_shift - 12);
+ mpt_entry->key = cpu_to_be32(key);
+ mpt_entry->pd = cpu_to_be32(pd);
+ mpt_entry->start = cpu_to_be64(iova);
+ mpt_entry->length = cpu_to_be64(total_size);
+
+ memset(&mpt_entry->lkey, 0,
+ sizeof *mpt_entry - offsetof(struct mthca_mpt_entry, lkey));
+
+ if (mr->mtt)
+ mpt_entry->mtt_seg =
+ cpu_to_be64(dev->mr_table.mtt_base +
+ mr->mtt->first_seg * dev->limits.mtt_seg_size);
+
+ if (0) {
+ mthca_dbg(dev, "Dumping MPT entry %08x:\n", mr->ibmr.lkey);
+ for (i = 0; i < sizeof (struct mthca_mpt_entry) / 4; ++i) {
+ if (i % 4 == 0)
+ printk("[%02x] ", i * 4);
+ printk(" %08x", be32_to_cpu(((__be32 *) mpt_entry)[i]));
+ if ((i + 1) % 4 == 0)
+ printk("\n");
+ }
+ }
+
+ err = mthca_SW2HW_MPT(dev, mailbox,
+ key & (dev->limits.num_mpts - 1),
+ &status);
+ if (err) {
+ mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+ goto err_out_mailbox;
+ } else if (status) {
+ mthca_warn(dev, "SW2HW_MPT returned status 0x%02x\n",
+ status);
+ err = -EINVAL;
+ goto err_out_mailbox;
+ }
+
+ mthca_free_mailbox(dev, mailbox);
+ return err;
+
+err_out_mailbox:
+ mthca_free_mailbox(dev, mailbox);
+
+err_out_table:
+ mthca_table_put(dev, dev->mr_table.mpt_table, key);
+
+err_out_mpt_free:
+ mthca_free(&dev->mr_table.mpt_alloc, key);
+ return err;
+}
+
+int mthca_mr_alloc_notrans(struct mthca_dev *dev, u32 pd,
+ u32 access, struct mthca_mr *mr)
+{
+ mr->mtt = NULL;
+ return mthca_mr_alloc(dev, pd, 12, 0, ~0ULL, access, mr);
+}
+
+int mthca_mr_alloc_phys(struct mthca_dev *dev, u32 pd,
+ u64 *buffer_list, int buffer_size_shift,
+ int list_len, u64 iova, u64 total_size,
+ u32 access, struct mthca_mr *mr)
+{
+ int err;
+
+ mr->mtt = mthca_alloc_mtt(dev, list_len);
+ if (IS_ERR(mr->mtt))
+ return PTR_ERR(mr->mtt);
+
+ err = mthca_write_mtt(dev, mr->mtt, 0, buffer_list, list_len);
+ if (err) {
+ mthca_free_mtt(dev, mr->mtt);
+ return err;
+ }
+
+ err = mthca_mr_alloc(dev, pd, buffer_size_shift, iova,
+ total_size, access, mr);
+ if (err)
+ mthca_free_mtt(dev, mr->mtt);
+
+ return err;
+}
+
+/* Free mr or fmr */
+static void mthca_free_region(struct mthca_dev *dev, u32 lkey)
+{
+ mthca_table_put(dev, dev->mr_table.mpt_table,
+ key_to_hw_index(dev, lkey));
+
+ mthca_free(&dev->mr_table.mpt_alloc, key_to_hw_index(dev, lkey));
+}
+
+void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr)
+{
+ int err;
+ u8 status;
+
+ err = mthca_HW2SW_MPT(dev, NULL,
+ key_to_hw_index(dev, mr->ibmr.lkey) &
+ (dev->limits.num_mpts - 1),
+ &status);
+ if (err)
+ mthca_warn(dev, "HW2SW_MPT failed (%d)\n", err);
+ else if (status)
+ mthca_warn(dev, "HW2SW_MPT returned status 0x%02x\n",
+ status);
+
+ mthca_free_region(dev, mr->ibmr.lkey);
+ mthca_free_mtt(dev, mr->mtt);
+}
+
+int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd,
+ u32 access, struct mthca_fmr *mr)
+{
+ struct mthca_mpt_entry *mpt_entry;
+ struct mthca_mailbox *mailbox;
+ u64 mtt_seg;
+ u32 key, idx;
+ u8 status;
+ int list_len = mr->attr.max_pages;
+ int err = -ENOMEM;
+ int i;
+
+ if (mr->attr.page_shift < 12 || mr->attr.page_shift >= 32)
+ return -EINVAL;
+
+ /* For Arbel, all MTTs must fit in the same page. */
+ if (mthca_is_memfree(dev) &&
+ mr->attr.max_pages * sizeof *mr->mem.arbel.mtts > PAGE_SIZE)
+ return -EINVAL;
+
+ mr->maps = 0;
+
+ key = mthca_alloc(&dev->mr_table.mpt_alloc);
+ if (key == -1)
+ return -ENOMEM;
+ key = adjust_key(dev, key);
+
+ idx = key & (dev->limits.num_mpts - 1);
+ mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key);
+
+ if (mthca_is_memfree(dev)) {
+ err = mthca_table_get(dev, dev->mr_table.mpt_table, key);
+ if (err)
+ goto err_out_mpt_free;
+
+ mr->mem.arbel.mpt = mthca_table_find(dev->mr_table.mpt_table, key, NULL);
+ BUG_ON(!mr->mem.arbel.mpt);
+ } else
+ mr->mem.tavor.mpt = dev->mr_table.tavor_fmr.mpt_base +
+ sizeof *(mr->mem.tavor.mpt) * idx;
+
+ mr->mtt = __mthca_alloc_mtt(dev, list_len, dev->mr_table.fmr_mtt_buddy);
+ if (IS_ERR(mr->mtt)) {
+ err = PTR_ERR(mr->mtt);
+ goto err_out_table;
+ }
+
+ mtt_seg = mr->mtt->first_seg * dev->limits.mtt_seg_size;
+
+ if (mthca_is_memfree(dev)) {
+ mr->mem.arbel.mtts = mthca_table_find(dev->mr_table.mtt_table,
+ mr->mtt->first_seg,
+ &mr->mem.arbel.dma_handle);
+ BUG_ON(!mr->mem.arbel.mtts);
+ } else
+ mr->mem.tavor.mtts = dev->mr_table.tavor_fmr.mtt_base + mtt_seg;
+
+ mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(mailbox)) {
+ err = PTR_ERR(mailbox);
+ goto err_out_free_mtt;
+ }
+
+ mpt_entry = mailbox->buf;
+
+ mpt_entry->flags = cpu_to_be32(MTHCA_MPT_FLAG_SW_OWNS |
+ MTHCA_MPT_FLAG_MIO |
+ MTHCA_MPT_FLAG_REGION |
+ access);
+
+ mpt_entry->page_size = cpu_to_be32(mr->attr.page_shift - 12);
+ mpt_entry->key = cpu_to_be32(key);
+ mpt_entry->pd = cpu_to_be32(pd);
+ memset(&mpt_entry->start, 0,
+ sizeof *mpt_entry - offsetof(struct mthca_mpt_entry, start));
+ mpt_entry->mtt_seg = cpu_to_be64(dev->mr_table.mtt_base + mtt_seg);
+
+ if (0) {
+ mthca_dbg(dev, "Dumping MPT entry %08x:\n", mr->ibmr.lkey);
+ for (i = 0; i < sizeof (struct mthca_mpt_entry) / 4; ++i) {
+ if (i % 4 == 0)
+ printk("[%02x] ", i * 4);
+ printk(" %08x", be32_to_cpu(((__be32 *) mpt_entry)[i]));
+ if ((i + 1) % 4 == 0)
+ printk("\n");
+ }
+ }
+
+ err = mthca_SW2HW_MPT(dev, mailbox,
+ key & (dev->limits.num_mpts - 1),
+ &status);
+ if (err) {
+ mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+ goto err_out_mailbox_free;
+ }
+ if (status) {
+ mthca_warn(dev, "SW2HW_MPT returned status 0x%02x\n",
+ status);
+ err = -EINVAL;
+ goto err_out_mailbox_free;
+ }
+
+ mthca_free_mailbox(dev, mailbox);
+ return 0;
+
+err_out_mailbox_free:
+ mthca_free_mailbox(dev, mailbox);
+
+err_out_free_mtt:
+ mthca_free_mtt(dev, mr->mtt);
+
+err_out_table:
+ mthca_table_put(dev, dev->mr_table.mpt_table, key);
+
+err_out_mpt_free:
+ mthca_free(&dev->mr_table.mpt_alloc, key);
+ return err;
+}
+
+int mthca_free_fmr(struct mthca_dev *dev, struct mthca_fmr *fmr)
+{
+ if (fmr->maps)
+ return -EBUSY;
+
+ mthca_free_region(dev, fmr->ibmr.lkey);
+ mthca_free_mtt(dev, fmr->mtt);
+
+ return 0;
+}
+
+static inline int mthca_check_fmr(struct mthca_fmr *fmr, u64 *page_list,
+ int list_len, u64 iova)
+{
+ int i, page_mask;
+
+ if (list_len > fmr->attr.max_pages)
+ return -EINVAL;
+
+ page_mask = (1 << fmr->attr.page_shift) - 1;
+
+ /* We are getting page lists, so va must be page aligned. */
+ if (iova & page_mask)
+ return -EINVAL;
+
+ /* Trust the user not to pass misaligned data in page_list */
+ if (0)
+ for (i = 0; i < list_len; ++i) {
+ if (page_list[i] & ~page_mask)
+ return -EINVAL;
+ }
+
+ if (fmr->maps >= fmr->attr.max_maps)
+ return -EINVAL;
+
+ return 0;
+}
+
+
+int mthca_tavor_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+ int list_len, u64 iova)
+{
+ struct mthca_fmr *fmr = to_mfmr(ibfmr);
+ struct mthca_dev *dev = to_mdev(ibfmr->device);
+ struct mthca_mpt_entry mpt_entry;
+ u32 key;
+ int i, err;
+
+ err = mthca_check_fmr(fmr, page_list, list_len, iova);
+ if (err)
+ return err;
+
+ ++fmr->maps;
+
+ key = tavor_key_to_hw_index(fmr->ibmr.lkey);
+ key += dev->limits.num_mpts;
+ fmr->ibmr.lkey = fmr->ibmr.rkey = tavor_hw_index_to_key(key);
+
+ writeb(MTHCA_MPT_STATUS_SW, fmr->mem.tavor.mpt);
+
+ for (i = 0; i < list_len; ++i) {
+ __be64 mtt_entry = cpu_to_be64(page_list[i] |
+ MTHCA_MTT_FLAG_PRESENT);
+ mthca_write64_raw(mtt_entry, fmr->mem.tavor.mtts + i);
+ }
+
+ mpt_entry.lkey = cpu_to_be32(key);
+ mpt_entry.length = cpu_to_be64(list_len * (1ull << fmr->attr.page_shift));
+ mpt_entry.start = cpu_to_be64(iova);
+
+ __raw_writel((__force u32) mpt_entry.lkey, &fmr->mem.tavor.mpt->key);
+ memcpy_toio(&fmr->mem.tavor.mpt->start, &mpt_entry.start,
+ offsetof(struct mthca_mpt_entry, window_count) -
+ offsetof(struct mthca_mpt_entry, start));
+
+ writeb(MTHCA_MPT_STATUS_HW, fmr->mem.tavor.mpt);
+
+ return 0;
+}
+
+int mthca_arbel_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+ int list_len, u64 iova)
+{
+ struct mthca_fmr *fmr = to_mfmr(ibfmr);
+ struct mthca_dev *dev = to_mdev(ibfmr->device);
+ u32 key;
+ int i, err;
+
+ err = mthca_check_fmr(fmr, page_list, list_len, iova);
+ if (err)
+ return err;
+
+ ++fmr->maps;
+
+ key = arbel_key_to_hw_index(fmr->ibmr.lkey);
+ if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+ key += SINAI_FMR_KEY_INC;
+ else
+ key += dev->limits.num_mpts;
+ fmr->ibmr.lkey = fmr->ibmr.rkey = arbel_hw_index_to_key(key);
+
+ *(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_SW;
+
+ wmb();
+
+ for (i = 0; i < list_len; ++i)
+ fmr->mem.arbel.mtts[i] = cpu_to_be64(page_list[i] |
+ MTHCA_MTT_FLAG_PRESENT);
+
+ dma_sync_single(&dev->pdev->dev, fmr->mem.arbel.dma_handle,
+ list_len * sizeof(u64), DMA_TO_DEVICE);
+
+ fmr->mem.arbel.mpt->key = cpu_to_be32(key);
+ fmr->mem.arbel.mpt->lkey = cpu_to_be32(key);
+ fmr->mem.arbel.mpt->length = cpu_to_be64(list_len * (1ull << fmr->attr.page_shift));
+ fmr->mem.arbel.mpt->start = cpu_to_be64(iova);
+
+ wmb();
+
+ *(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_HW;
+
+ wmb();
+
+ return 0;
+}
+
+void mthca_tavor_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr)
+{
+ if (!fmr->maps)
+ return;
+
+ fmr->maps = 0;
+
+ writeb(MTHCA_MPT_STATUS_SW, fmr->mem.tavor.mpt);
+}
+
+void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr)
+{
+ if (!fmr->maps)
+ return;
+
+ fmr->maps = 0;
+
+ *(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_SW;
+}
+
+int mthca_init_mr_table(struct mthca_dev *dev)
+{
+ unsigned long addr;
+ int mpts, mtts, err, i;
+
+ err = mthca_alloc_init(&dev->mr_table.mpt_alloc,
+ dev->limits.num_mpts,
+ ~0, dev->limits.reserved_mrws);
+ if (err)
+ return err;
+
+ if (!mthca_is_memfree(dev) &&
+ (dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN))
+ dev->limits.fmr_reserved_mtts = 0;
+ else
+ dev->mthca_flags |= MTHCA_FLAG_FMR;
+
+ if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+ mthca_dbg(dev, "Memory key throughput optimization activated.\n");
+
+ err = mthca_buddy_init(&dev->mr_table.mtt_buddy,
+ fls(dev->limits.num_mtt_segs - 1));
+
+ if (err)
+ goto err_mtt_buddy;
+
+ dev->mr_table.tavor_fmr.mpt_base = NULL;
+ dev->mr_table.tavor_fmr.mtt_base = NULL;
+
+ if (dev->limits.fmr_reserved_mtts) {
+ i = fls(dev->limits.fmr_reserved_mtts - 1);
+
+ if (i >= 31) {
+ mthca_warn(dev, "Unable to reserve 2^31 FMR MTTs.\n");
+ err = -EINVAL;
+ goto err_fmr_mpt;
+ }
+ mpts = mtts = 1 << i;
+ } else {
+ mtts = dev->limits.num_mtt_segs;
+ mpts = dev->limits.num_mpts;
+ }
+
+ if (!mthca_is_memfree(dev) &&
+ (dev->mthca_flags & MTHCA_FLAG_FMR)) {
+
+ addr = pci_resource_start(dev->pdev, 4) +
+ ((pci_resource_len(dev->pdev, 4) - 1) &
+ dev->mr_table.mpt_base);
+
+ dev->mr_table.tavor_fmr.mpt_base =
+ ioremap(addr, mpts * sizeof(struct mthca_mpt_entry));
+
+ if (!dev->mr_table.tavor_fmr.mpt_base) {
+ mthca_warn(dev, "MPT ioremap for FMR failed.\n");
+ err = -ENOMEM;
+ goto err_fmr_mpt;
+ }
+
+ addr = pci_resource_start(dev->pdev, 4) +
+ ((pci_resource_len(dev->pdev, 4) - 1) &
+ dev->mr_table.mtt_base);
+
+ dev->mr_table.tavor_fmr.mtt_base =
+ ioremap(addr, mtts * dev->limits.mtt_seg_size);
+ if (!dev->mr_table.tavor_fmr.mtt_base) {
+ mthca_warn(dev, "MTT ioremap for FMR failed.\n");
+ err = -ENOMEM;
+ goto err_fmr_mtt;
+ }
+ }
+
+ if (dev->limits.fmr_reserved_mtts) {
+ err = mthca_buddy_init(&dev->mr_table.tavor_fmr.mtt_buddy, fls(mtts - 1));
+ if (err)
+ goto err_fmr_mtt_buddy;
+
+ /* Prevent regular MRs from using FMR keys */
+ err = mthca_buddy_alloc(&dev->mr_table.mtt_buddy, fls(mtts - 1));
+ if (err)
+ goto err_reserve_fmr;
+
+ dev->mr_table.fmr_mtt_buddy =
+ &dev->mr_table.tavor_fmr.mtt_buddy;
+ } else
+ dev->mr_table.fmr_mtt_buddy = &dev->mr_table.mtt_buddy;
+
+ /* FMR table is always the first, take reserved MTTs out of there */
+ if (dev->limits.reserved_mtts) {
+ i = fls(dev->limits.reserved_mtts - 1);
+
+ if (mthca_alloc_mtt_range(dev, i,
+ dev->mr_table.fmr_mtt_buddy) == -1) {
+ mthca_warn(dev, "MTT table of order %d is too small.\n",
+ dev->mr_table.fmr_mtt_buddy->max_order);
+ err = -ENOMEM;
+ goto err_reserve_mtts;
+ }
+ }
+
+ return 0;
+
+err_reserve_mtts:
+err_reserve_fmr:
+ if (dev->limits.fmr_reserved_mtts)
+ mthca_buddy_cleanup(&dev->mr_table.tavor_fmr.mtt_buddy);
+
+err_fmr_mtt_buddy:
+ if (dev->mr_table.tavor_fmr.mtt_base)
+ iounmap(dev->mr_table.tavor_fmr.mtt_base);
+
+err_fmr_mtt:
+ if (dev->mr_table.tavor_fmr.mpt_base)
+ iounmap(dev->mr_table.tavor_fmr.mpt_base);
+
+err_fmr_mpt:
+ mthca_buddy_cleanup(&dev->mr_table.mtt_buddy);
+
+err_mtt_buddy:
+ mthca_alloc_cleanup(&dev->mr_table.mpt_alloc);
+
+ return err;
+}
+
+void mthca_cleanup_mr_table(struct mthca_dev *dev)
+{
+ /* XXX check if any MRs are still allocated? */
+ if (dev->limits.fmr_reserved_mtts)
+ mthca_buddy_cleanup(&dev->mr_table.tavor_fmr.mtt_buddy);
+
+ mthca_buddy_cleanup(&dev->mr_table.mtt_buddy);
+
+ if (dev->mr_table.tavor_fmr.mtt_base)
+ iounmap(dev->mr_table.tavor_fmr.mtt_base);
+ if (dev->mr_table.tavor_fmr.mpt_base)
+ iounmap(dev->mr_table.tavor_fmr.mpt_base);
+
+ mthca_alloc_cleanup(&dev->mr_table.mpt_alloc);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_pd.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_pd.c
new file mode 100644
index 0000000..266f14e
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_pd.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+
+#include "mthca_dev.h"
+
+int mthca_pd_alloc(struct mthca_dev *dev, int privileged, struct mthca_pd *pd)
+{
+ int err = 0;
+
+ pd->privileged = privileged;
+
+ atomic_set(&pd->sqp_count, 0);
+ pd->pd_num = mthca_alloc(&dev->pd_table.alloc);
+ if (pd->pd_num == -1)
+ return -ENOMEM;
+
+ if (privileged) {
+ err = mthca_mr_alloc_notrans(dev, pd->pd_num,
+ MTHCA_MPT_FLAG_LOCAL_READ |
+ MTHCA_MPT_FLAG_LOCAL_WRITE,
+ &pd->ntmr);
+ if (err)
+ mthca_free(&dev->pd_table.alloc, pd->pd_num);
+ }
+
+ return err;
+}
+
+void mthca_pd_free(struct mthca_dev *dev, struct mthca_pd *pd)
+{
+ if (pd->privileged)
+ mthca_free_mr(dev, &pd->ntmr);
+ mthca_free(&dev->pd_table.alloc, pd->pd_num);
+}
+
+int mthca_init_pd_table(struct mthca_dev *dev)
+{
+ return mthca_alloc_init(&dev->pd_table.alloc,
+ dev->limits.num_pds,
+ (1 << 24) - 1,
+ dev->limits.reserved_pds);
+}
+
+void mthca_cleanup_pd_table(struct mthca_dev *dev)
+{
+ /* XXX check if any PDs are still allocated? */
+ mthca_alloc_cleanup(&dev->pd_table.alloc);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_profile.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_profile.c
new file mode 100644
index 0000000..8edb28a
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_profile.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include "mthca_profile.h"
+
+enum {
+ MTHCA_RES_QP,
+ MTHCA_RES_EEC,
+ MTHCA_RES_SRQ,
+ MTHCA_RES_CQ,
+ MTHCA_RES_EQP,
+ MTHCA_RES_EEEC,
+ MTHCA_RES_EQ,
+ MTHCA_RES_RDB,
+ MTHCA_RES_MCG,
+ MTHCA_RES_MPT,
+ MTHCA_RES_MTT,
+ MTHCA_RES_UAR,
+ MTHCA_RES_UDAV,
+ MTHCA_RES_UARC,
+ MTHCA_RES_NUM
+};
+
+enum {
+ MTHCA_NUM_EQS = 32,
+ MTHCA_NUM_PDS = 1 << 15
+};
+
+s64 mthca_make_profile(struct mthca_dev *dev,
+ struct mthca_profile *request,
+ struct mthca_dev_lim *dev_lim,
+ struct mthca_init_hca_param *init_hca)
+{
+ struct mthca_resource {
+ u64 size;
+ u64 start;
+ int type;
+ int num;
+ int log_num;
+ };
+
+ u64 mem_base, mem_avail;
+ s64 total_size = 0;
+ struct mthca_resource *profile;
+ struct mthca_resource tmp;
+ int i, j;
+
+ profile = kzalloc(MTHCA_RES_NUM * sizeof *profile, GFP_KERNEL);
+ if (!profile)
+ return -ENOMEM;
+
+ profile[MTHCA_RES_QP].size = dev_lim->qpc_entry_sz;
+ profile[MTHCA_RES_EEC].size = dev_lim->eec_entry_sz;
+ profile[MTHCA_RES_SRQ].size = dev_lim->srq_entry_sz;
+ profile[MTHCA_RES_CQ].size = dev_lim->cqc_entry_sz;
+ profile[MTHCA_RES_EQP].size = dev_lim->eqpc_entry_sz;
+ profile[MTHCA_RES_EEEC].size = dev_lim->eeec_entry_sz;
+ profile[MTHCA_RES_EQ].size = dev_lim->eqc_entry_sz;
+ profile[MTHCA_RES_RDB].size = MTHCA_RDB_ENTRY_SIZE;
+ profile[MTHCA_RES_MCG].size = MTHCA_MGM_ENTRY_SIZE;
+ profile[MTHCA_RES_MPT].size = dev_lim->mpt_entry_sz;
+ profile[MTHCA_RES_MTT].size = dev->limits.mtt_seg_size;
+ profile[MTHCA_RES_UAR].size = dev_lim->uar_scratch_entry_sz;
+ profile[MTHCA_RES_UDAV].size = MTHCA_AV_SIZE;
+ profile[MTHCA_RES_UARC].size = request->uarc_size;
+
+ profile[MTHCA_RES_QP].num = request->num_qp;
+ profile[MTHCA_RES_SRQ].num = request->num_srq;
+ profile[MTHCA_RES_EQP].num = request->num_qp;
+ profile[MTHCA_RES_RDB].num = request->num_qp * request->rdb_per_qp;
+ profile[MTHCA_RES_CQ].num = request->num_cq;
+ profile[MTHCA_RES_EQ].num = MTHCA_NUM_EQS;
+ profile[MTHCA_RES_MCG].num = request->num_mcg;
+ profile[MTHCA_RES_MPT].num = request->num_mpt;
+ profile[MTHCA_RES_MTT].num = request->num_mtt;
+ profile[MTHCA_RES_UAR].num = request->num_uar;
+ profile[MTHCA_RES_UARC].num = request->num_uar;
+ profile[MTHCA_RES_UDAV].num = request->num_udav;
+
+ for (i = 0; i < MTHCA_RES_NUM; ++i) {
+ profile[i].type = i;
+ profile[i].log_num = max(ffs(profile[i].num) - 1, 0);
+ profile[i].size *= profile[i].num;
+ if (mthca_is_memfree(dev))
+ profile[i].size = max(profile[i].size, (u64) PAGE_SIZE);
+ }
+
+ if (mthca_is_memfree(dev)) {
+ mem_base = 0;
+ mem_avail = dev_lim->hca.arbel.max_icm_sz;
+ } else {
+ mem_base = dev->ddr_start;
+ mem_avail = dev->fw.tavor.fw_start - dev->ddr_start;
+ }
+
+ /*
+ * Sort the resources in decreasing order of size. Since they
+ * all have sizes that are powers of 2, we'll be able to keep
+ * resources aligned to their size and pack them without gaps
+ * using the sorted order.
+ */
+ for (i = MTHCA_RES_NUM; i > 0; --i)
+ for (j = 1; j < i; ++j) {
+ if (profile[j].size > profile[j - 1].size) {
+ tmp = profile[j];
+ profile[j] = profile[j - 1];
+ profile[j - 1] = tmp;
+ }
+ }
+
+ for (i = 0; i < MTHCA_RES_NUM; ++i) {
+ if (profile[i].size) {
+ profile[i].start = mem_base + total_size;
+ total_size += profile[i].size;
+ }
+ if (total_size > mem_avail) {
+ mthca_err(dev, "Profile requires 0x%llx bytes; "
+ "won't fit in 0x%llx bytes of context memory.\n",
+ (unsigned long long) total_size,
+ (unsigned long long) mem_avail);
+ kfree(profile);
+ return -ENOMEM;
+ }
+
+ if (profile[i].size)
+ mthca_dbg(dev, "profile[%2d]--%2d/%2d @ 0x%16llx "
+ "(size 0x%8llx)\n",
+ i, profile[i].type, profile[i].log_num,
+ (unsigned long long) profile[i].start,
+ (unsigned long long) profile[i].size);
+ }
+
+ if (mthca_is_memfree(dev))
+ mthca_dbg(dev, "HCA context memory: reserving %d KB\n",
+ (int) (total_size >> 10));
+ else
+ mthca_dbg(dev, "HCA memory: allocated %d KB/%d KB (%d KB free)\n",
+ (int) (total_size >> 10), (int) (mem_avail >> 10),
+ (int) ((mem_avail - total_size) >> 10));
+
+ for (i = 0; i < MTHCA_RES_NUM; ++i) {
+ switch (profile[i].type) {
+ case MTHCA_RES_QP:
+ dev->limits.num_qps = profile[i].num;
+ init_hca->qpc_base = profile[i].start;
+ init_hca->log_num_qps = profile[i].log_num;
+ break;
+ case MTHCA_RES_EEC:
+ dev->limits.num_eecs = profile[i].num;
+ init_hca->eec_base = profile[i].start;
+ init_hca->log_num_eecs = profile[i].log_num;
+ break;
+ case MTHCA_RES_SRQ:
+ dev->limits.num_srqs = profile[i].num;
+ init_hca->srqc_base = profile[i].start;
+ init_hca->log_num_srqs = profile[i].log_num;
+ break;
+ case MTHCA_RES_CQ:
+ dev->limits.num_cqs = profile[i].num;
+ init_hca->cqc_base = profile[i].start;
+ init_hca->log_num_cqs = profile[i].log_num;
+ break;
+ case MTHCA_RES_EQP:
+ init_hca->eqpc_base = profile[i].start;
+ break;
+ case MTHCA_RES_EEEC:
+ init_hca->eeec_base = profile[i].start;
+ break;
+ case MTHCA_RES_EQ:
+ dev->limits.num_eqs = profile[i].num;
+ init_hca->eqc_base = profile[i].start;
+ init_hca->log_num_eqs = profile[i].log_num;
+ break;
+ case MTHCA_RES_RDB:
+ for (dev->qp_table.rdb_shift = 0;
+ request->num_qp << dev->qp_table.rdb_shift < profile[i].num;
+ ++dev->qp_table.rdb_shift)
+ ; /* nothing */
+ dev->qp_table.rdb_base = (u32) profile[i].start;
+ init_hca->rdb_base = profile[i].start;
+ break;
+ case MTHCA_RES_MCG:
+ dev->limits.num_mgms = profile[i].num >> 1;
+ dev->limits.num_amgms = profile[i].num >> 1;
+ init_hca->mc_base = profile[i].start;
+ init_hca->log_mc_entry_sz = ffs(MTHCA_MGM_ENTRY_SIZE) - 1;
+ init_hca->log_mc_table_sz = profile[i].log_num;
+ init_hca->mc_hash_sz = 1 << (profile[i].log_num - 1);
+ break;
+ case MTHCA_RES_MPT:
+ dev->limits.num_mpts = profile[i].num;
+ dev->mr_table.mpt_base = profile[i].start;
+ init_hca->mpt_base = profile[i].start;
+ init_hca->log_mpt_sz = profile[i].log_num;
+ break;
+ case MTHCA_RES_MTT:
+ dev->limits.num_mtt_segs = profile[i].num;
+ dev->mr_table.mtt_base = profile[i].start;
+ init_hca->mtt_base = profile[i].start;
+ init_hca->mtt_seg_sz = ffs(dev->limits.mtt_seg_size) - 7;
+ break;
+ case MTHCA_RES_UAR:
+ dev->limits.num_uars = profile[i].num;
+ init_hca->uar_scratch_base = profile[i].start;
+ break;
+ case MTHCA_RES_UDAV:
+ dev->av_table.ddr_av_base = profile[i].start;
+ dev->av_table.num_ddr_avs = profile[i].num;
+ break;
+ case MTHCA_RES_UARC:
+ dev->uar_table.uarc_size = request->uarc_size;
+ dev->uar_table.uarc_base = profile[i].start;
+ init_hca->uarc_base = profile[i].start;
+ init_hca->log_uarc_sz = ffs(request->uarc_size) - 13;
+ init_hca->log_uar_sz = ffs(request->num_uar) - 1;
+ break;
+ default:
+ break;
+ }
+ }
+
+ /*
+ * PDs don't take any HCA memory, but we assign them as part
+ * of the HCA profile anyway.
+ */
+ dev->limits.num_pds = MTHCA_NUM_PDS;
+
+ if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT &&
+ init_hca->log_mpt_sz > 23) {
+ mthca_warn(dev, "MPT table too large (requested size 2^%d >= 2^24)\n",
+ init_hca->log_mpt_sz);
+ mthca_warn(dev, "Disabling memory key throughput optimization.\n");
+ dev->mthca_flags &= ~MTHCA_FLAG_SINAI_OPT;
+ }
+
+ /*
+ * For Tavor, FMRs use ioremapped PCI memory. For 32 bit
+ * systems it may use too much vmalloc space to map all MTT
+ * memory, so we reserve some MTTs for FMR access, taking them
+ * out of the MR pool. They don't use additional memory, but
+ * we assign them as part of the HCA profile anyway.
+ */
+ if (mthca_is_memfree(dev) || BITS_PER_LONG == 64)
+ dev->limits.fmr_reserved_mtts = 0;
+ else
+ dev->limits.fmr_reserved_mtts = request->fmr_reserved_mtts;
+
+ kfree(profile);
+ return total_size;
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_profile.h b/sys/ofed/drivers/infiniband/hw/mthca/mthca_profile.h
new file mode 100644
index 0000000..62b009c
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_profile.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_PROFILE_H
+#define MTHCA_PROFILE_H
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+struct mthca_profile {
+ int num_qp;
+ int rdb_per_qp;
+ int num_srq;
+ int num_cq;
+ int num_mcg;
+ int num_mpt;
+ int num_mtt;
+ int num_udav;
+ int num_uar;
+ int uarc_size;
+ int fmr_reserved_mtts;
+};
+
+s64 mthca_make_profile(struct mthca_dev *mdev,
+ struct mthca_profile *request,
+ struct mthca_dev_lim *dev_lim,
+ struct mthca_init_hca_param *init_hca);
+
+#endif /* MTHCA_PROFILE_H */
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c
new file mode 100644
index 0000000..e547739
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -0,0 +1,1427 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_user.h"
+#include "mthca_memfree.h"
+
+static void init_query_mad(struct ib_smp *mad)
+{
+ mad->base_version = 1;
+ mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+ mad->class_version = 1;
+ mad->method = IB_MGMT_METHOD_GET;
+}
+
+static int mthca_query_device(struct ib_device *ibdev,
+ struct ib_device_attr *props)
+{
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ int err = -ENOMEM;
+ struct mthca_dev *mdev = to_mdev(ibdev);
+
+ u8 status;
+
+ in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad)
+ goto out;
+
+ memset(props, 0, sizeof *props);
+
+ props->fw_ver = mdev->fw_ver;
+
+ init_query_mad(in_mad);
+ in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+ err = mthca_MAD_IFC(mdev, 1, 1,
+ 1, NULL, NULL, in_mad, out_mad,
+ &status);
+ if (err)
+ goto out;
+ if (status) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ props->device_cap_flags = mdev->device_cap_flags;
+ props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) &
+ 0xffffff;
+ props->vendor_part_id = be16_to_cpup((__be16 *) (out_mad->data + 30));
+ props->hw_ver = be32_to_cpup((__be32 *) (out_mad->data + 32));
+ memcpy(&props->sys_image_guid, out_mad->data + 4, 8);
+
+ props->max_mr_size = ~0ull;
+ props->page_size_cap = mdev->limits.page_size_cap;
+ props->max_qp = mdev->limits.num_qps - mdev->limits.reserved_qps;
+ props->max_qp_wr = mdev->limits.max_wqes;
+ props->max_sge = mdev->limits.max_sg;
+ props->max_cq = mdev->limits.num_cqs - mdev->limits.reserved_cqs;
+ props->max_cqe = mdev->limits.max_cqes;
+ props->max_mr = mdev->limits.num_mpts - mdev->limits.reserved_mrws;
+ props->max_pd = mdev->limits.num_pds - mdev->limits.reserved_pds;
+ props->max_qp_rd_atom = 1 << mdev->qp_table.rdb_shift;
+ props->max_qp_init_rd_atom = mdev->limits.max_qp_init_rdma;
+ props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp;
+ props->max_srq = mdev->limits.num_srqs - mdev->limits.reserved_srqs;
+ props->max_srq_wr = mdev->limits.max_srq_wqes;
+ props->max_srq_sge = mdev->limits.max_srq_sge;
+ props->local_ca_ack_delay = mdev->limits.local_ca_ack_delay;
+ props->atomic_cap = mdev->limits.flags & DEV_LIM_FLAG_ATOMIC ?
+ IB_ATOMIC_HCA : IB_ATOMIC_NONE;
+ props->max_pkeys = mdev->limits.pkey_table_len;
+ props->max_mcast_grp = mdev->limits.num_mgms + mdev->limits.num_amgms;
+ props->max_mcast_qp_attach = MTHCA_QP_PER_MGM;
+ props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+ props->max_mcast_grp;
+ /*
+ * If Sinai memory key optimization is being used, then only
+ * the 8-bit key portion will change. For other HCAs, the
+ * unused index bits will also be used for FMR remapping.
+ */
+ if (mdev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+ props->max_map_per_fmr = 255;
+ else
+ props->max_map_per_fmr =
+ (1 << (32 - ilog2(mdev->limits.num_mpts))) - 1;
+
+ err = 0;
+ out:
+ kfree(in_mad);
+ kfree(out_mad);
+ return err;
+}
+
+static int mthca_query_port(struct ib_device *ibdev,
+ u8 port, struct ib_port_attr *props)
+{
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ int err = -ENOMEM;
+ u8 status;
+
+ in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad)
+ goto out;
+
+ memset(props, 0, sizeof *props);
+
+ init_query_mad(in_mad);
+ in_mad->attr_id = IB_SMP_ATTR_PORT_INFO;
+ in_mad->attr_mod = cpu_to_be32(port);
+
+ err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+ port, NULL, NULL, in_mad, out_mad,
+ &status);
+ if (err)
+ goto out;
+ if (status) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ props->lid = be16_to_cpup((__be16 *) (out_mad->data + 16));
+ props->lmc = out_mad->data[34] & 0x7;
+ props->sm_lid = be16_to_cpup((__be16 *) (out_mad->data + 18));
+ props->sm_sl = out_mad->data[36] & 0xf;
+ props->state = out_mad->data[32] & 0xf;
+ props->phys_state = out_mad->data[33] >> 4;
+ props->port_cap_flags = be32_to_cpup((__be32 *) (out_mad->data + 20));
+ props->gid_tbl_len = to_mdev(ibdev)->limits.gid_table_len;
+ props->max_msg_sz = 0x80000000;
+ props->pkey_tbl_len = to_mdev(ibdev)->limits.pkey_table_len;
+ props->bad_pkey_cntr = be16_to_cpup((__be16 *) (out_mad->data + 46));
+ props->qkey_viol_cntr = be16_to_cpup((__be16 *) (out_mad->data + 48));
+ props->active_width = out_mad->data[31] & 0xf;
+ props->active_speed = out_mad->data[35] >> 4;
+ props->max_mtu = out_mad->data[41] & 0xf;
+ props->active_mtu = out_mad->data[36] >> 4;
+ props->subnet_timeout = out_mad->data[51] & 0x1f;
+ props->max_vl_num = out_mad->data[37] >> 4;
+ props->init_type_reply = out_mad->data[41] >> 4;
+
+ out:
+ kfree(in_mad);
+ kfree(out_mad);
+ return err;
+}
+
+static int mthca_modify_device(struct ib_device *ibdev,
+ int mask,
+ struct ib_device_modify *props)
+{
+ if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
+ return -EOPNOTSUPP;
+
+ if (mask & IB_DEVICE_MODIFY_NODE_DESC) {
+ if (mutex_lock_interruptible(&to_mdev(ibdev)->cap_mask_mutex))
+ return -ERESTARTSYS;
+ memcpy(ibdev->node_desc, props->node_desc, 64);
+ mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex);
+ }
+
+ return 0;
+}
+
+static int mthca_modify_port(struct ib_device *ibdev,
+ u8 port, int port_modify_mask,
+ struct ib_port_modify *props)
+{
+ struct mthca_set_ib_param set_ib;
+ struct ib_port_attr attr;
+ int err;
+ u8 status;
+
+ if (mutex_lock_interruptible(&to_mdev(ibdev)->cap_mask_mutex))
+ return -ERESTARTSYS;
+
+ err = mthca_query_port(ibdev, port, &attr);
+ if (err)
+ goto out;
+
+ set_ib.set_si_guid = 0;
+ set_ib.reset_qkey_viol = !!(port_modify_mask & IB_PORT_RESET_QKEY_CNTR);
+
+ set_ib.cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) &
+ ~props->clr_port_cap_mask;
+
+ err = mthca_SET_IB(to_mdev(ibdev), &set_ib, port, &status);
+ if (err)
+ goto out;
+ if (status) {
+ err = -EINVAL;
+ goto out;
+ }
+
+out:
+ mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex);
+ return err;
+}
+
+static int mthca_query_pkey(struct ib_device *ibdev,
+ u8 port, u16 index, u16 *pkey)
+{
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ int err = -ENOMEM;
+ u8 status;
+
+ in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad)
+ goto out;
+
+ init_query_mad(in_mad);
+ in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE;
+ in_mad->attr_mod = cpu_to_be32(index / 32);
+
+ err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+ port, NULL, NULL, in_mad, out_mad,
+ &status);
+ if (err)
+ goto out;
+ if (status) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ *pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]);
+
+ out:
+ kfree(in_mad);
+ kfree(out_mad);
+ return err;
+}
+
+static int mthca_query_gid(struct ib_device *ibdev, u8 port,
+ int index, union ib_gid *gid)
+{
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ int err = -ENOMEM;
+ u8 status;
+
+ in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad)
+ goto out;
+
+ init_query_mad(in_mad);
+ in_mad->attr_id = IB_SMP_ATTR_PORT_INFO;
+ in_mad->attr_mod = cpu_to_be32(port);
+
+ err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+ port, NULL, NULL, in_mad, out_mad,
+ &status);
+ if (err)
+ goto out;
+ if (status) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ memcpy(gid->raw, out_mad->data + 8, 8);
+
+ init_query_mad(in_mad);
+ in_mad->attr_id = IB_SMP_ATTR_GUID_INFO;
+ in_mad->attr_mod = cpu_to_be32(index / 8);
+
+ err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+ port, NULL, NULL, in_mad, out_mad,
+ &status);
+ if (err)
+ goto out;
+ if (status) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8);
+
+ out:
+ kfree(in_mad);
+ kfree(out_mad);
+ return err;
+}
+
+static struct ib_ucontext *mthca_alloc_ucontext(struct ib_device *ibdev,
+ struct ib_udata *udata)
+{
+ struct mthca_alloc_ucontext_resp uresp;
+ struct mthca_ucontext *context;
+ int err;
+
+ if (!(to_mdev(ibdev)->active))
+ return ERR_PTR(-EAGAIN);
+
+ memset(&uresp, 0, sizeof uresp);
+
+ uresp.qp_tab_size = to_mdev(ibdev)->limits.num_qps;
+ if (mthca_is_memfree(to_mdev(ibdev)))
+ uresp.uarc_size = to_mdev(ibdev)->uar_table.uarc_size;
+ else
+ uresp.uarc_size = 0;
+
+ context = kmalloc(sizeof *context, GFP_KERNEL);
+ if (!context)
+ return ERR_PTR(-ENOMEM);
+
+ err = mthca_uar_alloc(to_mdev(ibdev), &context->uar);
+ if (err) {
+ kfree(context);
+ return ERR_PTR(err);
+ }
+
+ context->db_tab = mthca_init_user_db_tab(to_mdev(ibdev));
+ if (IS_ERR(context->db_tab)) {
+ err = PTR_ERR(context->db_tab);
+ mthca_uar_free(to_mdev(ibdev), &context->uar);
+ kfree(context);
+ return ERR_PTR(err);
+ }
+
+ if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) {
+ mthca_cleanup_user_db_tab(to_mdev(ibdev), &context->uar, context->db_tab);
+ mthca_uar_free(to_mdev(ibdev), &context->uar);
+ kfree(context);
+ return ERR_PTR(-EFAULT);
+ }
+
+ context->reg_mr_warned = 0;
+
+ return &context->ibucontext;
+}
+
+static int mthca_dealloc_ucontext(struct ib_ucontext *context)
+{
+ mthca_cleanup_user_db_tab(to_mdev(context->device), &to_mucontext(context)->uar,
+ to_mucontext(context)->db_tab);
+ mthca_uar_free(to_mdev(context->device), &to_mucontext(context)->uar);
+ kfree(to_mucontext(context));
+
+ return 0;
+}
+
+static int mthca_mmap_uar(struct ib_ucontext *context,
+ struct vm_area_struct *vma)
+{
+ if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+ return -EINVAL;
+
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+ if (io_remap_pfn_range(vma, vma->vm_start,
+ to_mucontext(context)->uar.pfn,
+ PAGE_SIZE, vma->vm_page_prot))
+ return -EAGAIN;
+
+ return 0;
+}
+
+static struct ib_pd *mthca_alloc_pd(struct ib_device *ibdev,
+ struct ib_ucontext *context,
+ struct ib_udata *udata)
+{
+ struct mthca_pd *pd;
+ int err;
+
+ pd = kmalloc(sizeof *pd, GFP_KERNEL);
+ if (!pd)
+ return ERR_PTR(-ENOMEM);
+
+ err = mthca_pd_alloc(to_mdev(ibdev), !context, pd);
+ if (err) {
+ kfree(pd);
+ return ERR_PTR(err);
+ }
+
+ if (context) {
+ if (ib_copy_to_udata(udata, &pd->pd_num, sizeof (__u32))) {
+ mthca_pd_free(to_mdev(ibdev), pd);
+ kfree(pd);
+ return ERR_PTR(-EFAULT);
+ }
+ }
+
+ return &pd->ibpd;
+}
+
+static int mthca_dealloc_pd(struct ib_pd *pd)
+{
+ mthca_pd_free(to_mdev(pd->device), to_mpd(pd));
+ kfree(pd);
+
+ return 0;
+}
+
+static struct ib_ah *mthca_ah_create(struct ib_pd *pd,
+ struct ib_ah_attr *ah_attr)
+{
+ int err;
+ struct mthca_ah *ah;
+
+ ah = kmalloc(sizeof *ah, GFP_ATOMIC);
+ if (!ah)
+ return ERR_PTR(-ENOMEM);
+
+ err = mthca_create_ah(to_mdev(pd->device), to_mpd(pd), ah_attr, ah);
+ if (err) {
+ kfree(ah);
+ return ERR_PTR(err);
+ }
+
+ return &ah->ibah;
+}
+
+static int mthca_ah_destroy(struct ib_ah *ah)
+{
+ mthca_destroy_ah(to_mdev(ah->device), to_mah(ah));
+ kfree(ah);
+
+ return 0;
+}
+
+static struct ib_srq *mthca_create_srq(struct ib_pd *pd,
+ struct ib_srq_init_attr *init_attr,
+ struct ib_udata *udata)
+{
+ struct mthca_create_srq ucmd;
+ struct mthca_ucontext *context = NULL;
+ struct mthca_srq *srq;
+ int err;
+
+ srq = kmalloc(sizeof *srq, GFP_KERNEL);
+ if (!srq)
+ return ERR_PTR(-ENOMEM);
+
+ if (pd->uobject) {
+ context = to_mucontext(pd->uobject->context);
+
+ if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+ err = -EFAULT;
+ goto err_free;
+ }
+
+ err = mthca_map_user_db(to_mdev(pd->device), &context->uar,
+ context->db_tab, ucmd.db_index,
+ ucmd.db_page);
+
+ if (err)
+ goto err_free;
+
+ srq->mr.ibmr.lkey = ucmd.lkey;
+ srq->db_index = ucmd.db_index;
+ }
+
+ err = mthca_alloc_srq(to_mdev(pd->device), to_mpd(pd),
+ &init_attr->attr, srq);
+
+ if (err && pd->uobject)
+ mthca_unmap_user_db(to_mdev(pd->device), &context->uar,
+ context->db_tab, ucmd.db_index);
+
+ if (err)
+ goto err_free;
+
+ if (context && ib_copy_to_udata(udata, &srq->srqn, sizeof (__u32))) {
+ mthca_free_srq(to_mdev(pd->device), srq);
+ err = -EFAULT;
+ goto err_free;
+ }
+
+ return &srq->ibsrq;
+
+err_free:
+ kfree(srq);
+
+ return ERR_PTR(err);
+}
+
+static int mthca_destroy_srq(struct ib_srq *srq)
+{
+ struct mthca_ucontext *context;
+
+ if (srq->uobject) {
+ context = to_mucontext(srq->uobject->context);
+
+ mthca_unmap_user_db(to_mdev(srq->device), &context->uar,
+ context->db_tab, to_msrq(srq)->db_index);
+ }
+
+ mthca_free_srq(to_mdev(srq->device), to_msrq(srq));
+ kfree(srq);
+
+ return 0;
+}
+
+static struct ib_qp *mthca_create_qp(struct ib_pd *pd,
+ struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata)
+{
+ struct mthca_create_qp ucmd;
+ struct mthca_qp *qp;
+ int err;
+
+ if (init_attr->create_flags)
+ return ERR_PTR(-EINVAL);
+
+ switch (init_attr->qp_type) {
+ case IB_QPT_RC:
+ case IB_QPT_UC:
+ case IB_QPT_UD:
+ {
+ struct mthca_ucontext *context;
+
+ qp = kmalloc(sizeof *qp, GFP_KERNEL);
+ if (!qp)
+ return ERR_PTR(-ENOMEM);
+
+ if (pd->uobject) {
+ context = to_mucontext(pd->uobject->context);
+
+ if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+ kfree(qp);
+ return ERR_PTR(-EFAULT);
+ }
+
+ err = mthca_map_user_db(to_mdev(pd->device), &context->uar,
+ context->db_tab,
+ ucmd.sq_db_index, ucmd.sq_db_page);
+ if (err) {
+ kfree(qp);
+ return ERR_PTR(err);
+ }
+
+ err = mthca_map_user_db(to_mdev(pd->device), &context->uar,
+ context->db_tab,
+ ucmd.rq_db_index, ucmd.rq_db_page);
+ if (err) {
+ mthca_unmap_user_db(to_mdev(pd->device),
+ &context->uar,
+ context->db_tab,
+ ucmd.sq_db_index);
+ kfree(qp);
+ return ERR_PTR(err);
+ }
+
+ qp->mr.ibmr.lkey = ucmd.lkey;
+ qp->sq.db_index = ucmd.sq_db_index;
+ qp->rq.db_index = ucmd.rq_db_index;
+ }
+
+ err = mthca_alloc_qp(to_mdev(pd->device), to_mpd(pd),
+ to_mcq(init_attr->send_cq),
+ to_mcq(init_attr->recv_cq),
+ init_attr->qp_type, init_attr->sq_sig_type,
+ &init_attr->cap, qp);
+
+ if (err && pd->uobject) {
+ context = to_mucontext(pd->uobject->context);
+
+ mthca_unmap_user_db(to_mdev(pd->device),
+ &context->uar,
+ context->db_tab,
+ ucmd.sq_db_index);
+ mthca_unmap_user_db(to_mdev(pd->device),
+ &context->uar,
+ context->db_tab,
+ ucmd.rq_db_index);
+ }
+
+ qp->ibqp.qp_num = qp->qpn;
+ break;
+ }
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ {
+ /* Don't allow userspace to create special QPs */
+ if (pd->uobject)
+ return ERR_PTR(-EINVAL);
+
+ qp = kmalloc(sizeof (struct mthca_sqp), GFP_KERNEL);
+ if (!qp)
+ return ERR_PTR(-ENOMEM);
+
+ qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1;
+
+ err = mthca_alloc_sqp(to_mdev(pd->device), to_mpd(pd),
+ to_mcq(init_attr->send_cq),
+ to_mcq(init_attr->recv_cq),
+ init_attr->sq_sig_type, &init_attr->cap,
+ qp->ibqp.qp_num, init_attr->port_num,
+ to_msqp(qp));
+ break;
+ }
+ default:
+ /* Don't support raw QPs */
+ return ERR_PTR(-ENOSYS);
+ }
+
+ if (err) {
+ kfree(qp);
+ return ERR_PTR(err);
+ }
+
+ init_attr->cap.max_send_wr = qp->sq.max;
+ init_attr->cap.max_recv_wr = qp->rq.max;
+ init_attr->cap.max_send_sge = qp->sq.max_gs;
+ init_attr->cap.max_recv_sge = qp->rq.max_gs;
+ init_attr->cap.max_inline_data = qp->max_inline_data;
+
+ return &qp->ibqp;
+}
+
+static int mthca_destroy_qp(struct ib_qp *qp)
+{
+ if (qp->uobject) {
+ mthca_unmap_user_db(to_mdev(qp->device),
+ &to_mucontext(qp->uobject->context)->uar,
+ to_mucontext(qp->uobject->context)->db_tab,
+ to_mqp(qp)->sq.db_index);
+ mthca_unmap_user_db(to_mdev(qp->device),
+ &to_mucontext(qp->uobject->context)->uar,
+ to_mucontext(qp->uobject->context)->db_tab,
+ to_mqp(qp)->rq.db_index);
+ }
+ mthca_free_qp(to_mdev(qp->device), to_mqp(qp));
+ kfree(qp);
+ return 0;
+}
+
+static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, int entries,
+ int comp_vector,
+ struct ib_ucontext *context,
+ struct ib_udata *udata)
+{
+ struct mthca_create_cq ucmd;
+ struct mthca_cq *cq;
+ int nent;
+ int err;
+
+ if (entries < 1 || entries > to_mdev(ibdev)->limits.max_cqes)
+ return ERR_PTR(-EINVAL);
+
+ if (context) {
+ if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd))
+ return ERR_PTR(-EFAULT);
+
+ err = mthca_map_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
+ to_mucontext(context)->db_tab,
+ ucmd.set_db_index, ucmd.set_db_page);
+ if (err)
+ return ERR_PTR(err);
+
+ err = mthca_map_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
+ to_mucontext(context)->db_tab,
+ ucmd.arm_db_index, ucmd.arm_db_page);
+ if (err)
+ goto err_unmap_set;
+ }
+
+ cq = kmalloc(sizeof *cq, GFP_KERNEL);
+ if (!cq) {
+ err = -ENOMEM;
+ goto err_unmap_arm;
+ }
+
+ if (context) {
+ cq->buf.mr.ibmr.lkey = ucmd.lkey;
+ cq->set_ci_db_index = ucmd.set_db_index;
+ cq->arm_db_index = ucmd.arm_db_index;
+ }
+
+ for (nent = 1; nent <= entries; nent <<= 1)
+ ; /* nothing */
+
+ err = mthca_init_cq(to_mdev(ibdev), nent,
+ context ? to_mucontext(context) : NULL,
+ context ? ucmd.pdn : to_mdev(ibdev)->driver_pd.pd_num,
+ cq);
+ if (err)
+ goto err_free;
+
+ if (context && ib_copy_to_udata(udata, &cq->cqn, sizeof (__u32))) {
+ mthca_free_cq(to_mdev(ibdev), cq);
+ goto err_free;
+ }
+
+ cq->resize_buf = NULL;
+
+ return &cq->ibcq;
+
+err_free:
+ kfree(cq);
+
+err_unmap_arm:
+ if (context)
+ mthca_unmap_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
+ to_mucontext(context)->db_tab, ucmd.arm_db_index);
+
+err_unmap_set:
+ if (context)
+ mthca_unmap_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
+ to_mucontext(context)->db_tab, ucmd.set_db_index);
+
+ return ERR_PTR(err);
+}
+
+static int mthca_alloc_resize_buf(struct mthca_dev *dev, struct mthca_cq *cq,
+ int entries)
+{
+ int ret;
+
+ spin_lock_irq(&cq->lock);
+ if (cq->resize_buf) {
+ ret = -EBUSY;
+ goto unlock;
+ }
+
+ cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC);
+ if (!cq->resize_buf) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ cq->resize_buf->state = CQ_RESIZE_ALLOC;
+
+ ret = 0;
+
+unlock:
+ spin_unlock_irq(&cq->lock);
+
+ if (ret)
+ return ret;
+
+ ret = mthca_alloc_cq_buf(dev, &cq->resize_buf->buf, entries);
+ if (ret) {
+ spin_lock_irq(&cq->lock);
+ kfree(cq->resize_buf);
+ cq->resize_buf = NULL;
+ spin_unlock_irq(&cq->lock);
+ return ret;
+ }
+
+ cq->resize_buf->cqe = entries - 1;
+
+ spin_lock_irq(&cq->lock);
+ cq->resize_buf->state = CQ_RESIZE_READY;
+ spin_unlock_irq(&cq->lock);
+
+ return 0;
+}
+
+static int mthca_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
+{
+ struct mthca_dev *dev = to_mdev(ibcq->device);
+ struct mthca_cq *cq = to_mcq(ibcq);
+ struct mthca_resize_cq ucmd;
+ u32 lkey;
+ u8 status;
+ int ret;
+
+ if (entries < 1 || entries > dev->limits.max_cqes)
+ return -EINVAL;
+
+ mutex_lock(&cq->mutex);
+
+ entries = roundup_pow_of_two(entries + 1);
+ if (entries == ibcq->cqe + 1) {
+ ret = 0;
+ goto out;
+ }
+
+ if (cq->is_kernel) {
+ ret = mthca_alloc_resize_buf(dev, cq, entries);
+ if (ret)
+ goto out;
+ lkey = cq->resize_buf->buf.mr.ibmr.lkey;
+ } else {
+ if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ lkey = ucmd.lkey;
+ }
+
+ ret = mthca_RESIZE_CQ(dev, cq->cqn, lkey, ilog2(entries), &status);
+ if (status)
+ ret = -EINVAL;
+
+ if (ret) {
+ if (cq->resize_buf) {
+ mthca_free_cq_buf(dev, &cq->resize_buf->buf,
+ cq->resize_buf->cqe);
+ kfree(cq->resize_buf);
+ spin_lock_irq(&cq->lock);
+ cq->resize_buf = NULL;
+ spin_unlock_irq(&cq->lock);
+ }
+ goto out;
+ }
+
+ if (cq->is_kernel) {
+ struct mthca_cq_buf tbuf;
+ int tcqe;
+
+ spin_lock_irq(&cq->lock);
+ if (cq->resize_buf->state == CQ_RESIZE_READY) {
+ mthca_cq_resize_copy_cqes(cq);
+ tbuf = cq->buf;
+ tcqe = cq->ibcq.cqe;
+ cq->buf = cq->resize_buf->buf;
+ cq->ibcq.cqe = cq->resize_buf->cqe;
+ } else {
+ tbuf = cq->resize_buf->buf;
+ tcqe = cq->resize_buf->cqe;
+ }
+
+ kfree(cq->resize_buf);
+ cq->resize_buf = NULL;
+ spin_unlock_irq(&cq->lock);
+
+ mthca_free_cq_buf(dev, &tbuf, tcqe);
+ } else
+ ibcq->cqe = entries - 1;
+
+out:
+ mutex_unlock(&cq->mutex);
+
+ return ret;
+}
+
+static int mthca_destroy_cq(struct ib_cq *cq)
+{
+ if (cq->uobject) {
+ mthca_unmap_user_db(to_mdev(cq->device),
+ &to_mucontext(cq->uobject->context)->uar,
+ to_mucontext(cq->uobject->context)->db_tab,
+ to_mcq(cq)->arm_db_index);
+ mthca_unmap_user_db(to_mdev(cq->device),
+ &to_mucontext(cq->uobject->context)->uar,
+ to_mucontext(cq->uobject->context)->db_tab,
+ to_mcq(cq)->set_ci_db_index);
+ }
+ mthca_free_cq(to_mdev(cq->device), to_mcq(cq));
+ kfree(cq);
+
+ return 0;
+}
+
+static inline u32 convert_access(int acc)
+{
+ return (acc & IB_ACCESS_REMOTE_ATOMIC ? MTHCA_MPT_FLAG_ATOMIC : 0) |
+ (acc & IB_ACCESS_REMOTE_WRITE ? MTHCA_MPT_FLAG_REMOTE_WRITE : 0) |
+ (acc & IB_ACCESS_REMOTE_READ ? MTHCA_MPT_FLAG_REMOTE_READ : 0) |
+ (acc & IB_ACCESS_LOCAL_WRITE ? MTHCA_MPT_FLAG_LOCAL_WRITE : 0) |
+ MTHCA_MPT_FLAG_LOCAL_READ;
+}
+
+static struct ib_mr *mthca_get_dma_mr(struct ib_pd *pd, int acc)
+{
+ struct mthca_mr *mr;
+ int err;
+
+ mr = kmalloc(sizeof *mr, GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+ err = mthca_mr_alloc_notrans(to_mdev(pd->device),
+ to_mpd(pd)->pd_num,
+ convert_access(acc), mr);
+
+ if (err) {
+ kfree(mr);
+ return ERR_PTR(err);
+ }
+
+ mr->umem = NULL;
+
+ return &mr->ibmr;
+}
+
+static struct ib_mr *mthca_reg_phys_mr(struct ib_pd *pd,
+ struct ib_phys_buf *buffer_list,
+ int num_phys_buf,
+ int acc,
+ u64 *iova_start)
+{
+ struct mthca_mr *mr;
+ u64 *page_list;
+ u64 total_size;
+ unsigned long mask;
+ int shift;
+ int npages;
+ int err;
+ int i, j, n;
+
+ mask = buffer_list[0].addr ^ *iova_start;
+ total_size = 0;
+ for (i = 0; i < num_phys_buf; ++i) {
+ if (i != 0)
+ mask |= buffer_list[i].addr;
+ if (i != num_phys_buf - 1)
+ mask |= buffer_list[i].addr + buffer_list[i].size;
+
+ total_size += buffer_list[i].size;
+ }
+
+ if (mask & ~PAGE_MASK)
+ return ERR_PTR(-EINVAL);
+
+ shift = __ffs(mask | 1 << 31);
+
+ buffer_list[0].size += buffer_list[0].addr & ((1ULL << shift) - 1);
+ buffer_list[0].addr &= ~0ull << shift;
+
+ mr = kmalloc(sizeof *mr, GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+ npages = 0;
+ for (i = 0; i < num_phys_buf; ++i)
+ npages += (buffer_list[i].size + (1ULL << shift) - 1) >> shift;
+
+ if (!npages)
+ return &mr->ibmr;
+
+ page_list = kmalloc(npages * sizeof *page_list, GFP_KERNEL);
+ if (!page_list) {
+ kfree(mr);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ n = 0;
+ for (i = 0; i < num_phys_buf; ++i)
+ for (j = 0;
+ j < (buffer_list[i].size + (1ULL << shift) - 1) >> shift;
+ ++j)
+ page_list[n++] = buffer_list[i].addr + ((u64) j << shift);
+
+ mthca_dbg(to_mdev(pd->device), "Registering memory at %llx (iova %llx) "
+ "in PD %x; shift %d, npages %d.\n",
+ (unsigned long long) buffer_list[0].addr,
+ (unsigned long long) *iova_start,
+ to_mpd(pd)->pd_num,
+ shift, npages);
+
+ err = mthca_mr_alloc_phys(to_mdev(pd->device),
+ to_mpd(pd)->pd_num,
+ page_list, shift, npages,
+ *iova_start, total_size,
+ convert_access(acc), mr);
+
+ if (err) {
+ kfree(page_list);
+ kfree(mr);
+ return ERR_PTR(err);
+ }
+
+ kfree(page_list);
+ mr->umem = NULL;
+
+ return &mr->ibmr;
+}
+
+static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+ u64 virt, int acc, struct ib_udata *udata)
+{
+ struct mthca_dev *dev = to_mdev(pd->device);
+ struct ib_umem_chunk *chunk;
+ struct mthca_mr *mr;
+ struct mthca_reg_mr ucmd;
+ u64 *pages;
+ int shift, n, len;
+ int i, j, k;
+ int err = 0;
+ int write_mtt_size;
+
+ if (udata->inlen - sizeof (struct ib_uverbs_cmd_hdr) < sizeof ucmd) {
+ if (!to_mucontext(pd->uobject->context)->reg_mr_warned) {
+ mthca_warn(dev, "Process '%s' did not pass in MR attrs.\n",
+ curproc->p_comm);
+ mthca_warn(dev, " Update libmthca to fix this.\n");
+ }
+ ++to_mucontext(pd->uobject->context)->reg_mr_warned;
+ ucmd.mr_attrs = 0;
+ } else if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd))
+ return ERR_PTR(-EFAULT);
+
+ mr = kmalloc(sizeof *mr, GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+ mr->umem = ib_umem_get(pd->uobject->context, start, length, acc,
+ ucmd.mr_attrs & MTHCA_MR_DMASYNC);
+
+ if (IS_ERR(mr->umem)) {
+ err = PTR_ERR(mr->umem);
+ goto err;
+ }
+
+ shift = ffs(mr->umem->page_size) - 1;
+
+ n = 0;
+ list_for_each_entry(chunk, &mr->umem->chunk_list, list)
+ n += chunk->nents;
+
+ mr->mtt = mthca_alloc_mtt(dev, n);
+ if (IS_ERR(mr->mtt)) {
+ err = PTR_ERR(mr->mtt);
+ goto err_umem;
+ }
+
+ pages = (u64 *) __get_free_page(GFP_KERNEL);
+ if (!pages) {
+ err = -ENOMEM;
+ goto err_mtt;
+ }
+
+ i = n = 0;
+
+ write_mtt_size = min(mthca_write_mtt_size(dev), (int) (PAGE_SIZE / sizeof *pages));
+
+ list_for_each_entry(chunk, &mr->umem->chunk_list, list)
+ for (j = 0; j < chunk->nmap; ++j) {
+ len = sg_dma_len(&chunk->page_list[j]) >> shift;
+ for (k = 0; k < len; ++k) {
+ pages[i++] = sg_dma_address(&chunk->page_list[j]) +
+ mr->umem->page_size * k;
+ /*
+ * Be friendly to write_mtt and pass it chunks
+ * of appropriate size.
+ */
+ if (i == write_mtt_size) {
+ err = mthca_write_mtt(dev, mr->mtt, n, pages, i);
+ if (err)
+ goto mtt_done;
+ n += i;
+ i = 0;
+ }
+ }
+ }
+
+ if (i)
+ err = mthca_write_mtt(dev, mr->mtt, n, pages, i);
+mtt_done:
+ free_page((unsigned long) pages);
+ if (err)
+ goto err_mtt;
+
+ err = mthca_mr_alloc(dev, to_mpd(pd)->pd_num, shift, virt, length,
+ convert_access(acc), mr);
+
+ if (err)
+ goto err_mtt;
+
+ return &mr->ibmr;
+
+err_mtt:
+ mthca_free_mtt(dev, mr->mtt);
+
+err_umem:
+ ib_umem_release(mr->umem);
+
+err:
+ kfree(mr);
+ return ERR_PTR(err);
+}
+
+static int mthca_dereg_mr(struct ib_mr *mr)
+{
+ struct mthca_mr *mmr = to_mmr(mr);
+
+ mthca_free_mr(to_mdev(mr->device), mmr);
+ if (mmr->umem)
+ ib_umem_release(mmr->umem);
+ kfree(mmr);
+
+ return 0;
+}
+
+static struct ib_fmr *mthca_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+ struct ib_fmr_attr *fmr_attr)
+{
+ struct mthca_fmr *fmr;
+ int err;
+
+ fmr = kmalloc(sizeof *fmr, GFP_KERNEL);
+ if (!fmr)
+ return ERR_PTR(-ENOMEM);
+
+ memcpy(&fmr->attr, fmr_attr, sizeof *fmr_attr);
+ err = mthca_fmr_alloc(to_mdev(pd->device), to_mpd(pd)->pd_num,
+ convert_access(mr_access_flags), fmr);
+
+ if (err) {
+ kfree(fmr);
+ return ERR_PTR(err);
+ }
+
+ return &fmr->ibmr;
+}
+
+static int mthca_dealloc_fmr(struct ib_fmr *fmr)
+{
+ struct mthca_fmr *mfmr = to_mfmr(fmr);
+ int err;
+
+ err = mthca_free_fmr(to_mdev(fmr->device), mfmr);
+ if (err)
+ return err;
+
+ kfree(mfmr);
+ return 0;
+}
+
+static int mthca_unmap_fmr(struct list_head *fmr_list)
+{
+ struct ib_fmr *fmr;
+ int err;
+ u8 status;
+ struct mthca_dev *mdev = NULL;
+
+ list_for_each_entry(fmr, fmr_list, list) {
+ if (mdev && to_mdev(fmr->device) != mdev)
+ return -EINVAL;
+ mdev = to_mdev(fmr->device);
+ }
+
+ if (!mdev)
+ return 0;
+
+ if (mthca_is_memfree(mdev)) {
+ list_for_each_entry(fmr, fmr_list, list)
+ mthca_arbel_fmr_unmap(mdev, to_mfmr(fmr));
+
+ wmb();
+ } else
+ list_for_each_entry(fmr, fmr_list, list)
+ mthca_tavor_fmr_unmap(mdev, to_mfmr(fmr));
+
+ err = mthca_SYNC_TPT(mdev, &status);
+ if (err)
+ return err;
+ if (status)
+ return -EINVAL;
+ return 0;
+}
+
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+ char *buf)
+{
+ struct mthca_dev *dev =
+ container_of(device, struct mthca_dev, ib_dev.dev);
+ return sprintf(buf, "%x\n", dev->rev_id);
+}
+
+static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
+ char *buf)
+{
+ struct mthca_dev *dev =
+ container_of(device, struct mthca_dev, ib_dev.dev);
+ return sprintf(buf, "%d.%d.%d\n", (int) (dev->fw_ver >> 32),
+ (int) (dev->fw_ver >> 16) & 0xffff,
+ (int) dev->fw_ver & 0xffff);
+}
+
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+ char *buf)
+{
+ struct mthca_dev *dev =
+ container_of(device, struct mthca_dev, ib_dev.dev);
+ switch (dev->pdev->device) {
+ case PCI_DEVICE_ID_MELLANOX_TAVOR:
+ return sprintf(buf, "MT23108\n");
+ case PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT:
+ return sprintf(buf, "MT25208 (MT23108 compat mode)\n");
+ case PCI_DEVICE_ID_MELLANOX_ARBEL:
+ return sprintf(buf, "MT25208\n");
+ case PCI_DEVICE_ID_MELLANOX_SINAI:
+ case PCI_DEVICE_ID_MELLANOX_SINAI_OLD:
+ return sprintf(buf, "MT25204\n");
+ default:
+ return sprintf(buf, "unknown\n");
+ }
+}
+
+static ssize_t show_board(struct device *device, struct device_attribute *attr,
+ char *buf)
+{
+ struct mthca_dev *dev =
+ container_of(device, struct mthca_dev, ib_dev.dev);
+ return sprintf(buf, "%.*s\n", MTHCA_BOARD_ID_LEN, dev->board_id);
+}
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+
+static struct device_attribute *mthca_dev_attributes[] = {
+ &dev_attr_hw_rev,
+ &dev_attr_fw_ver,
+ &dev_attr_hca_type,
+ &dev_attr_board_id
+};
+
+static int mthca_init_node_data(struct mthca_dev *dev)
+{
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ int err = -ENOMEM;
+ u8 status;
+
+ in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad)
+ goto out;
+
+ init_query_mad(in_mad);
+ in_mad->attr_id = IB_SMP_ATTR_NODE_DESC;
+
+ err = mthca_MAD_IFC(dev, 1, 1,
+ 1, NULL, NULL, in_mad, out_mad,
+ &status);
+ if (err)
+ goto out;
+ if (status) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ memcpy(dev->ib_dev.node_desc, out_mad->data, 64);
+
+ in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+ err = mthca_MAD_IFC(dev, 1, 1,
+ 1, NULL, NULL, in_mad, out_mad,
+ &status);
+ if (err)
+ goto out;
+ if (status) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (mthca_is_memfree(dev))
+ dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32));
+ memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8);
+
+out:
+ kfree(in_mad);
+ kfree(out_mad);
+ return err;
+}
+
+int mthca_register_device(struct mthca_dev *dev)
+{
+ int ret;
+ int i;
+
+ ret = mthca_init_node_data(dev);
+ if (ret)
+ return ret;
+
+ strlcpy(dev->ib_dev.name, "mthca%d", IB_DEVICE_NAME_MAX);
+ dev->ib_dev.owner = THIS_MODULE;
+
+ dev->ib_dev.uverbs_abi_ver = MTHCA_UVERBS_ABI_VERSION;
+ dev->ib_dev.uverbs_cmd_mask =
+ (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+ (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+ (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+ (1ull << IB_USER_VERBS_CMD_REG_MR) |
+ (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+ (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_QP) |
+ (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+ (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) |
+ (1ull << IB_USER_VERBS_CMD_DETACH_MCAST);
+ dev->ib_dev.node_type = RDMA_NODE_IB_CA;
+ dev->ib_dev.phys_port_cnt = dev->limits.num_ports;
+ dev->ib_dev.num_comp_vectors = 1;
+ dev->ib_dev.dma_device = &dev->pdev->dev;
+ dev->ib_dev.query_device = mthca_query_device;
+ dev->ib_dev.query_port = mthca_query_port;
+ dev->ib_dev.modify_device = mthca_modify_device;
+ dev->ib_dev.modify_port = mthca_modify_port;
+ dev->ib_dev.query_pkey = mthca_query_pkey;
+ dev->ib_dev.query_gid = mthca_query_gid;
+ dev->ib_dev.alloc_ucontext = mthca_alloc_ucontext;
+ dev->ib_dev.dealloc_ucontext = mthca_dealloc_ucontext;
+ dev->ib_dev.mmap = mthca_mmap_uar;
+ dev->ib_dev.alloc_pd = mthca_alloc_pd;
+ dev->ib_dev.dealloc_pd = mthca_dealloc_pd;
+ dev->ib_dev.create_ah = mthca_ah_create;
+ dev->ib_dev.query_ah = mthca_ah_query;
+ dev->ib_dev.destroy_ah = mthca_ah_destroy;
+
+ if (dev->mthca_flags & MTHCA_FLAG_SRQ) {
+ dev->ib_dev.create_srq = mthca_create_srq;
+ dev->ib_dev.modify_srq = mthca_modify_srq;
+ dev->ib_dev.query_srq = mthca_query_srq;
+ dev->ib_dev.destroy_srq = mthca_destroy_srq;
+ dev->ib_dev.uverbs_cmd_mask |=
+ (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ);
+
+ if (mthca_is_memfree(dev))
+ dev->ib_dev.post_srq_recv = mthca_arbel_post_srq_recv;
+ else
+ dev->ib_dev.post_srq_recv = mthca_tavor_post_srq_recv;
+ }
+
+ dev->ib_dev.create_qp = mthca_create_qp;
+ dev->ib_dev.modify_qp = mthca_modify_qp;
+ dev->ib_dev.query_qp = mthca_query_qp;
+ dev->ib_dev.destroy_qp = mthca_destroy_qp;
+ dev->ib_dev.create_cq = mthca_create_cq;
+ dev->ib_dev.resize_cq = mthca_resize_cq;
+ dev->ib_dev.destroy_cq = mthca_destroy_cq;
+ dev->ib_dev.poll_cq = mthca_poll_cq;
+ dev->ib_dev.get_dma_mr = mthca_get_dma_mr;
+ dev->ib_dev.reg_phys_mr = mthca_reg_phys_mr;
+ dev->ib_dev.reg_user_mr = mthca_reg_user_mr;
+ dev->ib_dev.dereg_mr = mthca_dereg_mr;
+
+ if (dev->mthca_flags & MTHCA_FLAG_FMR) {
+ dev->ib_dev.alloc_fmr = mthca_alloc_fmr;
+ dev->ib_dev.unmap_fmr = mthca_unmap_fmr;
+ dev->ib_dev.dealloc_fmr = mthca_dealloc_fmr;
+ if (mthca_is_memfree(dev))
+ dev->ib_dev.map_phys_fmr = mthca_arbel_map_phys_fmr;
+ else
+ dev->ib_dev.map_phys_fmr = mthca_tavor_map_phys_fmr;
+ }
+
+ dev->ib_dev.attach_mcast = mthca_multicast_attach;
+ dev->ib_dev.detach_mcast = mthca_multicast_detach;
+ dev->ib_dev.process_mad = mthca_process_mad;
+
+ if (mthca_is_memfree(dev)) {
+ dev->ib_dev.req_notify_cq = mthca_arbel_arm_cq;
+ dev->ib_dev.post_send = mthca_arbel_post_send;
+ dev->ib_dev.post_recv = mthca_arbel_post_receive;
+ } else {
+ dev->ib_dev.req_notify_cq = mthca_tavor_arm_cq;
+ dev->ib_dev.post_send = mthca_tavor_post_send;
+ dev->ib_dev.post_recv = mthca_tavor_post_receive;
+ }
+
+ mutex_init(&dev->cap_mask_mutex);
+
+ ret = ib_register_device(&dev->ib_dev);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < ARRAY_SIZE(mthca_dev_attributes); ++i) {
+ ret = device_create_file(&dev->ib_dev.dev,
+ mthca_dev_attributes[i]);
+ if (ret) {
+ ib_unregister_device(&dev->ib_dev);
+ return ret;
+ }
+ }
+
+ mthca_start_catas_poll(dev);
+
+ return 0;
+}
+
+void mthca_unregister_device(struct mthca_dev *dev)
+{
+ mthca_stop_catas_poll(dev);
+ ib_unregister_device(&dev->ib_dev);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.h b/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.h
new file mode 100644
index 0000000..c621f87
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_provider.h
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_PROVIDER_H
+#define MTHCA_PROVIDER_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+
+#define MTHCA_MPT_FLAG_ATOMIC (1 << 14)
+#define MTHCA_MPT_FLAG_REMOTE_WRITE (1 << 13)
+#define MTHCA_MPT_FLAG_REMOTE_READ (1 << 12)
+#define MTHCA_MPT_FLAG_LOCAL_WRITE (1 << 11)
+#define MTHCA_MPT_FLAG_LOCAL_READ (1 << 10)
+
+struct mthca_buf_list {
+ void *buf;
+ DECLARE_PCI_UNMAP_ADDR(mapping)
+};
+
+union mthca_buf {
+ struct mthca_buf_list direct;
+ struct mthca_buf_list *page_list;
+};
+
+struct mthca_uar {
+ unsigned long pfn;
+ int index;
+};
+
+struct mthca_user_db_table;
+
+struct mthca_ucontext {
+ struct ib_ucontext ibucontext;
+ struct mthca_uar uar;
+ struct mthca_user_db_table *db_tab;
+ int reg_mr_warned;
+};
+
+struct mthca_mtt;
+
+struct mthca_mr {
+ struct ib_mr ibmr;
+ struct ib_umem *umem;
+ struct mthca_mtt *mtt;
+};
+
+struct mthca_fmr {
+ struct ib_fmr ibmr;
+ struct ib_fmr_attr attr;
+ struct mthca_mtt *mtt;
+ int maps;
+ union {
+ struct {
+ struct mthca_mpt_entry __iomem *mpt;
+ u64 __iomem *mtts;
+ } tavor;
+ struct {
+ struct mthca_mpt_entry *mpt;
+ __be64 *mtts;
+ dma_addr_t dma_handle;
+ } arbel;
+ } mem;
+};
+
+struct mthca_pd {
+ struct ib_pd ibpd;
+ u32 pd_num;
+ atomic_t sqp_count;
+ struct mthca_mr ntmr;
+ int privileged;
+};
+
+struct mthca_eq {
+ struct mthca_dev *dev;
+ int eqn;
+ u32 eqn_mask;
+ u32 cons_index;
+ u16 msi_x_vector;
+ u16 msi_x_entry;
+ int have_irq;
+ int nent;
+ struct mthca_buf_list *page_list;
+ struct mthca_mr mr;
+};
+
+struct mthca_av;
+
+enum mthca_ah_type {
+ MTHCA_AH_ON_HCA,
+ MTHCA_AH_PCI_POOL,
+ MTHCA_AH_KMALLOC
+};
+
+struct mthca_ah {
+ struct ib_ah ibah;
+ enum mthca_ah_type type;
+ u32 key;
+ struct mthca_av *av;
+ dma_addr_t avdma;
+};
+
+/*
+ * Quick description of our CQ/QP locking scheme:
+ *
+ * We have one global lock that protects dev->cq/qp_table. Each
+ * struct mthca_cq/qp also has its own lock. An individual qp lock
+ * may be taken inside of an individual cq lock. Both cqs attached to
+ * a qp may be locked, with the cq with the lower cqn locked first.
+ * No other nesting should be done.
+ *
+ * Each struct mthca_cq/qp also has an ref count, protected by the
+ * corresponding table lock. The pointer from the cq/qp_table to the
+ * struct counts as one reference. This reference also is good for
+ * access through the consumer API, so modifying the CQ/QP etc doesn't
+ * need to take another reference. Access to a QP because of a
+ * completion being polled does not need a reference either.
+ *
+ * Finally, each struct mthca_cq/qp has a wait_queue_head_t for the
+ * destroy function to sleep on.
+ *
+ * This means that access from the consumer API requires nothing but
+ * taking the struct's lock.
+ *
+ * Access because of a completion event should go as follows:
+ * - lock cq/qp_table and look up struct
+ * - increment ref count in struct
+ * - drop cq/qp_table lock
+ * - lock struct, do your thing, and unlock struct
+ * - decrement ref count; if zero, wake up waiters
+ *
+ * To destroy a CQ/QP, we can do the following:
+ * - lock cq/qp_table
+ * - remove pointer and decrement ref count
+ * - unlock cq/qp_table lock
+ * - wait_event until ref count is zero
+ *
+ * It is the consumer's responsibilty to make sure that no QP
+ * operations (WQE posting or state modification) are pending when a
+ * QP is destroyed. Also, the consumer must make sure that calls to
+ * qp_modify are serialized. Similarly, the consumer is responsible
+ * for ensuring that no CQ resize operations are pending when a CQ
+ * is destroyed.
+ *
+ * Possible optimizations (wait for profile data to see if/where we
+ * have locks bouncing between CPUs):
+ * - split cq/qp table lock into n separate (cache-aligned) locks,
+ * indexed (say) by the page in the table
+ * - split QP struct lock into three (one for common info, one for the
+ * send queue and one for the receive queue)
+ */
+
+struct mthca_cq_buf {
+ union mthca_buf queue;
+ struct mthca_mr mr;
+ int is_direct;
+};
+
+struct mthca_cq_resize {
+ struct mthca_cq_buf buf;
+ int cqe;
+ enum {
+ CQ_RESIZE_ALLOC,
+ CQ_RESIZE_READY,
+ CQ_RESIZE_SWAPPED
+ } state;
+};
+
+struct mthca_cq {
+ struct ib_cq ibcq;
+ spinlock_t lock;
+ int refcount;
+ int cqn;
+ u32 cons_index;
+ struct mthca_cq_buf buf;
+ struct mthca_cq_resize *resize_buf;
+ int is_kernel;
+
+ /* Next fields are Arbel only */
+ int set_ci_db_index;
+ __be32 *set_ci_db;
+ int arm_db_index;
+ __be32 *arm_db;
+ int arm_sn;
+
+ wait_queue_head_t wait;
+ struct mutex mutex;
+};
+
+struct mthca_srq {
+ struct ib_srq ibsrq;
+ spinlock_t lock;
+ int refcount;
+ int srqn;
+ int max;
+ int max_gs;
+ int wqe_shift;
+ int first_free;
+ int last_free;
+ u16 counter; /* Arbel only */
+ int db_index; /* Arbel only */
+ __be32 *db; /* Arbel only */
+ void *last;
+
+ int is_direct;
+ u64 *wrid;
+ union mthca_buf queue;
+ struct mthca_mr mr;
+
+ wait_queue_head_t wait;
+ struct mutex mutex;
+};
+
+struct mthca_wq {
+ spinlock_t lock;
+ int max;
+ unsigned next_ind;
+ unsigned last_comp;
+ unsigned head;
+ unsigned tail;
+ void *last;
+ int max_gs;
+ int wqe_shift;
+
+ int db_index; /* Arbel only */
+ __be32 *db;
+};
+
+struct mthca_qp {
+ struct ib_qp ibqp;
+ int refcount;
+ u32 qpn;
+ int is_direct;
+ u8 port; /* for SQP and memfree use only */
+ u8 alt_port; /* for memfree use only */
+ u8 transport;
+ u8 state;
+ u8 atomic_rd_en;
+ u8 resp_depth;
+
+ struct mthca_mr mr;
+
+ struct mthca_wq rq;
+ struct mthca_wq sq;
+ enum ib_sig_type sq_policy;
+ int send_wqe_offset;
+ int max_inline_data;
+
+ u64 *wrid;
+ union mthca_buf queue;
+
+ wait_queue_head_t wait;
+ struct mutex mutex;
+};
+
+struct mthca_sqp {
+ struct mthca_qp qp;
+ int pkey_index;
+ u32 qkey;
+ u32 send_psn;
+ struct ib_ud_header ud_header;
+ int header_buf_size;
+ void *header_buf;
+ dma_addr_t header_dma;
+};
+
+static inline struct mthca_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
+{
+ return container_of(ibucontext, struct mthca_ucontext, ibucontext);
+}
+
+static inline struct mthca_fmr *to_mfmr(struct ib_fmr *ibmr)
+{
+ return container_of(ibmr, struct mthca_fmr, ibmr);
+}
+
+static inline struct mthca_mr *to_mmr(struct ib_mr *ibmr)
+{
+ return container_of(ibmr, struct mthca_mr, ibmr);
+}
+
+static inline struct mthca_pd *to_mpd(struct ib_pd *ibpd)
+{
+ return container_of(ibpd, struct mthca_pd, ibpd);
+}
+
+static inline struct mthca_ah *to_mah(struct ib_ah *ibah)
+{
+ return container_of(ibah, struct mthca_ah, ibah);
+}
+
+static inline struct mthca_cq *to_mcq(struct ib_cq *ibcq)
+{
+ return container_of(ibcq, struct mthca_cq, ibcq);
+}
+
+static inline struct mthca_srq *to_msrq(struct ib_srq *ibsrq)
+{
+ return container_of(ibsrq, struct mthca_srq, ibsrq);
+}
+
+static inline struct mthca_qp *to_mqp(struct ib_qp *ibqp)
+{
+ return container_of(ibqp, struct mthca_qp, ibqp);
+}
+
+static inline struct mthca_sqp *to_msqp(struct mthca_qp *qp)
+{
+ return container_of(qp, struct mthca_sqp, qp);
+}
+
+#endif /* MTHCA_PROVIDER_H */
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_qp.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_qp.c
new file mode 100644
index 0000000..4a4d133
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -0,0 +1,2332 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+#include <asm/io.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_pack.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+#include "mthca_wqe.h"
+
+enum {
+ MTHCA_MAX_DIRECT_QP_SIZE = 4 * PAGE_SIZE,
+ MTHCA_ACK_REQ_FREQ = 10,
+ MTHCA_FLIGHT_LIMIT = 9,
+ MTHCA_UD_HEADER_SIZE = 72, /* largest UD header possible */
+ MTHCA_INLINE_HEADER_SIZE = 4, /* data segment overhead for inline */
+ MTHCA_INLINE_CHUNK_SIZE = 16 /* inline data segment chunk */
+};
+
+enum {
+ MTHCA_QP_STATE_RST = 0,
+ MTHCA_QP_STATE_INIT = 1,
+ MTHCA_QP_STATE_RTR = 2,
+ MTHCA_QP_STATE_RTS = 3,
+ MTHCA_QP_STATE_SQE = 4,
+ MTHCA_QP_STATE_SQD = 5,
+ MTHCA_QP_STATE_ERR = 6,
+ MTHCA_QP_STATE_DRAINING = 7
+};
+
+enum {
+ MTHCA_QP_ST_RC = 0x0,
+ MTHCA_QP_ST_UC = 0x1,
+ MTHCA_QP_ST_RD = 0x2,
+ MTHCA_QP_ST_UD = 0x3,
+ MTHCA_QP_ST_MLX = 0x7
+};
+
+enum {
+ MTHCA_QP_PM_MIGRATED = 0x3,
+ MTHCA_QP_PM_ARMED = 0x0,
+ MTHCA_QP_PM_REARM = 0x1
+};
+
+enum {
+ /* qp_context flags */
+ MTHCA_QP_BIT_DE = 1 << 8,
+ /* params1 */
+ MTHCA_QP_BIT_SRE = 1 << 15,
+ MTHCA_QP_BIT_SWE = 1 << 14,
+ MTHCA_QP_BIT_SAE = 1 << 13,
+ MTHCA_QP_BIT_SIC = 1 << 4,
+ MTHCA_QP_BIT_SSC = 1 << 3,
+ /* params2 */
+ MTHCA_QP_BIT_RRE = 1 << 15,
+ MTHCA_QP_BIT_RWE = 1 << 14,
+ MTHCA_QP_BIT_RAE = 1 << 13,
+ MTHCA_QP_BIT_RIC = 1 << 4,
+ MTHCA_QP_BIT_RSC = 1 << 3
+};
+
+enum {
+ MTHCA_SEND_DOORBELL_FENCE = 1 << 5
+};
+
+struct mthca_qp_path {
+ __be32 port_pkey;
+ u8 rnr_retry;
+ u8 g_mylmc;
+ __be16 rlid;
+ u8 ackto;
+ u8 mgid_index;
+ u8 static_rate;
+ u8 hop_limit;
+ __be32 sl_tclass_flowlabel;
+ u8 rgid[16];
+} __attribute__((packed));
+
+struct mthca_qp_context {
+ __be32 flags;
+ __be32 tavor_sched_queue; /* Reserved on Arbel */
+ u8 mtu_msgmax;
+ u8 rq_size_stride; /* Reserved on Tavor */
+ u8 sq_size_stride; /* Reserved on Tavor */
+ u8 rlkey_arbel_sched_queue; /* Reserved on Tavor */
+ __be32 usr_page;
+ __be32 local_qpn;
+ __be32 remote_qpn;
+ u32 reserved1[2];
+ struct mthca_qp_path pri_path;
+ struct mthca_qp_path alt_path;
+ __be32 rdd;
+ __be32 pd;
+ __be32 wqe_base;
+ __be32 wqe_lkey;
+ __be32 params1;
+ __be32 reserved2;
+ __be32 next_send_psn;
+ __be32 cqn_snd;
+ __be32 snd_wqe_base_l; /* Next send WQE on Tavor */
+ __be32 snd_db_index; /* (debugging only entries) */
+ __be32 last_acked_psn;
+ __be32 ssn;
+ __be32 params2;
+ __be32 rnr_nextrecvpsn;
+ __be32 ra_buff_indx;
+ __be32 cqn_rcv;
+ __be32 rcv_wqe_base_l; /* Next recv WQE on Tavor */
+ __be32 rcv_db_index; /* (debugging only entries) */
+ __be32 qkey;
+ __be32 srqn;
+ __be32 rmsn;
+ __be16 rq_wqe_counter; /* reserved on Tavor */
+ __be16 sq_wqe_counter; /* reserved on Tavor */
+ u32 reserved3[18];
+} __attribute__((packed));
+
+struct mthca_qp_param {
+ __be32 opt_param_mask;
+ u32 reserved1;
+ struct mthca_qp_context context;
+ u32 reserved2[62];
+} __attribute__((packed));
+
+enum {
+ MTHCA_QP_OPTPAR_ALT_ADDR_PATH = 1 << 0,
+ MTHCA_QP_OPTPAR_RRE = 1 << 1,
+ MTHCA_QP_OPTPAR_RAE = 1 << 2,
+ MTHCA_QP_OPTPAR_RWE = 1 << 3,
+ MTHCA_QP_OPTPAR_PKEY_INDEX = 1 << 4,
+ MTHCA_QP_OPTPAR_Q_KEY = 1 << 5,
+ MTHCA_QP_OPTPAR_RNR_TIMEOUT = 1 << 6,
+ MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH = 1 << 7,
+ MTHCA_QP_OPTPAR_SRA_MAX = 1 << 8,
+ MTHCA_QP_OPTPAR_RRA_MAX = 1 << 9,
+ MTHCA_QP_OPTPAR_PM_STATE = 1 << 10,
+ MTHCA_QP_OPTPAR_PORT_NUM = 1 << 11,
+ MTHCA_QP_OPTPAR_RETRY_COUNT = 1 << 12,
+ MTHCA_QP_OPTPAR_ALT_RNR_RETRY = 1 << 13,
+ MTHCA_QP_OPTPAR_ACK_TIMEOUT = 1 << 14,
+ MTHCA_QP_OPTPAR_RNR_RETRY = 1 << 15,
+ MTHCA_QP_OPTPAR_SCHED_QUEUE = 1 << 16
+};
+
+static const u8 mthca_opcode[] = {
+ [IB_WR_SEND] = MTHCA_OPCODE_SEND,
+ [IB_WR_SEND_WITH_IMM] = MTHCA_OPCODE_SEND_IMM,
+ [IB_WR_RDMA_WRITE] = MTHCA_OPCODE_RDMA_WRITE,
+ [IB_WR_RDMA_WRITE_WITH_IMM] = MTHCA_OPCODE_RDMA_WRITE_IMM,
+ [IB_WR_RDMA_READ] = MTHCA_OPCODE_RDMA_READ,
+ [IB_WR_ATOMIC_CMP_AND_SWP] = MTHCA_OPCODE_ATOMIC_CS,
+ [IB_WR_ATOMIC_FETCH_AND_ADD] = MTHCA_OPCODE_ATOMIC_FA,
+};
+
+static int is_sqp(struct mthca_dev *dev, struct mthca_qp *qp)
+{
+ return qp->qpn >= dev->qp_table.sqp_start &&
+ qp->qpn <= dev->qp_table.sqp_start + 3;
+}
+
+static int is_qp0(struct mthca_dev *dev, struct mthca_qp *qp)
+{
+ return qp->qpn >= dev->qp_table.sqp_start &&
+ qp->qpn <= dev->qp_table.sqp_start + 1;
+}
+
+static void *get_recv_wqe(struct mthca_qp *qp, int n)
+{
+ if (qp->is_direct)
+ return qp->queue.direct.buf + (n << qp->rq.wqe_shift);
+ else
+ return qp->queue.page_list[(n << qp->rq.wqe_shift) >> PAGE_SHIFT].buf +
+ ((n << qp->rq.wqe_shift) & (PAGE_SIZE - 1));
+}
+
+static void *get_send_wqe(struct mthca_qp *qp, int n)
+{
+ if (qp->is_direct)
+ return qp->queue.direct.buf + qp->send_wqe_offset +
+ (n << qp->sq.wqe_shift);
+ else
+ return qp->queue.page_list[(qp->send_wqe_offset +
+ (n << qp->sq.wqe_shift)) >>
+ PAGE_SHIFT].buf +
+ ((qp->send_wqe_offset + (n << qp->sq.wqe_shift)) &
+ (PAGE_SIZE - 1));
+}
+
+static void mthca_wq_reset(struct mthca_wq *wq)
+{
+ wq->next_ind = 0;
+ wq->last_comp = wq->max - 1;
+ wq->head = 0;
+ wq->tail = 0;
+}
+
+void mthca_qp_event(struct mthca_dev *dev, u32 qpn,
+ enum ib_event_type event_type)
+{
+ struct mthca_qp *qp;
+ struct ib_event event;
+
+ spin_lock(&dev->qp_table.lock);
+ qp = mthca_array_get(&dev->qp_table.qp, qpn & (dev->limits.num_qps - 1));
+ if (qp)
+ ++qp->refcount;
+ spin_unlock(&dev->qp_table.lock);
+
+ if (!qp) {
+ mthca_warn(dev, "Async event %d for bogus QP %08x\n",
+ (int) event_type, qpn);
+ return;
+ }
+
+ if (event_type == IB_EVENT_PATH_MIG)
+ qp->port = qp->alt_port;
+
+ event.device = &dev->ib_dev;
+ event.event = event_type;
+ event.element.qp = &qp->ibqp;
+ if (qp->ibqp.event_handler)
+ qp->ibqp.event_handler(&event, qp->ibqp.qp_context);
+
+ spin_lock(&dev->qp_table.lock);
+ if (!--qp->refcount)
+ wake_up(&qp->wait);
+ spin_unlock(&dev->qp_table.lock);
+}
+
+static int to_mthca_state(enum ib_qp_state ib_state)
+{
+ switch (ib_state) {
+ case IB_QPS_RESET: return MTHCA_QP_STATE_RST;
+ case IB_QPS_INIT: return MTHCA_QP_STATE_INIT;
+ case IB_QPS_RTR: return MTHCA_QP_STATE_RTR;
+ case IB_QPS_RTS: return MTHCA_QP_STATE_RTS;
+ case IB_QPS_SQD: return MTHCA_QP_STATE_SQD;
+ case IB_QPS_SQE: return MTHCA_QP_STATE_SQE;
+ case IB_QPS_ERR: return MTHCA_QP_STATE_ERR;
+ default: return -1;
+ }
+}
+
+enum { RC, UC, UD, RD, RDEE, MLX, NUM_TRANS };
+
+static int to_mthca_st(int transport)
+{
+ switch (transport) {
+ case RC: return MTHCA_QP_ST_RC;
+ case UC: return MTHCA_QP_ST_UC;
+ case UD: return MTHCA_QP_ST_UD;
+ case RD: return MTHCA_QP_ST_RD;
+ case MLX: return MTHCA_QP_ST_MLX;
+ default: return -1;
+ }
+}
+
+static void store_attrs(struct mthca_sqp *sqp, const struct ib_qp_attr *attr,
+ int attr_mask)
+{
+ if (attr_mask & IB_QP_PKEY_INDEX)
+ sqp->pkey_index = attr->pkey_index;
+ if (attr_mask & IB_QP_QKEY)
+ sqp->qkey = attr->qkey;
+ if (attr_mask & IB_QP_SQ_PSN)
+ sqp->send_psn = attr->sq_psn;
+}
+
+static void init_port(struct mthca_dev *dev, int port)
+{
+ int err;
+ u8 status;
+ struct mthca_init_ib_param param;
+
+ memset(&param, 0, sizeof param);
+
+ param.port_width = dev->limits.port_width_cap;
+ param.vl_cap = dev->limits.vl_cap;
+ param.mtu_cap = dev->limits.mtu_cap;
+ param.gid_cap = dev->limits.gid_table_len;
+ param.pkey_cap = dev->limits.pkey_table_len;
+
+ err = mthca_INIT_IB(dev, &param, port, &status);
+ if (err)
+ mthca_warn(dev, "INIT_IB failed, return code %d.\n", err);
+ if (status)
+ mthca_warn(dev, "INIT_IB returned status %02x.\n", status);
+}
+
+static __be32 get_hw_access_flags(struct mthca_qp *qp, const struct ib_qp_attr *attr,
+ int attr_mask)
+{
+ u8 dest_rd_atomic;
+ u32 access_flags;
+ u32 hw_access_flags = 0;
+
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+ dest_rd_atomic = attr->max_dest_rd_atomic;
+ else
+ dest_rd_atomic = qp->resp_depth;
+
+ if (attr_mask & IB_QP_ACCESS_FLAGS)
+ access_flags = attr->qp_access_flags;
+ else
+ access_flags = qp->atomic_rd_en;
+
+ if (!dest_rd_atomic)
+ access_flags &= IB_ACCESS_REMOTE_WRITE;
+
+ if (access_flags & IB_ACCESS_REMOTE_READ)
+ hw_access_flags |= MTHCA_QP_BIT_RRE;
+ if (access_flags & IB_ACCESS_REMOTE_ATOMIC)
+ hw_access_flags |= MTHCA_QP_BIT_RAE;
+ if (access_flags & IB_ACCESS_REMOTE_WRITE)
+ hw_access_flags |= MTHCA_QP_BIT_RWE;
+
+ return cpu_to_be32(hw_access_flags);
+}
+
+static inline enum ib_qp_state to_ib_qp_state(int mthca_state)
+{
+ switch (mthca_state) {
+ case MTHCA_QP_STATE_RST: return IB_QPS_RESET;
+ case MTHCA_QP_STATE_INIT: return IB_QPS_INIT;
+ case MTHCA_QP_STATE_RTR: return IB_QPS_RTR;
+ case MTHCA_QP_STATE_RTS: return IB_QPS_RTS;
+ case MTHCA_QP_STATE_DRAINING:
+ case MTHCA_QP_STATE_SQD: return IB_QPS_SQD;
+ case MTHCA_QP_STATE_SQE: return IB_QPS_SQE;
+ case MTHCA_QP_STATE_ERR: return IB_QPS_ERR;
+ default: return -1;
+ }
+}
+
+static inline enum ib_mig_state to_ib_mig_state(int mthca_mig_state)
+{
+ switch (mthca_mig_state) {
+ case 0: return IB_MIG_ARMED;
+ case 1: return IB_MIG_REARM;
+ case 3: return IB_MIG_MIGRATED;
+ default: return -1;
+ }
+}
+
+static int to_ib_qp_access_flags(int mthca_flags)
+{
+ int ib_flags = 0;
+
+ if (mthca_flags & MTHCA_QP_BIT_RRE)
+ ib_flags |= IB_ACCESS_REMOTE_READ;
+ if (mthca_flags & MTHCA_QP_BIT_RWE)
+ ib_flags |= IB_ACCESS_REMOTE_WRITE;
+ if (mthca_flags & MTHCA_QP_BIT_RAE)
+ ib_flags |= IB_ACCESS_REMOTE_ATOMIC;
+
+ return ib_flags;
+}
+
+static void to_ib_ah_attr(struct mthca_dev *dev, struct ib_ah_attr *ib_ah_attr,
+ struct mthca_qp_path *path)
+{
+ memset(ib_ah_attr, 0, sizeof *ib_ah_attr);
+ ib_ah_attr->port_num = (be32_to_cpu(path->port_pkey) >> 24) & 0x3;
+
+ if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->limits.num_ports)
+ return;
+
+ ib_ah_attr->dlid = be16_to_cpu(path->rlid);
+ ib_ah_attr->sl = be32_to_cpu(path->sl_tclass_flowlabel) >> 28;
+ ib_ah_attr->src_path_bits = path->g_mylmc & 0x7f;
+ ib_ah_attr->static_rate = mthca_rate_to_ib(dev,
+ path->static_rate & 0xf,
+ ib_ah_attr->port_num);
+ ib_ah_attr->ah_flags = (path->g_mylmc & (1 << 7)) ? IB_AH_GRH : 0;
+ if (ib_ah_attr->ah_flags) {
+ ib_ah_attr->grh.sgid_index = path->mgid_index & (dev->limits.gid_table_len - 1);
+ ib_ah_attr->grh.hop_limit = path->hop_limit;
+ ib_ah_attr->grh.traffic_class =
+ (be32_to_cpu(path->sl_tclass_flowlabel) >> 20) & 0xff;
+ ib_ah_attr->grh.flow_label =
+ be32_to_cpu(path->sl_tclass_flowlabel) & 0xfffff;
+ memcpy(ib_ah_attr->grh.dgid.raw,
+ path->rgid, sizeof ib_ah_attr->grh.dgid.raw);
+ }
+}
+
+int mthca_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ struct mthca_dev *dev = to_mdev(ibqp->device);
+ struct mthca_qp *qp = to_mqp(ibqp);
+ int err = 0;
+ struct mthca_mailbox *mailbox = NULL;
+ struct mthca_qp_param *qp_param;
+ struct mthca_qp_context *context;
+ int mthca_state;
+ u8 status;
+
+ mutex_lock(&qp->mutex);
+
+ if (qp->state == IB_QPS_RESET) {
+ qp_attr->qp_state = IB_QPS_RESET;
+ goto done;
+ }
+
+ mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(mailbox)) {
+ err = PTR_ERR(mailbox);
+ goto out;
+ }
+
+ err = mthca_QUERY_QP(dev, qp->qpn, 0, mailbox, &status);
+ if (err)
+ goto out_mailbox;
+ if (status) {
+ mthca_warn(dev, "QUERY_QP returned status %02x\n", status);
+ err = -EINVAL;
+ goto out_mailbox;
+ }
+
+ qp_param = mailbox->buf;
+ context = &qp_param->context;
+ mthca_state = be32_to_cpu(context->flags) >> 28;
+
+ qp->state = to_ib_qp_state(mthca_state);
+ qp_attr->qp_state = qp->state;
+ qp_attr->path_mtu = context->mtu_msgmax >> 5;
+ qp_attr->path_mig_state =
+ to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3);
+ qp_attr->qkey = be32_to_cpu(context->qkey);
+ qp_attr->rq_psn = be32_to_cpu(context->rnr_nextrecvpsn) & 0xffffff;
+ qp_attr->sq_psn = be32_to_cpu(context->next_send_psn) & 0xffffff;
+ qp_attr->dest_qp_num = be32_to_cpu(context->remote_qpn) & 0xffffff;
+ qp_attr->qp_access_flags =
+ to_ib_qp_access_flags(be32_to_cpu(context->params2));
+
+ if (qp->transport == RC || qp->transport == UC) {
+ to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path);
+ to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path);
+ qp_attr->alt_pkey_index =
+ be32_to_cpu(context->alt_path.port_pkey) & 0x7f;
+ qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num;
+ }
+
+ qp_attr->pkey_index = be32_to_cpu(context->pri_path.port_pkey) & 0x7f;
+ qp_attr->port_num =
+ (be32_to_cpu(context->pri_path.port_pkey) >> 24) & 0x3;
+
+ /* qp_attr->en_sqd_async_notify is only applicable in modify qp */
+ qp_attr->sq_draining = mthca_state == MTHCA_QP_STATE_DRAINING;
+
+ qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context->params1) >> 21) & 0x7);
+
+ qp_attr->max_dest_rd_atomic =
+ 1 << ((be32_to_cpu(context->params2) >> 21) & 0x7);
+ qp_attr->min_rnr_timer =
+ (be32_to_cpu(context->rnr_nextrecvpsn) >> 24) & 0x1f;
+ qp_attr->timeout = context->pri_path.ackto >> 3;
+ qp_attr->retry_cnt = (be32_to_cpu(context->params1) >> 16) & 0x7;
+ qp_attr->rnr_retry = context->pri_path.rnr_retry >> 5;
+ qp_attr->alt_timeout = context->alt_path.ackto >> 3;
+
+done:
+ qp_attr->cur_qp_state = qp_attr->qp_state;
+ qp_attr->cap.max_send_wr = qp->sq.max;
+ qp_attr->cap.max_recv_wr = qp->rq.max;
+ qp_attr->cap.max_send_sge = qp->sq.max_gs;
+ qp_attr->cap.max_recv_sge = qp->rq.max_gs;
+ qp_attr->cap.max_inline_data = qp->max_inline_data;
+
+ qp_init_attr->cap = qp_attr->cap;
+
+out_mailbox:
+ mthca_free_mailbox(dev, mailbox);
+
+out:
+ mutex_unlock(&qp->mutex);
+ return err;
+}
+
+static int mthca_path_set(struct mthca_dev *dev, const struct ib_ah_attr *ah,
+ struct mthca_qp_path *path, u8 port)
+{
+ path->g_mylmc = ah->src_path_bits & 0x7f;
+ path->rlid = cpu_to_be16(ah->dlid);
+ path->static_rate = mthca_get_rate(dev, ah->static_rate, port);
+
+ if (ah->ah_flags & IB_AH_GRH) {
+ if (ah->grh.sgid_index >= dev->limits.gid_table_len) {
+ mthca_dbg(dev, "sgid_index (%u) too large. max is %d\n",
+ ah->grh.sgid_index, dev->limits.gid_table_len-1);
+ return -1;
+ }
+
+ path->g_mylmc |= 1 << 7;
+ path->mgid_index = ah->grh.sgid_index;
+ path->hop_limit = ah->grh.hop_limit;
+ path->sl_tclass_flowlabel =
+ cpu_to_be32((ah->sl << 28) |
+ (ah->grh.traffic_class << 20) |
+ (ah->grh.flow_label));
+ memcpy(path->rgid, ah->grh.dgid.raw, 16);
+ } else
+ path->sl_tclass_flowlabel = cpu_to_be32(ah->sl << 28);
+
+ return 0;
+}
+
+static int __mthca_modify_qp(struct ib_qp *ibqp,
+ const struct ib_qp_attr *attr, int attr_mask,
+ enum ib_qp_state cur_state, enum ib_qp_state new_state)
+{
+ struct mthca_dev *dev = to_mdev(ibqp->device);
+ struct mthca_qp *qp = to_mqp(ibqp);
+ struct mthca_mailbox *mailbox;
+ struct mthca_qp_param *qp_param;
+ struct mthca_qp_context *qp_context;
+ u32 sqd_event = 0;
+ u8 status;
+ int err = -EINVAL;
+
+ mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(mailbox)) {
+ err = PTR_ERR(mailbox);
+ goto out;
+ }
+ qp_param = mailbox->buf;
+ qp_context = &qp_param->context;
+ memset(qp_param, 0, sizeof *qp_param);
+
+ qp_context->flags = cpu_to_be32((to_mthca_state(new_state) << 28) |
+ (to_mthca_st(qp->transport) << 16));
+ qp_context->flags |= cpu_to_be32(MTHCA_QP_BIT_DE);
+ if (!(attr_mask & IB_QP_PATH_MIG_STATE))
+ qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_MIGRATED << 11);
+ else {
+ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PM_STATE);
+ switch (attr->path_mig_state) {
+ case IB_MIG_MIGRATED:
+ qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_MIGRATED << 11);
+ break;
+ case IB_MIG_REARM:
+ qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_REARM << 11);
+ break;
+ case IB_MIG_ARMED:
+ qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_ARMED << 11);
+ break;
+ }
+ }
+
+ /* leave tavor_sched_queue as 0 */
+
+ if (qp->transport == MLX || qp->transport == UD)
+ qp_context->mtu_msgmax = (IB_MTU_2048 << 5) | 11;
+ else if (attr_mask & IB_QP_PATH_MTU) {
+ if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_2048) {
+ mthca_dbg(dev, "path MTU (%u) is invalid\n",
+ attr->path_mtu);
+ goto out_mailbox;
+ }
+ qp_context->mtu_msgmax = (attr->path_mtu << 5) | 31;
+ }
+
+ if (mthca_is_memfree(dev)) {
+ if (qp->rq.max)
+ qp_context->rq_size_stride = ilog2(qp->rq.max) << 3;
+ qp_context->rq_size_stride |= qp->rq.wqe_shift - 4;
+
+ if (qp->sq.max)
+ qp_context->sq_size_stride = ilog2(qp->sq.max) << 3;
+ qp_context->sq_size_stride |= qp->sq.wqe_shift - 4;
+ }
+
+ /* leave arbel_sched_queue as 0 */
+
+ if (qp->ibqp.uobject)
+ qp_context->usr_page =
+ cpu_to_be32(to_mucontext(qp->ibqp.uobject->context)->uar.index);
+ else
+ qp_context->usr_page = cpu_to_be32(dev->driver_uar.index);
+ qp_context->local_qpn = cpu_to_be32(qp->qpn);
+ if (attr_mask & IB_QP_DEST_QPN) {
+ qp_context->remote_qpn = cpu_to_be32(attr->dest_qp_num);
+ }
+
+ if (qp->transport == MLX)
+ qp_context->pri_path.port_pkey |=
+ cpu_to_be32(qp->port << 24);
+ else {
+ if (attr_mask & IB_QP_PORT) {
+ qp_context->pri_path.port_pkey |=
+ cpu_to_be32(attr->port_num << 24);
+ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PORT_NUM);
+ }
+ }
+
+ if (attr_mask & IB_QP_PKEY_INDEX) {
+ qp_context->pri_path.port_pkey |=
+ cpu_to_be32(attr->pkey_index);
+ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PKEY_INDEX);
+ }
+
+ if (attr_mask & IB_QP_RNR_RETRY) {
+ qp_context->alt_path.rnr_retry = qp_context->pri_path.rnr_retry =
+ attr->rnr_retry << 5;
+ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RNR_RETRY |
+ MTHCA_QP_OPTPAR_ALT_RNR_RETRY);
+ }
+
+ if (attr_mask & IB_QP_AV) {
+ if (mthca_path_set(dev, &attr->ah_attr, &qp_context->pri_path,
+ attr_mask & IB_QP_PORT ? attr->port_num : qp->port))
+ goto out_mailbox;
+
+ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH);
+ }
+
+ if (ibqp->qp_type == IB_QPT_RC &&
+ cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
+ u8 sched_queue = ibqp->uobject ? 0x2 : 0x1;
+
+ if (mthca_is_memfree(dev))
+ qp_context->rlkey_arbel_sched_queue |= sched_queue;
+ else
+ qp_context->tavor_sched_queue |= cpu_to_be32(sched_queue);
+
+ qp_param->opt_param_mask |=
+ cpu_to_be32(MTHCA_QP_OPTPAR_SCHED_QUEUE);
+ }
+
+ if (attr_mask & IB_QP_TIMEOUT) {
+ qp_context->pri_path.ackto = attr->timeout << 3;
+ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_ACK_TIMEOUT);
+ }
+
+ if (attr_mask & IB_QP_ALT_PATH) {
+ if (attr->alt_pkey_index >= dev->limits.pkey_table_len) {
+ mthca_dbg(dev, "Alternate P_Key index (%u) too large. max is %d\n",
+ attr->alt_pkey_index, dev->limits.pkey_table_len-1);
+ goto out_mailbox;
+ }
+
+ if (attr->alt_port_num == 0 || attr->alt_port_num > dev->limits.num_ports) {
+ mthca_dbg(dev, "Alternate port number (%u) is invalid\n",
+ attr->alt_port_num);
+ goto out_mailbox;
+ }
+
+ if (mthca_path_set(dev, &attr->alt_ah_attr, &qp_context->alt_path,
+ attr->alt_ah_attr.port_num))
+ goto out_mailbox;
+
+ qp_context->alt_path.port_pkey |= cpu_to_be32(attr->alt_pkey_index |
+ attr->alt_port_num << 24);
+ qp_context->alt_path.ackto = attr->alt_timeout << 3;
+ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_ALT_ADDR_PATH);
+ }
+
+ /* leave rdd as 0 */
+ qp_context->pd = cpu_to_be32(to_mpd(ibqp->pd)->pd_num);
+ /* leave wqe_base as 0 (we always create an MR based at 0 for WQs) */
+ qp_context->wqe_lkey = cpu_to_be32(qp->mr.ibmr.lkey);
+ qp_context->params1 = cpu_to_be32((MTHCA_ACK_REQ_FREQ << 28) |
+ (MTHCA_FLIGHT_LIMIT << 24) |
+ MTHCA_QP_BIT_SWE);
+ if (qp->sq_policy == IB_SIGNAL_ALL_WR)
+ qp_context->params1 |= cpu_to_be32(MTHCA_QP_BIT_SSC);
+ if (attr_mask & IB_QP_RETRY_CNT) {
+ qp_context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
+ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RETRY_COUNT);
+ }
+
+ if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+ if (attr->max_rd_atomic) {
+ qp_context->params1 |=
+ cpu_to_be32(MTHCA_QP_BIT_SRE |
+ MTHCA_QP_BIT_SAE);
+ qp_context->params1 |=
+ cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21);
+ }
+ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_SRA_MAX);
+ }
+
+ if (attr_mask & IB_QP_SQ_PSN)
+ qp_context->next_send_psn = cpu_to_be32(attr->sq_psn);
+ qp_context->cqn_snd = cpu_to_be32(to_mcq(ibqp->send_cq)->cqn);
+
+ if (mthca_is_memfree(dev)) {
+ qp_context->snd_wqe_base_l = cpu_to_be32(qp->send_wqe_offset);
+ qp_context->snd_db_index = cpu_to_be32(qp->sq.db_index);
+ }
+
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+ if (attr->max_dest_rd_atomic)
+ qp_context->params2 |=
+ cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
+
+ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RRA_MAX);
+ }
+
+ if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) {
+ qp_context->params2 |= get_hw_access_flags(qp, attr, attr_mask);
+ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RWE |
+ MTHCA_QP_OPTPAR_RRE |
+ MTHCA_QP_OPTPAR_RAE);
+ }
+
+ qp_context->params2 |= cpu_to_be32(MTHCA_QP_BIT_RSC);
+
+ if (ibqp->srq)
+ qp_context->params2 |= cpu_to_be32(MTHCA_QP_BIT_RIC);
+
+ if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+ qp_context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
+ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RNR_TIMEOUT);
+ }
+ if (attr_mask & IB_QP_RQ_PSN)
+ qp_context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
+
+ qp_context->ra_buff_indx =
+ cpu_to_be32(dev->qp_table.rdb_base +
+ ((qp->qpn & (dev->limits.num_qps - 1)) * MTHCA_RDB_ENTRY_SIZE <<
+ dev->qp_table.rdb_shift));
+
+ qp_context->cqn_rcv = cpu_to_be32(to_mcq(ibqp->recv_cq)->cqn);
+
+ if (mthca_is_memfree(dev))
+ qp_context->rcv_db_index = cpu_to_be32(qp->rq.db_index);
+
+ if (attr_mask & IB_QP_QKEY) {
+ qp_context->qkey = cpu_to_be32(attr->qkey);
+ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_Q_KEY);
+ }
+
+ if (ibqp->srq)
+ qp_context->srqn = cpu_to_be32(1 << 24 |
+ to_msrq(ibqp->srq)->srqn);
+
+ if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD &&
+ attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY &&
+ attr->en_sqd_async_notify)
+ sqd_event = 1 << 31;
+
+ err = mthca_MODIFY_QP(dev, cur_state, new_state, qp->qpn, 0,
+ mailbox, sqd_event, &status);
+ if (err)
+ goto out_mailbox;
+ if (status) {
+ mthca_warn(dev, "modify QP %d->%d returned status %02x.\n",
+ cur_state, new_state, status);
+ err = -EINVAL;
+ goto out_mailbox;
+ }
+
+ qp->state = new_state;
+ if (attr_mask & IB_QP_ACCESS_FLAGS)
+ qp->atomic_rd_en = attr->qp_access_flags;
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+ qp->resp_depth = attr->max_dest_rd_atomic;
+ if (attr_mask & IB_QP_PORT)
+ qp->port = attr->port_num;
+ if (attr_mask & IB_QP_ALT_PATH)
+ qp->alt_port = attr->alt_port_num;
+
+ if (is_sqp(dev, qp))
+ store_attrs(to_msqp(qp), attr, attr_mask);
+
+ /*
+ * If we moved QP0 to RTR, bring the IB link up; if we moved
+ * QP0 to RESET or ERROR, bring the link back down.
+ */
+ if (is_qp0(dev, qp)) {
+ if (cur_state != IB_QPS_RTR &&
+ new_state == IB_QPS_RTR)
+ init_port(dev, qp->port);
+
+ if (cur_state != IB_QPS_RESET &&
+ cur_state != IB_QPS_ERR &&
+ (new_state == IB_QPS_RESET ||
+ new_state == IB_QPS_ERR))
+ mthca_CLOSE_IB(dev, qp->port, &status);
+ }
+
+ /*
+ * If we moved a kernel QP to RESET, clean up all old CQ
+ * entries and reinitialize the QP.
+ */
+ if (new_state == IB_QPS_RESET && !qp->ibqp.uobject) {
+ mthca_cq_clean(dev, to_mcq(qp->ibqp.recv_cq), qp->qpn,
+ qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL);
+ if (qp->ibqp.send_cq != qp->ibqp.recv_cq)
+ mthca_cq_clean(dev, to_mcq(qp->ibqp.send_cq), qp->qpn, NULL);
+
+ mthca_wq_reset(&qp->sq);
+ qp->sq.last = get_send_wqe(qp, qp->sq.max - 1);
+
+ mthca_wq_reset(&qp->rq);
+ qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1);
+
+ if (mthca_is_memfree(dev)) {
+ *qp->sq.db = 0;
+ *qp->rq.db = 0;
+ }
+ }
+
+out_mailbox:
+ mthca_free_mailbox(dev, mailbox);
+out:
+ return err;
+}
+
+int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
+ struct ib_udata *udata)
+{
+ struct mthca_dev *dev = to_mdev(ibqp->device);
+ struct mthca_qp *qp = to_mqp(ibqp);
+ enum ib_qp_state cur_state, new_state;
+ int err = -EINVAL;
+
+ mutex_lock(&qp->mutex);
+ if (attr_mask & IB_QP_CUR_STATE) {
+ cur_state = attr->cur_qp_state;
+ } else {
+ spin_lock_irq(&qp->sq.lock);
+ spin_lock(&qp->rq.lock);
+ cur_state = qp->state;
+ spin_unlock(&qp->rq.lock);
+ spin_unlock_irq(&qp->sq.lock);
+ }
+
+ new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+ if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) {
+ mthca_dbg(dev, "Bad QP transition (transport %d) "
+ "%d->%d with attr 0x%08x\n",
+ qp->transport, cur_state, new_state,
+ attr_mask);
+ goto out;
+ }
+
+ if ((attr_mask & IB_QP_PKEY_INDEX) &&
+ attr->pkey_index >= dev->limits.pkey_table_len) {
+ mthca_dbg(dev, "P_Key index (%u) too large. max is %d\n",
+ attr->pkey_index, dev->limits.pkey_table_len-1);
+ goto out;
+ }
+
+ if ((attr_mask & IB_QP_PORT) &&
+ (attr->port_num == 0 || attr->port_num > dev->limits.num_ports)) {
+ mthca_dbg(dev, "Port number (%u) is invalid\n", attr->port_num);
+ goto out;
+ }
+
+ if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+ attr->max_rd_atomic > dev->limits.max_qp_init_rdma) {
+ mthca_dbg(dev, "Max rdma_atomic as initiator %u too large (max is %d)\n",
+ attr->max_rd_atomic, dev->limits.max_qp_init_rdma);
+ goto out;
+ }
+
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+ attr->max_dest_rd_atomic > 1 << dev->qp_table.rdb_shift) {
+ mthca_dbg(dev, "Max rdma_atomic as responder %u too large (max %d)\n",
+ attr->max_dest_rd_atomic, 1 << dev->qp_table.rdb_shift);
+ goto out;
+ }
+
+ if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+ err = 0;
+ goto out;
+ }
+
+ err = __mthca_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
+
+out:
+ mutex_unlock(&qp->mutex);
+ return err;
+}
+
+static int mthca_max_data_size(struct mthca_dev *dev, struct mthca_qp *qp, int desc_sz)
+{
+ /*
+ * Calculate the maximum size of WQE s/g segments, excluding
+ * the next segment and other non-data segments.
+ */
+ int max_data_size = desc_sz - sizeof (struct mthca_next_seg);
+
+ switch (qp->transport) {
+ case MLX:
+ max_data_size -= 2 * sizeof (struct mthca_data_seg);
+ break;
+
+ case UD:
+ if (mthca_is_memfree(dev))
+ max_data_size -= sizeof (struct mthca_arbel_ud_seg);
+ else
+ max_data_size -= sizeof (struct mthca_tavor_ud_seg);
+ break;
+
+ default:
+ max_data_size -= sizeof (struct mthca_raddr_seg);
+ break;
+ }
+
+ return max_data_size;
+}
+
+static inline int mthca_max_inline_data(struct mthca_pd *pd, int max_data_size)
+{
+ /* We don't support inline data for kernel QPs (yet). */
+ return pd->ibpd.uobject ? max_data_size - MTHCA_INLINE_HEADER_SIZE : 0;
+}
+
+static void mthca_adjust_qp_caps(struct mthca_dev *dev,
+ struct mthca_pd *pd,
+ struct mthca_qp *qp)
+{
+ int max_data_size = mthca_max_data_size(dev, qp,
+ min(dev->limits.max_desc_sz,
+ 1 << qp->sq.wqe_shift));
+
+ qp->max_inline_data = mthca_max_inline_data(pd, max_data_size);
+
+ qp->sq.max_gs = min_t(int, dev->limits.max_sg,
+ max_data_size / sizeof (struct mthca_data_seg));
+ qp->rq.max_gs = min_t(int, dev->limits.max_sg,
+ (min(dev->limits.max_desc_sz, 1 << qp->rq.wqe_shift) -
+ sizeof (struct mthca_next_seg)) /
+ sizeof (struct mthca_data_seg));
+}
+
+/*
+ * Allocate and register buffer for WQEs. qp->rq.max, sq.max,
+ * rq.max_gs and sq.max_gs must all be assigned.
+ * mthca_alloc_wqe_buf will calculate rq.wqe_shift and
+ * sq.wqe_shift (as well as send_wqe_offset, is_direct, and
+ * queue)
+ */
+static int mthca_alloc_wqe_buf(struct mthca_dev *dev,
+ struct mthca_pd *pd,
+ struct mthca_qp *qp)
+{
+ int size;
+ int err = -ENOMEM;
+
+ size = sizeof (struct mthca_next_seg) +
+ qp->rq.max_gs * sizeof (struct mthca_data_seg);
+
+ if (size > dev->limits.max_desc_sz)
+ return -EINVAL;
+
+ for (qp->rq.wqe_shift = 6; 1 << qp->rq.wqe_shift < size;
+ qp->rq.wqe_shift++)
+ ; /* nothing */
+
+ size = qp->sq.max_gs * sizeof (struct mthca_data_seg);
+ switch (qp->transport) {
+ case MLX:
+ size += 2 * sizeof (struct mthca_data_seg);
+ break;
+
+ case UD:
+ size += mthca_is_memfree(dev) ?
+ sizeof (struct mthca_arbel_ud_seg) :
+ sizeof (struct mthca_tavor_ud_seg);
+ break;
+
+ case UC:
+ size += sizeof (struct mthca_raddr_seg);
+ break;
+
+ case RC:
+ size += sizeof (struct mthca_raddr_seg);
+ /*
+ * An atomic op will require an atomic segment, a
+ * remote address segment and one scatter entry.
+ */
+ size = max_t(int, size,
+ sizeof (struct mthca_atomic_seg) +
+ sizeof (struct mthca_raddr_seg) +
+ sizeof (struct mthca_data_seg));
+ break;
+
+ default:
+ break;
+ }
+
+ /* Make sure that we have enough space for a bind request */
+ size = max_t(int, size, sizeof (struct mthca_bind_seg));
+
+ size += sizeof (struct mthca_next_seg);
+
+ if (size > dev->limits.max_desc_sz)
+ return -EINVAL;
+
+ for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
+ qp->sq.wqe_shift++)
+ ; /* nothing */
+
+ qp->send_wqe_offset = ALIGN(qp->rq.max << qp->rq.wqe_shift,
+ 1 << qp->sq.wqe_shift);
+
+ /*
+ * If this is a userspace QP, we don't actually have to
+ * allocate anything. All we need is to calculate the WQE
+ * sizes and the send_wqe_offset, so we're done now.
+ */
+ if (pd->ibpd.uobject)
+ return 0;
+
+ size = PAGE_ALIGN(qp->send_wqe_offset +
+ (qp->sq.max << qp->sq.wqe_shift));
+
+ qp->wrid = kmalloc((qp->rq.max + qp->sq.max) * sizeof (u64),
+ GFP_KERNEL);
+ if (!qp->wrid)
+ goto err_out;
+
+ err = mthca_buf_alloc(dev, size, MTHCA_MAX_DIRECT_QP_SIZE,
+ &qp->queue, &qp->is_direct, pd, 0, &qp->mr);
+ if (err)
+ goto err_out;
+
+ return 0;
+
+err_out:
+ kfree(qp->wrid);
+ return err;
+}
+
+static void mthca_free_wqe_buf(struct mthca_dev *dev,
+ struct mthca_qp *qp)
+{
+ mthca_buf_free(dev, PAGE_ALIGN(qp->send_wqe_offset +
+ (qp->sq.max << qp->sq.wqe_shift)),
+ &qp->queue, qp->is_direct, &qp->mr);
+ kfree(qp->wrid);
+}
+
+static int mthca_map_memfree(struct mthca_dev *dev,
+ struct mthca_qp *qp)
+{
+ int ret;
+
+ if (mthca_is_memfree(dev)) {
+ ret = mthca_table_get(dev, dev->qp_table.qp_table, qp->qpn);
+ if (ret)
+ return ret;
+
+ ret = mthca_table_get(dev, dev->qp_table.eqp_table, qp->qpn);
+ if (ret)
+ goto err_qpc;
+
+ ret = mthca_table_get(dev, dev->qp_table.rdb_table,
+ qp->qpn << dev->qp_table.rdb_shift);
+ if (ret)
+ goto err_eqpc;
+
+ }
+
+ return 0;
+
+err_eqpc:
+ mthca_table_put(dev, dev->qp_table.eqp_table, qp->qpn);
+
+err_qpc:
+ mthca_table_put(dev, dev->qp_table.qp_table, qp->qpn);
+
+ return ret;
+}
+
+static void mthca_unmap_memfree(struct mthca_dev *dev,
+ struct mthca_qp *qp)
+{
+ mthca_table_put(dev, dev->qp_table.rdb_table,
+ qp->qpn << dev->qp_table.rdb_shift);
+ mthca_table_put(dev, dev->qp_table.eqp_table, qp->qpn);
+ mthca_table_put(dev, dev->qp_table.qp_table, qp->qpn);
+}
+
+static int mthca_alloc_memfree(struct mthca_dev *dev,
+ struct mthca_qp *qp)
+{
+ if (mthca_is_memfree(dev)) {
+ qp->rq.db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_RQ,
+ qp->qpn, &qp->rq.db);
+ if (qp->rq.db_index < 0)
+ return -ENOMEM;
+
+ qp->sq.db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_SQ,
+ qp->qpn, &qp->sq.db);
+ if (qp->sq.db_index < 0) {
+ mthca_free_db(dev, MTHCA_DB_TYPE_RQ, qp->rq.db_index);
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+static void mthca_free_memfree(struct mthca_dev *dev,
+ struct mthca_qp *qp)
+{
+ if (mthca_is_memfree(dev)) {
+ mthca_free_db(dev, MTHCA_DB_TYPE_SQ, qp->sq.db_index);
+ mthca_free_db(dev, MTHCA_DB_TYPE_RQ, qp->rq.db_index);
+ }
+}
+
+static int mthca_alloc_qp_common(struct mthca_dev *dev,
+ struct mthca_pd *pd,
+ struct mthca_cq *send_cq,
+ struct mthca_cq *recv_cq,
+ enum ib_sig_type send_policy,
+ struct mthca_qp *qp)
+{
+ int ret;
+ int i;
+ struct mthca_next_seg *next;
+
+ qp->refcount = 1;
+ init_waitqueue_head(&qp->wait);
+ mutex_init(&qp->mutex);
+ qp->state = IB_QPS_RESET;
+ qp->atomic_rd_en = 0;
+ qp->resp_depth = 0;
+ qp->sq_policy = send_policy;
+ mthca_wq_reset(&qp->sq);
+ mthca_wq_reset(&qp->rq);
+
+ spin_lock_init(&qp->sq.lock);
+ spin_lock_init(&qp->rq.lock);
+
+ ret = mthca_map_memfree(dev, qp);
+ if (ret)
+ return ret;
+
+ ret = mthca_alloc_wqe_buf(dev, pd, qp);
+ if (ret) {
+ mthca_unmap_memfree(dev, qp);
+ return ret;
+ }
+
+ mthca_adjust_qp_caps(dev, pd, qp);
+
+ /*
+ * If this is a userspace QP, we're done now. The doorbells
+ * will be allocated and buffers will be initialized in
+ * userspace.
+ */
+ if (pd->ibpd.uobject)
+ return 0;
+
+ ret = mthca_alloc_memfree(dev, qp);
+ if (ret) {
+ mthca_free_wqe_buf(dev, qp);
+ mthca_unmap_memfree(dev, qp);
+ return ret;
+ }
+
+ if (mthca_is_memfree(dev)) {
+ struct mthca_data_seg *scatter;
+ int size = (sizeof (struct mthca_next_seg) +
+ qp->rq.max_gs * sizeof (struct mthca_data_seg)) / 16;
+
+ for (i = 0; i < qp->rq.max; ++i) {
+ next = get_recv_wqe(qp, i);
+ next->nda_op = cpu_to_be32(((i + 1) & (qp->rq.max - 1)) <<
+ qp->rq.wqe_shift);
+ next->ee_nds = cpu_to_be32(size);
+
+ for (scatter = (void *) (next + 1);
+ (void *) scatter < (void *) next + (1 << qp->rq.wqe_shift);
+ ++scatter)
+ scatter->lkey = cpu_to_be32(MTHCA_INVAL_LKEY);
+ }
+
+ for (i = 0; i < qp->sq.max; ++i) {
+ next = get_send_wqe(qp, i);
+ next->nda_op = cpu_to_be32((((i + 1) & (qp->sq.max - 1)) <<
+ qp->sq.wqe_shift) +
+ qp->send_wqe_offset);
+ }
+ } else {
+ for (i = 0; i < qp->rq.max; ++i) {
+ next = get_recv_wqe(qp, i);
+ next->nda_op = htonl((((i + 1) % qp->rq.max) <<
+ qp->rq.wqe_shift) | 1);
+ }
+
+ }
+
+ qp->sq.last = get_send_wqe(qp, qp->sq.max - 1);
+ qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1);
+
+ return 0;
+}
+
+static int mthca_set_qp_size(struct mthca_dev *dev, struct ib_qp_cap *cap,
+ struct mthca_pd *pd, struct mthca_qp *qp)
+{
+ int max_data_size = mthca_max_data_size(dev, qp, dev->limits.max_desc_sz);
+ u32 max_inline_data;
+
+ /* Sanity check QP size before proceeding */
+ if (cap->max_send_wr > dev->limits.max_wqes ||
+ cap->max_recv_wr > dev->limits.max_wqes ||
+ cap->max_send_sge > dev->limits.max_sg ||
+ cap->max_recv_sge > dev->limits.max_sg)
+ return -EINVAL;
+
+ if (pd->ibpd.uobject &&
+ cap->max_inline_data > mthca_max_inline_data(pd, max_data_size))
+ return -EINVAL;
+
+ max_inline_data = pd->ibpd.uobject ? cap->max_inline_data : 0;
+
+ /*
+ * For MLX transport we need 2 extra send gather entries:
+ * one for the header and one for the checksum at the end
+ */
+ if (qp->transport == MLX && cap->max_send_sge + 2 > dev->limits.max_sg)
+ return -EINVAL;
+
+ if (mthca_is_memfree(dev)) {
+ qp->rq.max = cap->max_recv_wr ?
+ roundup_pow_of_two(cap->max_recv_wr) : 0;
+ qp->sq.max = cap->max_send_wr ?
+ roundup_pow_of_two(cap->max_send_wr) : 0;
+ } else {
+ qp->rq.max = cap->max_recv_wr;
+ qp->sq.max = cap->max_send_wr;
+ }
+
+ qp->rq.max_gs = cap->max_recv_sge;
+ qp->sq.max_gs = max_t(int, cap->max_send_sge,
+ ALIGN(max_inline_data + MTHCA_INLINE_HEADER_SIZE,
+ MTHCA_INLINE_CHUNK_SIZE) /
+ sizeof (struct mthca_data_seg));
+
+ return 0;
+}
+
+int mthca_alloc_qp(struct mthca_dev *dev,
+ struct mthca_pd *pd,
+ struct mthca_cq *send_cq,
+ struct mthca_cq *recv_cq,
+ enum ib_qp_type type,
+ enum ib_sig_type send_policy,
+ struct ib_qp_cap *cap,
+ struct mthca_qp *qp)
+{
+ int err;
+
+ switch (type) {
+ case IB_QPT_RC: qp->transport = RC; break;
+ case IB_QPT_UC: qp->transport = UC; break;
+ case IB_QPT_UD: qp->transport = UD; break;
+ default: return -EINVAL;
+ }
+
+ err = mthca_set_qp_size(dev, cap, pd, qp);
+ if (err)
+ return err;
+
+ qp->qpn = mthca_alloc(&dev->qp_table.alloc);
+ if (qp->qpn == -1)
+ return -ENOMEM;
+
+ /* initialize port to zero for error-catching. */
+ qp->port = 0;
+
+ err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq,
+ send_policy, qp);
+ if (err) {
+ mthca_free(&dev->qp_table.alloc, qp->qpn);
+ return err;
+ }
+
+ spin_lock_irq(&dev->qp_table.lock);
+ mthca_array_set(&dev->qp_table.qp,
+ qp->qpn & (dev->limits.num_qps - 1), qp);
+ spin_unlock_irq(&dev->qp_table.lock);
+
+ return 0;
+}
+
+static void mthca_lock_cqs(struct mthca_cq *send_cq, struct mthca_cq *recv_cq)
+{
+ if (send_cq == recv_cq)
+ spin_lock_irq(&send_cq->lock);
+ else if (send_cq->cqn < recv_cq->cqn) {
+ spin_lock_irq(&send_cq->lock);
+ spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
+ } else {
+ spin_lock_irq(&recv_cq->lock);
+ spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
+ }
+}
+
+static void mthca_unlock_cqs(struct mthca_cq *send_cq, struct mthca_cq *recv_cq)
+{
+ if (send_cq == recv_cq)
+ spin_unlock_irq(&send_cq->lock);
+ else if (send_cq->cqn < recv_cq->cqn) {
+ spin_unlock(&recv_cq->lock);
+ spin_unlock_irq(&send_cq->lock);
+ } else {
+ spin_unlock(&send_cq->lock);
+ spin_unlock_irq(&recv_cq->lock);
+ }
+}
+
+int mthca_alloc_sqp(struct mthca_dev *dev,
+ struct mthca_pd *pd,
+ struct mthca_cq *send_cq,
+ struct mthca_cq *recv_cq,
+ enum ib_sig_type send_policy,
+ struct ib_qp_cap *cap,
+ int qpn,
+ int port,
+ struct mthca_sqp *sqp)
+{
+ u32 mqpn = qpn * 2 + dev->qp_table.sqp_start + port - 1;
+ int err;
+
+ sqp->qp.transport = MLX;
+ err = mthca_set_qp_size(dev, cap, pd, &sqp->qp);
+ if (err)
+ return err;
+
+ sqp->header_buf_size = sqp->qp.sq.max * MTHCA_UD_HEADER_SIZE;
+ sqp->header_buf = dma_alloc_coherent(&dev->pdev->dev, sqp->header_buf_size,
+ &sqp->header_dma, GFP_KERNEL);
+ if (!sqp->header_buf)
+ return -ENOMEM;
+
+ spin_lock_irq(&dev->qp_table.lock);
+ if (mthca_array_get(&dev->qp_table.qp, mqpn))
+ err = -EBUSY;
+ else
+ mthca_array_set(&dev->qp_table.qp, mqpn, sqp);
+ spin_unlock_irq(&dev->qp_table.lock);
+
+ if (err)
+ goto err_out;
+
+ sqp->qp.port = port;
+ sqp->qp.qpn = mqpn;
+ sqp->qp.transport = MLX;
+
+ err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq,
+ send_policy, &sqp->qp);
+ if (err)
+ goto err_out_free;
+
+ atomic_inc(&pd->sqp_count);
+
+ return 0;
+
+ err_out_free:
+ /*
+ * Lock CQs here, so that CQ polling code can do QP lookup
+ * without taking a lock.
+ */
+ mthca_lock_cqs(send_cq, recv_cq);
+
+ spin_lock(&dev->qp_table.lock);
+ mthca_array_clear(&dev->qp_table.qp, mqpn);
+ spin_unlock(&dev->qp_table.lock);
+
+ mthca_unlock_cqs(send_cq, recv_cq);
+
+ err_out:
+ dma_free_coherent(&dev->pdev->dev, sqp->header_buf_size,
+ sqp->header_buf, sqp->header_dma);
+
+ return err;
+}
+
+static inline int get_qp_refcount(struct mthca_dev *dev, struct mthca_qp *qp)
+{
+ int c;
+
+ spin_lock_irq(&dev->qp_table.lock);
+ c = qp->refcount;
+ spin_unlock_irq(&dev->qp_table.lock);
+
+ return c;
+}
+
+void mthca_free_qp(struct mthca_dev *dev,
+ struct mthca_qp *qp)
+{
+ u8 status;
+ struct mthca_cq *send_cq;
+ struct mthca_cq *recv_cq;
+
+ send_cq = to_mcq(qp->ibqp.send_cq);
+ recv_cq = to_mcq(qp->ibqp.recv_cq);
+
+ /*
+ * Lock CQs here, so that CQ polling code can do QP lookup
+ * without taking a lock.
+ */
+ mthca_lock_cqs(send_cq, recv_cq);
+
+ spin_lock(&dev->qp_table.lock);
+ mthca_array_clear(&dev->qp_table.qp,
+ qp->qpn & (dev->limits.num_qps - 1));
+ --qp->refcount;
+ spin_unlock(&dev->qp_table.lock);
+
+ mthca_unlock_cqs(send_cq, recv_cq);
+
+ wait_event(qp->wait, !get_qp_refcount(dev, qp));
+
+ if (qp->state != IB_QPS_RESET)
+ mthca_MODIFY_QP(dev, qp->state, IB_QPS_RESET, qp->qpn, 0,
+ NULL, 0, &status);
+
+ /*
+ * If this is a userspace QP, the buffers, MR, CQs and so on
+ * will be cleaned up in userspace, so all we have to do is
+ * unref the mem-free tables and free the QPN in our table.
+ */
+ if (!qp->ibqp.uobject) {
+ mthca_cq_clean(dev, recv_cq, qp->qpn,
+ qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL);
+ if (send_cq != recv_cq)
+ mthca_cq_clean(dev, send_cq, qp->qpn, NULL);
+
+ mthca_free_memfree(dev, qp);
+ mthca_free_wqe_buf(dev, qp);
+ }
+
+ mthca_unmap_memfree(dev, qp);
+
+ if (is_sqp(dev, qp)) {
+ atomic_dec(&(to_mpd(qp->ibqp.pd)->sqp_count));
+ dma_free_coherent(&dev->pdev->dev,
+ to_msqp(qp)->header_buf_size,
+ to_msqp(qp)->header_buf,
+ to_msqp(qp)->header_dma);
+ } else
+ mthca_free(&dev->qp_table.alloc, qp->qpn);
+}
+
+/* Create UD header for an MLX send and build a data segment for it */
+static int build_mlx_header(struct mthca_dev *dev, struct mthca_sqp *sqp,
+ int ind, struct ib_send_wr *wr,
+ struct mthca_mlx_seg *mlx,
+ struct mthca_data_seg *data)
+{
+ int header_size;
+ int err;
+ u16 pkey;
+
+ ib_ud_header_init(256, /* assume a MAD */
+ 1, 0, 0,
+ mthca_ah_grh_present(to_mah(wr->wr.ud.ah)),
+ 0,
+ &sqp->ud_header);
+
+ err = mthca_read_ah(dev, to_mah(wr->wr.ud.ah), &sqp->ud_header);
+ if (err)
+ return err;
+ mlx->flags &= ~cpu_to_be32(MTHCA_NEXT_SOLICIT | 1);
+ mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MTHCA_MLX_VL15 : 0) |
+ (sqp->ud_header.lrh.destination_lid ==
+ IB_LID_PERMISSIVE ? MTHCA_MLX_SLR : 0) |
+ (sqp->ud_header.lrh.service_level << 8));
+ mlx->rlid = sqp->ud_header.lrh.destination_lid;
+ mlx->vcrc = 0;
+
+ switch (wr->opcode) {
+ case IB_WR_SEND:
+ sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY;
+ sqp->ud_header.immediate_present = 0;
+ break;
+ case IB_WR_SEND_WITH_IMM:
+ sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+ sqp->ud_header.immediate_present = 1;
+ sqp->ud_header.immediate_data = wr->ex.imm_data;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0;
+ if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
+ sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
+ sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
+ if (!sqp->qp.ibqp.qp_num)
+ ib_get_cached_pkey(&dev->ib_dev, sqp->qp.port,
+ sqp->pkey_index, &pkey);
+ else
+ ib_get_cached_pkey(&dev->ib_dev, sqp->qp.port,
+ wr->wr.ud.pkey_index, &pkey);
+ sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
+ sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+ sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+ sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ?
+ sqp->qkey : wr->wr.ud.remote_qkey);
+ sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
+
+ header_size = ib_ud_header_pack(&sqp->ud_header,
+ sqp->header_buf +
+ ind * MTHCA_UD_HEADER_SIZE);
+
+ data->byte_count = cpu_to_be32(header_size);
+ data->lkey = cpu_to_be32(to_mpd(sqp->qp.ibqp.pd)->ntmr.ibmr.lkey);
+ data->addr = cpu_to_be64(sqp->header_dma +
+ ind * MTHCA_UD_HEADER_SIZE);
+
+ return 0;
+}
+
+static inline int mthca_wq_overflow(struct mthca_wq *wq, int nreq,
+ struct ib_cq *ib_cq)
+{
+ unsigned cur;
+ struct mthca_cq *cq;
+
+ cur = wq->head - wq->tail;
+ if (likely(cur + nreq < wq->max))
+ return 0;
+
+ cq = to_mcq(ib_cq);
+ spin_lock(&cq->lock);
+ cur = wq->head - wq->tail;
+ spin_unlock(&cq->lock);
+
+ return cur + nreq >= wq->max;
+}
+
+static __always_inline void set_raddr_seg(struct mthca_raddr_seg *rseg,
+ u64 remote_addr, u32 rkey)
+{
+ rseg->raddr = cpu_to_be64(remote_addr);
+ rseg->rkey = cpu_to_be32(rkey);
+ rseg->reserved = 0;
+}
+
+static __always_inline void set_atomic_seg(struct mthca_atomic_seg *aseg,
+ struct ib_send_wr *wr)
+{
+ if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+ aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap);
+ aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add);
+ } else {
+ aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add);
+ aseg->compare = 0;
+ }
+
+}
+
+static void set_tavor_ud_seg(struct mthca_tavor_ud_seg *useg,
+ struct ib_send_wr *wr)
+{
+ useg->lkey = cpu_to_be32(to_mah(wr->wr.ud.ah)->key);
+ useg->av_addr = cpu_to_be64(to_mah(wr->wr.ud.ah)->avdma);
+ useg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+ useg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
+
+}
+
+static void set_arbel_ud_seg(struct mthca_arbel_ud_seg *useg,
+ struct ib_send_wr *wr)
+{
+ memcpy(useg->av, to_mah(wr->wr.ud.ah)->av, MTHCA_AV_SIZE);
+ useg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+ useg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
+}
+
+int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+ struct ib_send_wr **bad_wr)
+{
+ struct mthca_dev *dev = to_mdev(ibqp->device);
+ struct mthca_qp *qp = to_mqp(ibqp);
+ void *wqe;
+ void *prev_wqe;
+ unsigned long flags;
+ int err = 0;
+ int nreq;
+ int i;
+ int size;
+ /*
+ * f0 and size0 are only used if nreq != 0, and they will
+ * always be initialized the first time through the main loop
+ * before nreq is incremented. So nreq cannot become non-zero
+ * without initializing f0 and size0, and they are in fact
+ * never used uninitialized.
+ */
+ int uninitialized_var(size0);
+ u32 uninitialized_var(f0);
+ int ind;
+ u8 op0 = 0;
+
+ spin_lock_irqsave(&qp->sq.lock, flags);
+
+ /* XXX check that state is OK to post send */
+
+ ind = qp->sq.next_ind;
+
+ for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ if (mthca_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+ mthca_err(dev, "SQ %06x full (%u head, %u tail,"
+ " %d max, %d nreq)\n", qp->qpn,
+ qp->sq.head, qp->sq.tail,
+ qp->sq.max, nreq);
+ err = -ENOMEM;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ wqe = get_send_wqe(qp, ind);
+ prev_wqe = qp->sq.last;
+ qp->sq.last = wqe;
+
+ ((struct mthca_next_seg *) wqe)->nda_op = 0;
+ ((struct mthca_next_seg *) wqe)->ee_nds = 0;
+ ((struct mthca_next_seg *) wqe)->flags =
+ ((wr->send_flags & IB_SEND_SIGNALED) ?
+ cpu_to_be32(MTHCA_NEXT_CQ_UPDATE) : 0) |
+ ((wr->send_flags & IB_SEND_SOLICITED) ?
+ cpu_to_be32(MTHCA_NEXT_SOLICIT) : 0) |
+ cpu_to_be32(1);
+ if (wr->opcode == IB_WR_SEND_WITH_IMM ||
+ wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+ ((struct mthca_next_seg *) wqe)->imm = wr->ex.imm_data;
+
+ wqe += sizeof (struct mthca_next_seg);
+ size = sizeof (struct mthca_next_seg) / 16;
+
+ switch (qp->transport) {
+ case RC:
+ switch (wr->opcode) {
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
+ wr->wr.atomic.rkey);
+ wqe += sizeof (struct mthca_raddr_seg);
+
+ set_atomic_seg(wqe, wr);
+ wqe += sizeof (struct mthca_atomic_seg);
+ size += (sizeof (struct mthca_raddr_seg) +
+ sizeof (struct mthca_atomic_seg)) / 16;
+ break;
+
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ case IB_WR_RDMA_READ:
+ set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+ wr->wr.rdma.rkey);
+ wqe += sizeof (struct mthca_raddr_seg);
+ size += sizeof (struct mthca_raddr_seg) / 16;
+ break;
+
+ default:
+ /* No extra segments required for sends */
+ break;
+ }
+
+ break;
+
+ case UC:
+ switch (wr->opcode) {
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+ wr->wr.rdma.rkey);
+ wqe += sizeof (struct mthca_raddr_seg);
+ size += sizeof (struct mthca_raddr_seg) / 16;
+ break;
+
+ default:
+ /* No extra segments required for sends */
+ break;
+ }
+
+ break;
+
+ case UD:
+ set_tavor_ud_seg(wqe, wr);
+ wqe += sizeof (struct mthca_tavor_ud_seg);
+ size += sizeof (struct mthca_tavor_ud_seg) / 16;
+ break;
+
+ case MLX:
+ err = build_mlx_header(dev, to_msqp(qp), ind, wr,
+ wqe - sizeof (struct mthca_next_seg),
+ wqe);
+ if (err) {
+ *bad_wr = wr;
+ goto out;
+ }
+ wqe += sizeof (struct mthca_data_seg);
+ size += sizeof (struct mthca_data_seg) / 16;
+ break;
+ }
+
+ if (wr->num_sge > qp->sq.max_gs) {
+ mthca_err(dev, "too many gathers\n");
+ err = -EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ for (i = 0; i < wr->num_sge; ++i) {
+ mthca_set_data_seg(wqe, wr->sg_list + i);
+ wqe += sizeof (struct mthca_data_seg);
+ size += sizeof (struct mthca_data_seg) / 16;
+ }
+
+ /* Add one more inline data segment for ICRC */
+ if (qp->transport == MLX) {
+ ((struct mthca_data_seg *) wqe)->byte_count =
+ cpu_to_be32((1 << 31) | 4);
+ ((u32 *) wqe)[1] = 0;
+ wqe += sizeof (struct mthca_data_seg);
+ size += sizeof (struct mthca_data_seg) / 16;
+ }
+
+ qp->wrid[ind] = wr->wr_id;
+
+ if (wr->opcode >= ARRAY_SIZE(mthca_opcode)) {
+ mthca_err(dev, "opcode invalid\n");
+ err = -EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ ((struct mthca_next_seg *) prev_wqe)->nda_op =
+ cpu_to_be32(((ind << qp->sq.wqe_shift) +
+ qp->send_wqe_offset) |
+ mthca_opcode[wr->opcode]);
+ wmb();
+ ((struct mthca_next_seg *) prev_wqe)->ee_nds =
+ cpu_to_be32((nreq ? 0 : MTHCA_NEXT_DBD) | size |
+ ((wr->send_flags & IB_SEND_FENCE) ?
+ MTHCA_NEXT_FENCE : 0));
+
+ if (!nreq) {
+ size0 = size;
+ op0 = mthca_opcode[wr->opcode];
+ f0 = wr->send_flags & IB_SEND_FENCE ?
+ MTHCA_SEND_DOORBELL_FENCE : 0;
+ }
+
+ ++ind;
+ if (unlikely(ind >= qp->sq.max))
+ ind -= qp->sq.max;
+ }
+
+out:
+ if (likely(nreq)) {
+ wmb();
+
+ mthca_write64(((qp->sq.next_ind << qp->sq.wqe_shift) +
+ qp->send_wqe_offset) | f0 | op0,
+ (qp->qpn << 8) | size0,
+ dev->kar + MTHCA_SEND_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+ /*
+ * Make sure doorbells don't leak out of SQ spinlock
+ * and reach the HCA out of order:
+ */
+ mmiowb();
+ }
+
+ qp->sq.next_ind = ind;
+ qp->sq.head += nreq;
+
+ spin_unlock_irqrestore(&qp->sq.lock, flags);
+ return err;
+}
+
+int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr)
+{
+ struct mthca_dev *dev = to_mdev(ibqp->device);
+ struct mthca_qp *qp = to_mqp(ibqp);
+ unsigned long flags;
+ int err = 0;
+ int nreq;
+ int i;
+ int size;
+ /*
+ * size0 is only used if nreq != 0, and it will always be
+ * initialized the first time through the main loop before
+ * nreq is incremented. So nreq cannot become non-zero
+ * without initializing size0, and it is in fact never used
+ * uninitialized.
+ */
+ int uninitialized_var(size0);
+ int ind;
+ void *wqe;
+ void *prev_wqe;
+
+ spin_lock_irqsave(&qp->rq.lock, flags);
+
+ /* XXX check that state is OK to post receive */
+
+ ind = qp->rq.next_ind;
+
+ for (nreq = 0; wr; wr = wr->next) {
+ if (mthca_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+ mthca_err(dev, "RQ %06x full (%u head, %u tail,"
+ " %d max, %d nreq)\n", qp->qpn,
+ qp->rq.head, qp->rq.tail,
+ qp->rq.max, nreq);
+ err = -ENOMEM;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ wqe = get_recv_wqe(qp, ind);
+ prev_wqe = qp->rq.last;
+ qp->rq.last = wqe;
+
+ ((struct mthca_next_seg *) wqe)->ee_nds =
+ cpu_to_be32(MTHCA_NEXT_DBD);
+ ((struct mthca_next_seg *) wqe)->flags = 0;
+
+ wqe += sizeof (struct mthca_next_seg);
+ size = sizeof (struct mthca_next_seg) / 16;
+
+ if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+ err = -EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ for (i = 0; i < wr->num_sge; ++i) {
+ mthca_set_data_seg(wqe, wr->sg_list + i);
+ wqe += sizeof (struct mthca_data_seg);
+ size += sizeof (struct mthca_data_seg) / 16;
+ }
+
+ qp->wrid[ind + qp->sq.max] = wr->wr_id;
+
+ ((struct mthca_next_seg *) prev_wqe)->ee_nds =
+ cpu_to_be32(MTHCA_NEXT_DBD | size);
+
+ if (!nreq)
+ size0 = size;
+
+ ++ind;
+ if (unlikely(ind >= qp->rq.max))
+ ind -= qp->rq.max;
+
+ ++nreq;
+ if (unlikely(nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB)) {
+ nreq = 0;
+
+ wmb();
+
+ mthca_write64((qp->rq.next_ind << qp->rq.wqe_shift) | size0,
+ qp->qpn << 8, dev->kar + MTHCA_RECEIVE_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+
+ qp->rq.next_ind = ind;
+ qp->rq.head += MTHCA_TAVOR_MAX_WQES_PER_RECV_DB;
+ }
+ }
+
+out:
+ if (likely(nreq)) {
+ wmb();
+
+ mthca_write64((qp->rq.next_ind << qp->rq.wqe_shift) | size0,
+ qp->qpn << 8 | nreq, dev->kar + MTHCA_RECEIVE_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+ }
+
+ qp->rq.next_ind = ind;
+ qp->rq.head += nreq;
+
+ /*
+ * Make sure doorbells don't leak out of RQ spinlock and reach
+ * the HCA out of order:
+ */
+ mmiowb();
+
+ spin_unlock_irqrestore(&qp->rq.lock, flags);
+ return err;
+}
+
+int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+ struct ib_send_wr **bad_wr)
+{
+ struct mthca_dev *dev = to_mdev(ibqp->device);
+ struct mthca_qp *qp = to_mqp(ibqp);
+ u32 dbhi;
+ void *wqe;
+ void *prev_wqe;
+ unsigned long flags;
+ int err = 0;
+ int nreq;
+ int i;
+ int size;
+ /*
+ * f0 and size0 are only used if nreq != 0, and they will
+ * always be initialized the first time through the main loop
+ * before nreq is incremented. So nreq cannot become non-zero
+ * without initializing f0 and size0, and they are in fact
+ * never used uninitialized.
+ */
+ int uninitialized_var(size0);
+ u32 uninitialized_var(f0);
+ int ind;
+ u8 op0 = 0;
+
+ spin_lock_irqsave(&qp->sq.lock, flags);
+
+ /* XXX check that state is OK to post send */
+
+ ind = qp->sq.head & (qp->sq.max - 1);
+
+ for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ if (unlikely(nreq == MTHCA_ARBEL_MAX_WQES_PER_SEND_DB)) {
+ nreq = 0;
+
+ dbhi = (MTHCA_ARBEL_MAX_WQES_PER_SEND_DB << 24) |
+ ((qp->sq.head & 0xffff) << 8) | f0 | op0;
+
+ qp->sq.head += MTHCA_ARBEL_MAX_WQES_PER_SEND_DB;
+
+ /*
+ * Make sure that descriptors are written before
+ * doorbell record.
+ */
+ wmb();
+ *qp->sq.db = cpu_to_be32(qp->sq.head & 0xffff);
+
+ /*
+ * Make sure doorbell record is written before we
+ * write MMIO send doorbell.
+ */
+ wmb();
+
+ mthca_write64(dbhi, (qp->qpn << 8) | size0,
+ dev->kar + MTHCA_SEND_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+ }
+
+ if (mthca_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+ mthca_err(dev, "SQ %06x full (%u head, %u tail,"
+ " %d max, %d nreq)\n", qp->qpn,
+ qp->sq.head, qp->sq.tail,
+ qp->sq.max, nreq);
+ err = -ENOMEM;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ wqe = get_send_wqe(qp, ind);
+ prev_wqe = qp->sq.last;
+ qp->sq.last = wqe;
+
+ ((struct mthca_next_seg *) wqe)->flags =
+ ((wr->send_flags & IB_SEND_SIGNALED) ?
+ cpu_to_be32(MTHCA_NEXT_CQ_UPDATE) : 0) |
+ ((wr->send_flags & IB_SEND_SOLICITED) ?
+ cpu_to_be32(MTHCA_NEXT_SOLICIT) : 0) |
+ ((wr->send_flags & IB_SEND_IP_CSUM) ?
+ cpu_to_be32(MTHCA_NEXT_IP_CSUM | MTHCA_NEXT_TCP_UDP_CSUM) : 0) |
+ cpu_to_be32(1);
+ if (wr->opcode == IB_WR_SEND_WITH_IMM ||
+ wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+ ((struct mthca_next_seg *) wqe)->imm = wr->ex.imm_data;
+
+ wqe += sizeof (struct mthca_next_seg);
+ size = sizeof (struct mthca_next_seg) / 16;
+
+ switch (qp->transport) {
+ case RC:
+ switch (wr->opcode) {
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
+ wr->wr.atomic.rkey);
+ wqe += sizeof (struct mthca_raddr_seg);
+
+ set_atomic_seg(wqe, wr);
+ wqe += sizeof (struct mthca_atomic_seg);
+ size += (sizeof (struct mthca_raddr_seg) +
+ sizeof (struct mthca_atomic_seg)) / 16;
+ break;
+
+ case IB_WR_RDMA_READ:
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+ wr->wr.rdma.rkey);
+ wqe += sizeof (struct mthca_raddr_seg);
+ size += sizeof (struct mthca_raddr_seg) / 16;
+ break;
+
+ default:
+ /* No extra segments required for sends */
+ break;
+ }
+
+ break;
+
+ case UC:
+ switch (wr->opcode) {
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+ wr->wr.rdma.rkey);
+ wqe += sizeof (struct mthca_raddr_seg);
+ size += sizeof (struct mthca_raddr_seg) / 16;
+ break;
+
+ default:
+ /* No extra segments required for sends */
+ break;
+ }
+
+ break;
+
+ case UD:
+ set_arbel_ud_seg(wqe, wr);
+ wqe += sizeof (struct mthca_arbel_ud_seg);
+ size += sizeof (struct mthca_arbel_ud_seg) / 16;
+ break;
+
+ case MLX:
+ err = build_mlx_header(dev, to_msqp(qp), ind, wr,
+ wqe - sizeof (struct mthca_next_seg),
+ wqe);
+ if (err) {
+ *bad_wr = wr;
+ goto out;
+ }
+ wqe += sizeof (struct mthca_data_seg);
+ size += sizeof (struct mthca_data_seg) / 16;
+ break;
+ }
+
+ if (wr->num_sge > qp->sq.max_gs) {
+ mthca_err(dev, "too many gathers\n");
+ err = -EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ for (i = 0; i < wr->num_sge; ++i) {
+ mthca_set_data_seg(wqe, wr->sg_list + i);
+ wqe += sizeof (struct mthca_data_seg);
+ size += sizeof (struct mthca_data_seg) / 16;
+ }
+
+ /* Add one more inline data segment for ICRC */
+ if (qp->transport == MLX) {
+ ((struct mthca_data_seg *) wqe)->byte_count =
+ cpu_to_be32((1 << 31) | 4);
+ ((u32 *) wqe)[1] = 0;
+ wqe += sizeof (struct mthca_data_seg);
+ size += sizeof (struct mthca_data_seg) / 16;
+ }
+
+ qp->wrid[ind] = wr->wr_id;
+
+ if (wr->opcode >= ARRAY_SIZE(mthca_opcode)) {
+ mthca_err(dev, "opcode invalid\n");
+ err = -EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ ((struct mthca_next_seg *) prev_wqe)->nda_op =
+ cpu_to_be32(((ind << qp->sq.wqe_shift) +
+ qp->send_wqe_offset) |
+ mthca_opcode[wr->opcode]);
+ wmb();
+ ((struct mthca_next_seg *) prev_wqe)->ee_nds =
+ cpu_to_be32(MTHCA_NEXT_DBD | size |
+ ((wr->send_flags & IB_SEND_FENCE) ?
+ MTHCA_NEXT_FENCE : 0));
+
+ if (!nreq) {
+ size0 = size;
+ op0 = mthca_opcode[wr->opcode];
+ f0 = wr->send_flags & IB_SEND_FENCE ?
+ MTHCA_SEND_DOORBELL_FENCE : 0;
+ }
+
+ ++ind;
+ if (unlikely(ind >= qp->sq.max))
+ ind -= qp->sq.max;
+ }
+
+out:
+ if (likely(nreq)) {
+ dbhi = (nreq << 24) | ((qp->sq.head & 0xffff) << 8) | f0 | op0;
+
+ qp->sq.head += nreq;
+
+ /*
+ * Make sure that descriptors are written before
+ * doorbell record.
+ */
+ wmb();
+ *qp->sq.db = cpu_to_be32(qp->sq.head & 0xffff);
+
+ /*
+ * Make sure doorbell record is written before we
+ * write MMIO send doorbell.
+ */
+ wmb();
+
+ mthca_write64(dbhi, (qp->qpn << 8) | size0, dev->kar + MTHCA_SEND_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+ }
+
+ /*
+ * Make sure doorbells don't leak out of SQ spinlock and reach
+ * the HCA out of order:
+ */
+ mmiowb();
+
+ spin_unlock_irqrestore(&qp->sq.lock, flags);
+ return err;
+}
+
+int mthca_arbel_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr)
+{
+ struct mthca_dev *dev = to_mdev(ibqp->device);
+ struct mthca_qp *qp = to_mqp(ibqp);
+ unsigned long flags;
+ int err = 0;
+ int nreq;
+ int ind;
+ int i;
+ void *wqe;
+
+ spin_lock_irqsave(&qp->rq.lock, flags);
+
+ /* XXX check that state is OK to post receive */
+
+ ind = qp->rq.head & (qp->rq.max - 1);
+
+ for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ if (mthca_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+ mthca_err(dev, "RQ %06x full (%u head, %u tail,"
+ " %d max, %d nreq)\n", qp->qpn,
+ qp->rq.head, qp->rq.tail,
+ qp->rq.max, nreq);
+ err = -ENOMEM;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ wqe = get_recv_wqe(qp, ind);
+
+ ((struct mthca_next_seg *) wqe)->flags = 0;
+
+ wqe += sizeof (struct mthca_next_seg);
+
+ if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+ err = -EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ for (i = 0; i < wr->num_sge; ++i) {
+ mthca_set_data_seg(wqe, wr->sg_list + i);
+ wqe += sizeof (struct mthca_data_seg);
+ }
+
+ if (i < qp->rq.max_gs)
+ mthca_set_data_seg_inval(wqe);
+
+ qp->wrid[ind + qp->sq.max] = wr->wr_id;
+
+ ++ind;
+ if (unlikely(ind >= qp->rq.max))
+ ind -= qp->rq.max;
+ }
+out:
+ if (likely(nreq)) {
+ qp->rq.head += nreq;
+
+ /*
+ * Make sure that descriptors are written before
+ * doorbell record.
+ */
+ wmb();
+ *qp->rq.db = cpu_to_be32(qp->rq.head & 0xffff);
+ }
+
+ spin_unlock_irqrestore(&qp->rq.lock, flags);
+ return err;
+}
+
+void mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send,
+ int index, int *dbd, __be32 *new_wqe)
+{
+ struct mthca_next_seg *next;
+
+ /*
+ * For SRQs, all receive WQEs generate a CQE, so we're always
+ * at the end of the doorbell chain.
+ */
+ if (qp->ibqp.srq && !is_send) {
+ *new_wqe = 0;
+ return;
+ }
+
+ if (is_send)
+ next = get_send_wqe(qp, index);
+ else
+ next = get_recv_wqe(qp, index);
+
+ *dbd = !!(next->ee_nds & cpu_to_be32(MTHCA_NEXT_DBD));
+ if (next->ee_nds & cpu_to_be32(0x3f))
+ *new_wqe = (next->nda_op & cpu_to_be32(~0x3f)) |
+ (next->ee_nds & cpu_to_be32(0x3f));
+ else
+ *new_wqe = 0;
+}
+
+int mthca_init_qp_table(struct mthca_dev *dev)
+{
+ int err;
+ u8 status;
+ int i;
+
+ spin_lock_init(&dev->qp_table.lock);
+
+ /*
+ * We reserve 2 extra QPs per port for the special QPs. The
+ * special QP for port 1 has to be even, so round up.
+ */
+ dev->qp_table.sqp_start = (dev->limits.reserved_qps + 1) & ~1UL;
+ err = mthca_alloc_init(&dev->qp_table.alloc,
+ dev->limits.num_qps,
+ (1 << 24) - 1,
+ dev->qp_table.sqp_start +
+ MTHCA_MAX_PORTS * 2);
+ if (err)
+ return err;
+
+ err = mthca_array_init(&dev->qp_table.qp,
+ dev->limits.num_qps);
+ if (err) {
+ mthca_alloc_cleanup(&dev->qp_table.alloc);
+ return err;
+ }
+
+ for (i = 0; i < 2; ++i) {
+ err = mthca_CONF_SPECIAL_QP(dev, i ? IB_QPT_GSI : IB_QPT_SMI,
+ dev->qp_table.sqp_start + i * 2,
+ &status);
+ if (err)
+ goto err_out;
+ if (status) {
+ mthca_warn(dev, "CONF_SPECIAL_QP returned "
+ "status %02x, aborting.\n",
+ status);
+ err = -EINVAL;
+ goto err_out;
+ }
+ }
+ return 0;
+
+ err_out:
+ for (i = 0; i < 2; ++i)
+ mthca_CONF_SPECIAL_QP(dev, i, 0, &status);
+
+ mthca_array_cleanup(&dev->qp_table.qp, dev->limits.num_qps);
+ mthca_alloc_cleanup(&dev->qp_table.alloc);
+
+ return err;
+}
+
+void mthca_cleanup_qp_table(struct mthca_dev *dev)
+{
+ int i;
+ u8 status;
+
+ for (i = 0; i < 2; ++i)
+ mthca_CONF_SPECIAL_QP(dev, i, 0, &status);
+
+ mthca_array_cleanup(&dev->qp_table.qp, dev->limits.num_qps);
+ mthca_alloc_cleanup(&dev->qp_table.alloc);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_reset.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_reset.c
new file mode 100644
index 0000000..3c12461
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_reset.c
@@ -0,0 +1,298 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+int mthca_reset(struct mthca_dev *mdev)
+{
+ int i;
+ int err = 0;
+ u32 *hca_header = NULL;
+ u32 *bridge_header = NULL;
+ struct pci_dev *bridge = NULL;
+ int bridge_pcix_cap = 0;
+ int hca_pcie_cap = 0;
+ int hca_pcix_cap = 0;
+
+ u16 devctl;
+ u16 linkctl;
+
+#define MTHCA_RESET_OFFSET 0xf0010
+#define MTHCA_RESET_VALUE swab32(1)
+
+ /*
+ * Reset the chip. This is somewhat ugly because we have to
+ * save off the PCI header before reset and then restore it
+ * after the chip reboots. We skip config space offsets 22
+ * and 23 since those have a special meaning.
+ *
+ * To make matters worse, for Tavor (PCI-X HCA) we have to
+ * find the associated bridge device and save off its PCI
+ * header as well.
+ */
+
+ if (!(mdev->mthca_flags & MTHCA_FLAG_PCIE)) {
+ /* Look for the bridge -- its device ID will be 2 more
+ than HCA's device ID. */
+#ifdef __linux__
+ while ((bridge = pci_get_device(mdev->pdev->vendor,
+ mdev->pdev->device + 2,
+ bridge)) != NULL) {
+ if (bridge->hdr_type == PCI_HEADER_TYPE_BRIDGE &&
+ bridge->subordinate == mdev->pdev->bus) {
+ mthca_dbg(mdev, "Found bridge: %s\n",
+ pci_name(bridge));
+ break;
+ }
+ }
+
+ if (!bridge) {
+ /*
+ * Didn't find a bridge for a Tavor device --
+ * assume we're in no-bridge mode and hope for
+ * the best.
+ */
+ mthca_warn(mdev, "No bridge found for %s\n",
+ pci_name(mdev->pdev));
+ }
+#else
+ mthca_warn(mdev, "Reset on PCI-X is not supported.\n");
+ goto out;
+
+#endif
+ }
+
+ /* For Arbel do we need to save off the full 4K PCI Express header?? */
+ hca_header = kmalloc(256, GFP_KERNEL);
+ if (!hca_header) {
+ err = -ENOMEM;
+ mthca_err(mdev, "Couldn't allocate memory to save HCA "
+ "PCI header, aborting.\n");
+ goto out;
+ }
+
+ for (i = 0; i < 64; ++i) {
+ if (i == 22 || i == 23)
+ continue;
+ if (pci_read_config_dword(mdev->pdev, i * 4, hca_header + i)) {
+ err = -ENODEV;
+ mthca_err(mdev, "Couldn't save HCA "
+ "PCI header, aborting.\n");
+ goto out;
+ }
+ }
+
+ hca_pcix_cap = pci_find_capability(mdev->pdev, PCI_CAP_ID_PCIX);
+ hca_pcie_cap = pci_find_capability(mdev->pdev, PCI_CAP_ID_EXP);
+
+#ifdef __linux__
+ if (bridge) {
+ bridge_header = kmalloc(256, GFP_KERNEL);
+ if (!bridge_header) {
+ err = -ENOMEM;
+ mthca_err(mdev, "Couldn't allocate memory to save HCA "
+ "bridge PCI header, aborting.\n");
+ goto out;
+ }
+
+ for (i = 0; i < 64; ++i) {
+ if (i == 22 || i == 23)
+ continue;
+ if (pci_read_config_dword(bridge, i * 4, bridge_header + i)) {
+ err = -ENODEV;
+ mthca_err(mdev, "Couldn't save HCA bridge "
+ "PCI header, aborting.\n");
+ goto out;
+ }
+ }
+ bridge_pcix_cap = pci_find_capability(bridge, PCI_CAP_ID_PCIX);
+ if (!bridge_pcix_cap) {
+ err = -ENODEV;
+ mthca_err(mdev, "Couldn't locate HCA bridge "
+ "PCI-X capability, aborting.\n");
+ goto out;
+ }
+ }
+#endif
+
+ /* actually hit reset */
+ {
+ void __iomem *reset = ioremap(pci_resource_start(mdev->pdev, 0) +
+ MTHCA_RESET_OFFSET, 4);
+
+ if (!reset) {
+ err = -ENOMEM;
+ mthca_err(mdev, "Couldn't map HCA reset register, "
+ "aborting.\n");
+ goto out;
+ }
+
+ writel(MTHCA_RESET_VALUE, reset);
+ iounmap(reset);
+ }
+
+ /* Docs say to wait one second before accessing device */
+ msleep(1000);
+
+ /* Now wait for PCI device to start responding again */
+ {
+ u32 v;
+ int c = 0;
+
+ for (c = 0; c < 100; ++c) {
+ if (pci_read_config_dword(bridge ? bridge : mdev->pdev, 0, &v)) {
+ err = -ENODEV;
+ mthca_err(mdev, "Couldn't access HCA after reset, "
+ "aborting.\n");
+ goto out;
+ }
+
+ if (v != 0xffffffff)
+ goto good;
+
+ msleep(100);
+ }
+
+ err = -ENODEV;
+ mthca_err(mdev, "PCI device did not come back after reset, "
+ "aborting.\n");
+ goto out;
+ }
+
+good:
+ /* Now restore the PCI headers */
+ if (bridge) {
+ if (pci_write_config_dword(bridge, bridge_pcix_cap + 0x8,
+ bridge_header[(bridge_pcix_cap + 0x8) / 4])) {
+ err = -ENODEV;
+ mthca_err(mdev, "Couldn't restore HCA bridge Upstream "
+ "split transaction control, aborting.\n");
+ goto out;
+ }
+ if (pci_write_config_dword(bridge, bridge_pcix_cap + 0xc,
+ bridge_header[(bridge_pcix_cap + 0xc) / 4])) {
+ err = -ENODEV;
+ mthca_err(mdev, "Couldn't restore HCA bridge Downstream "
+ "split transaction control, aborting.\n");
+ goto out;
+ }
+ /*
+ * Bridge control register is at 0x3e, so we'll
+ * naturally restore it last in this loop.
+ */
+ for (i = 0; i < 16; ++i) {
+ if (i * 4 == PCI_COMMAND)
+ continue;
+
+ if (pci_write_config_dword(bridge, i * 4, bridge_header[i])) {
+ err = -ENODEV;
+ mthca_err(mdev, "Couldn't restore HCA bridge reg %x, "
+ "aborting.\n", i);
+ goto out;
+ }
+ }
+
+ if (pci_write_config_dword(bridge, PCI_COMMAND,
+ bridge_header[PCI_COMMAND / 4])) {
+ err = -ENODEV;
+ mthca_err(mdev, "Couldn't restore HCA bridge COMMAND, "
+ "aborting.\n");
+ goto out;
+ }
+ }
+
+ if (hca_pcix_cap) {
+ if (pci_write_config_dword(mdev->pdev, hca_pcix_cap,
+ hca_header[hca_pcix_cap / 4])) {
+ err = -ENODEV;
+ mthca_err(mdev, "Couldn't restore HCA PCI-X "
+ "command register, aborting.\n");
+ goto out;
+ }
+ }
+
+ if (hca_pcie_cap) {
+ devctl = hca_header[(hca_pcie_cap + PCI_EXP_DEVCTL) / 4];
+ if (pci_write_config_word(mdev->pdev, hca_pcie_cap + PCI_EXP_DEVCTL,
+ devctl)) {
+ err = -ENODEV;
+ mthca_err(mdev, "Couldn't restore HCA PCI Express "
+ "Device Control register, aborting.\n");
+ goto out;
+ }
+ linkctl = hca_header[(hca_pcie_cap + PCI_EXP_LNKCTL) / 4];
+ if (pci_write_config_word(mdev->pdev, hca_pcie_cap + PCI_EXP_LNKCTL,
+ linkctl)) {
+ err = -ENODEV;
+ mthca_err(mdev, "Couldn't restore HCA PCI Express "
+ "Link control register, aborting.\n");
+ goto out;
+ }
+ }
+
+ for (i = 0; i < 16; ++i) {
+ if (i * 4 == PCI_COMMAND)
+ continue;
+
+ if (pci_write_config_dword(mdev->pdev, i * 4, hca_header[i])) {
+ err = -ENODEV;
+ mthca_err(mdev, "Couldn't restore HCA reg %x, "
+ "aborting.\n", i);
+ goto out;
+ }
+ }
+
+ if (pci_write_config_dword(mdev->pdev, PCI_COMMAND,
+ hca_header[PCI_COMMAND / 4])) {
+ err = -ENODEV;
+ mthca_err(mdev, "Couldn't restore HCA COMMAND, "
+ "aborting.\n");
+ goto out;
+ }
+
+out:
+#ifdef __linux__
+ if (bridge)
+ pci_dev_put(bridge);
+#endif
+ kfree(bridge_header);
+ kfree(hca_header);
+
+ return err;
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_srq.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_srq.c
new file mode 100644
index 0000000..4fabe62
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_srq.c
@@ -0,0 +1,715 @@
+/*
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+
+#include <asm/io.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+#include "mthca_wqe.h"
+
+enum {
+ MTHCA_MAX_DIRECT_SRQ_SIZE = 4 * PAGE_SIZE
+};
+
+struct mthca_tavor_srq_context {
+ __be64 wqe_base_ds; /* low 6 bits is descriptor size */
+ __be32 state_pd;
+ __be32 lkey;
+ __be32 uar;
+ __be16 limit_watermark;
+ __be16 wqe_cnt;
+ u32 reserved[2];
+};
+
+struct mthca_arbel_srq_context {
+ __be32 state_logsize_srqn;
+ __be32 lkey;
+ __be32 db_index;
+ __be32 logstride_usrpage;
+ __be64 wqe_base;
+ __be32 eq_pd;
+ __be16 limit_watermark;
+ __be16 wqe_cnt;
+ u16 reserved1;
+ __be16 wqe_counter;
+ u32 reserved2[3];
+};
+
+static void *get_wqe(struct mthca_srq *srq, int n)
+{
+ if (srq->is_direct)
+ return srq->queue.direct.buf + (n << srq->wqe_shift);
+ else
+ return srq->queue.page_list[(n << srq->wqe_shift) >> PAGE_SHIFT].buf +
+ ((n << srq->wqe_shift) & (PAGE_SIZE - 1));
+}
+
+/*
+ * Return a pointer to the location within a WQE that we're using as a
+ * link when the WQE is in the free list. We use the imm field
+ * because in the Tavor case, posting a WQE may overwrite the next
+ * segment of the previous WQE, but a receive WQE will never touch the
+ * imm field. This avoids corrupting our free list if the previous
+ * WQE has already completed and been put on the free list when we
+ * post the next WQE.
+ */
+static inline int *wqe_to_link(void *wqe)
+{
+ return (int *) (wqe + offsetof(struct mthca_next_seg, imm));
+}
+
+static void mthca_tavor_init_srq_context(struct mthca_dev *dev,
+ struct mthca_pd *pd,
+ struct mthca_srq *srq,
+ struct mthca_tavor_srq_context *context)
+{
+ memset(context, 0, sizeof *context);
+
+ context->wqe_base_ds = cpu_to_be64(1 << (srq->wqe_shift - 4));
+ context->state_pd = cpu_to_be32(pd->pd_num);
+ context->lkey = cpu_to_be32(srq->mr.ibmr.lkey);
+
+ if (pd->ibpd.uobject)
+ context->uar =
+ cpu_to_be32(to_mucontext(pd->ibpd.uobject->context)->uar.index);
+ else
+ context->uar = cpu_to_be32(dev->driver_uar.index);
+}
+
+static void mthca_arbel_init_srq_context(struct mthca_dev *dev,
+ struct mthca_pd *pd,
+ struct mthca_srq *srq,
+ struct mthca_arbel_srq_context *context)
+{
+ int logsize, max;
+
+ memset(context, 0, sizeof *context);
+
+ /*
+ * Put max in a temporary variable to work around gcc bug
+ * triggered by ilog2() on sparc64.
+ */
+ max = srq->max;
+ logsize = ilog2(max);
+ context->state_logsize_srqn = cpu_to_be32(logsize << 24 | srq->srqn);
+ context->lkey = cpu_to_be32(srq->mr.ibmr.lkey);
+ context->db_index = cpu_to_be32(srq->db_index);
+ context->logstride_usrpage = cpu_to_be32((srq->wqe_shift - 4) << 29);
+ if (pd->ibpd.uobject)
+ context->logstride_usrpage |=
+ cpu_to_be32(to_mucontext(pd->ibpd.uobject->context)->uar.index);
+ else
+ context->logstride_usrpage |= cpu_to_be32(dev->driver_uar.index);
+ context->eq_pd = cpu_to_be32(MTHCA_EQ_ASYNC << 24 | pd->pd_num);
+}
+
+static void mthca_free_srq_buf(struct mthca_dev *dev, struct mthca_srq *srq)
+{
+ mthca_buf_free(dev, srq->max << srq->wqe_shift, &srq->queue,
+ srq->is_direct, &srq->mr);
+ kfree(srq->wrid);
+}
+
+static int mthca_alloc_srq_buf(struct mthca_dev *dev, struct mthca_pd *pd,
+ struct mthca_srq *srq)
+{
+ struct mthca_data_seg *scatter;
+ void *wqe;
+ int err;
+ int i;
+
+ if (pd->ibpd.uobject)
+ return 0;
+
+ srq->wrid = kmalloc(srq->max * sizeof (u64), GFP_KERNEL);
+ if (!srq->wrid)
+ return -ENOMEM;
+
+ err = mthca_buf_alloc(dev, srq->max << srq->wqe_shift,
+ MTHCA_MAX_DIRECT_SRQ_SIZE,
+ &srq->queue, &srq->is_direct, pd, 1, &srq->mr);
+ if (err) {
+ kfree(srq->wrid);
+ return err;
+ }
+
+ /*
+ * Now initialize the SRQ buffer so that all of the WQEs are
+ * linked into the list of free WQEs. In addition, set the
+ * scatter list L_Keys to the sentry value of 0x100.
+ */
+ for (i = 0; i < srq->max; ++i) {
+ struct mthca_next_seg *next;
+
+ next = wqe = get_wqe(srq, i);
+
+ if (i < srq->max - 1) {
+ *wqe_to_link(wqe) = i + 1;
+ next->nda_op = htonl(((i + 1) << srq->wqe_shift) | 1);
+ } else {
+ *wqe_to_link(wqe) = -1;
+ next->nda_op = 0;
+ }
+
+ for (scatter = wqe + sizeof (struct mthca_next_seg);
+ (void *) scatter < wqe + (1 << srq->wqe_shift);
+ ++scatter)
+ scatter->lkey = cpu_to_be32(MTHCA_INVAL_LKEY);
+ }
+
+ srq->last = get_wqe(srq, srq->max - 1);
+
+ return 0;
+}
+
+int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd,
+ struct ib_srq_attr *attr, struct mthca_srq *srq)
+{
+ struct mthca_mailbox *mailbox;
+ u8 status;
+ int ds;
+ int err;
+
+ /* Sanity check SRQ size before proceeding */
+ if (attr->max_wr > dev->limits.max_srq_wqes ||
+ attr->max_sge > dev->limits.max_srq_sge)
+ return -EINVAL;
+
+ srq->max = attr->max_wr;
+ srq->max_gs = attr->max_sge;
+ srq->counter = 0;
+
+ if (mthca_is_memfree(dev))
+ srq->max = roundup_pow_of_two(srq->max + 1);
+ else
+ srq->max = srq->max + 1;
+
+ ds = max(64UL,
+ roundup_pow_of_two(sizeof (struct mthca_next_seg) +
+ srq->max_gs * sizeof (struct mthca_data_seg)));
+
+ if (!mthca_is_memfree(dev) && (ds > dev->limits.max_desc_sz))
+ return -EINVAL;
+
+ srq->wqe_shift = ilog2(ds);
+
+ srq->srqn = mthca_alloc(&dev->srq_table.alloc);
+ if (srq->srqn == -1)
+ return -ENOMEM;
+
+ if (mthca_is_memfree(dev)) {
+ err = mthca_table_get(dev, dev->srq_table.table, srq->srqn);
+ if (err)
+ goto err_out;
+
+ if (!pd->ibpd.uobject) {
+ srq->db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_SRQ,
+ srq->srqn, &srq->db);
+ if (srq->db_index < 0) {
+ err = -ENOMEM;
+ goto err_out_icm;
+ }
+ }
+ }
+
+ mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(mailbox)) {
+ err = PTR_ERR(mailbox);
+ goto err_out_db;
+ }
+
+ err = mthca_alloc_srq_buf(dev, pd, srq);
+ if (err)
+ goto err_out_mailbox;
+
+ spin_lock_init(&srq->lock);
+ srq->refcount = 1;
+ init_waitqueue_head(&srq->wait);
+ mutex_init(&srq->mutex);
+
+ if (mthca_is_memfree(dev))
+ mthca_arbel_init_srq_context(dev, pd, srq, mailbox->buf);
+ else
+ mthca_tavor_init_srq_context(dev, pd, srq, mailbox->buf);
+
+ err = mthca_SW2HW_SRQ(dev, mailbox, srq->srqn, &status);
+
+ if (err) {
+ mthca_warn(dev, "SW2HW_SRQ failed (%d)\n", err);
+ goto err_out_free_buf;
+ }
+ if (status) {
+ mthca_warn(dev, "SW2HW_SRQ returned status 0x%02x\n",
+ status);
+ err = -EINVAL;
+ goto err_out_free_buf;
+ }
+
+ spin_lock_irq(&dev->srq_table.lock);
+ if (mthca_array_set(&dev->srq_table.srq,
+ srq->srqn & (dev->limits.num_srqs - 1),
+ srq)) {
+ spin_unlock_irq(&dev->srq_table.lock);
+ goto err_out_free_srq;
+ }
+ spin_unlock_irq(&dev->srq_table.lock);
+
+ mthca_free_mailbox(dev, mailbox);
+
+ srq->first_free = 0;
+ srq->last_free = srq->max - 1;
+
+ attr->max_wr = srq->max - 1;
+ attr->max_sge = srq->max_gs;
+
+ return 0;
+
+err_out_free_srq:
+ err = mthca_HW2SW_SRQ(dev, mailbox, srq->srqn, &status);
+ if (err)
+ mthca_warn(dev, "HW2SW_SRQ failed (%d)\n", err);
+ else if (status)
+ mthca_warn(dev, "HW2SW_SRQ returned status 0x%02x\n", status);
+
+err_out_free_buf:
+ if (!pd->ibpd.uobject)
+ mthca_free_srq_buf(dev, srq);
+
+err_out_mailbox:
+ mthca_free_mailbox(dev, mailbox);
+
+err_out_db:
+ if (!pd->ibpd.uobject && mthca_is_memfree(dev))
+ mthca_free_db(dev, MTHCA_DB_TYPE_SRQ, srq->db_index);
+
+err_out_icm:
+ mthca_table_put(dev, dev->srq_table.table, srq->srqn);
+
+err_out:
+ mthca_free(&dev->srq_table.alloc, srq->srqn);
+
+ return err;
+}
+
+static inline int get_srq_refcount(struct mthca_dev *dev, struct mthca_srq *srq)
+{
+ int c;
+
+ spin_lock_irq(&dev->srq_table.lock);
+ c = srq->refcount;
+ spin_unlock_irq(&dev->srq_table.lock);
+
+ return c;
+}
+
+void mthca_free_srq(struct mthca_dev *dev, struct mthca_srq *srq)
+{
+ struct mthca_mailbox *mailbox;
+ int err;
+ u8 status;
+
+ mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(mailbox)) {
+ mthca_warn(dev, "No memory for mailbox to free SRQ.\n");
+ return;
+ }
+
+ err = mthca_HW2SW_SRQ(dev, mailbox, srq->srqn, &status);
+ if (err)
+ mthca_warn(dev, "HW2SW_SRQ failed (%d)\n", err);
+ else if (status)
+ mthca_warn(dev, "HW2SW_SRQ returned status 0x%02x\n", status);
+
+ spin_lock_irq(&dev->srq_table.lock);
+ mthca_array_clear(&dev->srq_table.srq,
+ srq->srqn & (dev->limits.num_srqs - 1));
+ --srq->refcount;
+ spin_unlock_irq(&dev->srq_table.lock);
+
+ wait_event(srq->wait, !get_srq_refcount(dev, srq));
+
+ if (!srq->ibsrq.uobject) {
+ mthca_free_srq_buf(dev, srq);
+ if (mthca_is_memfree(dev))
+ mthca_free_db(dev, MTHCA_DB_TYPE_SRQ, srq->db_index);
+ }
+
+ mthca_table_put(dev, dev->srq_table.table, srq->srqn);
+ mthca_free(&dev->srq_table.alloc, srq->srqn);
+ mthca_free_mailbox(dev, mailbox);
+}
+
+int mthca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+ enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+ struct mthca_dev *dev = to_mdev(ibsrq->device);
+ struct mthca_srq *srq = to_msrq(ibsrq);
+ int ret;
+ u8 status;
+
+ /* We don't support resizing SRQs (yet?) */
+ if (attr_mask & IB_SRQ_MAX_WR)
+ return -EINVAL;
+
+ if (attr_mask & IB_SRQ_LIMIT) {
+ u32 max_wr = mthca_is_memfree(dev) ? srq->max - 1 : srq->max;
+ if (attr->srq_limit > max_wr)
+ return -EINVAL;
+
+ mutex_lock(&srq->mutex);
+ ret = mthca_ARM_SRQ(dev, srq->srqn, attr->srq_limit, &status);
+ mutex_unlock(&srq->mutex);
+
+ if (ret)
+ return ret;
+ if (status)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int mthca_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
+{
+ struct mthca_dev *dev = to_mdev(ibsrq->device);
+ struct mthca_srq *srq = to_msrq(ibsrq);
+ struct mthca_mailbox *mailbox;
+ struct mthca_arbel_srq_context *arbel_ctx;
+ struct mthca_tavor_srq_context *tavor_ctx;
+ u8 status;
+ int err;
+
+ mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+
+ err = mthca_QUERY_SRQ(dev, srq->srqn, mailbox, &status);
+ if (err)
+ goto out;
+
+ if (mthca_is_memfree(dev)) {
+ arbel_ctx = mailbox->buf;
+ srq_attr->srq_limit = be16_to_cpu(arbel_ctx->limit_watermark);
+ } else {
+ tavor_ctx = mailbox->buf;
+ srq_attr->srq_limit = be16_to_cpu(tavor_ctx->limit_watermark);
+ }
+
+ srq_attr->max_wr = srq->max - 1;
+ srq_attr->max_sge = srq->max_gs;
+
+out:
+ mthca_free_mailbox(dev, mailbox);
+
+ return err;
+}
+
+void mthca_srq_event(struct mthca_dev *dev, u32 srqn,
+ enum ib_event_type event_type)
+{
+ struct mthca_srq *srq;
+ struct ib_event event;
+
+ spin_lock(&dev->srq_table.lock);
+ srq = mthca_array_get(&dev->srq_table.srq, srqn & (dev->limits.num_srqs - 1));
+ if (srq)
+ ++srq->refcount;
+ spin_unlock(&dev->srq_table.lock);
+
+ if (!srq) {
+ mthca_warn(dev, "Async event for bogus SRQ %08x\n", srqn);
+ return;
+ }
+
+ if (!srq->ibsrq.event_handler)
+ goto out;
+
+ event.device = &dev->ib_dev;
+ event.event = event_type;
+ event.element.srq = &srq->ibsrq;
+ srq->ibsrq.event_handler(&event, srq->ibsrq.srq_context);
+
+out:
+ spin_lock(&dev->srq_table.lock);
+ if (!--srq->refcount)
+ wake_up(&srq->wait);
+ spin_unlock(&dev->srq_table.lock);
+}
+
+/*
+ * This function must be called with IRQs disabled.
+ */
+void mthca_free_srq_wqe(struct mthca_srq *srq, u32 wqe_addr)
+{
+ int ind;
+ struct mthca_next_seg *last_free;
+
+ ind = wqe_addr >> srq->wqe_shift;
+
+ spin_lock(&srq->lock);
+
+ last_free = get_wqe(srq, srq->last_free);
+ *wqe_to_link(last_free) = ind;
+ last_free->nda_op = htonl((ind << srq->wqe_shift) | 1);
+ *wqe_to_link(get_wqe(srq, ind)) = -1;
+ srq->last_free = ind;
+
+ spin_unlock(&srq->lock);
+}
+
+int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr)
+{
+ struct mthca_dev *dev = to_mdev(ibsrq->device);
+ struct mthca_srq *srq = to_msrq(ibsrq);
+ unsigned long flags;
+ int err = 0;
+ int first_ind;
+ int ind;
+ int next_ind;
+ int nreq;
+ int i;
+ void *wqe;
+ void *prev_wqe;
+
+ spin_lock_irqsave(&srq->lock, flags);
+
+ first_ind = srq->first_free;
+
+ for (nreq = 0; wr; wr = wr->next) {
+ ind = srq->first_free;
+ wqe = get_wqe(srq, ind);
+ next_ind = *wqe_to_link(wqe);
+
+ if (unlikely(next_ind < 0)) {
+ mthca_err(dev, "SRQ %06x full\n", srq->srqn);
+ err = -ENOMEM;
+ *bad_wr = wr;
+ break;
+ }
+
+ prev_wqe = srq->last;
+ srq->last = wqe;
+
+ ((struct mthca_next_seg *) wqe)->ee_nds = 0;
+ /* flags field will always remain 0 */
+
+ wqe += sizeof (struct mthca_next_seg);
+
+ if (unlikely(wr->num_sge > srq->max_gs)) {
+ err = -EINVAL;
+ *bad_wr = wr;
+ srq->last = prev_wqe;
+ break;
+ }
+
+ for (i = 0; i < wr->num_sge; ++i) {
+ mthca_set_data_seg(wqe, wr->sg_list + i);
+ wqe += sizeof (struct mthca_data_seg);
+ }
+
+ if (i < srq->max_gs)
+ mthca_set_data_seg_inval(wqe);
+
+ ((struct mthca_next_seg *) prev_wqe)->ee_nds =
+ cpu_to_be32(MTHCA_NEXT_DBD);
+
+ srq->wrid[ind] = wr->wr_id;
+ srq->first_free = next_ind;
+
+ ++nreq;
+ if (unlikely(nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB)) {
+ nreq = 0;
+
+ /*
+ * Make sure that descriptors are written
+ * before doorbell is rung.
+ */
+ wmb();
+
+ mthca_write64(first_ind << srq->wqe_shift, srq->srqn << 8,
+ dev->kar + MTHCA_RECEIVE_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+
+ first_ind = srq->first_free;
+ }
+ }
+
+ if (likely(nreq)) {
+ /*
+ * Make sure that descriptors are written before
+ * doorbell is rung.
+ */
+ wmb();
+
+ mthca_write64(first_ind << srq->wqe_shift, (srq->srqn << 8) | nreq,
+ dev->kar + MTHCA_RECEIVE_DOORBELL,
+ MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+ }
+
+ /*
+ * Make sure doorbells don't leak out of SRQ spinlock and
+ * reach the HCA out of order:
+ */
+ mmiowb();
+
+ spin_unlock_irqrestore(&srq->lock, flags);
+ return err;
+}
+
+int mthca_arbel_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr)
+{
+ struct mthca_dev *dev = to_mdev(ibsrq->device);
+ struct mthca_srq *srq = to_msrq(ibsrq);
+ unsigned long flags;
+ int err = 0;
+ int ind;
+ int next_ind;
+ int nreq;
+ int i;
+ void *wqe;
+
+ spin_lock_irqsave(&srq->lock, flags);
+
+ for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ ind = srq->first_free;
+ wqe = get_wqe(srq, ind);
+ next_ind = *wqe_to_link(wqe);
+
+ if (unlikely(next_ind < 0)) {
+ mthca_err(dev, "SRQ %06x full\n", srq->srqn);
+ err = -ENOMEM;
+ *bad_wr = wr;
+ break;
+ }
+
+ ((struct mthca_next_seg *) wqe)->ee_nds = 0;
+ /* flags field will always remain 0 */
+
+ wqe += sizeof (struct mthca_next_seg);
+
+ if (unlikely(wr->num_sge > srq->max_gs)) {
+ err = -EINVAL;
+ *bad_wr = wr;
+ break;
+ }
+
+ for (i = 0; i < wr->num_sge; ++i) {
+ mthca_set_data_seg(wqe, wr->sg_list + i);
+ wqe += sizeof (struct mthca_data_seg);
+ }
+
+ if (i < srq->max_gs)
+ mthca_set_data_seg_inval(wqe);
+
+ srq->wrid[ind] = wr->wr_id;
+ srq->first_free = next_ind;
+ }
+
+ if (likely(nreq)) {
+ srq->counter += nreq;
+
+ /*
+ * Make sure that descriptors are written before
+ * we write doorbell record.
+ */
+ wmb();
+ *srq->db = cpu_to_be32(srq->counter);
+ }
+
+ spin_unlock_irqrestore(&srq->lock, flags);
+ return err;
+}
+
+int mthca_max_srq_sge(struct mthca_dev *dev)
+{
+ if (mthca_is_memfree(dev))
+ return dev->limits.max_sg;
+
+ /*
+ * SRQ allocations are based on powers of 2 for Tavor,
+ * (although they only need to be multiples of 16 bytes).
+ *
+ * Therefore, we need to base the max number of sg entries on
+ * the largest power of 2 descriptor size that is <= to the
+ * actual max WQE descriptor size, rather than return the
+ * max_sg value given by the firmware (which is based on WQE
+ * sizes as multiples of 16, not powers of 2).
+ *
+ * If SRQ implementation is changed for Tavor to be based on
+ * multiples of 16, the calculation below can be deleted and
+ * the FW max_sg value returned.
+ */
+ return min_t(int, dev->limits.max_sg,
+ ((1 << (fls(dev->limits.max_desc_sz) - 1)) -
+ sizeof (struct mthca_next_seg)) /
+ sizeof (struct mthca_data_seg));
+}
+
+int mthca_init_srq_table(struct mthca_dev *dev)
+{
+ int err;
+
+ if (!(dev->mthca_flags & MTHCA_FLAG_SRQ))
+ return 0;
+
+ spin_lock_init(&dev->srq_table.lock);
+
+ err = mthca_alloc_init(&dev->srq_table.alloc,
+ dev->limits.num_srqs,
+ dev->limits.num_srqs - 1,
+ dev->limits.reserved_srqs);
+ if (err)
+ return err;
+
+ err = mthca_array_init(&dev->srq_table.srq,
+ dev->limits.num_srqs);
+ if (err)
+ mthca_alloc_cleanup(&dev->srq_table.alloc);
+
+ return err;
+}
+
+void mthca_cleanup_srq_table(struct mthca_dev *dev)
+{
+ if (!(dev->mthca_flags & MTHCA_FLAG_SRQ))
+ return;
+
+ mthca_array_cleanup(&dev->srq_table.srq, dev->limits.num_srqs);
+ mthca_alloc_cleanup(&dev->srq_table.alloc);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_uar.c b/sys/ofed/drivers/infiniband/hw/mthca/mthca_uar.c
new file mode 100644
index 0000000..ca5900c
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_uar.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <asm/page.h> /* PAGE_SHIFT */
+
+#include "mthca_dev.h"
+#include "mthca_memfree.h"
+
+int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar)
+{
+ uar->index = mthca_alloc(&dev->uar_table.alloc);
+ if (uar->index == -1)
+ return -ENOMEM;
+
+ uar->pfn = (pci_resource_start(dev->pdev, 2) >> PAGE_SHIFT) + uar->index;
+
+ return 0;
+}
+
+void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar)
+{
+ mthca_free(&dev->uar_table.alloc, uar->index);
+}
+
+int mthca_init_uar_table(struct mthca_dev *dev)
+{
+ int ret;
+
+ ret = mthca_alloc_init(&dev->uar_table.alloc,
+ dev->limits.num_uars,
+ dev->limits.num_uars - 1,
+ dev->limits.reserved_uars + 1);
+ if (ret)
+ return ret;
+
+ ret = mthca_init_db_tab(dev);
+ if (ret)
+ mthca_alloc_cleanup(&dev->uar_table.alloc);
+
+ return ret;
+}
+
+void mthca_cleanup_uar_table(struct mthca_dev *dev)
+{
+ mthca_cleanup_db_tab(dev);
+
+ /* XXX check if any UARs are still allocated? */
+ mthca_alloc_cleanup(&dev->uar_table.alloc);
+}
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_user.h b/sys/ofed/drivers/infiniband/hw/mthca/mthca_user.h
new file mode 100644
index 0000000..5fe56e8
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_user.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_USER_H
+#define MTHCA_USER_H
+
+#include <linux/types.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define MTHCA_UVERBS_ABI_VERSION 1
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct mthca_alloc_ucontext_resp {
+ __u32 qp_tab_size;
+ __u32 uarc_size;
+};
+
+struct mthca_alloc_pd_resp {
+ __u32 pdn;
+ __u32 reserved;
+};
+
+struct mthca_reg_mr {
+/*
+ * Mark the memory region with a DMA attribute that causes
+ * in-flight DMA to be flushed when the region is written to:
+ */
+#define MTHCA_MR_DMASYNC 0x1
+ __u32 mr_attrs;
+ __u32 reserved;
+};
+
+struct mthca_create_cq {
+ __u32 lkey;
+ __u32 pdn;
+ __u64 arm_db_page;
+ __u64 set_db_page;
+ __u32 arm_db_index;
+ __u32 set_db_index;
+};
+
+struct mthca_create_cq_resp {
+ __u32 cqn;
+ __u32 reserved;
+};
+
+struct mthca_resize_cq {
+ __u32 lkey;
+ __u32 reserved;
+};
+
+struct mthca_create_srq {
+ __u32 lkey;
+ __u32 db_index;
+ __u64 db_page;
+};
+
+struct mthca_create_srq_resp {
+ __u32 srqn;
+ __u32 reserved;
+};
+
+struct mthca_create_qp {
+ __u32 lkey;
+ __u32 reserved;
+ __u64 sq_db_page;
+ __u64 rq_db_page;
+ __u32 sq_db_index;
+ __u32 rq_db_index;
+};
+
+#endif /* MTHCA_USER_H */
diff --git a/sys/ofed/drivers/infiniband/hw/mthca/mthca_wqe.h b/sys/ofed/drivers/infiniband/hw/mthca/mthca_wqe.h
new file mode 100644
index 0000000..341a5ae
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/hw/mthca/mthca_wqe.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_WQE_H
+#define MTHCA_WQE_H
+
+#include <linux/types.h>
+
+enum {
+ MTHCA_NEXT_DBD = 1 << 7,
+ MTHCA_NEXT_FENCE = 1 << 6,
+ MTHCA_NEXT_CQ_UPDATE = 1 << 3,
+ MTHCA_NEXT_EVENT_GEN = 1 << 2,
+ MTHCA_NEXT_SOLICIT = 1 << 1,
+ MTHCA_NEXT_IP_CSUM = 1 << 4,
+ MTHCA_NEXT_TCP_UDP_CSUM = 1 << 5,
+
+ MTHCA_MLX_VL15 = 1 << 17,
+ MTHCA_MLX_SLR = 1 << 16
+};
+
+enum {
+ MTHCA_INVAL_LKEY = 0x100,
+ MTHCA_TAVOR_MAX_WQES_PER_RECV_DB = 256,
+ MTHCA_ARBEL_MAX_WQES_PER_SEND_DB = 255
+};
+
+struct mthca_next_seg {
+ __be32 nda_op; /* [31:6] next WQE [4:0] next opcode */
+ __be32 ee_nds; /* [31:8] next EE [7] DBD [6] F [5:0] next WQE size */
+ __be32 flags; /* [3] CQ [2] Event [1] Solicit */
+ __be32 imm; /* immediate data */
+};
+
+struct mthca_tavor_ud_seg {
+ u32 reserved1;
+ __be32 lkey;
+ __be64 av_addr;
+ u32 reserved2[4];
+ __be32 dqpn;
+ __be32 qkey;
+ u32 reserved3[2];
+};
+
+struct mthca_arbel_ud_seg {
+ __be32 av[8];
+ __be32 dqpn;
+ __be32 qkey;
+ u32 reserved[2];
+};
+
+struct mthca_bind_seg {
+ __be32 flags; /* [31] Atomic [30] rem write [29] rem read */
+ u32 reserved;
+ __be32 new_rkey;
+ __be32 lkey;
+ __be64 addr;
+ __be64 length;
+};
+
+struct mthca_raddr_seg {
+ __be64 raddr;
+ __be32 rkey;
+ u32 reserved;
+};
+
+struct mthca_atomic_seg {
+ __be64 swap_add;
+ __be64 compare;
+};
+
+struct mthca_data_seg {
+ __be32 byte_count;
+ __be32 lkey;
+ __be64 addr;
+};
+
+struct mthca_mlx_seg {
+ __be32 nda_op;
+ __be32 nds;
+ __be32 flags; /* [17] VL15 [16] SLR [14:12] static rate
+ [11:8] SL [3] C [2] E */
+ __be16 rlid;
+ __be16 vcrc;
+};
+
+static __always_inline void mthca_set_data_seg(struct mthca_data_seg *dseg,
+ struct ib_sge *sg)
+{
+ dseg->byte_count = cpu_to_be32(sg->length);
+ dseg->lkey = cpu_to_be32(sg->lkey);
+ dseg->addr = cpu_to_be64(sg->addr);
+}
+
+static __always_inline void mthca_set_data_seg_inval(struct mthca_data_seg *dseg)
+{
+ dseg->byte_count = 0;
+ dseg->lkey = cpu_to_be32(MTHCA_INVAL_LKEY);
+ dseg->addr = 0;
+}
+
+#endif /* MTHCA_WQE_H */
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/Kconfig b/sys/ofed/drivers/infiniband/ulp/ipoib/Kconfig
new file mode 100644
index 0000000..9d9a9dc
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/Kconfig
@@ -0,0 +1,50 @@
+config INFINIBAND_IPOIB
+ tristate "IP-over-InfiniBand"
+ depends on NETDEVICES && INET && (IPV6 || IPV6=n)
+ select INET_LRO
+ ---help---
+ Support for the IP-over-InfiniBand protocol (IPoIB). This
+ transports IP packets over InfiniBand so you can use your IB
+ device as a fancy NIC.
+
+ See Documentation/infiniband/ipoib.txt for more information
+
+config INFINIBAND_IPOIB_CM
+ bool "IP-over-InfiniBand Connected Mode support"
+ depends on INFINIBAND_IPOIB
+ default n
+ ---help---
+ This option enables support for IPoIB connected mode. After
+ enabling this option, you need to switch to connected mode
+ through /sys/class/net/ibXXX/mode to actually create
+ connections, and then increase the interface MTU with
+ e.g. ifconfig ib0 mtu 65520.
+
+ WARNING: Enabling connected mode will trigger some packet
+ drops for multicast and UD mode traffic from this interface,
+ unless you limit mtu for these destinations to 2044.
+
+config INFINIBAND_IPOIB_DEBUG
+ bool "IP-over-InfiniBand debugging" if EMBEDDED
+ depends on INFINIBAND_IPOIB
+ default y
+ ---help---
+ This option causes debugging code to be compiled into the
+ IPoIB driver. The output can be turned on via the
+ debug_level and mcast_debug_level module parameters (which
+ can also be set after the driver is loaded through sysfs).
+
+ This option also creates a directory tree under ipoib/ in
+ debugfs, which contains files that expose debugging
+ information about IB multicast groups used by the IPoIB
+ driver.
+
+config INFINIBAND_IPOIB_DEBUG_DATA
+ bool "IP-over-InfiniBand data path debugging"
+ depends on INFINIBAND_IPOIB_DEBUG
+ ---help---
+ This option compiles debugging code into the data path
+ of the IPoIB driver. The output can be turned on via the
+ data_debug_level module parameter; however, even with output
+ turned off, this debugging code will have some performance
+ impact.
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/Makefile b/sys/ofed/drivers/infiniband/ulp/ipoib/Makefile
new file mode 100644
index 0000000..3090100
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/Makefile
@@ -0,0 +1,11 @@
+obj-$(CONFIG_INFINIBAND_IPOIB) += ib_ipoib.o
+
+ib_ipoib-y := ipoib_main.o \
+ ipoib_ib.o \
+ ipoib_multicast.o \
+ ipoib_verbs.o \
+ ipoib_vlan.o \
+ ipoib_ethtool.o
+ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_CM) += ipoib_cm.o
+ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG) += ipoib_fs.o
+
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h
new file mode 100644
index 0000000..1d6ae84
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -0,0 +1,757 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPOIB_H
+#define _IPOIB_H
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ofed.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/random.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/if_arp.h>
+#include <net/netisr.h>
+#include <net/route.h>
+#include <net/if_llc.h>
+#include <net/if_dl.h>
+#include <net/if_types.h>
+#include <net/bpf.h>
+#include <net/if_llatbl.h>
+#include <net/vnet.h>
+
+#if defined(INET) || defined(INET6)
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/if_ether.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_fw.h>
+#include <netinet/ipfw/ip_fw_private.h>
+#endif
+#ifdef INET6
+#include <netinet6/nd6.h>
+#endif
+
+#include <security/mac/mac_framework.h>
+
+#include <linux/list.h>
+
+#include <linux/workqueue.h>
+#include <linux/kref.h>
+#include <linux/mutex.h>
+
+#include <asm/atomic.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_sa.h>
+
+/* constants */
+
+#define INFINIBAND_ALEN 20 /* Octets in IPoIB HW addr */
+
+#ifdef IPOIB_CM
+#define CONFIG_INFINIBAND_IPOIB_CM
+#endif
+
+#ifdef IPOIB_DEBUG
+#define CONFIG_INFINIBAND_IPOIB_DEBUG
+#define CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
+#endif
+
+enum ipoib_flush_level {
+ IPOIB_FLUSH_LIGHT,
+ IPOIB_FLUSH_NORMAL,
+ IPOIB_FLUSH_HEAVY
+};
+
+enum {
+ IPOIB_ENCAP_LEN = 4,
+ IPOIB_HEADER_LEN = IPOIB_ENCAP_LEN + INFINIBAND_ALEN,
+ IPOIB_UD_MAX_MTU = 4 * 1024,
+ IPOIB_UD_RX_SG = (IPOIB_UD_MAX_MTU / MJUMPAGESIZE),
+ IPOIB_UD_TX_SG = (IPOIB_UD_MAX_MTU / MCLBYTES) + 2,
+ IPOIB_CM_MAX_MTU = (64 * 1024),
+ IPOIB_CM_TX_SG = (IPOIB_CM_MAX_MTU / MCLBYTES) + 2,
+ IPOIB_CM_RX_SG = (IPOIB_CM_MAX_MTU / MJUMPAGESIZE),
+ IPOIB_RX_RING_SIZE = 256,
+ IPOIB_TX_RING_SIZE = 128,
+ IPOIB_MAX_RX_SG = MAX(IPOIB_CM_RX_SG, IPOIB_UD_RX_SG),
+ IPOIB_MAX_TX_SG = MAX(IPOIB_CM_TX_SG, IPOIB_UD_TX_SG),
+ IPOIB_MAX_QUEUE_SIZE = 8192,
+ IPOIB_MIN_QUEUE_SIZE = 2,
+ IPOIB_CM_MAX_CONN_QP = 4096,
+
+ IPOIB_NUM_WC = 4,
+
+ IPOIB_MAX_PATH_REC_QUEUE = 3,
+ IPOIB_MAX_MCAST_QUEUE = 3,
+
+ IPOIB_FLAG_OPER_UP = 0,
+ IPOIB_FLAG_INITIALIZED = 1,
+ IPOIB_FLAG_ADMIN_UP = 2,
+ IPOIB_PKEY_ASSIGNED = 3,
+ IPOIB_PKEY_STOP = 4,
+ IPOIB_FLAG_SUBINTERFACE = 5,
+ IPOIB_MCAST_RUN = 6,
+ IPOIB_STOP_REAPER = 7,
+ IPOIB_FLAG_UMCAST = 10,
+ IPOIB_FLAG_CSUM = 11,
+
+ IPOIB_MAX_BACKOFF_SECONDS = 16,
+
+ IPOIB_MCAST_FLAG_FOUND = 0, /* used in set_multicast_list */
+ IPOIB_MCAST_FLAG_SENDONLY = 1,
+ IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */
+ IPOIB_MCAST_FLAG_ATTACHED = 3,
+
+ IPOIB_MAX_LRO_DESCRIPTORS = 8,
+ IPOIB_LRO_MAX_AGGR = 64,
+
+ MAX_SEND_CQE = 16,
+ IPOIB_CM_COPYBREAK = 256,
+};
+
+#define IPOIB_OP_RECV (1ul << 31)
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+#define IPOIB_OP_CM (1ul << 30)
+#else
+#define IPOIB_OP_CM (0)
+#endif
+
+/* structs */
+
+struct ipoib_header {
+ u8 hwaddr[INFINIBAND_ALEN];
+ __be16 proto;
+ u16 reserved;
+};
+
+struct ipoib_pseudoheader {
+ u8 hwaddr[INFINIBAND_ALEN];
+};
+
+/* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */
+struct ipoib_mcast {
+ struct ib_sa_mcmember_rec mcmember;
+ struct ib_sa_multicast *mc;
+ struct ipoib_ah *ah;
+
+ struct rb_node rb_node;
+ struct list_head list;
+
+ unsigned long created;
+ unsigned long backoff;
+
+ unsigned long flags;
+ unsigned char logcount;
+
+ struct ifqueue pkt_queue;
+
+ struct ipoib_dev_priv *priv;
+};
+
+struct ipoib_cm_rx_buf {
+ struct mbuf *mb;
+ u64 mapping[IPOIB_CM_RX_SG];
+};
+
+struct ipoib_cm_tx_buf {
+ struct mbuf *mb;
+ u64 mapping[IPOIB_CM_TX_SG];
+};
+
+struct ipoib_rx_buf {
+ struct mbuf *mb;
+ u64 mapping[IPOIB_UD_RX_SG];
+};
+
+struct ipoib_tx_buf {
+ struct mbuf *mb;
+ u64 mapping[IPOIB_UD_TX_SG];
+};
+
+struct ib_cm_id;
+
+struct ipoib_cm_data {
+ __be32 qpn; /* High byte MUST be ignored on receive */
+ __be32 mtu;
+};
+
+/*
+ * Quoting 10.3.1 Queue Pair and EE Context States:
+ *
+ * Note, for QPs that are associated with an SRQ, the Consumer should take the
+ * QP through the Error State before invoking a Destroy QP or a Modify QP to the
+ * Reset State. The Consumer may invoke the Destroy QP without first performing
+ * a Modify QP to the Error State and waiting for the Affiliated Asynchronous
+ * Last WQE Reached Event. However, if the Consumer does not wait for the
+ * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment
+ * leakage may occur. Therefore, it is good programming practice to tear down a
+ * QP that is associated with an SRQ by using the following process:
+ *
+ * - Put the QP in the Error State
+ * - Wait for the Affiliated Asynchronous Last WQE Reached Event;
+ * - either:
+ * drain the CQ by invoking the Poll CQ verb and either wait for CQ
+ * to be empty or the number of Poll CQ operations has exceeded
+ * CQ capacity size;
+ * - or
+ * post another WR that completes on the same CQ and wait for this
+ * WR to return as a WC;
+ * - and then invoke a Destroy QP or Reset QP.
+ *
+ * We use the second option and wait for a completion on the
+ * same CQ before destroying QPs attached to our SRQ.
+ */
+
+enum ipoib_cm_state {
+ IPOIB_CM_RX_LIVE,
+ IPOIB_CM_RX_ERROR, /* Ignored by stale task */
+ IPOIB_CM_RX_FLUSH /* Last WQE Reached event observed */
+};
+
+struct ipoib_cm_rx {
+ struct ib_cm_id *id;
+ struct ib_qp *qp;
+ struct ipoib_cm_rx_buf *rx_ring;
+ struct list_head list;
+ struct ipoib_dev_priv *priv;
+ unsigned long jiffies;
+ enum ipoib_cm_state state;
+ int recv_count;
+};
+
+struct ipoib_cm_tx {
+ struct ib_cm_id *id;
+ struct ib_qp *qp;
+ struct list_head list;
+ struct ipoib_dev_priv *priv;
+ struct ipoib_path *path;
+ struct ipoib_cm_tx_buf *tx_ring;
+ unsigned tx_head;
+ unsigned tx_tail;
+ unsigned long flags;
+ u32 mtu; /* remote specified mtu, with grh. */
+};
+
+struct ipoib_cm_dev_priv {
+ struct ib_srq *srq;
+ struct ipoib_cm_rx_buf *srq_ring;
+ struct ib_cm_id *id;
+ struct list_head passive_ids; /* state: LIVE */
+ struct list_head rx_error_list; /* state: ERROR */
+ struct list_head rx_flush_list; /* state: FLUSH, drain not started */
+ struct list_head rx_drain_list; /* state: FLUSH, drain started */
+ struct list_head rx_reap_list; /* state: FLUSH, drain done */
+ struct work_struct start_task;
+ struct work_struct reap_task;
+ struct work_struct mb_task;
+ struct work_struct rx_reap_task;
+ struct delayed_work stale_task;
+ struct ifqueue mb_queue;
+ struct list_head start_list;
+ struct list_head reap_list;
+ struct ib_sge rx_sge[IPOIB_CM_RX_SG];
+ struct ib_recv_wr rx_wr;
+ int nonsrq_conn_qp;
+ int max_cm_mtu; /* Actual buf size. */
+ int num_frags;
+};
+
+struct ipoib_ethtool_st {
+ u16 coalesce_usecs;
+ u16 max_coalesced_frames;
+};
+
+/*
+ * Device private locking: network stack tx_lock protects members used
+ * in TX fast path, lock protects everything else. lock nests inside
+ * of tx_lock (ie tx_lock must be acquired first if needed).
+ */
+struct ipoib_dev_priv {
+ spinlock_t lock;
+
+ struct ifnet *dev;
+
+ u8 broadcastaddr[INFINIBAND_ALEN];
+
+ unsigned long flags;
+
+ struct mutex vlan_mutex;
+
+ struct rb_root path_tree;
+ struct list_head path_list;
+
+ struct ipoib_mcast *broadcast;
+ struct list_head multicast_list;
+ struct rb_root multicast_tree;
+
+ struct delayed_work pkey_poll_task;
+ struct delayed_work mcast_task;
+ struct work_struct carrier_on_task;
+ struct work_struct flush_light;
+ struct work_struct flush_normal;
+ struct work_struct flush_heavy;
+ struct work_struct restart_task;
+ struct delayed_work ah_reap_task;
+
+ struct ib_device *ca;
+ u8 port;
+ u16 pkey;
+ u16 pkey_index;
+ struct ib_pd *pd;
+ struct ib_mr *mr;
+ struct ib_cq *recv_cq;
+ struct ib_cq *send_cq;
+ struct ib_qp *qp;
+ u32 qkey;
+
+ union ib_gid local_gid;
+ u16 local_lid;
+
+ unsigned int admin_mtu; /* User selected MTU, no GRH. */
+ unsigned int mcast_mtu; /* Minus GRH bytes, from mcast group. */
+ unsigned int max_ib_mtu; /* Without header, actual buf size. */
+
+ struct ipoib_rx_buf *rx_ring;
+
+ struct ipoib_tx_buf *tx_ring;
+ unsigned tx_head;
+ unsigned tx_tail;
+ struct ib_sge tx_sge[IPOIB_MAX_TX_SG];
+ struct ib_send_wr tx_wr;
+ unsigned tx_outstanding;
+ struct ib_wc send_wc[MAX_SEND_CQE];
+
+ struct ib_recv_wr rx_wr;
+ struct ib_sge rx_sge[IPOIB_MAX_RX_SG];
+
+ struct ib_wc ibwc[IPOIB_NUM_WC];
+
+ struct list_head dead_ahs;
+
+ struct ib_event_handler event_handler;
+
+ struct ifnet *parent;
+ struct list_head child_intfs;
+ struct list_head list;
+
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+ struct ipoib_cm_dev_priv cm;
+#endif
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+ struct list_head fs_list;
+ struct dentry *mcg_dentry;
+ struct dentry *path_dentry;
+#endif
+ int hca_caps;
+ struct ipoib_ethtool_st ethtool;
+ struct timer_list poll_timer;
+};
+
+struct ipoib_ah {
+ struct ipoib_dev_priv *priv;
+ struct ib_ah *ah;
+ struct list_head list;
+ struct kref ref;
+ unsigned last_send;
+};
+
+struct ipoib_path {
+ struct ipoib_dev_priv *priv;
+ struct rb_node rb_node;
+ struct list_head list;
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+ uint8_t hwaddr[INFINIBAND_ALEN];
+ struct ipoib_cm_tx *cm;
+#endif
+ struct ipoib_ah *ah;
+ struct ib_sa_path_rec pathrec;
+ struct ifqueue queue;
+
+ int query_id;
+ struct ib_sa_query *query;
+ struct completion done;
+
+ int valid;
+};
+
+/* UD Only transmits encap len but we want the two sizes to be symmetrical. */
+#define IPOIB_UD_MTU(ib_mtu) (ib_mtu - IPOIB_ENCAP_LEN)
+#define IPOIB_CM_MTU(ib_mtu) (ib_mtu - 0x10)
+
+#define IPOIB_IS_MULTICAST(addr) ((addr)[4] == 0xff)
+
+extern struct workqueue_struct *ipoib_workqueue;
+
+#define IPOIB_MTAP_PROTO(_ifp, _m, _proto) \
+do { \
+ if (bpf_peers_present((_ifp)->if_bpf)) { \
+ M_ASSERTVALID(_m); \
+ ipoib_mtap_proto((_ifp), (_m), (_proto)); \
+ } \
+} while (0)
+
+/* functions */
+void ipoib_mtap_proto(struct ifnet *ifp, struct mbuf *mb, uint16_t proto);
+void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr);
+void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr);
+
+struct ipoib_ah *ipoib_create_ah(struct ipoib_dev_priv *,
+ struct ib_pd *pd, struct ib_ah_attr *attr);
+void ipoib_free_ah(struct kref *kref);
+static inline void ipoib_put_ah(struct ipoib_ah *ah)
+{
+ kref_put(&ah->ref, ipoib_free_ah);
+}
+
+int ipoib_open(struct ipoib_dev_priv *priv);
+int ipoib_add_pkey_attr(struct ipoib_dev_priv *priv);
+int ipoib_add_umcast_attr(struct ipoib_dev_priv *priv);
+
+void ipoib_demux(struct ifnet *ifp, struct mbuf *m, u_short proto);
+
+void ipoib_send(struct ipoib_dev_priv *priv, struct mbuf *mb,
+ struct ipoib_ah *address, u32 qpn);
+void ipoib_reap_ah(struct work_struct *work);
+
+void ipoib_mark_paths_invalid(struct ipoib_dev_priv *priv);
+void ipoib_flush_paths(struct ipoib_dev_priv *priv);
+struct ipoib_dev_priv *ipoib_intf_alloc(const char *format);
+
+int ipoib_ib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca,
+ int port);
+void ipoib_ib_dev_flush_light(struct work_struct *work);
+void ipoib_ib_dev_flush_normal(struct work_struct *work);
+void ipoib_ib_dev_flush_heavy(struct work_struct *work);
+void ipoib_pkey_event(struct work_struct *work);
+void ipoib_ib_dev_cleanup(struct ipoib_dev_priv *priv);
+
+int ipoib_ib_dev_open(struct ipoib_dev_priv *priv);
+int ipoib_ib_dev_up(struct ipoib_dev_priv *priv);
+int ipoib_ib_dev_down(struct ipoib_dev_priv *priv, int flush);
+int ipoib_ib_dev_stop(struct ipoib_dev_priv *priv, int flush);
+
+int ipoib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port);
+void ipoib_dev_cleanup(struct ipoib_dev_priv *priv);
+
+void ipoib_mcast_join_task(struct work_struct *work);
+void ipoib_mcast_carrier_on_task(struct work_struct *work);
+void ipoib_mcast_send(struct ipoib_dev_priv *priv, void *mgid, struct mbuf *mb);
+
+void ipoib_mcast_restart_task(struct work_struct *work);
+void ipoib_mcast_restart(struct ipoib_dev_priv *);
+int ipoib_mcast_start_thread(struct ipoib_dev_priv *priv);
+int ipoib_mcast_stop_thread(struct ipoib_dev_priv *priv, int flush);
+
+void ipoib_mcast_dev_down(struct ipoib_dev_priv *priv);
+void ipoib_mcast_dev_flush(struct ipoib_dev_priv *priv);
+
+void ipoib_path_free(struct ipoib_dev_priv *priv, struct ipoib_path *path);
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct ipoib_dev_priv *priv);
+int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter);
+void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter,
+ union ib_gid *gid,
+ unsigned long *created,
+ unsigned int *queuelen,
+ unsigned int *complete,
+ unsigned int *send_only);
+
+struct ipoib_path_iter *ipoib_path_iter_init(struct ipoib_dev_priv *priv);
+int ipoib_path_iter_next(struct ipoib_path_iter *iter);
+void ipoib_path_iter_read(struct ipoib_path_iter *iter,
+ struct ipoib_path *path);
+#endif
+
+int ipoib_change_mtu(struct ipoib_dev_priv *priv, int new_mtu);
+
+int ipoib_mcast_attach(struct ipoib_dev_priv *priv, u16 mlid,
+ union ib_gid *mgid, int set_qkey);
+
+int ipoib_init_qp(struct ipoib_dev_priv *priv);
+int ipoib_transport_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca);
+void ipoib_transport_dev_cleanup(struct ipoib_dev_priv *priv);
+
+void ipoib_event(struct ib_event_handler *handler,
+ struct ib_event *record);
+
+void ipoib_pkey_poll(struct work_struct *work);
+int ipoib_pkey_dev_delay_open(struct ipoib_dev_priv *priv);
+void ipoib_drain_cq(struct ipoib_dev_priv *priv);
+
+int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req, int max);
+void ipoib_dma_unmap_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req);
+int ipoib_poll_tx(struct ipoib_dev_priv *priv);
+
+void ipoib_dma_unmap_rx(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req);
+void ipoib_dma_mb(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int length);
+struct mbuf *ipoib_alloc_map_mb(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req, int size);
+
+
+void ipoib_set_ethtool_ops(struct ifnet *dev);
+int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca);
+
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+
+#define IPOIB_FLAGS_RC 0x80
+#define IPOIB_FLAGS_UC 0x40
+
+/* We don't support UC connections at the moment */
+#define IPOIB_CM_SUPPORTED(ha) (ha[0] & (IPOIB_FLAGS_RC))
+
+extern int ipoib_max_conn_qp;
+
+static inline int ipoib_cm_admin_enabled(struct ipoib_dev_priv *priv)
+{
+ return IPOIB_CM_SUPPORTED(IF_LLADDR(priv->dev));
+}
+
+static inline int ipoib_cm_enabled(struct ipoib_dev_priv *priv, uint8_t *hwaddr)
+{
+ return IPOIB_CM_SUPPORTED(hwaddr);
+}
+
+static inline int ipoib_cm_up(struct ipoib_path *path)
+
+{
+ return test_bit(IPOIB_FLAG_OPER_UP, &path->cm->flags);
+}
+
+static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_path *path)
+{
+ return path->cm;
+}
+
+static inline void ipoib_cm_set(struct ipoib_path *path, struct ipoib_cm_tx *tx)
+{
+ path->cm = tx;
+}
+
+static inline int ipoib_cm_has_srq(struct ipoib_dev_priv *priv)
+{
+ return !!priv->cm.srq;
+}
+
+static inline unsigned int ipoib_cm_max_mtu(struct ipoib_dev_priv *priv)
+{
+ return priv->cm.max_cm_mtu;
+}
+
+void ipoib_cm_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_cm_tx *tx);
+int ipoib_cm_dev_open(struct ipoib_dev_priv *priv);
+void ipoib_cm_dev_stop(struct ipoib_dev_priv *priv);
+int ipoib_cm_dev_init(struct ipoib_dev_priv *priv);
+int ipoib_cm_add_mode_attr(struct ipoib_dev_priv *priv);
+void ipoib_cm_dev_cleanup(struct ipoib_dev_priv *priv);
+struct ipoib_cm_tx *ipoib_cm_create_tx(struct ipoib_dev_priv *priv,
+ struct ipoib_path *path);
+void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx);
+void ipoib_cm_mb_too_long(struct ipoib_dev_priv *priv, struct mbuf *mb,
+ unsigned int mtu);
+void ipoib_cm_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc);
+void ipoib_cm_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc);
+#else
+
+struct ipoib_cm_tx;
+
+#define ipoib_max_conn_qp 0
+
+static inline int ipoib_cm_admin_enabled(struct ipoib_dev_priv *priv)
+{
+ return 0;
+}
+static inline int ipoib_cm_enabled(struct ipoib_dev_priv *priv, uint8_t *hwaddr)
+
+{
+ return 0;
+}
+
+static inline int ipoib_cm_up(struct ipoib_path *path)
+
+{
+ return 0;
+}
+
+static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_path *path)
+{
+ return NULL;
+}
+
+static inline void ipoib_cm_set(struct ipoib_path *path, struct ipoib_cm_tx *tx)
+{
+}
+
+static inline int ipoib_cm_has_srq(struct ipoib_dev_priv *priv)
+{
+ return 0;
+}
+
+static inline unsigned int ipoib_cm_max_mtu(struct ipoib_dev_priv *priv)
+{
+ return 0;
+}
+
+static inline
+void ipoib_cm_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_cm_tx *tx)
+{
+ return;
+}
+
+static inline
+int ipoib_cm_dev_open(struct ipoib_dev_priv *priv)
+{
+ return 0;
+}
+
+static inline
+void ipoib_cm_dev_stop(struct ipoib_dev_priv *priv)
+{
+ return;
+}
+
+static inline
+int ipoib_cm_dev_init(struct ipoib_dev_priv *priv)
+{
+ return -ENOSYS;
+}
+
+static inline
+void ipoib_cm_dev_cleanup(struct ipoib_dev_priv *priv)
+{
+ return;
+}
+
+static inline
+struct ipoib_cm_tx *ipoib_cm_create_tx(struct ipoib_dev_priv *priv, struct ipoib_path *path)
+{
+ return NULL;
+}
+
+static inline
+void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
+{
+ return;
+}
+
+static inline
+int ipoib_cm_add_mode_attr(struct ipoib_dev_priv *priv)
+{
+ return 0;
+}
+
+static inline void ipoib_cm_mb_too_long(struct ipoib_dev_priv *priv, struct mbuf *mb,
+ unsigned int mtu)
+{
+ m_freem(mb);
+}
+
+static inline void ipoib_cm_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc)
+{
+}
+
+static inline void ipoib_cm_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc)
+{
+}
+#endif
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+void ipoib_create_debug_files(struct ipoib_dev_priv *priv);
+void ipoib_delete_debug_files(struct ipoib_dev_priv *priv);
+int ipoib_register_debugfs(void);
+void ipoib_unregister_debugfs(void);
+#else
+static inline void ipoib_create_debug_files(struct ipoib_dev_priv *priv) { }
+static inline void ipoib_delete_debug_files(struct ipoib_dev_priv *priv) { }
+static inline int ipoib_register_debugfs(void) { return 0; }
+static inline void ipoib_unregister_debugfs(void) { }
+#endif
+
+#define ipoib_printk(level, priv, format, arg...) \
+ printk(level "%s: " format, if_name(((struct ipoib_dev_priv *) priv)->dev), ## arg)
+#define ipoib_warn(priv, format, arg...) \
+ ipoib_printk(KERN_WARNING, priv, format , ## arg)
+
+extern int ipoib_sendq_size;
+extern int ipoib_recvq_size;
+
+extern struct ib_sa_client ipoib_sa_client;
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+extern int ipoib_debug_level;
+
+#define ipoib_dbg(priv, format, arg...) \
+ do { \
+ if (ipoib_debug_level > 0) \
+ ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
+ } while (0)
+#define ipoib_dbg_mcast(priv, format, arg...) \
+ do { \
+ if (mcast_debug_level > 0) \
+ ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
+ } while (0)
+#else /* CONFIG_INFINIBAND_IPOIB_DEBUG */
+#define ipoib_dbg(priv, format, arg...) \
+ do { (void) (priv); } while (0)
+#define ipoib_dbg_mcast(priv, format, arg...) \
+ do { (void) (priv); } while (0)
+#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
+#define ipoib_dbg_data(priv, format, arg...) \
+ do { \
+ if (data_debug_level > 0) \
+ ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
+ } while (0)
+#else /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */
+#define ipoib_dbg_data(priv, format, arg...) \
+ do { (void) (priv); } while (0)
+#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */
+
+#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff)
+
+#endif /* _IPOIB_H */
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c
new file mode 100644
index 0000000..2d0fd61
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -0,0 +1,1445 @@
+/*
+ * Copyright (c) 2006 Mellanox Technologies. All rights reserved
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ipoib.h"
+
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/icmp6.h>
+
+#include <rdma/ib_cm.h>
+#include <rdma/ib_cache.h>
+#include <linux/delay.h>
+
+int ipoib_max_conn_qp = 128;
+
+module_param_named(max_nonsrq_conn_qp, ipoib_max_conn_qp, int, 0444);
+MODULE_PARM_DESC(max_nonsrq_conn_qp,
+ "Max number of connected-mode QPs per interface "
+ "(applied only if shared receive queue is not available)");
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
+static int data_debug_level;
+
+module_param_named(cm_data_debug_level, data_debug_level, int, 0644);
+MODULE_PARM_DESC(cm_data_debug_level,
+ "Enable data path debug tracing for connected mode if > 0");
+#endif
+
+#define IPOIB_CM_IETF_ID 0x1000000000000000ULL
+
+#define IPOIB_CM_RX_UPDATE_TIME (256 * HZ)
+#define IPOIB_CM_RX_TIMEOUT (2 * 256 * HZ)
+#define IPOIB_CM_RX_DELAY (3 * 256 * HZ)
+#define IPOIB_CM_RX_UPDATE_MASK (0x3)
+
+static struct ib_qp_attr ipoib_cm_err_attr = {
+ .qp_state = IB_QPS_ERR
+};
+
+#define IPOIB_CM_RX_DRAIN_WRID 0xffffffff
+
+static struct ib_send_wr ipoib_cm_rx_drain_wr = {
+ .wr_id = IPOIB_CM_RX_DRAIN_WRID,
+ .opcode = IB_WR_SEND,
+};
+
+static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
+ struct ib_cm_event *event);
+
+static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, struct ipoib_cm_rx_buf *rx_req)
+{
+
+ ipoib_dma_unmap_rx(priv, (struct ipoib_rx_buf *)rx_req);
+
+}
+
+static int ipoib_cm_post_receive_srq(struct ipoib_dev_priv *priv, int id)
+{
+ struct ib_recv_wr *bad_wr;
+ struct ipoib_rx_buf *rx_req;
+ struct mbuf *m;
+ int ret;
+ int i;
+
+ rx_req = (struct ipoib_rx_buf *)&priv->cm.srq_ring[id];
+ for (m = rx_req->mb, i = 0; m != NULL; m = m->m_next, i++) {
+ priv->cm.rx_sge[i].addr = rx_req->mapping[i];
+ priv->cm.rx_sge[i].length = m->m_len;
+ }
+
+ priv->cm.rx_wr.num_sge = i;
+ priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
+
+ ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr);
+ if (unlikely(ret)) {
+ ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret);
+ ipoib_dma_unmap_rx(priv, rx_req);
+ m_freem(priv->cm.srq_ring[id].mb);
+ priv->cm.srq_ring[id].mb = NULL;
+ }
+
+ return ret;
+}
+
+static int ipoib_cm_post_receive_nonsrq(struct ipoib_dev_priv *priv,
+ struct ipoib_cm_rx *rx,
+ struct ib_recv_wr *wr,
+ struct ib_sge *sge, int id)
+{
+ struct ipoib_rx_buf *rx_req;
+ struct ib_recv_wr *bad_wr;
+ struct mbuf *m;
+ int ret;
+ int i;
+
+ rx_req = (struct ipoib_rx_buf *)&rx->rx_ring[id];
+ for (m = rx_req->mb, i = 0; m != NULL; m = m->m_next, i++) {
+ sge[i].addr = rx_req->mapping[i];
+ sge[i].length = m->m_len;
+ }
+
+ wr->num_sge = i;
+ wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
+
+ ret = ib_post_recv(rx->qp, wr, &bad_wr);
+ if (unlikely(ret)) {
+ ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret);
+ ipoib_dma_unmap_rx(priv, rx_req);
+ m_freem(rx->rx_ring[id].mb);
+ rx->rx_ring[id].mb = NULL;
+ }
+
+ return ret;
+}
+
+static struct mbuf *
+ipoib_cm_alloc_rx_mb(struct ipoib_dev_priv *priv, struct ipoib_cm_rx_buf *rx_req)
+{
+ return ipoib_alloc_map_mb(priv, (struct ipoib_rx_buf *)rx_req,
+ priv->cm.max_cm_mtu);
+}
+
+static void ipoib_cm_free_rx_ring(struct ipoib_dev_priv *priv,
+ struct ipoib_cm_rx_buf *rx_ring)
+{
+ int i;
+
+ for (i = 0; i < ipoib_recvq_size; ++i)
+ if (rx_ring[i].mb) {
+ ipoib_cm_dma_unmap_rx(priv, &rx_ring[i]);
+ m_freem(rx_ring[i].mb);
+ }
+
+ kfree(rx_ring);
+}
+
+static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv)
+{
+ struct ib_send_wr *bad_wr;
+ struct ipoib_cm_rx *p;
+
+ /* We only reserved 1 extra slot in CQ for drain WRs, so
+ * make sure we have at most 1 outstanding WR. */
+ if (list_empty(&priv->cm.rx_flush_list) ||
+ !list_empty(&priv->cm.rx_drain_list))
+ return;
+
+ /*
+ * QPs on flush list are error state. This way, a "flush
+ * error" WC will be immediately generated for each WR we post.
+ */
+ p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list);
+ if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr))
+ ipoib_warn(priv, "failed to post drain wr\n");
+
+ list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);
+}
+
+static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx)
+{
+ struct ipoib_cm_rx *p = ctx;
+ struct ipoib_dev_priv *priv = p->priv;
+ unsigned long flags;
+
+ if (event->event != IB_EVENT_QP_LAST_WQE_REACHED)
+ return;
+
+ spin_lock_irqsave(&priv->lock, flags);
+ list_move(&p->list, &priv->cm.rx_flush_list);
+ p->state = IPOIB_CM_RX_FLUSH;
+ ipoib_cm_start_rx_drain(priv);
+ spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static struct ib_qp *ipoib_cm_create_rx_qp(struct ipoib_dev_priv *priv,
+ struct ipoib_cm_rx *p)
+{
+ struct ib_qp_init_attr attr = {
+ .event_handler = ipoib_cm_rx_event_handler,
+ .send_cq = priv->recv_cq, /* For drain WR */
+ .recv_cq = priv->recv_cq,
+ .srq = priv->cm.srq,
+ .cap.max_send_wr = 1, /* For drain WR */
+ .cap.max_send_sge = 1,
+ .sq_sig_type = IB_SIGNAL_ALL_WR,
+ .qp_type = IB_QPT_RC,
+ .qp_context = p,
+ };
+
+ if (!ipoib_cm_has_srq(priv)) {
+ attr.cap.max_recv_wr = ipoib_recvq_size;
+ attr.cap.max_recv_sge = priv->cm.num_frags;
+ }
+
+ return ib_create_qp(priv->pd, &attr);
+}
+
+static int ipoib_cm_modify_rx_qp(struct ipoib_dev_priv *priv,
+ struct ib_cm_id *cm_id, struct ib_qp *qp,
+ unsigned psn)
+{
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+
+ qp_attr.qp_state = IB_QPS_INIT;
+ ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret);
+ return ret;
+ }
+ ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret);
+ return ret;
+ }
+ qp_attr.qp_state = IB_QPS_RTR;
+ ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
+ return ret;
+ }
+ qp_attr.rq_psn = psn;
+ ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
+ return ret;
+ }
+
+ /*
+ * Current Mellanox HCA firmware won't generate completions
+ * with error for drain WRs unless the QP has been moved to
+ * RTS first. This work-around leaves a window where a QP has
+ * moved to error asynchronously, but this will eventually get
+ * fixed in firmware, so let's not error out if modify QP
+ * fails.
+ */
+ qp_attr.qp_state = IB_QPS_RTS;
+ ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
+ return 0;
+ }
+ ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
+ return 0;
+ }
+
+ return 0;
+}
+
+static void ipoib_cm_init_rx_wr(struct ipoib_dev_priv *priv,
+ struct ib_recv_wr *wr,
+ struct ib_sge *sge)
+{
+ int i;
+
+ for (i = 0; i < IPOIB_CM_RX_SG; i++)
+ sge[i].lkey = priv->mr->lkey;
+
+ wr->next = NULL;
+ wr->sg_list = sge;
+ wr->num_sge = 1;
+}
+
+static int ipoib_cm_nonsrq_init_rx(struct ipoib_dev_priv *priv,
+ struct ib_cm_id *cm_id, struct ipoib_cm_rx *rx)
+{
+ struct {
+ struct ib_recv_wr wr;
+ struct ib_sge sge[IPOIB_CM_RX_SG];
+ } *t;
+ int ret;
+ int i;
+
+ rx->rx_ring = kzalloc(ipoib_recvq_size * sizeof *rx->rx_ring, GFP_KERNEL);
+ if (!rx->rx_ring) {
+ printk(KERN_WARNING "%s: failed to allocate CM non-SRQ ring (%d entries)\n",
+ priv->ca->name, ipoib_recvq_size);
+ return -ENOMEM;
+ }
+
+ memset(rx->rx_ring, 0, ipoib_recvq_size * sizeof *rx->rx_ring);
+
+ t = kmalloc(sizeof *t, GFP_KERNEL);
+ if (!t) {
+ ret = -ENOMEM;
+ goto err_free;
+ }
+
+ ipoib_cm_init_rx_wr(priv, &t->wr, t->sge);
+
+ spin_lock_irq(&priv->lock);
+
+ if (priv->cm.nonsrq_conn_qp >= ipoib_max_conn_qp) {
+ spin_unlock_irq(&priv->lock);
+ ib_send_cm_rej(cm_id, IB_CM_REJ_NO_QP, NULL, 0, NULL, 0);
+ ret = -EINVAL;
+ goto err_free;
+ } else
+ ++priv->cm.nonsrq_conn_qp;
+
+ spin_unlock_irq(&priv->lock);
+
+ for (i = 0; i < ipoib_recvq_size; ++i) {
+ if (!ipoib_cm_alloc_rx_mb(priv, &rx->rx_ring[i])) {
+ ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
+ ret = -ENOMEM;
+ goto err_count;
+ }
+ ret = ipoib_cm_post_receive_nonsrq(priv, rx, &t->wr, t->sge, i);
+ if (ret) {
+ ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq "
+ "failed for buf %d\n", i);
+ ret = -EIO;
+ goto err_count;
+ }
+ }
+
+ rx->recv_count = ipoib_recvq_size;
+
+ kfree(t);
+
+ return 0;
+
+err_count:
+ spin_lock_irq(&priv->lock);
+ --priv->cm.nonsrq_conn_qp;
+ spin_unlock_irq(&priv->lock);
+
+err_free:
+ kfree(t);
+ ipoib_cm_free_rx_ring(priv, rx->rx_ring);
+
+ return ret;
+}
+
+static int ipoib_cm_send_rep(struct ipoib_dev_priv *priv, struct ib_cm_id *cm_id,
+ struct ib_qp *qp, struct ib_cm_req_event_param *req,
+ unsigned psn)
+{
+ struct ipoib_cm_data data = {};
+ struct ib_cm_rep_param rep = {};
+
+ data.qpn = cpu_to_be32(priv->qp->qp_num);
+ data.mtu = cpu_to_be32(priv->cm.max_cm_mtu);
+
+ rep.private_data = &data;
+ rep.private_data_len = sizeof data;
+ rep.flow_control = 0;
+ rep.rnr_retry_count = req->rnr_retry_count;
+ rep.srq = ipoib_cm_has_srq(priv);
+ rep.qp_num = qp->qp_num;
+ rep.starting_psn = psn;
+ return ib_send_cm_rep(cm_id, &rep);
+}
+
+static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
+{
+ struct ipoib_dev_priv *priv = cm_id->context;
+ struct ipoib_cm_rx *p;
+ unsigned psn;
+ int ret;
+
+ ipoib_dbg(priv, "REQ arrived\n");
+ p = kzalloc(sizeof *p, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+ p->priv = priv;
+ p->id = cm_id;
+ cm_id->context = p;
+ p->state = IPOIB_CM_RX_LIVE;
+ p->jiffies = jiffies;
+ INIT_LIST_HEAD(&p->list);
+
+ p->qp = ipoib_cm_create_rx_qp(priv, p);
+ if (IS_ERR(p->qp)) {
+ ret = PTR_ERR(p->qp);
+ goto err_qp;
+ }
+
+ psn = random() & 0xffffff;
+ ret = ipoib_cm_modify_rx_qp(priv, cm_id, p->qp, psn);
+ if (ret)
+ goto err_modify;
+
+ if (!ipoib_cm_has_srq(priv)) {
+ ret = ipoib_cm_nonsrq_init_rx(priv, cm_id, p);
+ if (ret)
+ goto err_modify;
+ }
+
+ spin_lock_irq(&priv->lock);
+ queue_delayed_work(ipoib_workqueue,
+ &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
+ /* Add this entry to passive ids list head, but do not re-add it
+ * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */
+ p->jiffies = jiffies;
+ if (p->state == IPOIB_CM_RX_LIVE)
+ list_move(&p->list, &priv->cm.passive_ids);
+ spin_unlock_irq(&priv->lock);
+
+ ret = ipoib_cm_send_rep(priv, cm_id, p->qp, &event->param.req_rcvd, psn);
+ if (ret) {
+ ipoib_warn(priv, "failed to send REP: %d\n", ret);
+ if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
+ ipoib_warn(priv, "unable to move qp to error state\n");
+ }
+ return 0;
+
+err_modify:
+ ib_destroy_qp(p->qp);
+err_qp:
+ kfree(p);
+ return ret;
+}
+
+static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id,
+ struct ib_cm_event *event)
+{
+ struct ipoib_cm_rx *p;
+ struct ipoib_dev_priv *priv;
+
+ switch (event->event) {
+ case IB_CM_REQ_RECEIVED:
+ return ipoib_cm_req_handler(cm_id, event);
+ case IB_CM_DREQ_RECEIVED:
+ p = cm_id->context;
+ ib_send_cm_drep(cm_id, NULL, 0);
+ /* Fall through */
+ case IB_CM_REJ_RECEIVED:
+ p = cm_id->context;
+ priv = p->priv;
+ if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
+ ipoib_warn(priv, "unable to move qp to error state\n");
+ /* Fall through */
+ default:
+ return 0;
+ }
+}
+
+void ipoib_cm_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc)
+{
+ struct ipoib_cm_rx_buf saverx;
+ struct ipoib_cm_rx_buf *rx_ring;
+ unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV);
+ struct ifnet *dev = priv->dev;
+ struct mbuf *mb, *newmb;
+ struct ipoib_cm_rx *p;
+ int has_srq;
+ u_short proto;
+
+ ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n",
+ wr_id, wc->status);
+
+ if (unlikely(wr_id >= ipoib_recvq_size)) {
+ if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) {
+ spin_lock(&priv->lock);
+ list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
+ ipoib_cm_start_rx_drain(priv);
+ if (priv->cm.id != NULL)
+ queue_work(ipoib_workqueue,
+ &priv->cm.rx_reap_task);
+ spin_unlock(&priv->lock);
+ } else
+ ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
+ wr_id, ipoib_recvq_size);
+ return;
+ }
+
+ p = wc->qp->qp_context;
+
+ has_srq = ipoib_cm_has_srq(priv);
+ rx_ring = has_srq ? priv->cm.srq_ring : p->rx_ring;
+
+ mb = rx_ring[wr_id].mb;
+
+ if (unlikely(wc->status != IB_WC_SUCCESS)) {
+ ipoib_dbg(priv, "cm recv error "
+ "(status=%d, wrid=%d vend_err %x)\n",
+ wc->status, wr_id, wc->vendor_err);
+ ++dev->if_ierrors;
+ if (has_srq)
+ goto repost;
+ else {
+ if (!--p->recv_count) {
+ spin_lock(&priv->lock);
+ list_move(&p->list, &priv->cm.rx_reap_list);
+ queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
+ spin_unlock(&priv->lock);
+ }
+ return;
+ }
+ }
+
+ if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) {
+ if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
+ p->jiffies = jiffies;
+ /* Move this entry to list head, but do not re-add it
+ * if it has been moved out of list. */
+ if (p->state == IPOIB_CM_RX_LIVE)
+ list_move(&p->list, &priv->cm.passive_ids);
+ }
+ }
+
+ memcpy(&saverx, &rx_ring[wr_id], sizeof(saverx));
+ newmb = ipoib_cm_alloc_rx_mb(priv, &rx_ring[wr_id]);
+ if (unlikely(!newmb)) {
+ /*
+ * If we can't allocate a new RX buffer, dump
+ * this packet and reuse the old buffer.
+ */
+ ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id);
+ ++dev->if_ierrors;
+ memcpy(&rx_ring[wr_id], &saverx, sizeof(saverx));
+ goto repost;
+ }
+
+ ipoib_cm_dma_unmap_rx(priv, &saverx);
+
+ ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
+ wc->byte_len, wc->slid);
+
+ ipoib_dma_mb(priv, mb, wc->byte_len);
+
+ ++dev->if_opackets;
+ dev->if_obytes += mb->m_pkthdr.len;
+
+ mb->m_pkthdr.rcvif = dev;
+ proto = *mtod(mb, uint16_t *);
+ m_adj(mb, IPOIB_ENCAP_LEN);
+
+ IPOIB_MTAP_PROTO(dev, mb, proto);
+ ipoib_demux(dev, mb, ntohs(proto));
+
+repost:
+ if (has_srq) {
+ if (unlikely(ipoib_cm_post_receive_srq(priv, wr_id)))
+ ipoib_warn(priv, "ipoib_cm_post_receive_srq failed "
+ "for buf %d\n", wr_id);
+ } else {
+ if (unlikely(ipoib_cm_post_receive_nonsrq(priv, p,
+ &priv->cm.rx_wr,
+ priv->cm.rx_sge,
+ wr_id))) {
+ --p->recv_count;
+ ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed "
+ "for buf %d\n", wr_id);
+ }
+ }
+}
+
+static inline int post_send(struct ipoib_dev_priv *priv,
+ struct ipoib_cm_tx *tx,
+ struct ipoib_cm_tx_buf *tx_req,
+ unsigned int wr_id)
+{
+ struct ib_send_wr *bad_wr;
+ struct mbuf *mb = tx_req->mb;
+ u64 *mapping = tx_req->mapping;
+ struct mbuf *m;
+ int i;
+
+ for (m = mb, i = 0; m != NULL; m = m->m_next, i++) {
+ priv->tx_sge[i].addr = mapping[i];
+ priv->tx_sge[i].length = m->m_len;
+ }
+ priv->tx_wr.num_sge = i;
+ priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM;
+ priv->tx_wr.opcode = IB_WR_SEND;
+
+ return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
+}
+
+void ipoib_cm_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_cm_tx *tx)
+{
+ struct ipoib_cm_tx_buf *tx_req;
+ struct ifnet *dev = priv->dev;
+
+ if (unlikely(priv->tx_outstanding > MAX_SEND_CQE))
+ while (ipoib_poll_tx(priv)); /* nothing */
+
+ m_adj(mb, sizeof(struct ipoib_pseudoheader));
+ if (unlikely(mb->m_pkthdr.len > tx->mtu)) {
+ ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
+ mb->m_pkthdr.len, tx->mtu);
+ ++dev->if_oerrors;
+ ipoib_cm_mb_too_long(priv, mb, IPOIB_CM_MTU(tx->mtu));
+ return;
+ }
+
+ ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n",
+ tx->tx_head, mb->m_pkthdr.len, tx->qp->qp_num);
+
+
+ /*
+ * We put the mb into the tx_ring _before_ we call post_send()
+ * because it's entirely possible that the completion handler will
+ * run before we execute anything after the post_send(). That
+ * means we have to make sure everything is properly recorded and
+ * our state is consistent before we call post_send().
+ */
+ tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
+ tx_req->mb = mb;
+ if (unlikely(ipoib_dma_map_tx(priv->ca, (struct ipoib_tx_buf *)tx_req,
+ priv->cm.num_frags))) {
+ ++dev->if_oerrors;
+ if (tx_req->mb)
+ m_freem(tx_req->mb);
+ return;
+ }
+
+ if (unlikely(post_send(priv, tx, tx_req, tx->tx_head & (ipoib_sendq_size - 1)))) {
+ ipoib_warn(priv, "post_send failed\n");
+ ++dev->if_oerrors;
+ ipoib_dma_unmap_tx(priv->ca, (struct ipoib_tx_buf *)tx_req);
+ m_freem(mb);
+ } else {
+ ++tx->tx_head;
+
+ if (++priv->tx_outstanding == ipoib_sendq_size) {
+ ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
+ tx->qp->qp_num);
+ if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP))
+ ipoib_warn(priv, "request notify on send CQ failed\n");
+ dev->if_drv_flags |= IFF_DRV_OACTIVE;
+ }
+ }
+
+}
+
+void ipoib_cm_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc)
+{
+ struct ipoib_cm_tx *tx = wc->qp->qp_context;
+ unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
+ struct ifnet *dev = priv->dev;
+ struct ipoib_cm_tx_buf *tx_req;
+
+ ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n",
+ wr_id, wc->status);
+
+ if (unlikely(wr_id >= ipoib_sendq_size)) {
+ ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n",
+ wr_id, ipoib_sendq_size);
+ return;
+ }
+
+ tx_req = &tx->tx_ring[wr_id];
+
+ ipoib_dma_unmap_tx(priv->ca, (struct ipoib_tx_buf *)tx_req);
+
+ /* FIXME: is this right? Shouldn't we only increment on success? */
+ ++dev->if_opackets;
+ dev->if_obytes += tx_req->mb->m_pkthdr.len;
+
+ m_freem(tx_req->mb);
+
+ ++tx->tx_tail;
+ if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
+ (dev->if_drv_flags & IFF_DRV_OACTIVE) != 0 &&
+ test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+ dev->if_drv_flags &= ~IFF_DRV_OACTIVE;
+
+ if (wc->status != IB_WC_SUCCESS &&
+ wc->status != IB_WC_WR_FLUSH_ERR) {
+ struct ipoib_path *path;
+
+ ipoib_dbg(priv, "failed cm send event "
+ "(status=%d, wrid=%d vend_err %x)\n",
+ wc->status, wr_id, wc->vendor_err);
+
+ path = tx->path;
+
+ if (path) {
+ path->cm = NULL;
+ rb_erase(&path->rb_node, &priv->path_tree);
+ list_del(&path->list);
+ }
+
+ if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
+ list_move(&tx->list, &priv->cm.reap_list);
+ queue_work(ipoib_workqueue, &priv->cm.reap_task);
+ }
+
+ clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags);
+ }
+
+}
+
+int ipoib_cm_dev_open(struct ipoib_dev_priv *priv)
+{
+ int ret;
+
+ if (!IPOIB_CM_SUPPORTED(IF_LLADDR(priv->dev)))
+ return 0;
+
+ priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, priv);
+ if (IS_ERR(priv->cm.id)) {
+ printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
+ ret = PTR_ERR(priv->cm.id);
+ goto err_cm;
+ }
+
+ ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num),
+ 0, NULL);
+ if (ret) {
+ printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name,
+ IPOIB_CM_IETF_ID | priv->qp->qp_num);
+ goto err_listen;
+ }
+
+ return 0;
+
+err_listen:
+ ib_destroy_cm_id(priv->cm.id);
+err_cm:
+ priv->cm.id = NULL;
+ return ret;
+}
+
+static void ipoib_cm_free_rx_reap_list(struct ipoib_dev_priv *priv)
+{
+ struct ipoib_cm_rx *rx, *n;
+ LIST_HEAD(list);
+
+ spin_lock_irq(&priv->lock);
+ list_splice_init(&priv->cm.rx_reap_list, &list);
+ spin_unlock_irq(&priv->lock);
+
+ list_for_each_entry_safe(rx, n, &list, list) {
+ ib_destroy_cm_id(rx->id);
+ ib_destroy_qp(rx->qp);
+ if (!ipoib_cm_has_srq(priv)) {
+ ipoib_cm_free_rx_ring(priv, rx->rx_ring);
+ spin_lock_irq(&priv->lock);
+ --priv->cm.nonsrq_conn_qp;
+ spin_unlock_irq(&priv->lock);
+ }
+ kfree(rx);
+ }
+}
+
+void ipoib_cm_dev_stop(struct ipoib_dev_priv *priv)
+{
+ struct ipoib_cm_rx *p;
+ unsigned long begin;
+ int ret;
+
+ if (!IPOIB_CM_SUPPORTED(IF_LLADDR(priv->dev)) || !priv->cm.id)
+ return;
+
+ ib_destroy_cm_id(priv->cm.id);
+ priv->cm.id = NULL;
+
+ cancel_work_sync(&priv->cm.rx_reap_task);
+
+ spin_lock_irq(&priv->lock);
+ while (!list_empty(&priv->cm.passive_ids)) {
+ p = list_entry(priv->cm.passive_ids.next, typeof(*p), list);
+ list_move(&p->list, &priv->cm.rx_error_list);
+ p->state = IPOIB_CM_RX_ERROR;
+ spin_unlock_irq(&priv->lock);
+ ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
+ if (ret)
+ ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
+ spin_lock_irq(&priv->lock);
+ }
+
+ /* Wait for all RX to be drained */
+ begin = jiffies;
+
+ while (!list_empty(&priv->cm.rx_error_list) ||
+ !list_empty(&priv->cm.rx_flush_list) ||
+ !list_empty(&priv->cm.rx_drain_list)) {
+ if (time_after(jiffies, begin + 5 * HZ)) {
+ ipoib_warn(priv, "RX drain timing out\n");
+
+ /*
+ * assume the HW is wedged and just free up everything.
+ */
+ list_splice_init(&priv->cm.rx_flush_list,
+ &priv->cm.rx_reap_list);
+ list_splice_init(&priv->cm.rx_error_list,
+ &priv->cm.rx_reap_list);
+ list_splice_init(&priv->cm.rx_drain_list,
+ &priv->cm.rx_reap_list);
+ break;
+ }
+ spin_unlock_irq(&priv->lock);
+ msleep(1);
+ ipoib_drain_cq(priv);
+ spin_lock_irq(&priv->lock);
+ }
+
+ spin_unlock_irq(&priv->lock);
+
+ ipoib_cm_free_rx_reap_list(priv);
+
+ cancel_delayed_work(&priv->cm.stale_task);
+}
+
+static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
+{
+ struct ipoib_cm_tx *p = cm_id->context;
+ struct ipoib_dev_priv *priv = p->priv;
+ struct ipoib_cm_data *data = event->private_data;
+ struct ifqueue mbqueue;
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+ struct mbuf *mb;
+
+ ipoib_dbg(priv, "cm rep handler\n");
+ p->mtu = be32_to_cpu(data->mtu);
+
+ if (p->mtu <= IPOIB_ENCAP_LEN) {
+ ipoib_warn(priv, "Rejecting connection: mtu %d <= %d\n",
+ p->mtu, IPOIB_ENCAP_LEN);
+ return -EINVAL;
+ }
+
+ qp_attr.qp_state = IB_QPS_RTR;
+ ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
+ return ret;
+ }
+
+ qp_attr.rq_psn = 0 /* FIXME */;
+ ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
+ return ret;
+ }
+
+ qp_attr.qp_state = IB_QPS_RTS;
+ ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
+ return ret;
+ }
+ ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
+ return ret;
+ }
+
+ bzero(&mbqueue, sizeof(mbqueue));
+
+ spin_lock_irq(&priv->lock);
+ set_bit(IPOIB_FLAG_OPER_UP, &p->flags);
+ if (p->path)
+ for (;;) {
+ _IF_DEQUEUE(&p->path->queue, mb);
+ if (mb == NULL)
+ break;
+ _IF_ENQUEUE(&mbqueue, mb);
+ }
+ spin_unlock_irq(&priv->lock);
+
+ for (;;) {
+ struct ifnet *dev = p->priv->dev;
+ _IF_DEQUEUE(&mbqueue, mb);
+ if (mb == NULL)
+ break;
+ mb->m_pkthdr.rcvif = dev;
+ if (dev->if_transmit(dev, mb))
+ ipoib_warn(priv, "dev_queue_xmit failed "
+ "to requeue packet\n");
+ }
+
+ ret = ib_send_cm_rtu(cm_id, NULL, 0);
+ if (ret) {
+ ipoib_warn(priv, "failed to send RTU: %d\n", ret);
+ return ret;
+ }
+ return 0;
+}
+
+static struct ib_qp *ipoib_cm_create_tx_qp(struct ipoib_dev_priv *priv,
+ struct ipoib_cm_tx *tx)
+{
+ struct ib_qp_init_attr attr = {
+ .send_cq = priv->send_cq,
+ .recv_cq = priv->recv_cq,
+ .srq = priv->cm.srq,
+ .cap.max_send_wr = ipoib_sendq_size,
+ .cap.max_send_sge = priv->cm.num_frags,
+ .sq_sig_type = IB_SIGNAL_ALL_WR,
+ .qp_type = IB_QPT_RC,
+ .qp_context = tx
+ };
+
+ return ib_create_qp(priv->pd, &attr);
+}
+
+static int ipoib_cm_send_req(struct ipoib_dev_priv *priv,
+ struct ib_cm_id *id, struct ib_qp *qp,
+ u32 qpn,
+ struct ib_sa_path_rec *pathrec)
+{
+ struct ipoib_cm_data data = {};
+ struct ib_cm_req_param req = {};
+
+ ipoib_dbg(priv, "cm send req\n");
+
+ data.qpn = cpu_to_be32(priv->qp->qp_num);
+ data.mtu = cpu_to_be32(priv->cm.max_cm_mtu);
+
+ req.primary_path = pathrec;
+ req.alternate_path = NULL;
+ req.service_id = cpu_to_be64(IPOIB_CM_IETF_ID | qpn);
+ req.qp_num = qp->qp_num;
+ req.qp_type = qp->qp_type;
+ req.private_data = &data;
+ req.private_data_len = sizeof data;
+ req.flow_control = 0;
+
+ req.starting_psn = 0; /* FIXME */
+
+ /*
+ * Pick some arbitrary defaults here; we could make these
+ * module parameters if anyone cared about setting them.
+ */
+ req.responder_resources = 4;
+ req.remote_cm_response_timeout = 20;
+ req.local_cm_response_timeout = 20;
+ req.retry_count = 0; /* RFC draft warns against retries */
+ req.rnr_retry_count = 0; /* RFC draft warns against retries */
+ req.max_cm_retries = 15;
+ req.srq = ipoib_cm_has_srq(priv);
+ return ib_send_cm_req(id, &req);
+}
+
+static int ipoib_cm_modify_tx_init(struct ipoib_dev_priv *priv,
+ struct ib_cm_id *cm_id, struct ib_qp *qp)
+{
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+ ret = ib_find_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index);
+ if (ret) {
+ ipoib_warn(priv, "pkey 0x%x not found: %d\n", priv->pkey, ret);
+ return ret;
+ }
+
+ qp_attr.qp_state = IB_QPS_INIT;
+ qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE;
+ qp_attr.port_num = priv->port;
+ qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT;
+
+ ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret);
+ return ret;
+ }
+ return 0;
+}
+
+static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn,
+ struct ib_sa_path_rec *pathrec)
+{
+ struct ipoib_dev_priv *priv = p->priv;
+ int ret;
+
+ p->tx_ring = kzalloc(ipoib_sendq_size * sizeof *p->tx_ring, GFP_KERNEL);
+ if (!p->tx_ring) {
+ ipoib_warn(priv, "failed to allocate tx ring\n");
+ ret = -ENOMEM;
+ goto err_tx;
+ }
+ memset(p->tx_ring, 0, ipoib_sendq_size * sizeof *p->tx_ring);
+
+ p->qp = ipoib_cm_create_tx_qp(p->priv, p);
+ if (IS_ERR(p->qp)) {
+ ret = PTR_ERR(p->qp);
+ ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret);
+ goto err_qp;
+ }
+
+ p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p);
+ if (IS_ERR(p->id)) {
+ ret = PTR_ERR(p->id);
+ ipoib_warn(priv, "failed to create tx cm id: %d\n", ret);
+ goto err_id;
+ }
+
+ ret = ipoib_cm_modify_tx_init(p->priv, p->id, p->qp);
+ if (ret) {
+ ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret);
+ goto err_modify;
+ }
+
+ ret = ipoib_cm_send_req(p->priv, p->id, p->qp, qpn, pathrec);
+ if (ret) {
+ ipoib_warn(priv, "failed to send cm req: %d\n", ret);
+ goto err_send_cm;
+ }
+
+ ipoib_dbg(priv, "Request connection 0x%x for gid %pI6 qpn 0x%x\n",
+ p->qp->qp_num, pathrec->dgid.raw, qpn);
+
+ return 0;
+
+err_send_cm:
+err_modify:
+ ib_destroy_cm_id(p->id);
+err_id:
+ p->id = NULL;
+ ib_destroy_qp(p->qp);
+err_qp:
+ p->qp = NULL;
+ kfree(p->tx_ring);
+err_tx:
+ return ret;
+}
+
+static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
+{
+ struct ipoib_dev_priv *priv = p->priv;
+ struct ifnet *dev = priv->dev;
+ struct ipoib_cm_tx_buf *tx_req;
+ unsigned long begin;
+
+ ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
+ p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail);
+
+ if (p->path)
+ ipoib_path_free(priv, p->path);
+
+ if (p->id)
+ ib_destroy_cm_id(p->id);
+
+ if (p->tx_ring) {
+ /* Wait for all sends to complete */
+ begin = jiffies;
+ while ((int) p->tx_tail - (int) p->tx_head < 0) {
+ if (time_after(jiffies, begin + 5 * HZ)) {
+ ipoib_warn(priv, "timing out; %d sends not completed\n",
+ p->tx_head - p->tx_tail);
+ goto timeout;
+ }
+
+ msleep(1);
+ }
+ }
+
+timeout:
+
+ while ((int) p->tx_tail - (int) p->tx_head < 0) {
+ tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
+ ipoib_dma_unmap_tx(priv->ca, (struct ipoib_tx_buf *)tx_req);
+ m_freem(tx_req->mb);
+ ++p->tx_tail;
+ if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
+ (dev->if_drv_flags & IFF_DRV_OACTIVE) != 0 &&
+ test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+ dev->if_drv_flags &= ~IFF_DRV_OACTIVE;
+ }
+
+ if (p->qp)
+ ib_destroy_qp(p->qp);
+
+ kfree(p->tx_ring);
+ kfree(p);
+}
+
+static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
+ struct ib_cm_event *event)
+{
+ struct ipoib_cm_tx *tx = cm_id->context;
+ struct ipoib_dev_priv *priv = tx->priv;
+ struct ipoib_path *path;
+ unsigned long flags;
+ int ret;
+
+ switch (event->event) {
+ case IB_CM_DREQ_RECEIVED:
+ ipoib_dbg(priv, "DREQ received.\n");
+ ib_send_cm_drep(cm_id, NULL, 0);
+ break;
+ case IB_CM_REP_RECEIVED:
+ ipoib_dbg(priv, "REP received.\n");
+ ret = ipoib_cm_rep_handler(cm_id, event);
+ if (ret)
+ ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
+ NULL, 0, NULL, 0);
+ break;
+ case IB_CM_REQ_ERROR:
+ case IB_CM_REJ_RECEIVED:
+ case IB_CM_TIMEWAIT_EXIT:
+ ipoib_dbg(priv, "CM error %d.\n", event->event);
+ spin_lock_irqsave(&priv->lock, flags);
+ path = tx->path;
+
+ if (path) {
+ path->cm = NULL;
+ tx->path = NULL;
+ rb_erase(&path->rb_node, &priv->path_tree);
+ list_del(&path->list);
+ }
+
+ if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
+ list_move(&tx->list, &priv->cm.reap_list);
+ queue_work(ipoib_workqueue, &priv->cm.reap_task);
+ }
+
+ spin_unlock_irqrestore(&priv->lock, flags);
+ if (path)
+ ipoib_path_free(tx->priv, path);
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+struct ipoib_cm_tx *ipoib_cm_create_tx(struct ipoib_dev_priv *priv,
+ struct ipoib_path *path)
+{
+ struct ipoib_cm_tx *tx;
+
+ tx = kzalloc(sizeof *tx, GFP_ATOMIC);
+ if (!tx)
+ return NULL;
+
+ ipoib_dbg(priv, "Creating cm tx\n");
+ path->cm = tx;
+ tx->path = path;
+ tx->priv = priv;
+ list_add(&tx->list, &priv->cm.start_list);
+ set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags);
+ queue_work(ipoib_workqueue, &priv->cm.start_task);
+ return tx;
+}
+
+void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
+{
+ struct ipoib_dev_priv *priv = tx->priv;
+ if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
+ spin_lock(&priv->lock);
+ list_move(&tx->list, &priv->cm.reap_list);
+ spin_unlock(&priv->lock);
+ queue_work(ipoib_workqueue, &priv->cm.reap_task);
+ ipoib_dbg(priv, "Reap connection for gid %pI6\n",
+ tx->path->pathrec.dgid.raw);
+ tx->path = NULL;
+ }
+}
+
+static void ipoib_cm_tx_start(struct work_struct *work)
+{
+ struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+ cm.start_task);
+ struct ipoib_path *path;
+ struct ipoib_cm_tx *p;
+ unsigned long flags;
+ int ret;
+
+ struct ib_sa_path_rec pathrec;
+ u32 qpn;
+
+ ipoib_dbg(priv, "cm start task\n");
+ spin_lock_irqsave(&priv->lock, flags);
+
+ while (!list_empty(&priv->cm.start_list)) {
+ p = list_entry(priv->cm.start_list.next, typeof(*p), list);
+ list_del_init(&p->list);
+ path = p->path;
+ qpn = IPOIB_QPN(path->hwaddr);
+ memcpy(&pathrec, &p->path->pathrec, sizeof pathrec);
+
+ spin_unlock_irqrestore(&priv->lock, flags);
+
+ ret = ipoib_cm_tx_init(p, qpn, &pathrec);
+
+ spin_lock_irqsave(&priv->lock, flags);
+
+ if (ret) {
+ path = p->path;
+ if (path) {
+ path->cm = NULL;
+ rb_erase(&path->rb_node, &priv->path_tree);
+ list_del(&path->list);
+ ipoib_path_free(priv, path);
+ }
+ list_del(&p->list);
+ kfree(p);
+ }
+ }
+
+ spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static void ipoib_cm_tx_reap(struct work_struct *work)
+{
+ struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+ cm.reap_task);
+ struct ipoib_cm_tx *p;
+ unsigned long flags;
+
+ spin_lock_irqsave(&priv->lock, flags);
+
+ while (!list_empty(&priv->cm.reap_list)) {
+ p = list_entry(priv->cm.reap_list.next, typeof(*p), list);
+ list_del(&p->list);
+ spin_unlock_irqrestore(&priv->lock, flags);
+ ipoib_cm_tx_destroy(p);
+ spin_lock_irqsave(&priv->lock, flags);
+ }
+
+ spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static void ipoib_cm_mb_reap(struct work_struct *work)
+{
+ struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+ cm.mb_task);
+ struct mbuf *mb;
+ unsigned long flags;
+ unsigned mtu = priv->mcast_mtu;
+ uint16_t proto;
+
+ spin_lock_irqsave(&priv->lock, flags);
+
+ for (;;) {
+ IF_DEQUEUE(&priv->cm.mb_queue, mb);
+ if (mb == NULL)
+ break;
+ spin_unlock_irqrestore(&priv->lock, flags);
+
+ proto = htons(*mtod(mb, uint16_t *));
+ m_adj(mb, IPOIB_ENCAP_LEN);
+ if (proto == ETHERTYPE_IP)
+ icmp_error(mb, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, mtu);
+#if defined(INET6)
+ else if (proto == ETHERTYPE_IPV6)
+ icmp6_error(mb, ICMP6_PACKET_TOO_BIG, 0, mtu);
+#endif
+ else
+ m_freem(mb);
+
+ spin_lock_irqsave(&priv->lock, flags);
+ }
+
+ spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+void
+ipoib_cm_mb_too_long(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int mtu)
+{
+ int e = priv->cm.mb_queue.ifq_len;
+
+ IF_ENQUEUE(&priv->cm.mb_queue, mb);
+ if (e == 0)
+ queue_work(ipoib_workqueue, &priv->cm.mb_task);
+}
+
+static void ipoib_cm_rx_reap(struct work_struct *work)
+{
+ ipoib_cm_free_rx_reap_list(container_of(work, struct ipoib_dev_priv,
+ cm.rx_reap_task));
+}
+
+static void ipoib_cm_stale_task(struct work_struct *work)
+{
+ struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+ cm.stale_task.work);
+ struct ipoib_cm_rx *p;
+ int ret;
+
+ spin_lock_irq(&priv->lock);
+ while (!list_empty(&priv->cm.passive_ids)) {
+ /* List is sorted by LRU, start from tail,
+ * stop when we see a recently used entry */
+ p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list);
+ if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT))
+ break;
+ list_move(&p->list, &priv->cm.rx_error_list);
+ p->state = IPOIB_CM_RX_ERROR;
+ spin_unlock_irq(&priv->lock);
+ ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
+ if (ret)
+ ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
+ spin_lock_irq(&priv->lock);
+ }
+
+ if (!list_empty(&priv->cm.passive_ids))
+ queue_delayed_work(ipoib_workqueue,
+ &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
+ spin_unlock_irq(&priv->lock);
+}
+
+
+static void ipoib_cm_create_srq(struct ipoib_dev_priv *priv, int max_sge)
+{
+ struct ib_srq_init_attr srq_init_attr = {
+ .attr = {
+ .max_wr = ipoib_recvq_size,
+ .max_sge = max_sge
+ }
+ };
+
+ priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr);
+ if (IS_ERR(priv->cm.srq)) {
+ if (PTR_ERR(priv->cm.srq) != -ENOSYS)
+ printk(KERN_WARNING "%s: failed to allocate SRQ, error %ld\n",
+ priv->ca->name, PTR_ERR(priv->cm.srq));
+ priv->cm.srq = NULL;
+ return;
+ }
+
+ priv->cm.srq_ring = kzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring, GFP_KERNEL);
+ if (!priv->cm.srq_ring) {
+ printk(KERN_WARNING "%s: failed to allocate CM SRQ ring (%d entries)\n",
+ priv->ca->name, ipoib_recvq_size);
+ ib_destroy_srq(priv->cm.srq);
+ priv->cm.srq = NULL;
+ return;
+ }
+
+ memset(priv->cm.srq_ring, 0, ipoib_recvq_size * sizeof *priv->cm.srq_ring);
+}
+
+int ipoib_cm_dev_init(struct ipoib_dev_priv *priv)
+{
+ struct ifnet *dev = priv->dev;
+ int i, ret;
+ struct ib_device_attr attr;
+
+ INIT_LIST_HEAD(&priv->cm.passive_ids);
+ INIT_LIST_HEAD(&priv->cm.reap_list);
+ INIT_LIST_HEAD(&priv->cm.start_list);
+ INIT_LIST_HEAD(&priv->cm.rx_error_list);
+ INIT_LIST_HEAD(&priv->cm.rx_flush_list);
+ INIT_LIST_HEAD(&priv->cm.rx_drain_list);
+ INIT_LIST_HEAD(&priv->cm.rx_reap_list);
+ INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start);
+ INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap);
+ INIT_WORK(&priv->cm.mb_task, ipoib_cm_mb_reap);
+ INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap);
+ INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task);
+
+ bzero(&priv->cm.mb_queue, sizeof(priv->cm.mb_queue));
+ mtx_init(&priv->cm.mb_queue.ifq_mtx,
+ dev->if_xname, "if send queue", MTX_DEF);
+
+ ret = ib_query_device(priv->ca, &attr);
+ if (ret) {
+ printk(KERN_WARNING "ib_query_device() failed with %d\n", ret);
+ return ret;
+ }
+
+ ipoib_dbg(priv, "max_srq_sge=%d\n", attr.max_srq_sge);
+
+ attr.max_srq_sge = min_t(int, IPOIB_CM_RX_SG, attr.max_srq_sge);
+ ipoib_cm_create_srq(priv, attr.max_srq_sge);
+ if (ipoib_cm_has_srq(priv)) {
+ priv->cm.max_cm_mtu = attr.max_srq_sge * MJUMPAGESIZE;
+ priv->cm.num_frags = attr.max_srq_sge;
+ ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n",
+ priv->cm.max_cm_mtu, priv->cm.num_frags);
+ } else {
+ priv->cm.max_cm_mtu = IPOIB_CM_MAX_MTU;
+ priv->cm.num_frags = IPOIB_CM_RX_SG;
+ }
+
+ ipoib_cm_init_rx_wr(priv, &priv->cm.rx_wr, priv->cm.rx_sge);
+
+ if (ipoib_cm_has_srq(priv)) {
+ for (i = 0; i < ipoib_recvq_size; ++i) {
+ if (!ipoib_cm_alloc_rx_mb(priv, &priv->cm.srq_ring[i])) {
+ ipoib_warn(priv, "failed to allocate "
+ "receive buffer %d\n", i);
+ ipoib_cm_dev_cleanup(priv);
+ return -ENOMEM;
+ }
+
+ if (ipoib_cm_post_receive_srq(priv, i)) {
+ ipoib_warn(priv, "ipoib_cm_post_receive_srq "
+ "failed for buf %d\n", i);
+ ipoib_cm_dev_cleanup(priv);
+ return -EIO;
+ }
+ }
+ }
+
+ IF_LLADDR(priv->dev)[0] = IPOIB_FLAGS_RC;
+ return 0;
+}
+
+void ipoib_cm_dev_cleanup(struct ipoib_dev_priv *priv)
+{
+ int ret;
+
+ if (!priv->cm.srq)
+ return;
+
+ ipoib_dbg(priv, "Cleanup ipoib connected mode.\n");
+
+ ret = ib_destroy_srq(priv->cm.srq);
+ if (ret)
+ ipoib_warn(priv, "ib_destroy_srq failed: %d\n", ret);
+
+ priv->cm.srq = NULL;
+ if (!priv->cm.srq_ring)
+ return;
+
+ ipoib_cm_free_rx_ring(priv, priv->cm.srq_ring);
+ priv->cm.srq_ring = NULL;
+
+ mtx_destroy(&priv->cm.mb_queue.ifq_mtx);
+}
+
+#endif /* CONFIG_INFINIBAND_IPOIB_CM */
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
new file mode 100644
index 0000000..ec52712
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+
+#include "ipoib.h"
+
+static void ipoib_get_drvinfo(struct ifnet *netdev,
+ struct ethtool_drvinfo *drvinfo)
+{
+ strncpy(drvinfo->driver, "ipoib", sizeof(drvinfo->driver) - 1);
+}
+
+static u32 ipoib_get_rx_csum(struct ifnet *dev)
+{
+ struct ipoib_dev_priv *priv = dev->if_softc;
+ return test_bit(IPOIB_FLAG_CSUM, &priv->flags) &&
+ !test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
+}
+
+static int ipoib_get_coalesce(struct ifnet *dev,
+ struct ethtool_coalesce *coal)
+{
+ struct ipoib_dev_priv *priv = dev->if_softc;
+
+ coal->rx_coalesce_usecs = priv->ethtool.coalesce_usecs;
+ coal->tx_coalesce_usecs = priv->ethtool.coalesce_usecs;
+ coal->rx_max_coalesced_frames = priv->ethtool.max_coalesced_frames;
+ coal->tx_max_coalesced_frames = priv->ethtool.max_coalesced_frames;
+
+ return 0;
+}
+
+static int ipoib_set_coalesce(struct ifnet *dev,
+ struct ethtool_coalesce *coal)
+{
+ struct ipoib_dev_priv *priv = dev->if_softc;
+ int ret;
+
+ /*
+ * Since IPoIB uses a single CQ for both rx and tx, we assume
+ * that rx params dictate the configuration. These values are
+ * saved in the private data and returned when ipoib_get_coalesce()
+ * is called.
+ */
+ if (coal->rx_coalesce_usecs > 0xffff ||
+ coal->rx_max_coalesced_frames > 0xffff)
+ return -EINVAL;
+
+ if (coal->rx_max_coalesced_frames | coal->rx_coalesce_usecs) {
+ if (!coal->rx_max_coalesced_frames)
+ coal->rx_max_coalesced_frames = 0xffff;
+ else if (!coal->rx_coalesce_usecs)
+ coal->rx_coalesce_usecs = 0xffff;
+ }
+
+ ret = ib_modify_cq(priv->recv_cq, coal->rx_max_coalesced_frames,
+ coal->rx_coalesce_usecs);
+ if (ret && ret != -ENOSYS) {
+ ipoib_warn(priv, "failed modifying CQ (%d)\n", ret);
+ return ret;
+ }
+
+ coal->tx_coalesce_usecs = coal->rx_coalesce_usecs;
+ coal->tx_max_coalesced_frames = coal->rx_max_coalesced_frames;
+ priv->ethtool.coalesce_usecs = coal->rx_coalesce_usecs;
+ priv->ethtool.max_coalesced_frames = coal->rx_max_coalesced_frames;
+
+ return 0;
+}
+
+static const char ipoib_stats_keys[][ETH_GSTRING_LEN] = {
+ "LRO aggregated", "LRO flushed",
+ "LRO avg aggr", "LRO no desc"
+};
+
+static void ipoib_get_strings(struct ifnet *netdev, u32 stringset, u8 *data)
+{
+ switch (stringset) {
+ case ETH_SS_STATS:
+ memcpy(data, *ipoib_stats_keys, sizeof(ipoib_stats_keys));
+ break;
+ }
+}
+
+static int ipoib_get_sset_count(struct ifnet *dev, int sset)
+{
+ switch (sset) {
+ case ETH_SS_STATS:
+ return ARRAY_SIZE(ipoib_stats_keys);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static void ipoib_get_ethtool_stats(struct ifnet *dev,
+ struct ethtool_stats *stats, uint64_t *data)
+{
+ struct ipoib_dev_priv *priv = dev->if_softc;
+ int index = 0;
+
+ /* Get LRO statistics */
+ data[index++] = priv->lro.lro_mgr.stats.aggregated;
+ data[index++] = priv->lro.lro_mgr.stats.flushed;
+ if (priv->lro.lro_mgr.stats.flushed)
+ data[index++] = priv->lro.lro_mgr.stats.aggregated /
+ priv->lro.lro_mgr.stats.flushed;
+ else
+ data[index++] = 0;
+ data[index++] = priv->lro.lro_mgr.stats.no_desc;
+}
+
+static const struct ethtool_ops ipoib_ethtool_ops = {
+ .get_drvinfo = ipoib_get_drvinfo,
+ .get_rx_csum = ipoib_get_rx_csum,
+ .get_coalesce = ipoib_get_coalesce,
+ .set_coalesce = ipoib_set_coalesce,
+ .get_flags = ethtool_op_get_flags,
+ .set_flags = ethtool_op_set_flags,
+ .get_strings = ipoib_get_strings,
+ .get_sset_count = ipoib_get_sset_count,
+ .get_ethtool_stats = ipoib_get_ethtool_stats,
+};
+
+void ipoib_set_ethtool_ops(struct ifnet *dev)
+{
+ SET_ETHTOOL_OPS(dev, &ipoib_ethtool_ops);
+}
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_fs.c
new file mode 100644
index 0000000..0f85f28
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_fs.c
@@ -0,0 +1,298 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/seq_file.h>
+
+struct file_operations;
+
+#include <linux/debugfs.h>
+
+#include "ipoib.h"
+
+static struct dentry *ipoib_root;
+
+static void format_gid(union ib_gid *gid, char *buf)
+{
+ int i, n;
+
+ for (n = 0, i = 0; i < 8; ++i) {
+ n += sprintf(buf + n, "%x",
+ be16_to_cpu(((__be16 *) gid->raw)[i]));
+ if (i < 7)
+ buf[n++] = ':';
+ }
+}
+
+static void *ipoib_mcg_seq_start(struct seq_file *file, loff_t *pos)
+{
+ struct ipoib_mcast_iter *iter;
+ loff_t n = *pos;
+
+ iter = ipoib_mcast_iter_init(file->private);
+ if (!iter)
+ return NULL;
+
+ while (n--) {
+ if (ipoib_mcast_iter_next(iter)) {
+ kfree(iter);
+ return NULL;
+ }
+ }
+
+ return iter;
+}
+
+static void *ipoib_mcg_seq_next(struct seq_file *file, void *iter_ptr,
+ loff_t *pos)
+{
+ struct ipoib_mcast_iter *iter = iter_ptr;
+
+ (*pos)++;
+
+ if (ipoib_mcast_iter_next(iter)) {
+ kfree(iter);
+ return NULL;
+ }
+
+ return iter;
+}
+
+static void ipoib_mcg_seq_stop(struct seq_file *file, void *iter_ptr)
+{
+ /* nothing for now */
+}
+
+static int ipoib_mcg_seq_show(struct seq_file *file, void *iter_ptr)
+{
+ struct ipoib_mcast_iter *iter = iter_ptr;
+ char gid_buf[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"];
+ union ib_gid mgid;
+ unsigned long created;
+ unsigned int queuelen, complete, send_only;
+
+ if (!iter)
+ return 0;
+
+ ipoib_mcast_iter_read(iter, &mgid, &created, &queuelen,
+ &complete, &send_only);
+
+ format_gid(&mgid, gid_buf);
+
+ seq_printf(file,
+ "GID: %s\n"
+ " created: %10ld\n"
+ " queuelen: %9d\n"
+ " complete: %9s\n"
+ " send_only: %8s\n"
+ "\n",
+ gid_buf, created, queuelen,
+ complete ? "yes" : "no",
+ send_only ? "yes" : "no");
+
+ return 0;
+}
+
+static const struct seq_operations ipoib_mcg_seq_ops = {
+ .start = ipoib_mcg_seq_start,
+ .next = ipoib_mcg_seq_next,
+ .stop = ipoib_mcg_seq_stop,
+ .show = ipoib_mcg_seq_show,
+};
+
+static int ipoib_mcg_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int ret;
+
+ ret = seq_open(file, &ipoib_mcg_seq_ops);
+ if (ret)
+ return ret;
+
+ seq = file->private_data;
+ seq->private = inode->i_private;
+
+ return 0;
+}
+
+static const struct file_operations ipoib_mcg_fops = {
+ .owner = THIS_MODULE,
+ .open = ipoib_mcg_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release
+};
+
+static void *ipoib_path_seq_start(struct seq_file *file, loff_t *pos)
+{
+ struct ipoib_path_iter *iter;
+ loff_t n = *pos;
+
+ iter = ipoib_path_iter_init(file->private);
+ if (!iter)
+ return NULL;
+
+ while (n--) {
+ if (ipoib_path_iter_next(iter)) {
+ kfree(iter);
+ return NULL;
+ }
+ }
+
+ return iter;
+}
+
+static void *ipoib_path_seq_next(struct seq_file *file, void *iter_ptr,
+ loff_t *pos)
+{
+ struct ipoib_path_iter *iter = iter_ptr;
+
+ (*pos)++;
+
+ if (ipoib_path_iter_next(iter)) {
+ kfree(iter);
+ return NULL;
+ }
+
+ return iter;
+}
+
+static void ipoib_path_seq_stop(struct seq_file *file, void *iter_ptr)
+{
+ /* nothing for now */
+}
+
+static int ipoib_path_seq_show(struct seq_file *file, void *iter_ptr)
+{
+ struct ipoib_path_iter *iter = iter_ptr;
+ char gid_buf[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"];
+ struct ipoib_path path;
+ int rate;
+
+ if (!iter)
+ return 0;
+
+ ipoib_path_iter_read(iter, &path);
+
+ format_gid(&path.pathrec.dgid, gid_buf);
+
+ seq_printf(file,
+ "GID: %s\n"
+ " complete: %6s\n",
+ gid_buf, path.pathrec.dlid ? "yes" : "no");
+
+ if (path.pathrec.dlid) {
+ rate = ib_rate_to_mult(path.pathrec.rate) * 25;
+
+ seq_printf(file,
+ " DLID: 0x%04x\n"
+ " SL: %12d\n"
+ " rate: %*d%s Gb/sec\n",
+ be16_to_cpu(path.pathrec.dlid),
+ path.pathrec.sl,
+ 10 - ((rate % 10) ? 2 : 0),
+ rate / 10, rate % 10 ? ".5" : "");
+ }
+
+ seq_putc(file, '\n');
+
+ return 0;
+}
+
+static const struct seq_operations ipoib_path_seq_ops = {
+ .start = ipoib_path_seq_start,
+ .next = ipoib_path_seq_next,
+ .stop = ipoib_path_seq_stop,
+ .show = ipoib_path_seq_show,
+};
+
+static int ipoib_path_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int ret;
+
+ ret = seq_open(file, &ipoib_path_seq_ops);
+ if (ret)
+ return ret;
+
+ seq = file->private_data;
+ seq->private = inode->i_private;
+
+ return 0;
+}
+
+static const struct file_operations ipoib_path_fops = {
+ .owner = THIS_MODULE,
+ .open = ipoib_path_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release
+};
+
+void ipoib_create_debug_files(struct ifnet *dev)
+{
+ struct ipoib_dev_priv *priv = dev->if_softc;
+ char name[IFNAMSIZ + sizeof "_path"];
+
+ snprintf(name, sizeof name, "%s_mcg", if_name(dev));
+ priv->mcg_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO,
+ ipoib_root, dev, &ipoib_mcg_fops);
+ if (!priv->mcg_dentry)
+ ipoib_warn(priv, "failed to create mcg debug file\n");
+
+ snprintf(name, sizeof name, "%s_path", if_name(dev));
+ priv->path_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO,
+ ipoib_root, dev, &ipoib_path_fops);
+ if (!priv->path_dentry)
+ ipoib_warn(priv, "failed to create path debug file\n");
+}
+
+void ipoib_delete_debug_files(struct ifnet *dev)
+{
+ struct ipoib_dev_priv *priv = dev->if_softc;
+
+ if (priv->mcg_dentry)
+ debugfs_remove(priv->mcg_dentry);
+ if (priv->path_dentry)
+ debugfs_remove(priv->path_dentry);
+}
+
+int ipoib_register_debugfs(void)
+{
+ ipoib_root = debugfs_create_dir("ipoib", NULL);
+ return ipoib_root ? 0 : -ENOMEM;
+}
+
+void ipoib_unregister_debugfs(void)
+{
+ debugfs_remove(ipoib_root);
+}
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c
new file mode 100644
index 0000000..d3b68bc
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -0,0 +1,997 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ipoib.h"
+
+#include <rdma/ib_cache.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
+static int data_debug_level;
+
+module_param(data_debug_level, int, 0644);
+MODULE_PARM_DESC(data_debug_level,
+ "Enable data path debug tracing if > 0");
+#endif
+
+static DEFINE_MUTEX(pkey_mutex);
+
+struct ipoib_ah *ipoib_create_ah(struct ipoib_dev_priv *priv,
+ struct ib_pd *pd, struct ib_ah_attr *attr)
+{
+ struct ipoib_ah *ah;
+
+ ah = kmalloc(sizeof *ah, GFP_KERNEL);
+ if (!ah)
+ return NULL;
+
+ ah->priv = priv;
+ ah->last_send = 0;
+ kref_init(&ah->ref);
+
+ ah->ah = ib_create_ah(pd, attr);
+ if (IS_ERR(ah->ah)) {
+ kfree(ah);
+ ah = NULL;
+ } else
+ ipoib_dbg(priv, "Created ah %p\n", ah->ah);
+
+ return ah;
+}
+
+void ipoib_free_ah(struct kref *kref)
+{
+ struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref);
+ struct ipoib_dev_priv *priv = ah->priv;
+
+ unsigned long flags;
+
+ spin_lock_irqsave(&priv->lock, flags);
+ list_add_tail(&ah->list, &priv->dead_ahs);
+ spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+void
+ipoib_dma_unmap_rx(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req)
+{
+ struct mbuf *m;
+ int i;
+
+ for (i = 0, m = rx_req->mb; m != NULL; m = m->m_next, i++)
+ ib_dma_unmap_single(priv->ca, rx_req->mapping[i], m->m_len,
+ DMA_FROM_DEVICE);
+}
+
+void
+ipoib_dma_mb(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int length)
+{
+
+ m_adj(mb, -(mb->m_pkthdr.len - length));
+}
+
+struct mbuf *
+ipoib_alloc_map_mb(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req,
+ int size)
+{
+ struct mbuf *mb, *m;
+ int i, j;
+
+ rx_req->mb = NULL;
+ mb = m_getm2(NULL, size, M_NOWAIT, MT_DATA, M_PKTHDR);
+ if (mb == NULL)
+ return (NULL);
+ for (i = 0, m = mb; m != NULL; m = m->m_next, i++) {
+ m->m_len = (m->m_flags & M_EXT) ? m->m_ext.ext_size :
+ ((m->m_flags & M_PKTHDR) ? MHLEN : MLEN);
+ mb->m_pkthdr.len += m->m_len;
+ rx_req->mapping[i] = ib_dma_map_single(priv->ca,
+ mtod(m, void *), m->m_len, DMA_FROM_DEVICE);
+ if (unlikely(ib_dma_mapping_error(priv->ca,
+ rx_req->mapping[i])))
+ goto error;
+
+ }
+ rx_req->mb = mb;
+ return (mb);
+error:
+ for (j = 0, m = mb; j < i; m = m->m_next, j++)
+ ib_dma_unmap_single(priv->ca, rx_req->mapping[j], m->m_len,
+ DMA_FROM_DEVICE);
+ m_freem(mb);
+ return (NULL);
+
+}
+
+static int ipoib_ib_post_receive(struct ipoib_dev_priv *priv, int id)
+{
+ struct ipoib_rx_buf *rx_req;
+ struct ib_recv_wr *bad_wr;
+ struct mbuf *m;
+ int ret;
+ int i;
+
+ rx_req = &priv->rx_ring[id];
+ for (m = rx_req->mb, i = 0; m != NULL; m = m->m_next, i++) {
+ priv->rx_sge[i].addr = rx_req->mapping[i];
+ priv->rx_sge[i].length = m->m_len;
+ }
+ priv->rx_wr.num_sge = i;
+ priv->rx_wr.wr_id = id | IPOIB_OP_RECV;
+
+ ret = ib_post_recv(priv->qp, &priv->rx_wr, &bad_wr);
+ if (unlikely(ret)) {
+ ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
+ ipoib_dma_unmap_rx(priv, &priv->rx_ring[id]);
+ m_freem(priv->rx_ring[id].mb);
+ priv->rx_ring[id].mb = NULL;
+ }
+
+ return ret;
+}
+
+static struct mbuf *
+ipoib_alloc_rx_mb(struct ipoib_dev_priv *priv, int id)
+{
+
+ return ipoib_alloc_map_mb(priv, &priv->rx_ring[id],
+ priv->max_ib_mtu + IB_GRH_BYTES);
+}
+
+static int ipoib_ib_post_receives(struct ipoib_dev_priv *priv)
+{
+ int i;
+
+ for (i = 0; i < ipoib_recvq_size; ++i) {
+ if (!ipoib_alloc_rx_mb(priv, i)) {
+ ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
+ return -ENOMEM;
+ }
+ if (ipoib_ib_post_receive(priv, i)) {
+ ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i);
+ return -EIO;
+ }
+ }
+
+ return 0;
+}
+
+static void
+ipoib_ib_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc)
+{
+ struct ipoib_rx_buf saverx;
+ unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV;
+ struct ifnet *dev = priv->dev;
+ struct ipoib_header *eh;
+ struct mbuf *mb;
+
+ ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n",
+ wr_id, wc->status);
+
+ if (unlikely(wr_id >= ipoib_recvq_size)) {
+ ipoib_warn(priv, "recv completion event with wrid %d (> %d)\n",
+ wr_id, ipoib_recvq_size);
+ return;
+ }
+
+ mb = priv->rx_ring[wr_id].mb;
+
+ if (unlikely(wc->status != IB_WC_SUCCESS)) {
+ if (wc->status != IB_WC_WR_FLUSH_ERR) {
+ ipoib_warn(priv, "failed recv event "
+ "(status=%d, wrid=%d vend_err %x)\n",
+ wc->status, wr_id, wc->vendor_err);
+ goto repost;
+ }
+ if (mb) {
+ ipoib_dma_unmap_rx(priv, &priv->rx_ring[wr_id]);
+ m_freem(mb);
+ priv->rx_ring[wr_id].mb = NULL;
+ }
+ return;
+ }
+
+ /*
+ * Drop packets that this interface sent, ie multicast packets
+ * that the HCA has replicated.
+ */
+ if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num)
+ goto repost;
+
+ memcpy(&saverx, &priv->rx_ring[wr_id], sizeof(saverx));
+ /*
+ * If we can't allocate a new RX buffer, dump
+ * this packet and reuse the old buffer.
+ */
+ if (unlikely(!ipoib_alloc_rx_mb(priv, wr_id))) {
+ memcpy(&priv->rx_ring[wr_id], &saverx, sizeof(saverx));
+ dev->if_iqdrops++;
+ goto repost;
+ }
+
+ ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
+ wc->byte_len, wc->slid);
+
+ ipoib_dma_unmap_rx(priv, &saverx);
+ ipoib_dma_mb(priv, mb, wc->byte_len);
+
+ ++dev->if_ipackets;
+ dev->if_ibytes += mb->m_pkthdr.len;
+ mb->m_pkthdr.rcvif = dev;
+ m_adj(mb, sizeof(struct ib_grh) - INFINIBAND_ALEN);
+ eh = mtod(mb, struct ipoib_header *);
+ bzero(eh->hwaddr, 4); /* Zero the queue pair, only dgid is in grh */
+
+ if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->csum_ok))
+ mb->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID;
+
+ dev->if_input(dev, mb);
+
+repost:
+ if (unlikely(ipoib_ib_post_receive(priv, wr_id)))
+ ipoib_warn(priv, "ipoib_ib_post_receive failed "
+ "for buf %d\n", wr_id);
+}
+
+int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req, int max)
+{
+ struct mbuf *mb = tx_req->mb;
+ u64 *mapping = tx_req->mapping;
+ struct mbuf *m, *p;
+ int error;
+ int i;
+
+ for (m = mb, p = NULL, i = 0; m != NULL; p = m, m = m->m_next, i++) {
+ if (m->m_len != 0)
+ continue;
+ if (p == NULL)
+ panic("ipoib_dma_map_tx: First mbuf empty\n");
+ p->m_next = m_free(m);
+ m = p;
+ i--;
+ }
+ i--;
+ if (i >= max) {
+ tx_req->mb = mb = m_defrag(mb, M_DONTWAIT);
+ if (mb == NULL)
+ return -EIO;
+ for (m = mb, i = 0; m != NULL; m = m->m_next, i++);
+ if (i >= max)
+ return -EIO;
+ }
+ error = 0;
+ for (m = mb, i = 0; m != NULL; m = m->m_next, i++) {
+ mapping[i] = ib_dma_map_single(ca, mtod(m, void *),
+ m->m_len, DMA_TO_DEVICE);
+ if (unlikely(ib_dma_mapping_error(ca, mapping[i]))) {
+ error = -EIO;
+ break;
+ }
+ }
+ if (error) {
+ int end;
+
+ end = i;
+ for (m = mb, i = 0; i < end; m = m->m_next, i++)
+ ib_dma_unmap_single(ca, mapping[i], m->m_len,
+ DMA_TO_DEVICE);
+ }
+ return error;
+}
+
+void ipoib_dma_unmap_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req)
+{
+ struct mbuf *mb = tx_req->mb;
+ u64 *mapping = tx_req->mapping;
+ struct mbuf *m;
+ int i;
+
+ for (m = mb, i = 0; m != NULL; m = m->m_next, i++)
+ ib_dma_unmap_single(ca, mapping[i], m->m_len, DMA_TO_DEVICE);
+}
+
+static void ipoib_ib_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc)
+{
+ struct ifnet *dev = priv->dev;
+ unsigned int wr_id = wc->wr_id;
+ struct ipoib_tx_buf *tx_req;
+
+ ipoib_dbg_data(priv, "send completion: id %d, status: %d\n",
+ wr_id, wc->status);
+
+ if (unlikely(wr_id >= ipoib_sendq_size)) {
+ ipoib_warn(priv, "send completion event with wrid %d (> %d)\n",
+ wr_id, ipoib_sendq_size);
+ return;
+ }
+
+ tx_req = &priv->tx_ring[wr_id];
+
+ ipoib_dma_unmap_tx(priv->ca, tx_req);
+
+ ++dev->if_opackets;
+ dev->if_obytes += tx_req->mb->m_pkthdr.len;
+
+ m_freem(tx_req->mb);
+
+ ++priv->tx_tail;
+ if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
+ (dev->if_drv_flags & IFF_DRV_OACTIVE) &&
+ test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+ dev->if_drv_flags &= ~IFF_DRV_OACTIVE;
+
+ if (wc->status != IB_WC_SUCCESS &&
+ wc->status != IB_WC_WR_FLUSH_ERR)
+ ipoib_warn(priv, "failed send event "
+ "(status=%d, wrid=%d vend_err %x)\n",
+ wc->status, wr_id, wc->vendor_err);
+}
+
+int
+ipoib_poll_tx(struct ipoib_dev_priv *priv)
+{
+ int n, i;
+
+ n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc);
+ for (i = 0; i < n; ++i) {
+ struct ib_wc *wc = priv->send_wc + i;
+ if (wc->wr_id & IPOIB_OP_CM)
+ ipoib_cm_handle_tx_wc(priv, wc);
+ else
+ ipoib_ib_handle_tx_wc(priv, wc);
+ }
+
+ return n == MAX_SEND_CQE;
+}
+
+static void
+ipoib_poll(struct ipoib_dev_priv *priv)
+{
+ int n, i;
+
+poll_more:
+ for (;;) {
+ n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc);
+
+ for (i = 0; i < n; i++) {
+ struct ib_wc *wc = priv->ibwc + i;
+
+ if ((wc->wr_id & IPOIB_OP_RECV) == 0)
+ panic("ipoib_poll: Bad wr_id 0x%jX\n",
+ (intmax_t)wc->wr_id);
+ if (wc->wr_id & IPOIB_OP_CM)
+ ipoib_cm_handle_rx_wc(priv, wc);
+ else
+ ipoib_ib_handle_rx_wc(priv, wc);
+ }
+
+ if (n != IPOIB_NUM_WC)
+ break;
+ }
+
+ if (ib_req_notify_cq(priv->recv_cq,
+ IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS))
+ goto poll_more;
+}
+
+void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
+{
+ struct ipoib_dev_priv *priv = dev_ptr;
+
+ ipoib_poll(priv);
+}
+
+static void drain_tx_cq(struct ipoib_dev_priv *priv)
+{
+ struct ifnet *dev = priv->dev;
+
+ spin_lock(&priv->lock);
+ while (ipoib_poll_tx(priv))
+ ; /* nothing */
+
+ if (dev->if_drv_flags & IFF_DRV_OACTIVE)
+ mod_timer(&priv->poll_timer, jiffies + 1);
+
+ spin_unlock(&priv->lock);
+}
+
+void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr)
+{
+ struct ipoib_dev_priv *priv = dev_ptr;
+
+ mod_timer(&priv->poll_timer, jiffies);
+}
+
+static inline int
+post_send(struct ipoib_dev_priv *priv, unsigned int wr_id,
+ struct ib_ah *address, u32 qpn, struct ipoib_tx_buf *tx_req, void *head,
+ int hlen)
+{
+ struct ib_send_wr *bad_wr;
+ struct mbuf *mb = tx_req->mb;
+ u64 *mapping = tx_req->mapping;
+ struct mbuf *m;
+ int i;
+
+ for (m = mb, i = 0; m != NULL; m = m->m_next, i++) {
+ priv->tx_sge[i].addr = mapping[i];
+ priv->tx_sge[i].length = m->m_len;
+ }
+ priv->tx_wr.num_sge = i;
+ priv->tx_wr.wr_id = wr_id;
+ priv->tx_wr.wr.ud.remote_qpn = qpn;
+ priv->tx_wr.wr.ud.ah = address;
+
+
+ if (head) {
+ priv->tx_wr.wr.ud.mss = 0; /* XXX mb_shinfo(mb)->gso_size; */
+ priv->tx_wr.wr.ud.header = head;
+ priv->tx_wr.wr.ud.hlen = hlen;
+ priv->tx_wr.opcode = IB_WR_LSO;
+ } else
+ priv->tx_wr.opcode = IB_WR_SEND;
+
+ return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr);
+}
+
+void
+ipoib_send(struct ipoib_dev_priv *priv, struct mbuf *mb,
+ struct ipoib_ah *address, u32 qpn)
+{
+ struct ifnet *dev = priv->dev;
+ struct ipoib_tx_buf *tx_req;
+ int hlen;
+ void *phead;
+
+ if (unlikely(priv->tx_outstanding > MAX_SEND_CQE))
+ while (ipoib_poll_tx(priv))
+ ; /* nothing */
+
+ m_adj(mb, sizeof (struct ipoib_pseudoheader));
+ if (0 /* XXX segment offload mb_is_gso(mb) */) {
+ /* XXX hlen = mb_transport_offset(mb) + tcp_hdrlen(mb); */
+ phead = mtod(mb, void *);
+ if (mb->m_len < hlen) {
+ ipoib_warn(priv, "linear data too small\n");
+ ++dev->if_oerrors;
+ m_freem(mb);
+ return;
+ }
+ m_adj(mb, hlen);
+ } else {
+ if (unlikely(mb->m_pkthdr.len - IPOIB_ENCAP_LEN > priv->mcast_mtu)) {
+ ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
+ mb->m_pkthdr.len, priv->mcast_mtu);
+ ++dev->if_oerrors;
+ ipoib_cm_mb_too_long(priv, mb, priv->mcast_mtu);
+ return;
+ }
+ phead = NULL;
+ hlen = 0;
+ }
+
+ ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n",
+ mb->m_pkthdr.len, address, qpn);
+
+ /*
+ * We put the mb into the tx_ring _before_ we call post_send()
+ * because it's entirely possible that the completion handler will
+ * run before we execute anything after the post_send(). That
+ * means we have to make sure everything is properly recorded and
+ * our state is consistent before we call post_send().
+ */
+ tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)];
+ tx_req->mb = mb;
+ if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req, IPOIB_UD_TX_SG))) {
+ ++dev->if_oerrors;
+ if (tx_req->mb)
+ m_freem(tx_req->mb);
+ return;
+ }
+
+ if (mb->m_pkthdr.csum_flags & (CSUM_IP|CSUM_TCP|CSUM_UDP))
+ priv->tx_wr.send_flags |= IB_SEND_IP_CSUM;
+ else
+ priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
+
+ if (++priv->tx_outstanding == ipoib_sendq_size) {
+ ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
+ if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP))
+ ipoib_warn(priv, "request notify on send CQ failed\n");
+ dev->if_drv_flags |= IFF_DRV_OACTIVE;
+ }
+
+ if (unlikely(post_send(priv,
+ priv->tx_head & (ipoib_sendq_size - 1), address->ah, qpn,
+ tx_req, phead, hlen))) {
+ ipoib_warn(priv, "post_send failed\n");
+ ++dev->if_oerrors;
+ --priv->tx_outstanding;
+ ipoib_dma_unmap_tx(priv->ca, tx_req);
+ m_freem(mb);
+ if (dev->if_drv_flags & IFF_DRV_OACTIVE)
+ dev->if_drv_flags &= ~IFF_DRV_OACTIVE;
+ } else {
+ address->last_send = priv->tx_head;
+ ++priv->tx_head;
+ }
+}
+
+static void __ipoib_reap_ah(struct ipoib_dev_priv *priv)
+{
+ struct ipoib_ah *ah, *tah;
+ LIST_HEAD(remove_list);
+ unsigned long flags;
+
+ spin_lock_irqsave(&priv->lock, flags);
+
+ list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list)
+ if ((int) priv->tx_tail - (int) ah->last_send >= 0) {
+ list_del(&ah->list);
+ ib_destroy_ah(ah->ah);
+ kfree(ah);
+ }
+
+ spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+void ipoib_reap_ah(struct work_struct *work)
+{
+ struct ipoib_dev_priv *priv =
+ container_of(work, struct ipoib_dev_priv, ah_reap_task.work);
+
+ __ipoib_reap_ah(priv);
+
+ if (!test_bit(IPOIB_STOP_REAPER, &priv->flags))
+ queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task,
+ HZ);
+}
+
+static void ipoib_ah_dev_cleanup(struct ipoib_dev_priv *priv)
+{
+ unsigned long begin;
+
+ begin = jiffies;
+
+ while (!list_empty(&priv->dead_ahs)) {
+ __ipoib_reap_ah(priv);
+
+ if (time_after(jiffies, begin + HZ)) {
+ ipoib_warn(priv, "timing out; will leak address handles\n");
+ break;
+ }
+
+ msleep(1);
+ }
+}
+
+static void ipoib_ib_tx_timer_func(unsigned long ctx)
+{
+ drain_tx_cq((struct ipoib_dev_priv *)ctx);
+}
+
+int ipoib_ib_dev_open(struct ipoib_dev_priv *priv)
+{
+ int ret;
+
+ if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &priv->pkey_index)) {
+ ipoib_warn(priv, "P_Key 0x%04x not found\n", priv->pkey);
+ clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+ return -1;
+ }
+ set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+
+ ret = ipoib_init_qp(priv);
+ if (ret) {
+ ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret);
+ return -1;
+ }
+
+ ret = ipoib_ib_post_receives(priv);
+ if (ret) {
+ ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
+ ipoib_ib_dev_stop(priv, 1);
+ return -1;
+ }
+
+ ret = ipoib_cm_dev_open(priv);
+ if (ret) {
+ ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret);
+ ipoib_ib_dev_stop(priv, 1);
+ return -1;
+ }
+
+ clear_bit(IPOIB_STOP_REAPER, &priv->flags);
+ queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ);
+
+ return 0;
+}
+
+static void ipoib_pkey_dev_check_presence(struct ipoib_dev_priv *priv)
+{
+ u16 pkey_index = 0;
+
+ if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index))
+ clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+ else
+ set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+}
+
+int ipoib_ib_dev_up(struct ipoib_dev_priv *priv)
+{
+
+ ipoib_pkey_dev_check_presence(priv);
+
+ if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
+ ipoib_dbg(priv, "PKEY is not assigned.\n");
+ return 0;
+ }
+
+ set_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
+
+ return ipoib_mcast_start_thread(priv);
+}
+
+int ipoib_ib_dev_down(struct ipoib_dev_priv *priv, int flush)
+{
+
+ ipoib_dbg(priv, "downing ib_dev\n");
+
+ clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
+ if_link_state_change(priv->dev, LINK_STATE_DOWN);
+
+ /* Shutdown the P_Key thread if still active */
+ if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
+ mutex_lock(&pkey_mutex);
+ set_bit(IPOIB_PKEY_STOP, &priv->flags);
+ cancel_delayed_work(&priv->pkey_poll_task);
+ mutex_unlock(&pkey_mutex);
+ if (flush)
+ flush_workqueue(ipoib_workqueue);
+ }
+
+ ipoib_mcast_stop_thread(priv, flush);
+ ipoib_mcast_dev_flush(priv);
+
+ ipoib_flush_paths(priv);
+
+ return 0;
+}
+
+static int recvs_pending(struct ipoib_dev_priv *priv)
+{
+ int pending = 0;
+ int i;
+
+ for (i = 0; i < ipoib_recvq_size; ++i)
+ if (priv->rx_ring[i].mb)
+ ++pending;
+
+ return pending;
+}
+
+void ipoib_drain_cq(struct ipoib_dev_priv *priv)
+{
+ int i, n;
+
+ do {
+ n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc);
+ for (i = 0; i < n; ++i) {
+ /*
+ * Convert any successful completions to flush
+ * errors to avoid passing packets up the
+ * stack after bringing the device down.
+ */
+ if (priv->ibwc[i].status == IB_WC_SUCCESS)
+ priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR;
+
+ if ((priv->ibwc[i].wr_id & IPOIB_OP_RECV) == 0)
+ panic("ipoib_drain_cq: Bad wrid 0x%jX\n",
+ (intmax_t)priv->ibwc[i].wr_id);
+ if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
+ ipoib_cm_handle_rx_wc(priv, priv->ibwc + i);
+ else
+ ipoib_ib_handle_rx_wc(priv, priv->ibwc + i);
+ }
+ } while (n == IPOIB_NUM_WC);
+
+ spin_lock(&priv->lock);
+ while (ipoib_poll_tx(priv))
+ ; /* nothing */
+
+ spin_unlock(&priv->lock);
+}
+
+int ipoib_ib_dev_stop(struct ipoib_dev_priv *priv, int flush)
+{
+ struct ib_qp_attr qp_attr;
+ unsigned long begin;
+ struct ipoib_tx_buf *tx_req;
+ int i;
+
+ ipoib_cm_dev_stop(priv);
+
+ /*
+ * Move our QP to the error state and then reinitialize in
+ * when all work requests have completed or have been flushed.
+ */
+ qp_attr.qp_state = IB_QPS_ERR;
+ if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
+ ipoib_warn(priv, "Failed to modify QP to ERROR state\n");
+
+ /* Wait for all sends and receives to complete */
+ begin = jiffies;
+
+ while (priv->tx_head != priv->tx_tail || recvs_pending(priv)) {
+ if (time_after(jiffies, begin + 5 * HZ)) {
+ ipoib_warn(priv, "timing out; %d sends %d receives not completed\n",
+ priv->tx_head - priv->tx_tail, recvs_pending(priv));
+
+ /*
+ * assume the HW is wedged and just free up
+ * all our pending work requests.
+ */
+ while ((int) priv->tx_tail - (int) priv->tx_head < 0) {
+ tx_req = &priv->tx_ring[priv->tx_tail &
+ (ipoib_sendq_size - 1)];
+ ipoib_dma_unmap_tx(priv->ca, tx_req);
+ m_freem(tx_req->mb);
+ ++priv->tx_tail;
+ --priv->tx_outstanding;
+ }
+
+ for (i = 0; i < ipoib_recvq_size; ++i) {
+ struct ipoib_rx_buf *rx_req;
+
+ rx_req = &priv->rx_ring[i];
+ if (!rx_req->mb)
+ continue;
+ ipoib_dma_unmap_rx(priv, &priv->rx_ring[i]);
+ m_freem(rx_req->mb);
+ rx_req->mb = NULL;
+ }
+
+ goto timeout;
+ }
+
+ ipoib_drain_cq(priv);
+
+ msleep(1);
+ }
+
+ ipoib_dbg(priv, "All sends and receives done.\n");
+
+timeout:
+ del_timer_sync(&priv->poll_timer);
+ qp_attr.qp_state = IB_QPS_RESET;
+ if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
+ ipoib_warn(priv, "Failed to modify QP to RESET state\n");
+
+ /* Wait for all AHs to be reaped */
+ set_bit(IPOIB_STOP_REAPER, &priv->flags);
+ cancel_delayed_work(&priv->ah_reap_task);
+ if (flush)
+ flush_workqueue(ipoib_workqueue);
+
+ ipoib_ah_dev_cleanup(priv);
+
+ ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP);
+
+ return 0;
+}
+
+int ipoib_ib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port)
+{
+ struct ifnet *dev = priv->dev;
+
+ priv->ca = ca;
+ priv->port = port;
+ priv->qp = NULL;
+
+ if (ipoib_transport_dev_init(priv, ca)) {
+ printk(KERN_WARNING "%s: ipoib_transport_dev_init failed\n", ca->name);
+ return -ENODEV;
+ }
+
+ setup_timer(&priv->poll_timer, ipoib_ib_tx_timer_func,
+ (unsigned long) priv);
+
+ if (dev->if_flags & IFF_UP) {
+ if (ipoib_ib_dev_open(priv)) {
+ ipoib_transport_dev_cleanup(priv);
+ return -ENODEV;
+ }
+ }
+
+ return 0;
+}
+
+static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
+ enum ipoib_flush_level level)
+{
+ struct ipoib_dev_priv *cpriv;
+ u16 new_index;
+
+ mutex_lock(&priv->vlan_mutex);
+
+ /*
+ * Flush any child interfaces too -- they might be up even if
+ * the parent is down.
+ */
+ list_for_each_entry(cpriv, &priv->child_intfs, list)
+ __ipoib_ib_dev_flush(cpriv, level);
+
+ mutex_unlock(&priv->vlan_mutex);
+
+ if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) {
+ ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n");
+ return;
+ }
+
+ if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
+ ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n");
+ return;
+ }
+
+ if (level == IPOIB_FLUSH_HEAVY) {
+ if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) {
+ clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+ ipoib_ib_dev_down(priv, 0);
+ ipoib_ib_dev_stop(priv, 0);
+ if (ipoib_pkey_dev_delay_open(priv))
+ return;
+ }
+
+ /* restart QP only if P_Key index is changed */
+ if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) &&
+ new_index == priv->pkey_index) {
+ ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n");
+ return;
+ }
+ priv->pkey_index = new_index;
+ }
+
+ if (level == IPOIB_FLUSH_LIGHT) {
+ ipoib_mark_paths_invalid(priv);
+ ipoib_mcast_dev_flush(priv);
+ }
+
+ if (level >= IPOIB_FLUSH_NORMAL)
+ ipoib_ib_dev_down(priv, 0);
+
+ if (level == IPOIB_FLUSH_HEAVY) {
+ ipoib_ib_dev_stop(priv, 0);
+ ipoib_ib_dev_open(priv);
+ }
+
+ /*
+ * The device could have been brought down between the start and when
+ * we get here, don't bring it back up if it's not configured up
+ */
+ if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
+ if (level >= IPOIB_FLUSH_NORMAL)
+ ipoib_ib_dev_up(priv);
+ ipoib_mcast_restart_task(&priv->restart_task);
+ }
+}
+
+void ipoib_ib_dev_flush_light(struct work_struct *work)
+{
+ struct ipoib_dev_priv *priv =
+ container_of(work, struct ipoib_dev_priv, flush_light);
+
+ __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT);
+}
+
+void ipoib_ib_dev_flush_normal(struct work_struct *work)
+{
+ struct ipoib_dev_priv *priv =
+ container_of(work, struct ipoib_dev_priv, flush_normal);
+
+ __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL);
+}
+
+void ipoib_ib_dev_flush_heavy(struct work_struct *work)
+{
+ struct ipoib_dev_priv *priv =
+ container_of(work, struct ipoib_dev_priv, flush_heavy);
+
+ __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY);
+}
+
+void ipoib_ib_dev_cleanup(struct ipoib_dev_priv *priv)
+{
+
+ ipoib_dbg(priv, "cleaning up ib_dev\n");
+
+ ipoib_mcast_stop_thread(priv, 1);
+ ipoib_mcast_dev_flush(priv);
+
+ ipoib_ah_dev_cleanup(priv);
+ ipoib_transport_dev_cleanup(priv);
+}
+
+/*
+ * Delayed P_Key Assigment Interim Support
+ *
+ * The following is initial implementation of delayed P_Key assigment
+ * mechanism. It is using the same approach implemented for the multicast
+ * group join. The single goal of this implementation is to quickly address
+ * Bug #2507. This implementation will probably be removed when the P_Key
+ * change async notification is available.
+ */
+
+void ipoib_pkey_poll(struct work_struct *work)
+{
+ struct ipoib_dev_priv *priv =
+ container_of(work, struct ipoib_dev_priv, pkey_poll_task.work);
+
+ ipoib_pkey_dev_check_presence(priv);
+
+ if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
+ ipoib_open(priv);
+ else {
+ mutex_lock(&pkey_mutex);
+ if (!test_bit(IPOIB_PKEY_STOP, &priv->flags))
+ queue_delayed_work(ipoib_workqueue,
+ &priv->pkey_poll_task,
+ HZ);
+ mutex_unlock(&pkey_mutex);
+ }
+}
+
+int ipoib_pkey_dev_delay_open(struct ipoib_dev_priv *priv)
+{
+
+ /* Look for the interface pkey value in the IB Port P_Key table and */
+ /* set the interface pkey assigment flag */
+ ipoib_pkey_dev_check_presence(priv);
+
+ /* P_Key value not assigned yet - start polling */
+ if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
+ mutex_lock(&pkey_mutex);
+ clear_bit(IPOIB_PKEY_STOP, &priv->flags);
+ queue_delayed_work(ipoib_workqueue,
+ &priv->pkey_poll_task,
+ HZ);
+ mutex_unlock(&pkey_mutex);
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c
new file mode 100644
index 0000000..99c9cc2
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -0,0 +1,1537 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ipoib.h"
+
+static int ipoib_resolvemulti(struct ifnet *, struct sockaddr **,
+ struct sockaddr *);
+
+
+#include <linux/module.h>
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+
+#include <linux/if_arp.h> /* For ARPHRD_xxx */
+#include <linux/if_vlan.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
+MODULE_LICENSE("Dual BSD/GPL");
+
+int ipoib_sendq_size = IPOIB_TX_RING_SIZE;
+int ipoib_recvq_size = IPOIB_RX_RING_SIZE;
+
+module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
+MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
+module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
+MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+int ipoib_debug_level = 1;
+
+module_param_named(debug_level, ipoib_debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
+#endif
+
+struct ipoib_path_iter {
+ struct ipoib_dev_priv *priv;
+ struct ipoib_path path;
+};
+
+static const u8 ipv4_bcast_addr[] = {
+ 0x00, 0xff, 0xff, 0xff,
+ 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff
+};
+
+struct workqueue_struct *ipoib_workqueue;
+
+struct ib_sa_client ipoib_sa_client;
+
+static void ipoib_add_one(struct ib_device *device);
+static void ipoib_remove_one(struct ib_device *device);
+static void ipoib_start(struct ifnet *dev);
+static int ipoib_output(struct ifnet *ifp, struct mbuf *m,
+ struct sockaddr *dst, struct route *ro);
+static int ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data);
+static void ipoib_input(struct ifnet *ifp, struct mbuf *m);
+
+#define IPOIB_MTAP(_ifp, _m) \
+do { \
+ if (bpf_peers_present((_ifp)->if_bpf)) { \
+ M_ASSERTVALID(_m); \
+ ipoib_mtap_mb((_ifp), (_m)); \
+ } \
+} while (0)
+
+/*
+ * This is for clients that have an ipoib_header in the mbuf.
+ */
+static void
+ipoib_mtap_mb(struct ifnet *ifp, struct mbuf *mb)
+{
+ struct ipoib_header *ih;
+ struct ether_header eh;
+
+ ih = mtod(mb, struct ipoib_header *);
+ eh.ether_type = ih->proto;
+ bcopy(ih->hwaddr, &eh.ether_dhost, ETHER_ADDR_LEN);
+ bzero(&eh.ether_shost, ETHER_ADDR_LEN);
+ mb->m_data += sizeof(struct ipoib_header);
+ mb->m_len -= sizeof(struct ipoib_header);
+ bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb);
+ mb->m_data -= sizeof(struct ipoib_header);
+ mb->m_len += sizeof(struct ipoib_header);
+}
+
+void
+ipoib_mtap_proto(struct ifnet *ifp, struct mbuf *mb, uint16_t proto)
+{
+ struct ether_header eh;
+
+ eh.ether_type = proto;
+ bzero(&eh.ether_shost, ETHER_ADDR_LEN);
+ bzero(&eh.ether_dhost, ETHER_ADDR_LEN);
+ bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb);
+}
+
+static struct ib_client ipoib_client = {
+ .name = "ipoib",
+ .add = ipoib_add_one,
+ .remove = ipoib_remove_one
+};
+
+int
+ipoib_open(struct ipoib_dev_priv *priv)
+{
+ struct ifnet *dev = priv->dev;
+
+ ipoib_dbg(priv, "bringing up interface\n");
+
+ set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
+
+ if (ipoib_pkey_dev_delay_open(priv))
+ return 0;
+
+ if (ipoib_ib_dev_open(priv))
+ goto err_disable;
+
+ if (ipoib_ib_dev_up(priv))
+ goto err_stop;
+
+ if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+ struct ipoib_dev_priv *cpriv;
+
+ /* Bring up any child interfaces too */
+ mutex_lock(&priv->vlan_mutex);
+ list_for_each_entry(cpriv, &priv->child_intfs, list)
+ if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) == 0)
+ ipoib_open(cpriv);
+ mutex_unlock(&priv->vlan_mutex);
+ }
+ dev->if_drv_flags |= IFF_DRV_RUNNING;
+ dev->if_drv_flags &= ~IFF_DRV_OACTIVE;
+
+ return 0;
+
+err_stop:
+ ipoib_ib_dev_stop(priv, 1);
+
+err_disable:
+ clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
+
+ return -EINVAL;
+}
+
+static void
+ipoib_init(void *arg)
+{
+ struct ifnet *dev;
+ struct ipoib_dev_priv *priv;
+
+ priv = arg;
+ dev = priv->dev;
+ if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0)
+ ipoib_open(priv);
+ queue_work(ipoib_workqueue, &priv->flush_light);
+}
+
+
+static int
+ipoib_stop(struct ipoib_dev_priv *priv)
+{
+ struct ifnet *dev = priv->dev;
+
+ ipoib_dbg(priv, "stopping interface\n");
+
+ clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
+
+ dev->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
+
+ ipoib_ib_dev_down(priv, 0);
+ ipoib_ib_dev_stop(priv, 0);
+
+ if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+ struct ipoib_dev_priv *cpriv;
+
+ /* Bring down any child interfaces too */
+ mutex_lock(&priv->vlan_mutex);
+ list_for_each_entry(cpriv, &priv->child_intfs, list)
+ if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) != 0)
+ ipoib_stop(cpriv);
+ mutex_unlock(&priv->vlan_mutex);
+ }
+
+ return 0;
+}
+
+int
+ipoib_change_mtu(struct ipoib_dev_priv *priv, int new_mtu)
+{
+ struct ifnet *dev = priv->dev;
+
+ /* dev->if_mtu > 2K ==> connected mode */
+ if (ipoib_cm_admin_enabled(priv)) {
+ if (new_mtu > IPOIB_CM_MTU(ipoib_cm_max_mtu(priv)))
+ return -EINVAL;
+
+ if (new_mtu > priv->mcast_mtu)
+ ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
+ priv->mcast_mtu);
+
+ dev->if_mtu = new_mtu;
+ return 0;
+ }
+
+ if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
+ return -EINVAL;
+
+ priv->admin_mtu = new_mtu;
+
+ dev->if_mtu = min(priv->mcast_mtu, priv->admin_mtu);
+
+ queue_work(ipoib_workqueue, &priv->flush_light);
+
+ return 0;
+}
+
+static int
+ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
+{
+ struct ipoib_dev_priv *priv = ifp->if_softc;
+ struct ifaddr *ifa = (struct ifaddr *) data;
+ struct ifreq *ifr = (struct ifreq *) data;
+ int error = 0;
+
+ switch (command) {
+ case SIOCSIFFLAGS:
+ if (ifp->if_flags & IFF_UP) {
+ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
+ error = -ipoib_open(priv);
+ } else
+ if (ifp->if_drv_flags & IFF_DRV_RUNNING)
+ ipoib_stop(priv);
+ break;
+ case SIOCADDMULTI:
+ case SIOCDELMULTI:
+ if (ifp->if_drv_flags & IFF_DRV_RUNNING)
+ queue_work(ipoib_workqueue, &priv->restart_task);
+ break;
+ case SIOCSIFADDR:
+ ifp->if_flags |= IFF_UP;
+
+ switch (ifa->ifa_addr->sa_family) {
+#ifdef INET
+ case AF_INET:
+ ifp->if_init(ifp->if_softc); /* before arpwhohas */
+ arp_ifinit(ifp, ifa);
+ break;
+#endif
+ default:
+ ifp->if_init(ifp->if_softc);
+ break;
+ }
+ break;
+
+ case SIOCGIFADDR:
+ {
+ struct sockaddr *sa;
+
+ sa = (struct sockaddr *) & ifr->ifr_data;
+ bcopy(IF_LLADDR(ifp),
+ (caddr_t) sa->sa_data, INFINIBAND_ALEN);
+ }
+ break;
+
+ case SIOCSIFMTU:
+ /*
+ * Set the interface MTU.
+ */
+ error = -ipoib_change_mtu(priv, ifr->ifr_mtu);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ return (error);
+}
+
+
+static struct ipoib_path *
+__path_find(struct ipoib_dev_priv *priv, void *gid)
+{
+ struct rb_node *n = priv->path_tree.rb_node;
+ struct ipoib_path *path;
+ int ret;
+
+ while (n) {
+ path = rb_entry(n, struct ipoib_path, rb_node);
+
+ ret = memcmp(gid, path->pathrec.dgid.raw,
+ sizeof (union ib_gid));
+
+ if (ret < 0)
+ n = n->rb_left;
+ else if (ret > 0)
+ n = n->rb_right;
+ else
+ return path;
+ }
+
+ return NULL;
+}
+
+static int
+__path_add(struct ipoib_dev_priv *priv, struct ipoib_path *path)
+{
+ struct rb_node **n = &priv->path_tree.rb_node;
+ struct rb_node *pn = NULL;
+ struct ipoib_path *tpath;
+ int ret;
+
+ while (*n) {
+ pn = *n;
+ tpath = rb_entry(pn, struct ipoib_path, rb_node);
+
+ ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
+ sizeof (union ib_gid));
+ if (ret < 0)
+ n = &pn->rb_left;
+ else if (ret > 0)
+ n = &pn->rb_right;
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node(&path->rb_node, pn, n);
+ rb_insert_color(&path->rb_node, &priv->path_tree);
+
+ list_add_tail(&path->list, &priv->path_list);
+
+ return 0;
+}
+
+void
+ipoib_path_free(struct ipoib_dev_priv *priv, struct ipoib_path *path)
+{
+
+ _IF_DRAIN(&path->queue);
+
+ if (path->ah)
+ ipoib_put_ah(path->ah);
+ if (ipoib_cm_get(path))
+ ipoib_cm_destroy_tx(ipoib_cm_get(path));
+
+ kfree(path);
+}
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+
+struct ipoib_path_iter *
+ipoib_path_iter_init(struct ipoib_dev_priv *priv)
+{
+ struct ipoib_path_iter *iter;
+
+ iter = kmalloc(sizeof *iter, GFP_KERNEL);
+ if (!iter)
+ return NULL;
+
+ iter->priv = priv;
+ memset(iter->path.pathrec.dgid.raw, 0, 16);
+
+ if (ipoib_path_iter_next(iter)) {
+ kfree(iter);
+ return NULL;
+ }
+
+ return iter;
+}
+
+int
+ipoib_path_iter_next(struct ipoib_path_iter *iter)
+{
+ struct ipoib_dev_priv *priv = iter->priv;
+ struct rb_node *n;
+ struct ipoib_path *path;
+ int ret = 1;
+
+ spin_lock_irq(&priv->lock);
+
+ n = rb_first(&priv->path_tree);
+
+ while (n) {
+ path = rb_entry(n, struct ipoib_path, rb_node);
+
+ if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,
+ sizeof (union ib_gid)) < 0) {
+ iter->path = *path;
+ ret = 0;
+ break;
+ }
+
+ n = rb_next(n);
+ }
+
+ spin_unlock_irq(&priv->lock);
+
+ return ret;
+}
+
+void
+ipoib_path_iter_read(struct ipoib_path_iter *iter, struct ipoib_path *path)
+{
+ *path = iter->path;
+}
+
+#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
+
+void
+ipoib_mark_paths_invalid(struct ipoib_dev_priv *priv)
+{
+ struct ipoib_path *path, *tp;
+
+ spin_lock_irq(&priv->lock);
+
+ list_for_each_entry_safe(path, tp, &priv->path_list, list) {
+ ipoib_dbg(priv, "mark path LID 0x%04x GID %16D invalid\n",
+ be16_to_cpu(path->pathrec.dlid),
+ path->pathrec.dgid.raw, ":");
+ path->valid = 0;
+ }
+
+ spin_unlock_irq(&priv->lock);
+}
+
+void
+ipoib_flush_paths(struct ipoib_dev_priv *priv)
+{
+ struct ipoib_path *path, *tp;
+ LIST_HEAD(remove_list);
+ unsigned long flags;
+
+ spin_lock_irqsave(&priv->lock, flags);
+
+ list_splice_init(&priv->path_list, &remove_list);
+
+ list_for_each_entry(path, &remove_list, list)
+ rb_erase(&path->rb_node, &priv->path_tree);
+
+ list_for_each_entry_safe(path, tp, &remove_list, list) {
+ if (path->query)
+ ib_sa_cancel_query(path->query_id, path->query);
+ spin_unlock_irqrestore(&priv->lock, flags);
+ wait_for_completion(&path->done);
+ ipoib_path_free(priv, path);
+ spin_lock_irqsave(&priv->lock, flags);
+ }
+
+ spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static void
+path_rec_completion(int status, struct ib_sa_path_rec *pathrec, void *path_ptr)
+{
+ struct ipoib_path *path = path_ptr;
+ struct ipoib_dev_priv *priv = path->priv;
+ struct ifnet *dev = priv->dev;
+ struct ipoib_ah *ah = NULL;
+ struct ipoib_ah *old_ah = NULL;
+ struct ifqueue mbqueue;
+ struct mbuf *mb;
+ unsigned long flags;
+
+ if (!status)
+ ipoib_dbg(priv, "PathRec LID 0x%04x for GID %16D\n",
+ be16_to_cpu(pathrec->dlid), pathrec->dgid.raw, ":");
+ else
+ ipoib_dbg(priv, "PathRec status %d for GID %16D\n",
+ status, path->pathrec.dgid.raw, ":");
+
+ bzero(&mbqueue, sizeof(mbqueue));
+
+ if (!status) {
+ struct ib_ah_attr av;
+
+ if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av))
+ ah = ipoib_create_ah(priv, priv->pd, &av);
+ }
+
+ spin_lock_irqsave(&priv->lock, flags);
+
+ if (ah) {
+ path->pathrec = *pathrec;
+
+ old_ah = path->ah;
+ path->ah = ah;
+
+ ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
+ ah, be16_to_cpu(pathrec->dlid), pathrec->sl);
+
+ for (;;) {
+ _IF_DEQUEUE(&path->queue, mb);
+ if (mb == NULL)
+ break;
+ _IF_ENQUEUE(&mbqueue, mb);
+ }
+
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+ if (ipoib_cm_enabled(priv, path->hwaddr) && !ipoib_cm_get(path))
+ ipoib_cm_set(path, ipoib_cm_create_tx(priv, path));
+#endif
+
+ path->valid = 1;
+ }
+
+ path->query = NULL;
+ complete(&path->done);
+
+ spin_unlock_irqrestore(&priv->lock, flags);
+
+ if (old_ah)
+ ipoib_put_ah(old_ah);
+
+ for (;;) {
+ _IF_DEQUEUE(&mbqueue, mb);
+ if (mb == NULL)
+ break;
+ mb->m_pkthdr.rcvif = dev;
+ if (dev->if_transmit(dev, mb))
+ ipoib_warn(priv, "dev_queue_xmit failed "
+ "to requeue packet\n");
+ }
+}
+
+static struct ipoib_path *
+path_rec_create(struct ipoib_dev_priv *priv, uint8_t *hwaddr)
+{
+ struct ipoib_path *path;
+
+ if (!priv->broadcast)
+ return NULL;
+
+ path = kzalloc(sizeof *path, GFP_ATOMIC);
+ if (!path)
+ return NULL;
+
+ path->priv = priv;
+
+ bzero(&path->queue, sizeof(path->queue));
+
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+ memcpy(&path->hwaddr, hwaddr, INFINIBAND_ALEN);
+#endif
+ memcpy(path->pathrec.dgid.raw, &hwaddr[4], sizeof (union ib_gid));
+ path->pathrec.sgid = priv->local_gid;
+ path->pathrec.pkey = cpu_to_be16(priv->pkey);
+ path->pathrec.numb_path = 1;
+ path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
+
+ return path;
+}
+
+static int
+path_rec_start(struct ipoib_dev_priv *priv, struct ipoib_path *path)
+{
+ struct ifnet *dev = priv->dev;
+
+ ib_sa_comp_mask comp_mask = IB_SA_PATH_REC_MTU_SELECTOR | IB_SA_PATH_REC_MTU;
+ struct ib_sa_path_rec p_rec;
+
+ p_rec = path->pathrec;
+ p_rec.mtu_selector = IB_SA_GT;
+
+ switch (roundup_pow_of_two(dev->if_mtu + IPOIB_ENCAP_LEN)) {
+ case 512:
+ p_rec.mtu = IB_MTU_256;
+ break;
+ case 1024:
+ p_rec.mtu = IB_MTU_512;
+ break;
+ case 2048:
+ p_rec.mtu = IB_MTU_1024;
+ break;
+ case 4096:
+ p_rec.mtu = IB_MTU_2048;
+ break;
+ default:
+ /* Wildcard everything */
+ comp_mask = 0;
+ p_rec.mtu = 0;
+ p_rec.mtu_selector = 0;
+ }
+
+ ipoib_dbg(priv, "Start path record lookup for %16D MTU > %d\n",
+ p_rec.dgid.raw, ":",
+ comp_mask ? ib_mtu_enum_to_int(p_rec.mtu) : 0);
+
+ init_completion(&path->done);
+
+ path->query_id =
+ ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,
+ &p_rec, comp_mask |
+ IB_SA_PATH_REC_DGID |
+ IB_SA_PATH_REC_SGID |
+ IB_SA_PATH_REC_NUMB_PATH |
+ IB_SA_PATH_REC_TRAFFIC_CLASS |
+ IB_SA_PATH_REC_PKEY,
+ 1000, GFP_ATOMIC,
+ path_rec_completion,
+ path, &path->query);
+ if (path->query_id < 0) {
+ ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);
+ path->query = NULL;
+ complete(&path->done);
+ return path->query_id;
+ }
+
+ return 0;
+}
+
+static void
+ipoib_unicast_send(struct mbuf *mb, struct ipoib_dev_priv *priv, struct ipoib_header *eh)
+{
+ struct ipoib_path *path;
+
+ path = __path_find(priv, eh->hwaddr + 4);
+ if (!path || !path->valid) {
+ int new_path = 0;
+
+ if (!path) {
+ path = path_rec_create(priv, eh->hwaddr);
+ new_path = 1;
+ }
+ if (path) {
+ _IF_ENQUEUE(&path->queue, mb);
+ if (!path->query && path_rec_start(priv, path)) {
+ spin_unlock_irqrestore(&priv->lock, flags);
+ if (new_path)
+ ipoib_path_free(priv, path);
+ return;
+ } else
+ __path_add(priv, path);
+ } else {
+ ++priv->dev->if_oerrors;
+ m_freem(mb);
+ }
+
+ return;
+ }
+
+ if (ipoib_cm_get(path) && ipoib_cm_up(path)) {
+ ipoib_cm_send(priv, mb, ipoib_cm_get(path));
+ } else if (path->ah) {
+ ipoib_send(priv, mb, path->ah, IPOIB_QPN(eh->hwaddr));
+ } else if ((path->query || !path_rec_start(priv, path)) &&
+ path->queue.ifq_len < IPOIB_MAX_PATH_REC_QUEUE) {
+ _IF_ENQUEUE(&path->queue, mb);
+ } else {
+ ++priv->dev->if_oerrors;
+ m_freem(mb);
+ }
+}
+
+static int
+ipoib_send_one(struct ipoib_dev_priv *priv, struct mbuf *mb)
+{
+ struct ipoib_header *eh;
+
+ eh = mtod(mb, struct ipoib_header *);
+ if (IPOIB_IS_MULTICAST(eh->hwaddr)) {
+ /* Add in the P_Key for multicast*/
+ eh->hwaddr[8] = (priv->pkey >> 8) & 0xff;
+ eh->hwaddr[9] = priv->pkey & 0xff;
+
+ ipoib_mcast_send(priv, eh->hwaddr + 4, mb);
+ } else
+ ipoib_unicast_send(mb, priv, eh);
+
+ return 0;
+}
+
+
+static void
+_ipoib_start(struct ifnet *dev, struct ipoib_dev_priv *priv)
+{
+ struct mbuf *mb;
+
+ if ((dev->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
+ IFF_DRV_RUNNING)
+ return;
+
+ spin_lock(&priv->lock);
+ while (!IFQ_DRV_IS_EMPTY(&dev->if_snd) &&
+ (dev->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
+ IFQ_DRV_DEQUEUE(&dev->if_snd, mb);
+ if (mb == NULL)
+ break;
+ IPOIB_MTAP(dev, mb);
+ ipoib_send_one(priv, mb);
+ }
+ spin_unlock(&priv->lock);
+}
+
+static void
+ipoib_start(struct ifnet *dev)
+{
+ _ipoib_start(dev, dev->if_softc);
+}
+
+static void
+ipoib_vlan_start(struct ifnet *dev)
+{
+ struct ipoib_dev_priv *priv;
+ struct mbuf *mb;
+
+ priv = VLAN_COOKIE(dev);
+ if (priv != NULL)
+ return _ipoib_start(dev, priv);
+ while (!IFQ_DRV_IS_EMPTY(&dev->if_snd)) {
+ IFQ_DRV_DEQUEUE(&dev->if_snd, mb);
+ if (mb == NULL)
+ break;
+ m_freem(mb);
+ dev->if_oerrors++;
+ }
+}
+
+int
+ipoib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port)
+{
+
+ /* Allocate RX/TX "rings" to hold queued mbs */
+ priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
+ GFP_KERNEL);
+ if (!priv->rx_ring) {
+ printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
+ ca->name, ipoib_recvq_size);
+ goto out;
+ }
+
+ priv->tx_ring = kzalloc(ipoib_sendq_size * sizeof *priv->tx_ring, GFP_KERNEL);
+ if (!priv->tx_ring) {
+ printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
+ ca->name, ipoib_sendq_size);
+ goto out_rx_ring_cleanup;
+ }
+ memset(priv->tx_ring, 0, ipoib_sendq_size * sizeof *priv->tx_ring);
+
+ /* priv->tx_head, tx_tail & tx_outstanding are already 0 */
+
+ if (ipoib_ib_dev_init(priv, ca, port))
+ goto out_tx_ring_cleanup;
+
+ return 0;
+
+out_tx_ring_cleanup:
+ kfree(priv->tx_ring);
+
+out_rx_ring_cleanup:
+ kfree(priv->rx_ring);
+
+out:
+ return -ENOMEM;
+}
+
+static void
+ipoib_detach(struct ipoib_dev_priv *priv)
+{
+ struct ifnet *dev;
+
+ dev = priv->dev;
+ if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+ bpfdetach(dev);
+ if_detach(dev);
+ if_free(dev);
+ } else
+ VLAN_SETCOOKIE(priv->dev, NULL);
+
+ free(priv, M_TEMP);
+}
+
+void
+ipoib_dev_cleanup(struct ipoib_dev_priv *priv)
+{
+ struct ipoib_dev_priv *cpriv, *tcpriv;
+
+ /* Delete any child interfaces first */
+ list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
+ ipoib_dev_cleanup(cpriv);
+ ipoib_detach(cpriv);
+ }
+
+ ipoib_ib_dev_cleanup(priv);
+
+ kfree(priv->rx_ring);
+ kfree(priv->tx_ring);
+
+ priv->rx_ring = NULL;
+ priv->tx_ring = NULL;
+}
+
+static volatile int ipoib_unit;
+
+static struct ipoib_dev_priv *
+ipoib_priv_alloc(void)
+{
+ struct ipoib_dev_priv *priv;
+
+ priv = malloc(sizeof(struct ipoib_dev_priv), M_TEMP, M_ZERO|M_WAITOK);
+ spin_lock_init(&priv->lock);
+ mutex_init(&priv->vlan_mutex);
+ INIT_LIST_HEAD(&priv->path_list);
+ INIT_LIST_HEAD(&priv->child_intfs);
+ INIT_LIST_HEAD(&priv->dead_ahs);
+ INIT_LIST_HEAD(&priv->multicast_list);
+ INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
+ INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task);
+ INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
+ INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light);
+ INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal);
+ INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy);
+ INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
+ INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
+ memcpy(priv->broadcastaddr, ipv4_bcast_addr, INFINIBAND_ALEN);
+
+ return (priv);
+}
+
+struct ipoib_dev_priv *
+ipoib_intf_alloc(const char *name)
+{
+ struct ipoib_dev_priv *priv;
+ struct sockaddr_dl *sdl;
+ struct ifnet *dev;
+
+ priv = ipoib_priv_alloc();
+ dev = priv->dev = if_alloc(IFT_INFINIBAND);
+ if (!dev) {
+ free(priv, M_TEMP);
+ return NULL;
+ }
+ dev->if_softc = priv;
+ if_initname(dev, name, atomic_fetchadd_int(&ipoib_unit, 1));
+ dev->if_flags = IFF_BROADCAST | IFF_MULTICAST;
+ dev->if_addrlen = INFINIBAND_ALEN;
+ dev->if_hdrlen = IPOIB_HEADER_LEN;
+ if_attach(dev);
+ dev->if_init = ipoib_init;
+ dev->if_ioctl = ipoib_ioctl;
+ dev->if_start = ipoib_start;
+ dev->if_output = ipoib_output;
+ dev->if_input = ipoib_input;
+ dev->if_resolvemulti = ipoib_resolvemulti;
+ dev->if_baudrate = IF_Gbps(10LL);
+ dev->if_broadcastaddr = priv->broadcastaddr;
+ dev->if_snd.ifq_maxlen = ipoib_sendq_size * 2;
+ sdl = (struct sockaddr_dl *)dev->if_addr->ifa_addr;
+ sdl->sdl_type = IFT_INFINIBAND;
+ sdl->sdl_alen = dev->if_addrlen;
+ priv->dev = dev;
+ if_link_state_change(dev, LINK_STATE_DOWN);
+ bpfattach(dev, DLT_EN10MB, ETHER_HDR_LEN);
+
+ return dev->if_softc;
+}
+
+int
+ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
+{
+ struct ib_device_attr *device_attr;
+ int result = -ENOMEM;
+
+ device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL);
+ if (!device_attr) {
+ printk(KERN_WARNING "%s: allocation of %zu bytes failed\n",
+ hca->name, sizeof *device_attr);
+ return result;
+ }
+
+ result = ib_query_device(hca, device_attr);
+ if (result) {
+ printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n",
+ hca->name, result);
+ kfree(device_attr);
+ return result;
+ }
+ priv->hca_caps = device_attr->device_cap_flags;
+
+ kfree(device_attr);
+
+ priv->dev->if_hwassist = 0;
+ priv->dev->if_capabilities = 0;
+
+#ifndef CONFIG_INFINIBAND_IPOIB_CM
+ if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
+ set_bit(IPOIB_FLAG_CSUM, &priv->flags);
+ priv->dev->if_hwassist = CSUM_IP | CSUM_TCP | CSUM_UDP;
+ priv->dev->if_capabilities = IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM;
+ }
+
+#if 0
+ if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO)
+ priv->dev->if_capabilities |= IFCAP_TSO4 | CSUM_TSO;
+#endif
+#endif
+ priv->dev->if_capabilities |=
+ IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_LINKSTATE;
+ priv->dev->if_capenable = priv->dev->if_capabilities;
+
+ return 0;
+}
+
+
+static struct ifnet *
+ipoib_add_port(const char *format, struct ib_device *hca, u8 port)
+{
+ struct ipoib_dev_priv *priv;
+ struct ib_port_attr attr;
+ int result = -ENOMEM;
+
+ priv = ipoib_intf_alloc(format);
+ if (!priv)
+ goto alloc_mem_failed;
+
+ if (!ib_query_port(hca, port, &attr))
+ priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
+ else {
+ printk(KERN_WARNING "%s: ib_query_port %d failed\n",
+ hca->name, port);
+ goto device_init_failed;
+ }
+
+ /* MTU will be reset when mcast join happens */
+ priv->dev->if_mtu = IPOIB_UD_MTU(priv->max_ib_mtu);
+ priv->mcast_mtu = priv->admin_mtu = priv->dev->if_mtu;
+
+ result = ib_query_pkey(hca, port, 0, &priv->pkey);
+ if (result) {
+ printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
+ hca->name, port, result);
+ goto device_init_failed;
+ }
+
+ if (ipoib_set_dev_features(priv, hca))
+ goto device_init_failed;
+
+ /*
+ * Set the full membership bit, so that we join the right
+ * broadcast group, etc.
+ */
+ priv->pkey |= 0x8000;
+
+ priv->broadcastaddr[8] = priv->pkey >> 8;
+ priv->broadcastaddr[9] = priv->pkey & 0xff;
+
+ result = ib_query_gid(hca, port, 0, &priv->local_gid);
+ if (result) {
+ printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
+ hca->name, port, result);
+ goto device_init_failed;
+ }
+ memcpy(IF_LLADDR(priv->dev) + 4, priv->local_gid.raw, sizeof (union ib_gid));
+
+ result = ipoib_dev_init(priv, hca, port);
+ if (result < 0) {
+ printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n",
+ hca->name, port, result);
+ goto device_init_failed;
+ }
+ if (ipoib_cm_admin_enabled(priv))
+ priv->dev->if_mtu = IPOIB_CM_MTU(ipoib_cm_max_mtu(priv));
+
+ INIT_IB_EVENT_HANDLER(&priv->event_handler,
+ priv->ca, ipoib_event);
+ result = ib_register_event_handler(&priv->event_handler);
+ if (result < 0) {
+ printk(KERN_WARNING "%s: ib_register_event_handler failed for "
+ "port %d (ret = %d)\n",
+ hca->name, port, result);
+ goto event_failed;
+ }
+ if_printf(priv->dev, "Attached to %s port %d\n", hca->name, port);
+
+ return priv->dev;
+
+event_failed:
+ ipoib_dev_cleanup(priv);
+
+device_init_failed:
+ ipoib_detach(priv);
+
+alloc_mem_failed:
+ return ERR_PTR(result);
+}
+
+static void
+ipoib_add_one(struct ib_device *device)
+{
+ struct list_head *dev_list;
+ struct ifnet *dev;
+ struct ipoib_dev_priv *priv;
+ int s, e, p;
+
+ if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+ return;
+
+ dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);
+ if (!dev_list)
+ return;
+
+ INIT_LIST_HEAD(dev_list);
+
+ if (device->node_type == RDMA_NODE_IB_SWITCH) {
+ s = 0;
+ e = 0;
+ } else {
+ s = 1;
+ e = device->phys_port_cnt;
+ }
+
+ for (p = s; p <= e; ++p) {
+ if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND)
+ continue;
+ dev = ipoib_add_port("ib", device, p);
+ if (!IS_ERR(dev)) {
+ priv = dev->if_softc;
+ list_add_tail(&priv->list, dev_list);
+ }
+ }
+
+ ib_set_client_data(device, &ipoib_client, dev_list);
+}
+
+static void
+ipoib_remove_one(struct ib_device *device)
+{
+ struct ipoib_dev_priv *priv, *tmp;
+ struct list_head *dev_list;
+
+ if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+ return;
+
+ dev_list = ib_get_client_data(device, &ipoib_client);
+
+ list_for_each_entry_safe(priv, tmp, dev_list, list) {
+ if (rdma_port_get_link_layer(device, priv->port) != IB_LINK_LAYER_INFINIBAND)
+ continue;
+
+ ib_unregister_event_handler(&priv->event_handler);
+
+ /* dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); */
+
+ flush_workqueue(ipoib_workqueue);
+
+ ipoib_dev_cleanup(priv);
+ ipoib_detach(priv);
+ }
+
+ kfree(dev_list);
+}
+
+static void
+ipoib_config_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
+{
+ struct ipoib_dev_priv *parent;
+ struct ipoib_dev_priv *priv;
+ struct ifnet *dev;
+ uint16_t pkey;
+ int error;
+
+ if (ifp->if_type != IFT_INFINIBAND)
+ return;
+ dev = VLAN_DEVAT(ifp, vtag);
+ if (dev == NULL)
+ return;
+ priv = NULL;
+ error = 0;
+ parent = ifp->if_softc;
+ /* We only support 15 bits of pkey. */
+ if (vtag & 0x8000)
+ return;
+ pkey = vtag | 0x8000; /* Set full membership bit. */
+ if (pkey == parent->pkey)
+ return;
+ /* Check for dups */
+ mutex_lock(&parent->vlan_mutex);
+ list_for_each_entry(priv, &parent->child_intfs, list) {
+ if (priv->pkey == pkey) {
+ priv = NULL;
+ error = EBUSY;
+ goto out;
+ }
+ }
+ priv = ipoib_priv_alloc();
+ priv->dev = dev;
+ priv->max_ib_mtu = parent->max_ib_mtu;
+ priv->mcast_mtu = priv->admin_mtu = parent->dev->if_mtu;
+ set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags);
+ error = ipoib_set_dev_features(priv, parent->ca);
+ if (error)
+ goto out;
+ priv->pkey = pkey;
+ priv->broadcastaddr[8] = pkey >> 8;
+ priv->broadcastaddr[9] = pkey & 0xff;
+ dev->if_broadcastaddr = priv->broadcastaddr;
+ error = ipoib_dev_init(priv, parent->ca, parent->port);
+ if (error)
+ goto out;
+ priv->parent = parent->dev;
+ list_add_tail(&priv->list, &parent->child_intfs);
+ VLAN_SETCOOKIE(dev, priv);
+ dev->if_start = ipoib_vlan_start;
+ dev->if_drv_flags &= ~IFF_DRV_RUNNING;
+ dev->if_hdrlen = IPOIB_HEADER_LEN;
+ if (ifp->if_drv_flags & IFF_DRV_RUNNING)
+ ipoib_open(priv);
+ mutex_unlock(&parent->vlan_mutex);
+ return;
+out:
+ mutex_unlock(&parent->vlan_mutex);
+ if (priv)
+ free(priv, M_TEMP);
+ if (error)
+ ipoib_warn(parent,
+ "failed to initialize subinterface: device %s, port %d vtag 0x%X",
+ parent->ca->name, parent->port, vtag);
+ return;
+}
+
+static void
+ipoib_unconfig_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
+{
+ struct ipoib_dev_priv *parent;
+ struct ipoib_dev_priv *priv;
+ struct ifnet *dev;
+ uint16_t pkey;
+
+ if (ifp->if_type != IFT_INFINIBAND)
+ return;
+
+ dev = VLAN_DEVAT(ifp, vtag);
+ if (dev)
+ VLAN_SETCOOKIE(dev, NULL);
+ pkey = vtag | 0x8000;
+ parent = ifp->if_softc;
+ mutex_lock(&parent->vlan_mutex);
+ list_for_each_entry(priv, &parent->child_intfs, list) {
+ if (priv->pkey == pkey) {
+ ipoib_dev_cleanup(priv);
+ list_del(&priv->list);
+ break;
+ }
+ }
+ mutex_unlock(&parent->vlan_mutex);
+}
+
+eventhandler_tag ipoib_vlan_attach;
+eventhandler_tag ipoib_vlan_detach;
+
+static int __init
+ipoib_init_module(void)
+{
+ int ret;
+
+ ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);
+ ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);
+ ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);
+
+ ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
+ ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
+ ipoib_sendq_size = max(ipoib_sendq_size, max(2 * MAX_SEND_CQE,
+ IPOIB_MIN_QUEUE_SIZE));
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+ ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
+#endif
+
+ ipoib_vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
+ ipoib_config_vlan, NULL, EVENTHANDLER_PRI_FIRST);
+ ipoib_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
+ ipoib_unconfig_vlan, NULL, EVENTHANDLER_PRI_FIRST);
+
+ /*
+ * We create our own workqueue mainly because we want to be
+ * able to flush it when devices are being removed. We can't
+ * use schedule_work()/flush_scheduled_work() because both
+ * unregister_netdev() and linkwatch_event take the rtnl lock,
+ * so flush_scheduled_work() can deadlock during device
+ * removal.
+ */
+ ipoib_workqueue = create_singlethread_workqueue("ipoib");
+ if (!ipoib_workqueue) {
+ ret = -ENOMEM;
+ goto err_fs;
+ }
+
+ ib_sa_register_client(&ipoib_sa_client);
+
+ ret = ib_register_client(&ipoib_client);
+ if (ret)
+ goto err_sa;
+
+ return 0;
+
+err_sa:
+ ib_sa_unregister_client(&ipoib_sa_client);
+ destroy_workqueue(ipoib_workqueue);
+
+err_fs:
+ return ret;
+}
+
+static void __exit
+ipoib_cleanup_module(void)
+{
+
+ EVENTHANDLER_DEREGISTER(vlan_config, ipoib_vlan_attach);
+ EVENTHANDLER_DEREGISTER(vlan_unconfig, ipoib_vlan_detach);
+ ib_unregister_client(&ipoib_client);
+ ib_sa_unregister_client(&ipoib_sa_client);
+ destroy_workqueue(ipoib_workqueue);
+}
+
+/*
+ * Infiniband output routine.
+ */
+static int
+ipoib_output(struct ifnet *ifp, struct mbuf *m,
+ struct sockaddr *dst, struct route *ro)
+{
+ u_char edst[INFINIBAND_ALEN];
+ struct llentry *lle = NULL;
+ struct rtentry *rt0 = NULL;
+ struct ipoib_header *eh;
+ int error = 0;
+ short type;
+
+ if (ro != NULL) {
+ if (!(m->m_flags & (M_BCAST | M_MCAST)))
+ lle = ro->ro_lle;
+ rt0 = ro->ro_rt;
+ }
+#ifdef MAC
+ error = mac_ifnet_check_transmit(ifp, m);
+ if (error)
+ goto bad;
+#endif
+
+ M_PROFILE(m);
+ if (ifp->if_flags & IFF_MONITOR) {
+ error = ENETDOWN;
+ goto bad;
+ }
+ if (!((ifp->if_flags & IFF_UP) &&
+ (ifp->if_drv_flags & IFF_DRV_RUNNING))) {
+ error = ENETDOWN;
+ goto bad;
+ }
+
+ switch (dst->sa_family) {
+#ifdef INET
+ case AF_INET:
+ if (lle != NULL && (lle->la_flags & LLE_VALID))
+ memcpy(edst, &lle->ll_addr.mac8, sizeof(edst));
+ else if (m->m_flags & M_MCAST)
+ ip_ib_mc_map(((struct sockaddr_in *)dst)->sin_addr.s_addr, ifp->if_broadcastaddr, edst);
+ else
+ error = arpresolve(ifp, rt0, m, dst, edst, &lle);
+ if (error)
+ return (error == EWOULDBLOCK ? 0 : error);
+ type = htons(ETHERTYPE_IP);
+ break;
+ case AF_ARP:
+ {
+ struct arphdr *ah;
+ ah = mtod(m, struct arphdr *);
+ ah->ar_hrd = htons(ARPHRD_INFINIBAND);
+
+ switch(ntohs(ah->ar_op)) {
+ case ARPOP_REVREQUEST:
+ case ARPOP_REVREPLY:
+ type = htons(ETHERTYPE_REVARP);
+ break;
+ case ARPOP_REQUEST:
+ case ARPOP_REPLY:
+ default:
+ type = htons(ETHERTYPE_ARP);
+ break;
+ }
+
+ if (m->m_flags & M_BCAST)
+ bcopy(ifp->if_broadcastaddr, edst, INFINIBAND_ALEN);
+ else
+ bcopy(ar_tha(ah), edst, INFINIBAND_ALEN);
+
+ }
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ if (lle != NULL && (lle->la_flags & LLE_VALID))
+ memcpy(edst, &lle->ll_addr.mac8, sizeof(edst));
+ else if (m->m_flags & M_MCAST)
+ ipv6_ib_mc_map(&((struct sockaddr_in6 *)dst)->sin6_addr, ifp->if_broadcastaddr, edst);
+ else
+ error = nd6_storelladdr(ifp, m, dst, (u_char *)edst, &lle);
+ if (error)
+ return error;
+ type = htons(ETHERTYPE_IPV6);
+ break;
+#endif
+
+ default:
+ if_printf(ifp, "can't handle af%d\n", dst->sa_family);
+ error = EAFNOSUPPORT;
+ goto bad;
+ }
+
+ /*
+ * Add local net header. If no space in first mbuf,
+ * allocate another.
+ */
+ M_PREPEND(m, IPOIB_HEADER_LEN, M_DONTWAIT);
+ if (m == NULL) {
+ error = ENOBUFS;
+ goto bad;
+ }
+ eh = mtod(m, struct ipoib_header *);
+ (void)memcpy(&eh->proto, &type, sizeof(eh->proto));
+ (void)memcpy(&eh->hwaddr, edst, sizeof (edst));
+
+ /*
+ * Queue message on interface, update output statistics if
+ * successful, and start output if interface not yet active.
+ */
+ return ((ifp->if_transmit)(ifp, m));
+bad:
+ if (m != NULL)
+ m_freem(m);
+ return (error);
+}
+
+/*
+ * Upper layer processing for a received Infiniband packet.
+ */
+void
+ipoib_demux(struct ifnet *ifp, struct mbuf *m, u_short proto)
+{
+ int isr;
+
+#ifdef MAC
+ /*
+ * Tag the mbuf with an appropriate MAC label before any other
+ * consumers can get to it.
+ */
+ mac_ifnet_create_mbuf(ifp, m);
+#endif
+ /* Allow monitor mode to claim this frame, after stats are updated. */
+ if (ifp->if_flags & IFF_MONITOR) {
+ if_printf(ifp, "discard frame at IFF_MONITOR\n");
+ m_freem(m);
+ return;
+ }
+ /*
+ * Dispatch frame to upper layer.
+ */
+ switch (proto) {
+#ifdef INET
+ case ETHERTYPE_IP:
+ isr = NETISR_IP;
+ break;
+
+ case ETHERTYPE_ARP:
+ if (ifp->if_flags & IFF_NOARP) {
+ /* Discard packet if ARP is disabled on interface */
+ m_freem(m);
+ return;
+ }
+ isr = NETISR_ARP;
+ break;
+#endif
+#ifdef INET6
+ case ETHERTYPE_IPV6:
+ isr = NETISR_IPV6;
+ break;
+#endif
+ default:
+ goto discard;
+ }
+ netisr_dispatch(isr, m);
+ return;
+
+discard:
+ m_freem(m);
+}
+
+/*
+ * Process a received Infiniband packet.
+ */
+static void
+ipoib_input(struct ifnet *ifp, struct mbuf *m)
+{
+ struct ipoib_header *eh;
+
+ if ((ifp->if_flags & IFF_UP) == 0) {
+ m_freem(m);
+ return;
+ }
+ CURVNET_SET_QUIET(ifp->if_vnet);
+
+ /* Let BPF have it before we strip the header. */
+ IPOIB_MTAP(ifp, m);
+ eh = mtod(m, struct ipoib_header *);
+ /*
+ * Reset layer specific mbuf flags to avoid confusing upper layers.
+ * Strip off Infiniband header.
+ */
+ m->m_flags &= ~M_VLANTAG;
+ m->m_flags &= ~(M_PROTOFLAGS);
+ m_adj(m, IPOIB_HEADER_LEN);
+
+ if (IPOIB_IS_MULTICAST(eh->hwaddr)) {
+ if (memcmp(eh->hwaddr, ifp->if_broadcastaddr,
+ ifp->if_addrlen) == 0)
+ m->m_flags |= M_BCAST;
+ else
+ m->m_flags |= M_MCAST;
+ ifp->if_imcasts++;
+ }
+
+ ipoib_demux(ifp, m, ntohs(eh->proto));
+ CURVNET_RESTORE();
+}
+
+static int
+ipoib_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa,
+ struct sockaddr *sa)
+{
+ struct sockaddr_dl *sdl;
+#ifdef INET
+ struct sockaddr_in *sin;
+#endif
+#ifdef INET6
+ struct sockaddr_in6 *sin6;
+#endif
+ u_char *e_addr;
+
+ switch(sa->sa_family) {
+ case AF_LINK:
+ /*
+ * No mapping needed. Just check that it's a valid MC address.
+ */
+ sdl = (struct sockaddr_dl *)sa;
+ e_addr = LLADDR(sdl);
+ if (!IPOIB_IS_MULTICAST(e_addr))
+ return EADDRNOTAVAIL;
+ *llsa = 0;
+ return 0;
+
+#ifdef INET
+ case AF_INET:
+ sin = (struct sockaddr_in *)sa;
+ if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
+ return EADDRNOTAVAIL;
+ sdl = malloc(sizeof *sdl, M_IFMADDR,
+ M_NOWAIT|M_ZERO);
+ if (sdl == NULL)
+ return ENOMEM;
+ sdl->sdl_len = sizeof *sdl;
+ sdl->sdl_family = AF_LINK;
+ sdl->sdl_index = ifp->if_index;
+ sdl->sdl_type = IFT_INFINIBAND;
+ sdl->sdl_alen = INFINIBAND_ALEN;
+ e_addr = LLADDR(sdl);
+ ip_ib_mc_map(sin->sin_addr.s_addr, ifp->if_broadcastaddr,
+ e_addr);
+ *llsa = (struct sockaddr *)sdl;
+ return 0;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ sin6 = (struct sockaddr_in6 *)sa;
+ /*
+ * An IP6 address of 0 means listen to all
+ * of the multicast address used for IP6.
+ * This has no meaning in ipoib.
+ */
+ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
+ return EADDRNOTAVAIL;
+ if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
+ return EADDRNOTAVAIL;
+ sdl = malloc(sizeof *sdl, M_IFMADDR,
+ M_NOWAIT|M_ZERO);
+ if (sdl == NULL)
+ return (ENOMEM);
+ sdl->sdl_len = sizeof *sdl;
+ sdl->sdl_family = AF_LINK;
+ sdl->sdl_index = ifp->if_index;
+ sdl->sdl_type = IFT_INFINIBAND;
+ sdl->sdl_alen = INFINIBAND_ALEN;
+ e_addr = LLADDR(sdl);
+ ipv6_ib_mc_map(&sin6->sin6_addr, ifp->if_broadcastaddr, e_addr);
+ *llsa = (struct sockaddr *)sdl;
+ return 0;
+#endif
+
+ default:
+ return EAFNOSUPPORT;
+ }
+}
+
+module_init(ipoib_init_module);
+module_exit(ipoib_cleanup_module);
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
new file mode 100644
index 0000000..a5746e4
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -0,0 +1,907 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ipoib.h"
+
+#include <linux/delay.h>
+#include <linux/completion.h>
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+static int mcast_debug_level = 1;
+
+module_param(mcast_debug_level, int, 0644);
+MODULE_PARM_DESC(mcast_debug_level,
+ "Enable multicast debug tracing if > 0");
+#endif
+
+static DEFINE_MUTEX(mcast_mutex);
+
+struct ipoib_mcast_iter {
+ struct ipoib_dev_priv *priv;
+ union ib_gid mgid;
+ unsigned long created;
+ unsigned int queuelen;
+ unsigned int complete;
+ unsigned int send_only;
+};
+
+static void ipoib_mcast_free(struct ipoib_mcast *mcast)
+{
+ struct ifnet *dev = mcast->priv->dev;
+ int tx_dropped = 0;
+
+ ipoib_dbg_mcast(mcast->priv, "deleting multicast group %16D\n",
+ mcast->mcmember.mgid.raw, ":");
+
+ if (mcast->ah)
+ ipoib_put_ah(mcast->ah);
+
+ tx_dropped = mcast->pkt_queue.ifq_len;
+ _IF_DRAIN(&mcast->pkt_queue); /* XXX Locking. */
+
+ dev->if_oerrors += tx_dropped;
+
+ kfree(mcast);
+}
+
+static struct ipoib_mcast *ipoib_mcast_alloc(struct ipoib_dev_priv *priv,
+ int can_sleep)
+{
+ struct ipoib_mcast *mcast;
+
+ mcast = kzalloc(sizeof *mcast, can_sleep ? GFP_KERNEL : GFP_ATOMIC);
+ if (!mcast)
+ return NULL;
+
+ mcast->priv = priv;
+ mcast->created = jiffies;
+ mcast->backoff = 1;
+
+ INIT_LIST_HEAD(&mcast->list);
+ bzero(&mcast->pkt_queue, sizeof(mcast->pkt_queue));
+
+ return mcast;
+}
+
+static struct ipoib_mcast *__ipoib_mcast_find(struct ipoib_dev_priv *priv,
+ void *mgid)
+{
+ struct rb_node *n = priv->multicast_tree.rb_node;
+
+ while (n) {
+ struct ipoib_mcast *mcast;
+ int ret;
+
+ mcast = rb_entry(n, struct ipoib_mcast, rb_node);
+
+ ret = memcmp(mgid, mcast->mcmember.mgid.raw,
+ sizeof (union ib_gid));
+ if (ret < 0)
+ n = n->rb_left;
+ else if (ret > 0)
+ n = n->rb_right;
+ else
+ return mcast;
+ }
+
+ return NULL;
+}
+
+static int __ipoib_mcast_add(struct ipoib_dev_priv *priv,
+ struct ipoib_mcast *mcast)
+{
+ struct rb_node **n = &priv->multicast_tree.rb_node, *pn = NULL;
+
+ while (*n) {
+ struct ipoib_mcast *tmcast;
+ int ret;
+
+ pn = *n;
+ tmcast = rb_entry(pn, struct ipoib_mcast, rb_node);
+
+ ret = memcmp(mcast->mcmember.mgid.raw, tmcast->mcmember.mgid.raw,
+ sizeof (union ib_gid));
+ if (ret < 0)
+ n = &pn->rb_left;
+ else if (ret > 0)
+ n = &pn->rb_right;
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node(&mcast->rb_node, pn, n);
+ rb_insert_color(&mcast->rb_node, &priv->multicast_tree);
+
+ return 0;
+}
+
+static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
+ struct ib_sa_mcmember_rec *mcmember)
+{
+ struct ipoib_dev_priv *priv = mcast->priv;
+ struct ifnet *dev = priv->dev;
+ struct ipoib_ah *ah;
+ int ret;
+ int set_qkey = 0;
+
+ mcast->mcmember = *mcmember;
+
+ /* Set the cached Q_Key before we attach if it's the broadcast group */
+ if (!memcmp(mcast->mcmember.mgid.raw, dev->if_broadcastaddr + 4,
+ sizeof (union ib_gid))) {
+ spin_lock_irq(&priv->lock);
+ if (!priv->broadcast) {
+ spin_unlock_irq(&priv->lock);
+ return -EAGAIN;
+ }
+ priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey);
+ spin_unlock_irq(&priv->lock);
+ priv->tx_wr.wr.ud.remote_qkey = priv->qkey;
+ set_qkey = 1;
+ }
+
+ if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+ if (test_and_set_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
+ ipoib_warn(priv, "multicast group %16D already attached\n",
+ mcast->mcmember.mgid.raw, ":");
+
+ return 0;
+ }
+
+ ret = ipoib_mcast_attach(priv, be16_to_cpu(mcast->mcmember.mlid),
+ &mcast->mcmember.mgid, set_qkey);
+ if (ret < 0) {
+ ipoib_warn(priv, "couldn't attach QP to multicast group %16D\n",
+ mcast->mcmember.mgid.raw, ":");
+
+ clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags);
+ return ret;
+ }
+ }
+
+ {
+ struct ib_ah_attr av = {
+ .dlid = be16_to_cpu(mcast->mcmember.mlid),
+ .port_num = priv->port,
+ .sl = mcast->mcmember.sl,
+ .ah_flags = IB_AH_GRH,
+ .static_rate = mcast->mcmember.rate,
+ .grh = {
+ .flow_label = be32_to_cpu(mcast->mcmember.flow_label),
+ .hop_limit = mcast->mcmember.hop_limit,
+ .sgid_index = 0,
+ .traffic_class = mcast->mcmember.traffic_class
+ }
+ };
+ av.grh.dgid = mcast->mcmember.mgid;
+
+ ah = ipoib_create_ah(priv, priv->pd, &av);
+ if (!ah) {
+ ipoib_warn(priv, "ib_address_create failed\n");
+ } else {
+ spin_lock_irq(&priv->lock);
+ mcast->ah = ah;
+ spin_unlock_irq(&priv->lock);
+
+ ipoib_dbg_mcast(priv, "MGID %16D AV %p, LID 0x%04x, SL %d\n",
+ mcast->mcmember.mgid.raw, ":",
+ mcast->ah->ah,
+ be16_to_cpu(mcast->mcmember.mlid),
+ mcast->mcmember.sl);
+ }
+ }
+
+ /* actually send any queued packets */
+ while (mcast->pkt_queue.ifq_len) {
+ struct mbuf *mb;
+ _IF_DEQUEUE(&mcast->pkt_queue, mb);
+ mb->m_pkthdr.rcvif = dev;
+
+ if (dev->if_transmit(dev, mb))
+ ipoib_warn(priv, "dev_queue_xmit failed to requeue packet\n");
+ }
+
+ return 0;
+}
+
+static int
+ipoib_mcast_sendonly_join_complete(int status,
+ struct ib_sa_multicast *multicast)
+{
+ struct ipoib_mcast *mcast = multicast->context;
+ struct ipoib_dev_priv *priv = mcast->priv;
+
+ /* We trap for port events ourselves. */
+ if (status == -ENETRESET)
+ return 0;
+
+ if (!status)
+ status = ipoib_mcast_join_finish(mcast, &multicast->rec);
+
+ if (status) {
+ if (mcast->logcount++ < 20)
+ ipoib_dbg_mcast(priv, "multicast join failed for %16D, status %d\n",
+ mcast->mcmember.mgid.raw, ":", status);
+
+ /* Flush out any queued packets */
+ priv->dev->if_oerrors += mcast->pkt_queue.ifq_len;
+ _IF_DRAIN(&mcast->pkt_queue);
+
+ /* Clear the busy flag so we try again */
+ status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY,
+ &mcast->flags);
+ }
+ return status;
+}
+
+static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
+{
+ struct ipoib_dev_priv *priv = mcast->priv;
+ struct ib_sa_mcmember_rec rec = {
+#if 0 /* Some SMs don't support send-only yet */
+ .join_state = 4
+#else
+ .join_state = 1
+#endif
+ };
+ int ret = 0;
+
+ if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
+ ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n");
+ return -ENODEV;
+ }
+
+ if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) {
+ ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n");
+ return -EBUSY;
+ }
+
+ rec.mgid = mcast->mcmember.mgid;
+ rec.port_gid = priv->local_gid;
+ rec.pkey = cpu_to_be16(priv->pkey);
+
+ mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca,
+ priv->port, &rec,
+ IB_SA_MCMEMBER_REC_MGID |
+ IB_SA_MCMEMBER_REC_PORT_GID |
+ IB_SA_MCMEMBER_REC_PKEY |
+ IB_SA_MCMEMBER_REC_JOIN_STATE,
+ GFP_ATOMIC,
+ ipoib_mcast_sendonly_join_complete,
+ mcast);
+ if (IS_ERR(mcast->mc)) {
+ ret = PTR_ERR(mcast->mc);
+ clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+ ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n",
+ ret);
+ } else {
+ ipoib_dbg_mcast(priv, "no multicast record for %16D, starting join\n",
+ mcast->mcmember.mgid.raw, ":");
+ }
+
+ return ret;
+}
+
+void ipoib_mcast_carrier_on_task(struct work_struct *work)
+{
+ struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+ carrier_on_task);
+ struct ib_port_attr attr;
+
+ /*
+ * Take rtnl_lock to avoid racing with ipoib_stop() and
+ * turning the carrier back on while a device is being
+ * removed.
+ */
+ if (ib_query_port(priv->ca, priv->port, &attr) ||
+ attr.state != IB_PORT_ACTIVE) {
+ ipoib_dbg(priv, "Keeping carrier off until IB port is active\n");
+ return;
+ }
+ if_link_state_change(priv->dev, LINK_STATE_UP);
+}
+
+static int ipoib_mcast_join_complete(int status,
+ struct ib_sa_multicast *multicast)
+{
+ struct ipoib_mcast *mcast = multicast->context;
+ struct ipoib_dev_priv *priv = mcast->priv;
+
+ ipoib_dbg_mcast(priv, "join completion for %16D (status %d)\n",
+ mcast->mcmember.mgid.raw, ":", status);
+
+ /* We trap for port events ourselves. */
+ if (status == -ENETRESET)
+ return 0;
+
+ if (!status)
+ status = ipoib_mcast_join_finish(mcast, &multicast->rec);
+
+ if (!status) {
+ mcast->backoff = 1;
+ mutex_lock(&mcast_mutex);
+ if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+ queue_delayed_work(ipoib_workqueue,
+ &priv->mcast_task, 0);
+ mutex_unlock(&mcast_mutex);
+
+ /*
+ * Defer carrier on work to ipoib_workqueue to avoid a
+ * deadlock on rtnl_lock here.
+ */
+ if (mcast == priv->broadcast)
+ queue_work(ipoib_workqueue, &priv->carrier_on_task);
+
+ return 0;
+ }
+
+ if (mcast->logcount++ < 20) {
+ if (status == -ETIMEDOUT || status == -EAGAIN) {
+ ipoib_dbg_mcast(priv, "multicast join failed for %16D, status %d\n",
+ mcast->mcmember.mgid.raw, ":", status);
+ } else {
+ ipoib_warn(priv, "multicast join failed for %16D, status %d\n",
+ mcast->mcmember.mgid.raw, ":", status);
+ }
+ }
+
+ mcast->backoff *= 2;
+ if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
+ mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
+
+ /* Clear the busy flag so we try again */
+ status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+
+ mutex_lock(&mcast_mutex);
+ spin_lock_irq(&priv->lock);
+ if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+ queue_delayed_work(ipoib_workqueue, &priv->mcast_task,
+ mcast->backoff * HZ);
+ spin_unlock_irq(&priv->lock);
+ mutex_unlock(&mcast_mutex);
+
+ return status;
+}
+
+static void ipoib_mcast_join(struct ipoib_dev_priv *priv,
+ struct ipoib_mcast *mcast, int create)
+{
+ struct ib_sa_mcmember_rec rec = {
+ .join_state = 1
+ };
+ ib_sa_comp_mask comp_mask;
+ int ret = 0;
+
+ ipoib_dbg_mcast(priv, "joining MGID %16D\n",
+ mcast->mcmember.mgid.raw, ":");
+
+ rec.mgid = mcast->mcmember.mgid;
+ rec.port_gid = priv->local_gid;
+ rec.pkey = cpu_to_be16(priv->pkey);
+
+ comp_mask =
+ IB_SA_MCMEMBER_REC_MGID |
+ IB_SA_MCMEMBER_REC_PORT_GID |
+ IB_SA_MCMEMBER_REC_PKEY |
+ IB_SA_MCMEMBER_REC_JOIN_STATE;
+
+ if (create) {
+ comp_mask |=
+ IB_SA_MCMEMBER_REC_QKEY |
+ IB_SA_MCMEMBER_REC_MTU_SELECTOR |
+ IB_SA_MCMEMBER_REC_MTU |
+ IB_SA_MCMEMBER_REC_TRAFFIC_CLASS |
+ IB_SA_MCMEMBER_REC_RATE_SELECTOR |
+ IB_SA_MCMEMBER_REC_RATE |
+ IB_SA_MCMEMBER_REC_SL |
+ IB_SA_MCMEMBER_REC_FLOW_LABEL |
+ IB_SA_MCMEMBER_REC_HOP_LIMIT;
+
+ rec.qkey = priv->broadcast->mcmember.qkey;
+ rec.mtu_selector = IB_SA_EQ;
+ rec.mtu = priv->broadcast->mcmember.mtu;
+ rec.traffic_class = priv->broadcast->mcmember.traffic_class;
+ rec.rate_selector = IB_SA_EQ;
+ rec.rate = priv->broadcast->mcmember.rate;
+ rec.sl = priv->broadcast->mcmember.sl;
+ rec.flow_label = priv->broadcast->mcmember.flow_label;
+ rec.hop_limit = priv->broadcast->mcmember.hop_limit;
+ }
+
+ set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+ mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
+ &rec, comp_mask, GFP_KERNEL,
+ ipoib_mcast_join_complete, mcast);
+ if (IS_ERR(mcast->mc)) {
+ clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+ ret = PTR_ERR(mcast->mc);
+ ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret);
+
+ mcast->backoff *= 2;
+ if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
+ mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
+
+ mutex_lock(&mcast_mutex);
+ if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+ queue_delayed_work(ipoib_workqueue,
+ &priv->mcast_task,
+ mcast->backoff * HZ);
+ mutex_unlock(&mcast_mutex);
+ }
+}
+
+void ipoib_mcast_join_task(struct work_struct *work)
+{
+ struct ipoib_dev_priv *priv =
+ container_of(work, struct ipoib_dev_priv, mcast_task.work);
+ struct ifnet *dev = priv->dev;
+
+ ipoib_dbg_mcast(priv, "Running join task. flags 0x%lX\n", priv->flags);
+
+ if (!test_bit(IPOIB_MCAST_RUN, &priv->flags))
+ return;
+
+ if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid))
+ ipoib_warn(priv, "ib_query_gid() failed\n");
+ else
+ memcpy(IF_LLADDR(dev) + 4, priv->local_gid.raw, sizeof (union ib_gid));
+
+ {
+ struct ib_port_attr attr;
+
+ if (!ib_query_port(priv->ca, priv->port, &attr))
+ priv->local_lid = attr.lid;
+ else
+ ipoib_warn(priv, "ib_query_port failed\n");
+ }
+
+ if (!priv->broadcast) {
+ struct ipoib_mcast *broadcast;
+
+ if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+ return;
+
+ broadcast = ipoib_mcast_alloc(priv, 1);
+ if (!broadcast) {
+ ipoib_warn(priv, "failed to allocate broadcast group\n");
+ mutex_lock(&mcast_mutex);
+ if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+ queue_delayed_work(ipoib_workqueue,
+ &priv->mcast_task, HZ);
+ mutex_unlock(&mcast_mutex);
+ return;
+ }
+
+ spin_lock_irq(&priv->lock);
+ memcpy(broadcast->mcmember.mgid.raw, dev->if_broadcastaddr + 4,
+ sizeof (union ib_gid));
+ priv->broadcast = broadcast;
+
+ __ipoib_mcast_add(priv, priv->broadcast);
+ spin_unlock_irq(&priv->lock);
+ }
+
+ if (priv->broadcast &&
+ !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
+ if (priv->broadcast &&
+ !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags))
+ ipoib_mcast_join(priv, priv->broadcast, 0);
+ return;
+ }
+
+ while (1) {
+ struct ipoib_mcast *mcast = NULL;
+
+ spin_lock_irq(&priv->lock);
+ list_for_each_entry(mcast, &priv->multicast_list, list) {
+ if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)
+ && !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)
+ && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
+ /* Found the next unjoined group */
+ break;
+ }
+ }
+ spin_unlock_irq(&priv->lock);
+
+ if (&mcast->list == &priv->multicast_list) {
+ /* All done */
+ break;
+ }
+
+ ipoib_mcast_join(priv, mcast, 1);
+ return;
+ }
+
+ spin_lock_irq(&priv->lock);
+ if (priv->broadcast)
+ priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
+ else
+ priv->mcast_mtu = priv->admin_mtu;
+ spin_unlock_irq(&priv->lock);
+
+ if (!ipoib_cm_admin_enabled(priv))
+ ipoib_change_mtu(priv, min(priv->mcast_mtu, priv->admin_mtu));
+
+ ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n");
+
+ clear_bit(IPOIB_MCAST_RUN, &priv->flags);
+}
+
+int ipoib_mcast_start_thread(struct ipoib_dev_priv *priv)
+{
+ ipoib_dbg_mcast(priv, "starting multicast thread flags 0x%lX\n",
+ priv->flags);
+
+ mutex_lock(&mcast_mutex);
+ if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags))
+ queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0);
+ mutex_unlock(&mcast_mutex);
+
+ return 0;
+}
+
+int ipoib_mcast_stop_thread(struct ipoib_dev_priv *priv, int flush)
+{
+
+ ipoib_dbg_mcast(priv, "stopping multicast thread\n");
+
+ mutex_lock(&mcast_mutex);
+ clear_bit(IPOIB_MCAST_RUN, &priv->flags);
+ cancel_delayed_work(&priv->mcast_task);
+ mutex_unlock(&mcast_mutex);
+
+ if (flush)
+ flush_workqueue(ipoib_workqueue);
+
+ return 0;
+}
+
+static int ipoib_mcast_leave(struct ipoib_dev_priv *priv, struct ipoib_mcast *mcast)
+{
+ int ret = 0;
+
+ if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
+ ib_sa_free_multicast(mcast->mc);
+
+ if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
+ ipoib_dbg_mcast(priv, "leaving MGID %16D\n",
+ mcast->mcmember.mgid.raw, ":");
+
+ /* Remove ourselves from the multicast group */
+ ret = ib_detach_mcast(priv->qp, &mcast->mcmember.mgid,
+ be16_to_cpu(mcast->mcmember.mlid));
+ if (ret)
+ ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret);
+ }
+
+ return 0;
+}
+
+void
+ipoib_mcast_send(struct ipoib_dev_priv *priv, void *mgid, struct mbuf *mb)
+{
+ struct ifnet *dev = priv->dev;
+ struct ipoib_mcast *mcast;
+
+ if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags) ||
+ !priv->broadcast ||
+ !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
+ ++dev->if_oerrors;
+ m_freem(mb);
+ return;
+ }
+
+ mcast = __ipoib_mcast_find(priv, mgid);
+ if (!mcast) {
+ /* Let's create a new send only group now */
+ ipoib_dbg_mcast(priv, "setting up send only multicast group for %16D\n",
+ mgid, ":");
+
+ mcast = ipoib_mcast_alloc(priv, 0);
+ if (!mcast) {
+ ipoib_warn(priv, "unable to allocate memory for "
+ "multicast structure\n");
+ ++dev->if_oerrors;
+ m_freem(mb);
+ goto out;
+ }
+
+ set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags);
+ memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid));
+ __ipoib_mcast_add(priv, mcast);
+ list_add_tail(&mcast->list, &priv->multicast_list);
+ }
+
+ if (!mcast->ah) {
+ if (mcast->pkt_queue.ifq_len < IPOIB_MAX_MCAST_QUEUE) {
+ _IF_ENQUEUE(&mcast->pkt_queue, mb);
+ } else {
+ ++dev->if_oerrors;
+ m_freem(mb);
+ }
+
+ if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
+ ipoib_dbg_mcast(priv, "no address vector, "
+ "but multicast join already started\n");
+ else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
+ ipoib_mcast_sendonly_join(mcast);
+
+ /*
+ * If lookup completes between here and out:, don't
+ * want to send packet twice.
+ */
+ mcast = NULL;
+ }
+
+out:
+ if (mcast && mcast->ah)
+ ipoib_send(priv, mb, mcast->ah, IB_MULTICAST_QPN);
+}
+
+void ipoib_mcast_dev_flush(struct ipoib_dev_priv *priv)
+{
+ LIST_HEAD(remove_list);
+ struct ipoib_mcast *mcast, *tmcast;
+ unsigned long flags;
+
+ ipoib_dbg_mcast(priv, "flushing multicast list\n");
+
+ spin_lock_irqsave(&priv->lock, flags);
+
+ list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
+ list_del(&mcast->list);
+ rb_erase(&mcast->rb_node, &priv->multicast_tree);
+ list_add_tail(&mcast->list, &remove_list);
+ }
+
+ if (priv->broadcast) {
+ rb_erase(&priv->broadcast->rb_node, &priv->multicast_tree);
+ list_add_tail(&priv->broadcast->list, &remove_list);
+ priv->broadcast = NULL;
+ }
+
+ spin_unlock_irqrestore(&priv->lock, flags);
+
+ list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
+ ipoib_mcast_leave(priv, mcast);
+ ipoib_mcast_free(mcast);
+ }
+}
+
+static int ipoib_mcast_addr_is_valid(const u8 *addr, unsigned int addrlen,
+ const u8 *broadcast)
+{
+ if (addrlen != INFINIBAND_ALEN)
+ return 0;
+ /* reserved QPN, prefix, scope */
+ if (memcmp(addr, broadcast, 6))
+ return 0;
+ /* signature lower, pkey */
+ if (memcmp(addr + 7, broadcast + 7, 3))
+ return 0;
+ return 1;
+}
+
+void ipoib_mcast_restart_task(struct work_struct *work)
+{
+ struct ipoib_dev_priv *priv =
+ container_of(work, struct ipoib_dev_priv, restart_task);
+ ipoib_mcast_restart(priv);
+}
+
+void ipoib_mcast_restart(struct ipoib_dev_priv *priv)
+{
+ struct ifnet *dev = priv->dev;
+ struct ifmultiaddr *ifma;;
+ struct ipoib_mcast *mcast, *tmcast;
+ LIST_HEAD(remove_list);
+ struct ib_sa_mcmember_rec rec;
+ int addrlen;
+
+ ipoib_dbg_mcast(priv, "restarting multicast task flags 0x%lX\n",
+ priv->flags);
+
+ ipoib_mcast_stop_thread(priv, 0);
+
+ if_maddr_rlock(dev);
+ spin_lock(&priv->lock);
+
+ /*
+ * Unfortunately, the networking core only gives us a list of all of
+ * the multicast hardware addresses. We need to figure out which ones
+ * are new and which ones have been removed
+ */
+
+ /* Clear out the found flag */
+ list_for_each_entry(mcast, &priv->multicast_list, list)
+ clear_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags);
+
+ /* Mark all of the entries that are found or don't exist */
+
+
+ TAILQ_FOREACH(ifma, &dev->if_multiaddrs, ifma_link) {
+ union ib_gid mgid;
+ uint8_t *addr;
+
+ if (ifma->ifma_addr->sa_family != AF_LINK)
+ continue;
+ addr = LLADDR((struct sockaddr_dl *)ifma->ifma_addr);
+ addrlen = ((struct sockaddr_dl *)ifma->ifma_addr)->sdl_alen;
+ if (!ipoib_mcast_addr_is_valid(addr, addrlen,
+ dev->if_broadcastaddr))
+ continue;
+
+ memcpy(mgid.raw, addr + 4, sizeof mgid);
+
+ mcast = __ipoib_mcast_find(priv, &mgid);
+ if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+ struct ipoib_mcast *nmcast;
+
+ /* ignore group which is directly joined by userspace */
+ if (test_bit(IPOIB_FLAG_UMCAST, &priv->flags) &&
+ !ib_sa_get_mcmember_rec(priv->ca, priv->port, &mgid, &rec)) {
+ ipoib_dbg_mcast(priv, "ignoring multicast entry for mgid %16D\n",
+ mgid.raw, ":");
+ continue;
+ }
+
+ /* Not found or send-only group, let's add a new entry */
+ ipoib_dbg_mcast(priv, "adding multicast entry for mgid %16D\n",
+ mgid.raw, ":");
+
+ nmcast = ipoib_mcast_alloc(priv, 0);
+ if (!nmcast) {
+ ipoib_warn(priv, "unable to allocate memory for multicast structure\n");
+ continue;
+ }
+
+ set_bit(IPOIB_MCAST_FLAG_FOUND, &nmcast->flags);
+
+ nmcast->mcmember.mgid = mgid;
+
+ if (mcast) {
+ /* Destroy the send only entry */
+ list_move_tail(&mcast->list, &remove_list);
+
+ rb_replace_node(&mcast->rb_node,
+ &nmcast->rb_node,
+ &priv->multicast_tree);
+ } else
+ __ipoib_mcast_add(priv, nmcast);
+
+ list_add_tail(&nmcast->list, &priv->multicast_list);
+ }
+
+ if (mcast)
+ set_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags);
+ }
+
+ /* Remove all of the entries don't exist anymore */
+ list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
+ if (!test_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags) &&
+ !test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+ ipoib_dbg_mcast(priv, "deleting multicast group %16D\n",
+ mcast->mcmember.mgid.raw, ":");
+
+ rb_erase(&mcast->rb_node, &priv->multicast_tree);
+
+ /* Move to the remove list */
+ list_move_tail(&mcast->list, &remove_list);
+ }
+ }
+
+ spin_unlock(&priv->lock);
+ if_maddr_runlock(dev);
+
+ /* We have to cancel outside of the spinlock */
+ list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
+ ipoib_mcast_leave(mcast->priv, mcast);
+ ipoib_mcast_free(mcast);
+ }
+
+ if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+ ipoib_mcast_start_thread(priv);
+}
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+
+struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct ipoib_dev_priv *priv)
+{
+ struct ipoib_mcast_iter *iter;
+
+ iter = kmalloc(sizeof *iter, GFP_KERNEL);
+ if (!iter)
+ return NULL;
+
+ iter->priv = priv;
+ memset(iter->mgid.raw, 0, 16);
+
+ if (ipoib_mcast_iter_next(iter)) {
+ kfree(iter);
+ return NULL;
+ }
+
+ return iter;
+}
+
+int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter)
+{
+ struct ipoib_dev_priv *priv = iter->priv;
+ struct rb_node *n;
+ struct ipoib_mcast *mcast;
+ int ret = 1;
+
+ spin_lock_irq(&priv->lock);
+
+ n = rb_first(&priv->multicast_tree);
+
+ while (n) {
+ mcast = rb_entry(n, struct ipoib_mcast, rb_node);
+
+ if (memcmp(iter->mgid.raw, mcast->mcmember.mgid.raw,
+ sizeof (union ib_gid)) < 0) {
+ iter->mgid = mcast->mcmember.mgid;
+ iter->created = mcast->created;
+ iter->queuelen = mcast->pkt_queue.ifq_len;
+ iter->complete = !!mcast->ah;
+ iter->send_only = !!(mcast->flags & (1 << IPOIB_MCAST_FLAG_SENDONLY));
+
+ ret = 0;
+
+ break;
+ }
+
+ n = rb_next(n);
+ }
+
+ spin_unlock_irq(&priv->lock);
+
+ return ret;
+}
+
+void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter,
+ union ib_gid *mgid,
+ unsigned long *created,
+ unsigned int *queuelen,
+ unsigned int *complete,
+ unsigned int *send_only)
+{
+ *mgid = iter->mgid;
+ *created = iter->created;
+ *queuelen = iter->queuelen;
+ *complete = iter->complete;
+ *send_only = iter->send_only;
+}
+
+#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
new file mode 100644
index 0000000..fb9a27a
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ipoib.h"
+#include <linux/ethtool.h>
+
+int ipoib_mcast_attach(struct ipoib_dev_priv *priv, u16 mlid, union ib_gid *mgid, int set_qkey)
+{
+ struct ib_qp_attr *qp_attr = NULL;
+ int ret;
+ u16 pkey_index;
+
+ if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) {
+ clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+ ret = -ENXIO;
+ goto out;
+ }
+ set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+
+ if (set_qkey) {
+ ret = -ENOMEM;
+ qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL);
+ if (!qp_attr)
+ goto out;
+
+ /* set correct QKey for QP */
+ qp_attr->qkey = priv->qkey;
+ ret = ib_modify_qp(priv->qp, qp_attr, IB_QP_QKEY);
+ if (ret) {
+ ipoib_warn(priv, "failed to modify QP, ret = %d\n", ret);
+ goto out;
+ }
+ }
+
+ /* attach QP to multicast group */
+ ret = ib_attach_mcast(priv->qp, mgid, mlid);
+ if (ret)
+ ipoib_warn(priv, "failed to attach to multicast group, ret = %d\n", ret);
+
+out:
+ kfree(qp_attr);
+ return ret;
+}
+
+int ipoib_init_qp(struct ipoib_dev_priv *priv)
+{
+ int ret;
+ struct ib_qp_attr qp_attr;
+ int attr_mask;
+
+ if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
+ return -1;
+
+ qp_attr.qp_state = IB_QPS_INIT;
+ qp_attr.qkey = 0;
+ qp_attr.port_num = priv->port;
+ qp_attr.pkey_index = priv->pkey_index;
+ attr_mask =
+ IB_QP_QKEY |
+ IB_QP_PORT |
+ IB_QP_PKEY_INDEX |
+ IB_QP_STATE;
+ ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to modify QP to init, ret = %d\n", ret);
+ goto out_fail;
+ }
+
+ qp_attr.qp_state = IB_QPS_RTR;
+ /* Can't set this in a INIT->RTR transition */
+ attr_mask &= ~IB_QP_PORT;
+ ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to modify QP to RTR, ret = %d\n", ret);
+ goto out_fail;
+ }
+
+ qp_attr.qp_state = IB_QPS_RTS;
+ qp_attr.sq_psn = 0;
+ attr_mask |= IB_QP_SQ_PSN;
+ attr_mask &= ~IB_QP_PKEY_INDEX;
+ ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask);
+ if (ret) {
+ ipoib_warn(priv, "failed to modify QP to RTS, ret = %d\n", ret);
+ goto out_fail;
+ }
+
+ return 0;
+
+out_fail:
+ qp_attr.qp_state = IB_QPS_RESET;
+ if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
+ ipoib_warn(priv, "Failed to modify QP to RESET state\n");
+
+ return ret;
+}
+
+int ipoib_transport_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca)
+{
+ struct ib_qp_init_attr init_attr = {
+ .cap = {
+ .max_send_wr = ipoib_sendq_size,
+ .max_recv_wr = ipoib_recvq_size,
+ .max_send_sge = 1,
+ .max_recv_sge = IPOIB_UD_RX_SG
+ },
+ .sq_sig_type = IB_SIGNAL_ALL_WR,
+ .qp_type = IB_QPT_UD
+ };
+
+ int ret, size;
+ int i;
+ /* XXX struct ethtool_coalesce *coal; */
+
+ priv->pd = ib_alloc_pd(priv->ca);
+ if (IS_ERR(priv->pd)) {
+ printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name);
+ return -ENODEV;
+ }
+
+ priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(priv->mr)) {
+ printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name);
+ goto out_free_pd;
+ }
+
+ size = ipoib_recvq_size + 1;
+ ret = ipoib_cm_dev_init(priv);
+ if (!ret) {
+ size += ipoib_sendq_size;
+ if (ipoib_cm_has_srq(priv))
+ size += ipoib_recvq_size + 1; /* 1 extra for rx_drain_qp */
+ else
+ size += ipoib_recvq_size * ipoib_max_conn_qp;
+ }
+
+ priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, priv, size, 0);
+ if (IS_ERR(priv->recv_cq)) {
+ printk(KERN_WARNING "%s: failed to create receive CQ\n", ca->name);
+ goto out_free_mr;
+ }
+
+ priv->send_cq = ib_create_cq(priv->ca, ipoib_send_comp_handler, NULL,
+ priv, ipoib_sendq_size, 0);
+ if (IS_ERR(priv->send_cq)) {
+ printk(KERN_WARNING "%s: failed to create send CQ\n", ca->name);
+ goto out_free_recv_cq;
+ }
+
+ if (ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP))
+ goto out_free_send_cq;
+
+#if 0
+ /* XXX */
+ coal = kzalloc(sizeof *coal, GFP_KERNEL);
+ if (coal) {
+ coal->rx_coalesce_usecs = 10;
+ coal->tx_coalesce_usecs = 10;
+ coal->rx_max_coalesced_frames = 16;
+ coal->tx_max_coalesced_frames = 16;
+ dev->ethtool_ops->set_coalesce(dev, coal);
+ kfree(coal);
+ }
+#endif
+
+ init_attr.send_cq = priv->send_cq;
+ init_attr.recv_cq = priv->recv_cq;
+
+ if (priv->hca_caps & IB_DEVICE_UD_TSO)
+ init_attr.create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
+
+ if (priv->hca_caps & IB_DEVICE_BLOCK_MULTICAST_LOOPBACK)
+ init_attr.create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
+
+ init_attr.cap.max_send_sge = IPOIB_UD_TX_SG;
+
+ priv->qp = ib_create_qp(priv->pd, &init_attr);
+ if (IS_ERR(priv->qp)) {
+ printk(KERN_WARNING "%s: failed to create QP\n", ca->name);
+ goto out_free_send_cq;
+ }
+
+ IF_LLADDR(priv->dev)[1] = (priv->qp->qp_num >> 16) & 0xff;
+ IF_LLADDR(priv->dev)[2] = (priv->qp->qp_num >> 8) & 0xff;
+ IF_LLADDR(priv->dev)[3] = (priv->qp->qp_num ) & 0xff;
+
+ for (i = 0; i < IPOIB_MAX_TX_SG; ++i)
+ priv->tx_sge[i].lkey = priv->mr->lkey;
+
+ priv->tx_wr.opcode = IB_WR_SEND;
+ priv->tx_wr.sg_list = priv->tx_sge;
+ priv->tx_wr.send_flags = IB_SEND_SIGNALED;
+
+ for (i = 0; i < IPOIB_UD_RX_SG; ++i)
+ priv->rx_sge[i].lkey = priv->mr->lkey;
+ priv->rx_wr.next = NULL;
+ priv->rx_wr.sg_list = priv->rx_sge;
+
+ return 0;
+
+out_free_send_cq:
+ ib_destroy_cq(priv->send_cq);
+
+out_free_recv_cq:
+ ib_destroy_cq(priv->recv_cq);
+
+out_free_mr:
+ ib_dereg_mr(priv->mr);
+ ipoib_cm_dev_cleanup(priv);
+
+out_free_pd:
+ ib_dealloc_pd(priv->pd);
+ return -ENODEV;
+}
+
+void ipoib_transport_dev_cleanup(struct ipoib_dev_priv *priv)
+{
+
+ if (priv->qp) {
+ if (ib_destroy_qp(priv->qp))
+ ipoib_warn(priv, "ib_qp_destroy failed\n");
+
+ priv->qp = NULL;
+ clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+ }
+
+ if (ib_destroy_cq(priv->send_cq))
+ ipoib_warn(priv, "ib_cq_destroy (send) failed\n");
+
+ if (ib_destroy_cq(priv->recv_cq))
+ ipoib_warn(priv, "ib_cq_destroy (recv) failed\n");
+
+ ipoib_cm_dev_cleanup(priv);
+
+ if (ib_dereg_mr(priv->mr))
+ ipoib_warn(priv, "ib_dereg_mr failed\n");
+
+ if (ib_dealloc_pd(priv->pd))
+ ipoib_warn(priv, "ib_dealloc_pd failed\n");
+}
+
+void ipoib_event(struct ib_event_handler *handler,
+ struct ib_event *record)
+{
+ struct ipoib_dev_priv *priv =
+ container_of(handler, struct ipoib_dev_priv, event_handler);
+
+ if (record->element.port_num != priv->port)
+ return;
+
+ ipoib_dbg(priv, "Event %d on device %s port %d\n", record->event,
+ record->device->name, record->element.port_num);
+
+ if (record->event == IB_EVENT_SM_CHANGE ||
+ record->event == IB_EVENT_CLIENT_REREGISTER) {
+ queue_work(ipoib_workqueue, &priv->flush_light);
+ } else if (record->event == IB_EVENT_PORT_ERR ||
+ record->event == IB_EVENT_PORT_ACTIVE ||
+ record->event == IB_EVENT_LID_CHANGE) {
+ queue_work(ipoib_workqueue, &priv->flush_normal);
+ } else if (record->event == IB_EVENT_PKEY_CHANGE) {
+ queue_work(ipoib_workqueue, &priv->flush_heavy);
+ }
+}
diff --git a/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
new file mode 100644
index 0000000..18c761f
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+
+#include <asm/uaccess.h>
+
+#include "ipoib.h"
+
+static ssize_t show_parent(struct device *d, struct device_attribute *attr,
+ char *buf)
+{
+ struct ifnet *dev = to_net_dev(d);
+ struct ipoib_dev_priv *priv = dev->if_softc;
+
+ return sprintf(buf, "%s\n", priv->parent->name);
+}
+static DEVICE_ATTR(parent, S_IRUGO, show_parent, NULL);
+
+int ipoib_vlan_add(struct ifnet *pdev, unsigned short pkey)
+{
+ struct ipoib_dev_priv *ppriv, *priv;
+ char intf_name[IFNAMSIZ];
+ int result;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ ppriv = pdev->if_softc;
+
+ rtnl_lock();
+ mutex_lock(&ppriv->vlan_mutex);
+
+ /*
+ * First ensure this isn't a duplicate. We check the parent device and
+ * then all of the child interfaces to make sure the Pkey doesn't match.
+ */
+ if (ppriv->pkey == pkey) {
+ result = -ENOTUNIQ;
+ priv = NULL;
+ goto err;
+ }
+
+ list_for_each_entry(priv, &ppriv->child_intfs, list) {
+ if (priv->pkey == pkey) {
+ result = -ENOTUNIQ;
+ priv = NULL;
+ goto err;
+ }
+ }
+
+ snprintf(intf_name, sizeof intf_name, "%s.%04x",
+ ppriv->dev->name, pkey);
+ priv = ipoib_intf_alloc(intf_name);
+ if (!priv) {
+ result = -ENOMEM;
+ goto err;
+ }
+
+ priv->max_ib_mtu = ppriv->max_ib_mtu;
+ /* MTU will be reset when mcast join happens */
+ priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu);
+ priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu;
+ set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags);
+
+ result = ipoib_set_dev_features(priv, ppriv->ca);
+ if (result)
+ goto err;
+
+ priv->pkey = pkey;
+
+ memcpy(IF_LLADDR(priv->dev), ppriv->dev->dev_addr, INFINIBAND_ALEN);
+ priv->broadcastaddr[8] = pkey >> 8;
+ priv->broadcastaddr[9] = pkey & 0xff;
+
+ result = ipoib_dev_init(priv->dev, ppriv->ca, ppriv->port);
+ if (result < 0) {
+ ipoib_warn(ppriv, "failed to initialize subinterface: "
+ "device %s, port %d",
+ ppriv->ca->name, ppriv->port);
+ goto err;
+ }
+
+ result = register_netdevice(priv->dev);
+ if (result) {
+ ipoib_warn(priv, "failed to initialize; error %i", result);
+ goto register_failed;
+ }
+
+ priv->parent = ppriv->dev;
+
+ ipoib_create_debug_files(priv->dev);
+
+ if (ipoib_cm_add_mode_attr(priv->dev))
+ goto sysfs_failed;
+ if (ipoib_add_pkey_attr(priv->dev))
+ goto sysfs_failed;
+ if (ipoib_add_umcast_attr(priv->dev))
+ goto sysfs_failed;
+
+ if (device_create_file(&priv->dev->dev, &dev_attr_parent))
+ goto sysfs_failed;
+
+ list_add_tail(&priv->list, &ppriv->child_intfs);
+
+ mutex_unlock(&ppriv->vlan_mutex);
+ rtnl_unlock();
+
+ return 0;
+
+sysfs_failed:
+ ipoib_delete_debug_files(priv->dev);
+ unregister_netdevice(priv->dev);
+
+register_failed:
+ ipoib_dev_cleanup(priv->dev);
+
+err:
+ mutex_unlock(&ppriv->vlan_mutex);
+ rtnl_unlock();
+ if (priv)
+ free_netdev(priv->dev);
+
+ return result;
+}
+
+int ipoib_vlan_delete(struct ifnet *pdev, unsigned short pkey)
+{
+ struct ipoib_dev_priv *ppriv, *priv, *tpriv;
+ struct ifnet *dev = NULL;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ ppriv = pdev->if_softc;
+
+ rtnl_lock();
+ mutex_lock(&ppriv->vlan_mutex);
+ list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) {
+ if (priv->pkey == pkey) {
+ unregister_netdevice(priv->dev);
+ ipoib_dev_cleanup(priv->dev);
+ list_del(&priv->list);
+ dev = priv->dev;
+ break;
+ }
+ }
+ mutex_unlock(&ppriv->vlan_mutex);
+ rtnl_unlock();
+
+ if (dev) {
+ free_netdev(dev);
+ return 0;
+ }
+
+ return -ENODEV;
+}
diff --git a/sys/ofed/drivers/infiniband/ulp/sdp/Kconfig b/sys/ofed/drivers/infiniband/ulp/sdp/Kconfig
new file mode 100644
index 0000000..b5fadf4
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/sdp/Kconfig
@@ -0,0 +1,28 @@
+config INFINIBAND_SDP
+ tristate "Sockets Direct Protocol"
+ depends on INFINIBAND && INFINIBAND_IPOIB
+ ---help---
+ Support for Sockets Direct Protocol (SDP). This provides
+ sockets semantics over InfiniBand via address family
+ AF_INET_SDP (address family 27). You can also LD_PRELOAD the
+ libsdp library from <http://openib.org> to have standard
+ sockets applications use SDP.
+
+config INFINIBAND_SDP_DEBUG
+ bool "Sockets Direct Protocol debugging"
+ depends on INFINIBAND_SDP
+ ---help---
+ This option causes debugging code to be compiled into the
+ SDP driver. The output can be turned on via the debug_level
+ module parameter (which can also be set through sysfs after the
+ driver is loaded).
+
+config INFINIBAND_SDP_DEBUG_DATA
+ bool "Sockets Direct Protocol data path debugging"
+ depends on INFINIBAND_SDP_DEBUG
+ ---help---
+ This option compiles debugging code into the the data path
+ of the SDP driver. The output can be turned on via the
+ data_debug_level module parameter; however, even with output
+ turned off, this debugging code will have some performance
+ impact.
diff --git a/sys/ofed/drivers/infiniband/ulp/sdp/Makefile b/sys/ofed/drivers/infiniband/ulp/sdp/Makefile
new file mode 100644
index 0000000..5c250e9
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/sdp/Makefile
@@ -0,0 +1,6 @@
+EXTRA_CFLAGS += -Idrivers/infiniband/include
+EXTRA_CFLAGS += -ggdb
+
+obj-$(CONFIG_INFINIBAND_SDP) += ib_sdp.o
+
+ib_sdp-objs := sdp_main.o sdp_cma.o sdp_bcopy.o sdp_proc.o sdp_tx.o sdp_rx.o sdp_zcopy.o
diff --git a/sys/ofed/drivers/infiniband/ulp/sdp/sdp.h b/sys/ofed/drivers/infiniband/ulp/sdp/sdp.h
new file mode 100644
index 0000000..a1ccdff
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/sdp/sdp.h
@@ -0,0 +1,721 @@
+#ifndef _SDP_H_
+#define _SDP_H_
+
+#include "opt_ddb.h"
+#include "opt_inet.h"
+#include "opt_ofed.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/mbuf.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/protosw.h>
+#include <sys/proc.h>
+#include <sys/jail.h>
+#include <sys/domain.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#include <net/if.h>
+#include <net/route.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+#include <linux/wait.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/ib_cm.h>
+#include <rdma/sdp_socket.h>
+#include <rdma/ib_fmr_pool.h>
+
+#ifdef SDP_DEBUG
+#define CONFIG_INFINIBAND_SDP_DEBUG
+#endif
+
+#include "sdp_dbg.h"
+
+#undef LIST_HEAD
+/* From sys/queue.h */
+#define LIST_HEAD(name, type) \
+struct name { \
+ struct type *lh_first; /* first element */ \
+}
+
+/* Interval between sucessive polls in the Tx routine when polling is used
+ instead of interrupts (in per-core Tx rings) - should be power of 2 */
+#define SDP_TX_POLL_MODER 16
+#define SDP_TX_POLL_TIMEOUT (HZ / 20)
+#define SDP_NAGLE_TIMEOUT (HZ / 10)
+
+#define SDP_SRCAVAIL_CANCEL_TIMEOUT (HZ * 5)
+#define SDP_SRCAVAIL_ADV_TIMEOUT (1 * HZ)
+#define SDP_SRCAVAIL_PAYLOAD_LEN 1
+
+#define SDP_RESOLVE_TIMEOUT 1000
+#define SDP_ROUTE_TIMEOUT 1000
+#define SDP_RETRY_COUNT 5
+#define SDP_KEEPALIVE_TIME (120 * 60 * HZ)
+#define SDP_FIN_WAIT_TIMEOUT (60 * HZ) /* like TCP_FIN_TIMEOUT */
+
+#define SDP_TX_SIZE 0x40
+#define SDP_RX_SIZE 0x40
+
+#define SDP_FMR_SIZE (MIN(0x1000, PAGE_SIZE) / sizeof(u64))
+#define SDP_FMR_POOL_SIZE 1024
+#define SDP_FMR_DIRTY_SIZE ( SDP_FMR_POOL_SIZE / 4 )
+
+#define SDP_MAX_RDMA_READ_LEN (PAGE_SIZE * (SDP_FMR_SIZE - 2))
+
+/* mb inlined data len - rest will be rx'ed into frags */
+#define SDP_HEAD_SIZE (sizeof(struct sdp_bsdh))
+
+/* limit tx payload len, if the sink supports bigger buffers than the source
+ * can handle.
+ * or rx fragment size (limited by sge->length size) */
+#define SDP_MAX_PACKET (1 << 16)
+#define SDP_MAX_PAYLOAD (SDP_MAX_PACKET - SDP_HEAD_SIZE)
+
+#define SDP_MAX_RECV_SGES (SDP_MAX_PACKET / MCLBYTES)
+#define SDP_MAX_SEND_SGES (SDP_MAX_PACKET / MCLBYTES) + 2
+
+#define SDP_NUM_WC 4
+
+#define SDP_DEF_ZCOPY_THRESH 64*1024
+#define SDP_MIN_ZCOPY_THRESH PAGE_SIZE
+#define SDP_MAX_ZCOPY_THRESH 1048576
+
+#define SDP_OP_RECV 0x800000000LL
+#define SDP_OP_SEND 0x400000000LL
+#define SDP_OP_RDMA 0x200000000LL
+#define SDP_OP_NOP 0x100000000LL
+
+/* how long (in jiffies) to block sender till tx completion*/
+#define SDP_BZCOPY_POLL_TIMEOUT (HZ / 10)
+
+#define SDP_AUTO_CONF 0xffff
+#define AUTO_MOD_DELAY (HZ / 4)
+
+struct sdp_mb_cb {
+ __u32 seq; /* Starting sequence number */
+ struct bzcopy_state *bz;
+ struct rx_srcavail_state *rx_sa;
+ struct tx_srcavail_state *tx_sa;
+};
+
+#define M_PUSH M_PROTO1 /* Do a 'push'. */
+#define M_URG M_PROTO2 /* Mark as urgent (oob). */
+
+#define SDP_SKB_CB(__mb) ((struct sdp_mb_cb *)&((__mb)->cb[0]))
+#define BZCOPY_STATE(mb) (SDP_SKB_CB(mb)->bz)
+#define RX_SRCAVAIL_STATE(mb) (SDP_SKB_CB(mb)->rx_sa)
+#define TX_SRCAVAIL_STATE(mb) (SDP_SKB_CB(mb)->tx_sa)
+
+#ifndef MIN
+#define MIN(a, b) (a < b ? a : b)
+#endif
+
+#define ring_head(ring) (atomic_read(&(ring).head))
+#define ring_tail(ring) (atomic_read(&(ring).tail))
+#define ring_posted(ring) (ring_head(ring) - ring_tail(ring))
+
+#define rx_ring_posted(ssk) ring_posted(ssk->rx_ring)
+#ifdef SDP_ZCOPY
+#define tx_ring_posted(ssk) (ring_posted(ssk->tx_ring) + \
+ (ssk->tx_ring.rdma_inflight ? ssk->tx_ring.rdma_inflight->busy : 0))
+#else
+#define tx_ring_posted(ssk) ring_posted(ssk->tx_ring)
+#endif
+
+extern int sdp_zcopy_thresh;
+extern int rcvbuf_initial_size;
+extern struct workqueue_struct *rx_comp_wq;
+extern struct ib_client sdp_client;
+
+enum sdp_mid {
+ SDP_MID_HELLO = 0x0,
+ SDP_MID_HELLO_ACK = 0x1,
+ SDP_MID_DISCONN = 0x2,
+ SDP_MID_ABORT = 0x3,
+ SDP_MID_SENDSM = 0x4,
+ SDP_MID_RDMARDCOMPL = 0x6,
+ SDP_MID_SRCAVAIL_CANCEL = 0x8,
+ SDP_MID_CHRCVBUF = 0xB,
+ SDP_MID_CHRCVBUF_ACK = 0xC,
+ SDP_MID_SINKAVAIL = 0xFD,
+ SDP_MID_SRCAVAIL = 0xFE,
+ SDP_MID_DATA = 0xFF,
+};
+
+enum sdp_flags {
+ SDP_OOB_PRES = 1 << 0,
+ SDP_OOB_PEND = 1 << 1,
+};
+
+enum {
+ SDP_MIN_TX_CREDITS = 2
+};
+
+enum {
+ SDP_ERR_ERROR = -4,
+ SDP_ERR_FAULT = -3,
+ SDP_NEW_SEG = -2,
+ SDP_DO_WAIT_MEM = -1
+};
+
+struct sdp_bsdh {
+ u8 mid;
+ u8 flags;
+ __u16 bufs;
+ __u32 len;
+ __u32 mseq;
+ __u32 mseq_ack;
+} __attribute__((__packed__));
+
+union cma_ip_addr {
+ struct in6_addr ip6;
+ struct {
+ __u32 pad[3];
+ __u32 addr;
+ } ip4;
+} __attribute__((__packed__));
+
+/* TODO: too much? Can I avoid having the src/dst and port here? */
+struct sdp_hh {
+ struct sdp_bsdh bsdh;
+ u8 majv_minv;
+ u8 ipv_cap;
+ u8 rsvd1;
+ u8 max_adverts;
+ __u32 desremrcvsz;
+ __u32 localrcvsz;
+ __u16 port;
+ __u16 rsvd2;
+ union cma_ip_addr src_addr;
+ union cma_ip_addr dst_addr;
+ u8 rsvd3[IB_CM_REQ_PRIVATE_DATA_SIZE - sizeof(struct sdp_bsdh) - 48];
+} __attribute__((__packed__));
+
+struct sdp_hah {
+ struct sdp_bsdh bsdh;
+ u8 majv_minv;
+ u8 ipv_cap;
+ u8 rsvd1;
+ u8 ext_max_adverts;
+ __u32 actrcvsz;
+ u8 rsvd2[IB_CM_REP_PRIVATE_DATA_SIZE - sizeof(struct sdp_bsdh) - 8];
+} __attribute__((__packed__));
+
+struct sdp_rrch {
+ __u32 len;
+} __attribute__((__packed__));
+
+struct sdp_srcah {
+ __u32 len;
+ __u32 rkey;
+ __u64 vaddr;
+} __attribute__((__packed__));
+
+struct sdp_buf {
+ struct mbuf *mb;
+ u64 mapping[SDP_MAX_SEND_SGES];
+} __attribute__((__packed__));
+
+struct sdp_chrecvbuf {
+ u32 size;
+} __attribute__((__packed__));
+
+/* Context used for synchronous zero copy bcopy (BZCOPY) */
+struct bzcopy_state {
+ unsigned char __user *u_base;
+ int u_len;
+ int left;
+ int page_cnt;
+ int cur_page;
+ int cur_offset;
+ int busy;
+ struct sdp_sock *ssk;
+ struct page **pages;
+};
+
+enum rx_sa_flag {
+ RX_SA_ABORTED = 2,
+};
+
+enum tx_sa_flag {
+ TX_SA_SENDSM = 0x01,
+ TX_SA_CROSS_SEND = 0x02,
+ TX_SA_INTRRUPTED = 0x04,
+ TX_SA_TIMEDOUT = 0x08,
+ TX_SA_ERROR = 0x10,
+};
+
+struct rx_srcavail_state {
+ /* Advertised buffer stuff */
+ u32 mseq;
+ u32 used;
+ u32 reported;
+ u32 len;
+ u32 rkey;
+ u64 vaddr;
+
+ /* Dest buff info */
+ struct ib_umem *umem;
+ struct ib_pool_fmr *fmr;
+
+ /* Utility */
+ u8 busy;
+ enum rx_sa_flag flags;
+};
+
+struct tx_srcavail_state {
+ /* Data below 'busy' will be reset */
+ u8 busy;
+
+ struct ib_umem *umem;
+ struct ib_pool_fmr *fmr;
+
+ u32 bytes_sent;
+ u32 bytes_acked;
+
+ enum tx_sa_flag abort_flags;
+ u8 posted;
+
+ u32 mseq;
+};
+
+struct sdp_tx_ring {
+#ifdef SDP_ZCOPY
+ struct rx_srcavail_state *rdma_inflight;
+#endif
+ struct sdp_buf *buffer;
+ atomic_t head;
+ atomic_t tail;
+ struct ib_cq *cq;
+
+ atomic_t credits;
+#define tx_credits(ssk) (atomic_read(&ssk->tx_ring.credits))
+
+ struct callout timer;
+ u16 poll_cnt;
+};
+
+struct sdp_rx_ring {
+ struct sdp_buf *buffer;
+ atomic_t head;
+ atomic_t tail;
+ struct ib_cq *cq;
+
+ int destroyed;
+ struct rwlock destroyed_lock;
+};
+
+struct sdp_device {
+ struct ib_pd *pd;
+ struct ib_mr *mr;
+ struct ib_fmr_pool *fmr_pool;
+};
+
+struct sdp_moderation {
+ unsigned long last_moder_packets;
+ unsigned long last_moder_tx_packets;
+ unsigned long last_moder_bytes;
+ unsigned long last_moder_jiffies;
+ int last_moder_time;
+ u16 rx_usecs;
+ u16 rx_frames;
+ u16 tx_usecs;
+ u32 pkt_rate_low;
+ u16 rx_usecs_low;
+ u32 pkt_rate_high;
+ u16 rx_usecs_high;
+ u16 sample_interval;
+ u16 adaptive_rx_coal;
+ u32 msg_enable;
+
+ int moder_cnt;
+ int moder_time;
+};
+
+/* These are flags fields. */
+#define SDP_TIMEWAIT 0x0001 /* In ssk timewait state. */
+#define SDP_DROPPED 0x0002 /* Socket has been dropped. */
+#define SDP_SOCKREF 0x0004 /* Holding a sockref for close. */
+#define SDP_NODELAY 0x0008 /* Disble nagle. */
+#define SDP_NEEDFIN 0x0010 /* Send a fin on the next tx. */
+#define SDP_DREQWAIT 0x0020 /* Waiting on DREQ. */
+#define SDP_DESTROY 0x0040 /* Being destroyed. */
+#define SDP_DISCON 0x0080 /* rdma_disconnect is owed. */
+
+/* These are oobflags */
+#define SDP_HADOOB 0x0001 /* Had OOB data. */
+#define SDP_HAVEOOB 0x0002 /* Have OOB data. */
+
+struct sdp_sock {
+ LIST_ENTRY(sdp_sock) list;
+ struct socket *socket;
+ struct rdma_cm_id *id;
+ struct ib_device *ib_device;
+ struct sdp_device *sdp_dev;
+ struct ib_qp *qp;
+ struct ucred *cred;
+ struct callout keep2msl; /* 2msl and keepalive timer. */
+ struct callout nagle_timer; /* timeout waiting for ack */
+ struct ib_ucontext context;
+ in_port_t lport;
+ in_addr_t laddr;
+ in_port_t fport;
+ in_addr_t faddr;
+ int flags;
+ int oobflags; /* protected by rx lock. */
+ int state;
+ int softerror;
+ int recv_bytes; /* Bytes per recv. buf including header */
+ int xmit_size_goal;
+ char iobc;
+
+ struct sdp_rx_ring rx_ring;
+ struct sdp_tx_ring tx_ring;
+ struct rwlock lock;
+ struct mbuf *rx_ctl_q;
+ struct mbuf *rx_ctl_tail;
+
+ int qp_active; /* XXX Flag. */
+ int max_sge;
+ struct work_struct rx_comp_work;
+#define rcv_nxt(ssk) atomic_read(&(ssk->rcv_nxt))
+ atomic_t rcv_nxt;
+
+ /* SDP specific */
+ atomic_t mseq_ack;
+#define mseq_ack(ssk) (atomic_read(&ssk->mseq_ack))
+ unsigned max_bufs; /* Initial buffers offered by other side */
+ unsigned min_bufs; /* Low water mark to wake senders */
+
+ unsigned long nagle_last_unacked; /* mseq of lastest unacked packet */
+
+ atomic_t remote_credits;
+#define remote_credits(ssk) (atomic_read(&ssk->remote_credits))
+ int poll_cq;
+
+ /* SDP slow start */
+ int recv_request_head; /* mark the rx_head when the resize request
+ was recieved */
+ int recv_request; /* XXX flag if request to resize was recieved */
+
+ unsigned long tx_packets;
+ unsigned long rx_packets;
+ unsigned long tx_bytes;
+ unsigned long rx_bytes;
+ struct sdp_moderation auto_mod;
+ struct task shutdown_task;
+#ifdef SDP_ZCOPY
+ struct tx_srcavail_state *tx_sa;
+ struct rx_srcavail_state *rx_sa;
+ spinlock_t tx_sa_lock;
+ struct delayed_work srcavail_cancel_work;
+ int srcavail_cancel_mseq;
+ /* ZCOPY data: -1:use global; 0:disable zcopy; >0: zcopy threshold */
+ int zcopy_thresh;
+#endif
+};
+
+#define sdp_sk(so) ((struct sdp_sock *)(so->so_pcb))
+
+#define SDP_RLOCK(ssk) rw_rlock(&(ssk)->lock)
+#define SDP_WLOCK(ssk) rw_wlock(&(ssk)->lock)
+#define SDP_RUNLOCK(ssk) rw_runlock(&(ssk)->lock)
+#define SDP_WUNLOCK(ssk) rw_wunlock(&(ssk)->lock)
+#define SDP_WLOCK_ASSERT(ssk) rw_assert(&(ssk)->lock, RA_WLOCKED)
+#define SDP_RLOCK_ASSERT(ssk) rw_assert(&(ssk)->lock, RA_RLOCKED)
+#define SDP_LOCK_ASSERT(ssk) rw_assert(&(ssk)->lock, RA_LOCKED)
+
+static inline void tx_sa_reset(struct tx_srcavail_state *tx_sa)
+{
+ memset((void *)&tx_sa->busy, 0,
+ sizeof(*tx_sa) - offsetof(typeof(*tx_sa), busy));
+}
+
+static inline void rx_ring_unlock(struct sdp_rx_ring *rx_ring)
+{
+ rw_runlock(&rx_ring->destroyed_lock);
+}
+
+static inline int rx_ring_trylock(struct sdp_rx_ring *rx_ring)
+{
+ rw_rlock(&rx_ring->destroyed_lock);
+ if (rx_ring->destroyed) {
+ rx_ring_unlock(rx_ring);
+ return 0;
+ }
+ return 1;
+}
+
+static inline void rx_ring_destroy_lock(struct sdp_rx_ring *rx_ring)
+{
+ rw_wlock(&rx_ring->destroyed_lock);
+ rx_ring->destroyed = 1;
+ rw_wunlock(&rx_ring->destroyed_lock);
+}
+
+static inline void sdp_arm_rx_cq(struct sdp_sock *ssk)
+{
+ sdp_prf(ssk->socket, NULL, "Arming RX cq");
+ sdp_dbg_data(ssk->socket, "Arming RX cq\n");
+
+ ib_req_notify_cq(ssk->rx_ring.cq, IB_CQ_NEXT_COMP);
+}
+
+static inline void sdp_arm_tx_cq(struct sdp_sock *ssk)
+{
+ sdp_prf(ssk->socket, NULL, "Arming TX cq");
+ sdp_dbg_data(ssk->socket, "Arming TX cq. credits: %d, posted: %d\n",
+ tx_credits(ssk), tx_ring_posted(ssk));
+
+ ib_req_notify_cq(ssk->tx_ring.cq, IB_CQ_NEXT_COMP);
+}
+
+/* return the min of:
+ * - tx credits
+ * - free slots in tx_ring (not including SDP_MIN_TX_CREDITS
+ */
+static inline int tx_slots_free(struct sdp_sock *ssk)
+{
+ int min_free;
+
+ min_free = MIN(tx_credits(ssk),
+ SDP_TX_SIZE - tx_ring_posted(ssk));
+ if (min_free < SDP_MIN_TX_CREDITS)
+ return 0;
+
+ return min_free - SDP_MIN_TX_CREDITS;
+};
+
+/* utilities */
+static inline char *mid2str(int mid)
+{
+#define ENUM2STR(e) [e] = #e
+ static char *mid2str[] = {
+ ENUM2STR(SDP_MID_HELLO),
+ ENUM2STR(SDP_MID_HELLO_ACK),
+ ENUM2STR(SDP_MID_ABORT),
+ ENUM2STR(SDP_MID_DISCONN),
+ ENUM2STR(SDP_MID_SENDSM),
+ ENUM2STR(SDP_MID_RDMARDCOMPL),
+ ENUM2STR(SDP_MID_SRCAVAIL_CANCEL),
+ ENUM2STR(SDP_MID_CHRCVBUF),
+ ENUM2STR(SDP_MID_CHRCVBUF_ACK),
+ ENUM2STR(SDP_MID_DATA),
+ ENUM2STR(SDP_MID_SRCAVAIL),
+ ENUM2STR(SDP_MID_SINKAVAIL),
+ };
+
+ if (mid >= ARRAY_SIZE(mid2str))
+ return NULL;
+
+ return mid2str[mid];
+}
+
+static inline struct mbuf *
+sdp_alloc_mb(struct socket *sk, u8 mid, int size, int wait)
+{
+ struct sdp_bsdh *h;
+ struct mbuf *mb;
+
+ MGETHDR(mb, wait, MT_DATA);
+ if (mb == NULL)
+ return (NULL);
+ mb->m_pkthdr.len = mb->m_len = sizeof(struct sdp_bsdh);
+ h = mtod(mb, struct sdp_bsdh *);
+ h->mid = mid;
+
+ return mb;
+}
+static inline struct mbuf *
+sdp_alloc_mb_data(struct socket *sk, int wait)
+{
+ return sdp_alloc_mb(sk, SDP_MID_DATA, 0, wait);
+}
+
+static inline struct mbuf *
+sdp_alloc_mb_disconnect(struct socket *sk, int wait)
+{
+ return sdp_alloc_mb(sk, SDP_MID_DISCONN, 0, wait);
+}
+
+static inline void *
+mb_put(struct mbuf *mb, int len)
+{
+ uint8_t *data;
+
+ data = mb->m_data;
+ data += mb->m_len;
+ mb->m_len += len;
+ return (void *)data;
+}
+
+static inline struct mbuf *
+sdp_alloc_mb_chrcvbuf_ack(struct socket *sk, int size, int wait)
+{
+ struct mbuf *mb;
+ struct sdp_chrecvbuf *resp_size;
+
+ mb = sdp_alloc_mb(sk, SDP_MID_CHRCVBUF_ACK, sizeof(*resp_size), wait);
+ if (mb == NULL)
+ return (NULL);
+ resp_size = (struct sdp_chrecvbuf *)mb_put(mb, sizeof *resp_size);
+ resp_size->size = htonl(size);
+
+ return mb;
+}
+
+static inline struct mbuf *
+sdp_alloc_mb_srcavail(struct socket *sk, u32 len, u32 rkey, u64 vaddr, int wait)
+{
+ struct mbuf *mb;
+ struct sdp_srcah *srcah;
+
+ mb = sdp_alloc_mb(sk, SDP_MID_SRCAVAIL, sizeof(*srcah), wait);
+ if (mb == NULL)
+ return (NULL);
+ srcah = (struct sdp_srcah *)mb_put(mb, sizeof(*srcah));
+ srcah->len = htonl(len);
+ srcah->rkey = htonl(rkey);
+ srcah->vaddr = cpu_to_be64(vaddr);
+
+ return mb;
+}
+
+static inline struct mbuf *
+sdp_alloc_mb_srcavail_cancel(struct socket *sk, int wait)
+{
+ return sdp_alloc_mb(sk, SDP_MID_SRCAVAIL_CANCEL, 0, wait);
+}
+
+static inline struct mbuf *
+sdp_alloc_mb_rdmardcompl(struct socket *sk, u32 len, int wait)
+{
+ struct mbuf *mb;
+ struct sdp_rrch *rrch;
+
+ mb = sdp_alloc_mb(sk, SDP_MID_RDMARDCOMPL, sizeof(*rrch), wait);
+ if (mb == NULL)
+ return (NULL);
+ rrch = (struct sdp_rrch *)mb_put(mb, sizeof(*rrch));
+ rrch->len = htonl(len);
+
+ return mb;
+}
+
+static inline struct mbuf *
+sdp_alloc_mb_sendsm(struct socket *sk, int wait)
+{
+ return sdp_alloc_mb(sk, SDP_MID_SENDSM, 0, wait);
+}
+static inline int sdp_tx_ring_slots_left(struct sdp_sock *ssk)
+{
+ return SDP_TX_SIZE - tx_ring_posted(ssk);
+}
+
+static inline int credit_update_needed(struct sdp_sock *ssk)
+{
+ int c;
+
+ c = remote_credits(ssk);
+ if (likely(c > SDP_MIN_TX_CREDITS))
+ c += c/2;
+ return unlikely(c < rx_ring_posted(ssk)) &&
+ likely(tx_credits(ssk) > 0) &&
+ likely(sdp_tx_ring_slots_left(ssk));
+}
+
+
+#define SDPSTATS_COUNTER_INC(stat)
+#define SDPSTATS_COUNTER_ADD(stat, val)
+#define SDPSTATS_COUNTER_MID_INC(stat, mid)
+#define SDPSTATS_HIST_LINEAR(stat, size)
+#define SDPSTATS_HIST(stat, size)
+
+static inline void
+sdp_cleanup_sdp_buf(struct sdp_sock *ssk, struct sdp_buf *sbuf,
+ enum dma_data_direction dir)
+{
+ struct ib_device *dev;
+ struct mbuf *mb;
+ int i;
+
+ dev = ssk->ib_device;
+ for (i = 0, mb = sbuf->mb; mb != NULL; mb = mb->m_next, i++)
+ ib_dma_unmap_single(dev, sbuf->mapping[i], mb->m_len, dir);
+}
+
+/* sdp_main.c */
+void sdp_set_default_moderation(struct sdp_sock *ssk);
+void sdp_start_keepalive_timer(struct socket *sk);
+void sdp_urg(struct sdp_sock *ssk, struct mbuf *mb);
+void sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk);
+void sdp_abort(struct socket *sk);
+struct sdp_sock *sdp_notify(struct sdp_sock *ssk, int error);
+
+
+/* sdp_cma.c */
+int sdp_cma_handler(struct rdma_cm_id *, struct rdma_cm_event *);
+
+/* sdp_tx.c */
+int sdp_tx_ring_create(struct sdp_sock *ssk, struct ib_device *device);
+void sdp_tx_ring_destroy(struct sdp_sock *ssk);
+int sdp_xmit_poll(struct sdp_sock *ssk, int force);
+void sdp_post_send(struct sdp_sock *ssk, struct mbuf *mb);
+void sdp_post_sends(struct sdp_sock *ssk, int wait);
+void sdp_post_keepalive(struct sdp_sock *ssk);
+
+/* sdp_rx.c */
+void sdp_rx_ring_init(struct sdp_sock *ssk);
+int sdp_rx_ring_create(struct sdp_sock *ssk, struct ib_device *device);
+void sdp_rx_ring_destroy(struct sdp_sock *ssk);
+int sdp_resize_buffers(struct sdp_sock *ssk, u32 new_size);
+int sdp_init_buffers(struct sdp_sock *ssk, u32 new_size);
+void sdp_do_posts(struct sdp_sock *ssk);
+void sdp_rx_comp_full(struct sdp_sock *ssk);
+
+/* sdp_zcopy.c */
+int sdp_sendmsg_zcopy(struct kiocb *iocb, struct socket *sk, struct iovec *iov);
+int sdp_handle_srcavail(struct sdp_sock *ssk, struct sdp_srcah *srcah);
+void sdp_handle_sendsm(struct sdp_sock *ssk, u32 mseq_ack);
+void sdp_handle_rdma_read_compl(struct sdp_sock *ssk, u32 mseq_ack,
+ u32 bytes_completed);
+int sdp_handle_rdma_read_cqe(struct sdp_sock *ssk);
+int sdp_rdma_to_iovec(struct socket *sk, struct iovec *iov, struct mbuf *mb,
+ unsigned long *used);
+int sdp_post_rdma_rd_compl(struct sdp_sock *ssk,
+ struct rx_srcavail_state *rx_sa);
+int sdp_post_sendsm(struct socket *sk);
+void srcavail_cancel_timeout(struct work_struct *work);
+void sdp_abort_srcavail(struct socket *sk);
+void sdp_abort_rdma_read(struct socket *sk);
+int sdp_process_rx(struct sdp_sock *ssk);
+
+#endif
diff --git a/sys/ofed/drivers/infiniband/ulp/sdp/sdp_bcopy.c b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_bcopy.c
new file mode 100644
index 0000000..d068852
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_bcopy.c
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2006 Mellanox Technologies Ltd. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+#include "sdp.h"
+
+static void sdp_nagle_timeout(void *data);
+
+#ifdef CONFIG_INFINIBAND_SDP_DEBUG_DATA
+void _dump_packet(const char *func, int line, struct socket *sk, char *str,
+ struct mbuf *mb, const struct sdp_bsdh *h)
+{
+ struct sdp_hh *hh;
+ struct sdp_hah *hah;
+ struct sdp_chrecvbuf *req_size;
+ struct sdp_rrch *rrch;
+ struct sdp_srcah *srcah;
+ int len = 0;
+ char buf[256];
+ len += snprintf(buf, 255-len, "%s mb: %p mid: %2x:%-20s flags: 0x%x "
+ "bufs: 0x%x len: 0x%x mseq: 0x%x mseq_ack: 0x%x | ",
+ str, mb, h->mid, mid2str(h->mid), h->flags,
+ ntohs(h->bufs), ntohl(h->len), ntohl(h->mseq),
+ ntohl(h->mseq_ack));
+
+ switch (h->mid) {
+ case SDP_MID_HELLO:
+ hh = (struct sdp_hh *)h;
+ len += snprintf(buf + len, 255-len,
+ "max_adverts: %d majv_minv: 0x%x "
+ "localrcvsz: 0x%x desremrcvsz: 0x%x |",
+ hh->max_adverts, hh->majv_minv,
+ ntohl(hh->localrcvsz),
+ ntohl(hh->desremrcvsz));
+ break;
+ case SDP_MID_HELLO_ACK:
+ hah = (struct sdp_hah *)h;
+ len += snprintf(buf + len, 255-len, "actrcvz: 0x%x |",
+ ntohl(hah->actrcvsz));
+ break;
+ case SDP_MID_CHRCVBUF:
+ case SDP_MID_CHRCVBUF_ACK:
+ req_size = (struct sdp_chrecvbuf *)(h+1);
+ len += snprintf(buf + len, 255-len, "req_size: 0x%x |",
+ ntohl(req_size->size));
+ break;
+ case SDP_MID_DATA:
+ len += snprintf(buf + len, 255-len, "data_len: 0x%lx |",
+ ntohl(h->len) - sizeof(struct sdp_bsdh));
+ break;
+ case SDP_MID_RDMARDCOMPL:
+ rrch = (struct sdp_rrch *)(h+1);
+
+ len += snprintf(buf + len, 255-len, " | len: 0x%x |",
+ ntohl(rrch->len));
+ break;
+ case SDP_MID_SRCAVAIL:
+ srcah = (struct sdp_srcah *)(h+1);
+
+ len += snprintf(buf + len, 255-len, " | payload: 0x%lx, "
+ "len: 0x%x, rkey: 0x%x, vaddr: 0x%jx |",
+ ntohl(h->len) - sizeof(struct sdp_bsdh) -
+ sizeof(struct sdp_srcah),
+ ntohl(srcah->len), ntohl(srcah->rkey),
+ be64_to_cpu(srcah->vaddr));
+ break;
+ default:
+ break;
+ }
+ buf[len] = 0;
+ _sdp_printk(func, line, KERN_WARNING, sk, "%s: %s\n", str, buf);
+}
+#endif
+
+static inline int
+sdp_nagle_off(struct sdp_sock *ssk, struct mbuf *mb)
+{
+
+ struct sdp_bsdh *h;
+
+ h = mtod(mb, struct sdp_bsdh *);
+ int send_now =
+#ifdef SDP_ZCOPY
+ BZCOPY_STATE(mb) ||
+#endif
+ unlikely(h->mid != SDP_MID_DATA) ||
+ (ssk->flags & SDP_NODELAY) ||
+ !ssk->nagle_last_unacked ||
+ mb->m_pkthdr.len >= ssk->xmit_size_goal / 4 ||
+ (mb->m_flags & M_PUSH);
+
+ if (send_now) {
+ unsigned long mseq = ring_head(ssk->tx_ring);
+ ssk->nagle_last_unacked = mseq;
+ } else {
+ if (!callout_pending(&ssk->nagle_timer)) {
+ callout_reset(&ssk->nagle_timer, SDP_NAGLE_TIMEOUT,
+ sdp_nagle_timeout, ssk);
+ sdp_dbg_data(ssk->socket, "Starting nagle timer\n");
+ }
+ }
+ sdp_dbg_data(ssk->socket, "send_now = %d last_unacked = %ld\n",
+ send_now, ssk->nagle_last_unacked);
+
+ return send_now;
+}
+
+static void
+sdp_nagle_timeout(void *data)
+{
+ struct sdp_sock *ssk = (struct sdp_sock *)data;
+ struct socket *sk = ssk->socket;
+
+ sdp_dbg_data(sk, "last_unacked = %ld\n", ssk->nagle_last_unacked);
+
+ if (!callout_active(&ssk->nagle_timer))
+ return;
+ callout_deactivate(&ssk->nagle_timer);
+
+ if (!ssk->nagle_last_unacked)
+ goto out;
+ if (ssk->state == TCPS_CLOSED)
+ return;
+ ssk->nagle_last_unacked = 0;
+ sdp_post_sends(ssk, M_DONTWAIT);
+
+ sowwakeup(ssk->socket);
+out:
+ if (sk->so_snd.sb_sndptr)
+ callout_reset(&ssk->nagle_timer, SDP_NAGLE_TIMEOUT,
+ sdp_nagle_timeout, ssk);
+}
+
+void
+sdp_post_sends(struct sdp_sock *ssk, int wait)
+{
+ struct mbuf *mb;
+ int post_count = 0;
+ struct socket *sk;
+ int low;
+
+ sk = ssk->socket;
+ if (unlikely(!ssk->id)) {
+ if (sk->so_snd.sb_sndptr) {
+ sdp_dbg(ssk->socket,
+ "Send on socket without cmid ECONNRESET.\n");
+ sdp_notify(ssk, ECONNRESET);
+ }
+ return;
+ }
+again:
+ if (sdp_tx_ring_slots_left(ssk) < SDP_TX_SIZE / 2)
+ sdp_xmit_poll(ssk, 1);
+
+ if (ssk->recv_request &&
+ ring_tail(ssk->rx_ring) >= ssk->recv_request_head &&
+ tx_credits(ssk) >= SDP_MIN_TX_CREDITS &&
+ sdp_tx_ring_slots_left(ssk)) {
+ mb = sdp_alloc_mb_chrcvbuf_ack(sk,
+ ssk->recv_bytes - SDP_HEAD_SIZE, wait);
+ if (mb == NULL)
+ goto allocfail;
+ ssk->recv_request = 0;
+ sdp_post_send(ssk, mb);
+ post_count++;
+ }
+
+ if (tx_credits(ssk) <= SDP_MIN_TX_CREDITS &&
+ sdp_tx_ring_slots_left(ssk) && sk->so_snd.sb_sndptr &&
+ sdp_nagle_off(ssk, sk->so_snd.sb_sndptr)) {
+ SDPSTATS_COUNTER_INC(send_miss_no_credits);
+ }
+
+ while (tx_credits(ssk) > SDP_MIN_TX_CREDITS &&
+ sdp_tx_ring_slots_left(ssk) && (mb = sk->so_snd.sb_sndptr) &&
+ sdp_nagle_off(ssk, mb)) {
+ struct mbuf *n;
+
+ SOCKBUF_LOCK(&sk->so_snd);
+ sk->so_snd.sb_sndptr = mb->m_nextpkt;
+ sk->so_snd.sb_mb = mb->m_nextpkt;
+ mb->m_nextpkt = NULL;
+ SB_EMPTY_FIXUP(&sk->so_snd);
+ for (n = mb; n != NULL; n = n->m_next)
+ sbfree(&sk->so_snd, n);
+ SOCKBUF_UNLOCK(&sk->so_snd);
+ sdp_post_send(ssk, mb);
+ post_count++;
+ }
+
+ if (credit_update_needed(ssk) && ssk->state >= TCPS_ESTABLISHED &&
+ ssk->state < TCPS_FIN_WAIT_2) {
+ mb = sdp_alloc_mb_data(ssk->socket, wait);
+ if (mb == NULL)
+ goto allocfail;
+ sdp_post_send(ssk, mb);
+
+ SDPSTATS_COUNTER_INC(post_send_credits);
+ post_count++;
+ }
+
+ /* send DisConn if needed
+ * Do not send DisConn if there is only 1 credit. Compliance with CA4-82
+ * If one credit is available, an implementation shall only send SDP
+ * messages that provide additional credits and also do not contain ULP
+ * payload. */
+ if ((ssk->flags & SDP_NEEDFIN) && !sk->so_snd.sb_sndptr &&
+ tx_credits(ssk) > 1) {
+ mb = sdp_alloc_mb_disconnect(sk, wait);
+ if (mb == NULL)
+ goto allocfail;
+ ssk->flags &= ~SDP_NEEDFIN;
+ sdp_post_send(ssk, mb);
+ post_count++;
+ }
+ low = (sdp_tx_ring_slots_left(ssk) <= SDP_MIN_TX_CREDITS);
+ if (post_count || low) {
+ if (low)
+ sdp_arm_tx_cq(ssk);
+ if (sdp_xmit_poll(ssk, low))
+ goto again;
+ }
+ return;
+
+allocfail:
+ ssk->nagle_last_unacked = -1;
+ callout_reset(&ssk->nagle_timer, 1, sdp_nagle_timeout, ssk);
+ return;
+}
diff --git a/sys/ofed/drivers/infiniband/ulp/sdp/sdp_cma.c b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_cma.c
new file mode 100644
index 0000000..9350609
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_cma.c
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2006 Mellanox Technologies Ltd. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+#include "sdp.h"
+
+#define SDP_MAJV_MINV 0x22
+
+SDP_MODPARAM_SINT(sdp_link_layer_ib_only, 1, "Support only link layer of "
+ "type Infiniband");
+
+enum {
+ SDP_HH_SIZE = 76,
+ SDP_HAH_SIZE = 180,
+};
+
+static void
+sdp_qp_event_handler(struct ib_event *event, void *data)
+{
+}
+
+static int
+sdp_get_max_dev_sge(struct ib_device *dev)
+{
+ struct ib_device_attr attr;
+ static int max_sges = -1;
+
+ if (max_sges > 0)
+ goto out;
+
+ ib_query_device(dev, &attr);
+
+ max_sges = attr.max_sge;
+
+out:
+ return max_sges;
+}
+
+static int
+sdp_init_qp(struct socket *sk, struct rdma_cm_id *id)
+{
+ struct ib_qp_init_attr qp_init_attr = {
+ .event_handler = sdp_qp_event_handler,
+ .cap.max_send_wr = SDP_TX_SIZE,
+ .cap.max_recv_wr = SDP_RX_SIZE,
+ .sq_sig_type = IB_SIGNAL_REQ_WR,
+ .qp_type = IB_QPT_RC,
+ };
+ struct ib_device *device = id->device;
+ struct sdp_sock *ssk;
+ int rc;
+
+ sdp_dbg(sk, "%s\n", __func__);
+
+ ssk = sdp_sk(sk);
+ ssk->max_sge = sdp_get_max_dev_sge(device);
+ sdp_dbg(sk, "Max sges: %d\n", ssk->max_sge);
+
+ qp_init_attr.cap.max_send_sge = MIN(ssk->max_sge, SDP_MAX_SEND_SGES);
+ sdp_dbg(sk, "Setting max send sge to: %d\n",
+ qp_init_attr.cap.max_send_sge);
+
+ qp_init_attr.cap.max_recv_sge = MIN(ssk->max_sge, SDP_MAX_RECV_SGES);
+ sdp_dbg(sk, "Setting max recv sge to: %d\n",
+ qp_init_attr.cap.max_recv_sge);
+
+ ssk->sdp_dev = ib_get_client_data(device, &sdp_client);
+ if (!ssk->sdp_dev) {
+ sdp_warn(sk, "SDP not available on device %s\n", device->name);
+ rc = -ENODEV;
+ goto err_rx;
+ }
+
+ rc = sdp_rx_ring_create(ssk, device);
+ if (rc)
+ goto err_rx;
+
+ rc = sdp_tx_ring_create(ssk, device);
+ if (rc)
+ goto err_tx;
+
+ qp_init_attr.recv_cq = ssk->rx_ring.cq;
+ qp_init_attr.send_cq = ssk->tx_ring.cq;
+
+ rc = rdma_create_qp(id, ssk->sdp_dev->pd, &qp_init_attr);
+ if (rc) {
+ sdp_warn(sk, "Unable to create QP: %d.\n", rc);
+ goto err_qp;
+ }
+ ssk->qp = id->qp;
+ ssk->ib_device = device;
+ ssk->qp_active = 1;
+ ssk->context.device = device;
+
+ sdp_dbg(sk, "%s done\n", __func__);
+ return 0;
+
+err_qp:
+ sdp_tx_ring_destroy(ssk);
+err_tx:
+ sdp_rx_ring_destroy(ssk);
+err_rx:
+ return rc;
+}
+
+static int
+sdp_connect_handler(struct socket *sk, struct rdma_cm_id *id,
+ struct rdma_cm_event *event)
+{
+ struct sockaddr_in *src_addr;
+ struct sockaddr_in *dst_addr;
+ struct socket *child;
+ const struct sdp_hh *h;
+ struct sdp_sock *ssk;
+ int rc;
+
+ sdp_dbg(sk, "%s %p -> %p\n", __func__, sdp_sk(sk)->id, id);
+
+ h = event->param.conn.private_data;
+ SDP_DUMP_PACKET(sk, "RX", NULL, &h->bsdh);
+
+ if (!h->max_adverts)
+ return -EINVAL;
+
+ child = sonewconn(sk, SS_ISCONNECTED);
+ if (!child)
+ return -ENOMEM;
+
+ ssk = sdp_sk(child);
+ rc = sdp_init_qp(child, id);
+ if (rc)
+ return rc;
+ SDP_WLOCK(ssk);
+ id->context = ssk;
+ ssk->id = id;
+ ssk->socket = child;
+ ssk->cred = crhold(child->so_cred);
+ dst_addr = (struct sockaddr_in *)&id->route.addr.dst_addr;
+ src_addr = (struct sockaddr_in *)&id->route.addr.src_addr;
+ ssk->fport = dst_addr->sin_port;
+ ssk->faddr = dst_addr->sin_addr.s_addr;
+ ssk->lport = src_addr->sin_port;
+ ssk->max_bufs = ntohs(h->bsdh.bufs);
+ atomic_set(&ssk->tx_ring.credits, ssk->max_bufs);
+ ssk->min_bufs = tx_credits(ssk) / 4;
+ ssk->xmit_size_goal = ntohl(h->localrcvsz) - sizeof(struct sdp_bsdh);
+ sdp_init_buffers(ssk, rcvbuf_initial_size);
+ ssk->state = TCPS_SYN_RECEIVED;
+ SDP_WUNLOCK(ssk);
+
+ return 0;
+}
+
+static int
+sdp_response_handler(struct socket *sk, struct rdma_cm_id *id,
+ struct rdma_cm_event *event)
+{
+ const struct sdp_hah *h;
+ struct sockaddr_in *dst_addr;
+ struct sdp_sock *ssk;
+ sdp_dbg(sk, "%s\n", __func__);
+
+ ssk = sdp_sk(sk);
+ SDP_WLOCK(ssk);
+ ssk->state = TCPS_ESTABLISHED;
+ sdp_set_default_moderation(ssk);
+ if (ssk->flags & SDP_DROPPED) {
+ SDP_WUNLOCK(ssk);
+ return 0;
+ }
+ if (sk->so_options & SO_KEEPALIVE)
+ sdp_start_keepalive_timer(sk);
+ h = event->param.conn.private_data;
+ SDP_DUMP_PACKET(sk, "RX", NULL, &h->bsdh);
+ ssk->max_bufs = ntohs(h->bsdh.bufs);
+ atomic_set(&ssk->tx_ring.credits, ssk->max_bufs);
+ ssk->min_bufs = tx_credits(ssk) / 4;
+ ssk->xmit_size_goal =
+ ntohl(h->actrcvsz) - sizeof(struct sdp_bsdh);
+ ssk->poll_cq = 1;
+
+ dst_addr = (struct sockaddr_in *)&id->route.addr.dst_addr;
+ ssk->fport = dst_addr->sin_port;
+ ssk->faddr = dst_addr->sin_addr.s_addr;
+ soisconnected(sk);
+ SDP_WUNLOCK(ssk);
+
+ return 0;
+}
+
+static int
+sdp_connected_handler(struct socket *sk, struct rdma_cm_event *event)
+{
+ struct sdp_sock *ssk;
+
+ sdp_dbg(sk, "%s\n", __func__);
+
+ ssk = sdp_sk(sk);
+ SDP_WLOCK(ssk);
+ ssk->state = TCPS_ESTABLISHED;
+
+ sdp_set_default_moderation(ssk);
+
+ if (sk->so_options & SO_KEEPALIVE)
+ sdp_start_keepalive_timer(sk);
+
+ if ((ssk->flags & SDP_DROPPED) == 0)
+ soisconnected(sk);
+ SDP_WUNLOCK(ssk);
+ return 0;
+}
+
+static int
+sdp_disconnected_handler(struct socket *sk)
+{
+ struct sdp_sock *ssk;
+
+ ssk = sdp_sk(sk);
+ sdp_dbg(sk, "%s\n", __func__);
+
+ SDP_WLOCK_ASSERT(ssk);
+ if (sdp_sk(sk)->state == TCPS_SYN_RECEIVED) {
+ sdp_connected_handler(sk, NULL);
+
+ if (rcv_nxt(ssk))
+ return 0;
+ }
+
+ return -ECONNRESET;
+}
+
+int
+sdp_cma_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
+{
+ struct rdma_conn_param conn_param;
+ struct socket *sk;
+ struct sdp_sock *ssk;
+ struct sdp_hah hah;
+ struct sdp_hh hh;
+
+ int rc = 0;
+
+ ssk = id->context;
+ sk = NULL;
+ if (ssk)
+ sk = ssk->socket;
+ if (!ssk || !sk || !ssk->id) {
+ sdp_dbg(sk,
+ "cm_id is being torn down, event %d, ssk %p, sk %p, id %p\n",
+ event->event, ssk, sk, id);
+ return event->event == RDMA_CM_EVENT_CONNECT_REQUEST ?
+ -EINVAL : 0;
+ }
+
+ sdp_dbg(sk, "%s event %d id %p\n", __func__, event->event, id);
+ switch (event->event) {
+ case RDMA_CM_EVENT_ADDR_RESOLVED:
+ sdp_dbg(sk, "RDMA_CM_EVENT_ADDR_RESOLVED\n");
+
+ if (sdp_link_layer_ib_only &&
+ rdma_node_get_transport(id->device->node_type) ==
+ RDMA_TRANSPORT_IB &&
+ rdma_port_get_link_layer(id->device, id->port_num) !=
+ IB_LINK_LAYER_INFINIBAND) {
+ sdp_dbg(sk, "Link layer is: %d. Only IB link layer "
+ "is allowed\n",
+ rdma_port_get_link_layer(id->device, id->port_num));
+ rc = -ENETUNREACH;
+ break;
+ }
+
+ rc = rdma_resolve_route(id, SDP_ROUTE_TIMEOUT);
+ break;
+ case RDMA_CM_EVENT_ADDR_ERROR:
+ sdp_dbg(sk, "RDMA_CM_EVENT_ADDR_ERROR\n");
+ rc = -ENETUNREACH;
+ break;
+ case RDMA_CM_EVENT_ROUTE_RESOLVED:
+ sdp_dbg(sk, "RDMA_CM_EVENT_ROUTE_RESOLVED : %p\n", id);
+ rc = sdp_init_qp(sk, id);
+ if (rc)
+ break;
+ atomic_set(&sdp_sk(sk)->remote_credits,
+ rx_ring_posted(sdp_sk(sk)));
+ memset(&hh, 0, sizeof hh);
+ hh.bsdh.mid = SDP_MID_HELLO;
+ hh.bsdh.len = htonl(sizeof(struct sdp_hh));
+ hh.max_adverts = 1;
+ hh.ipv_cap = 0x40;
+ hh.majv_minv = SDP_MAJV_MINV;
+ sdp_init_buffers(sdp_sk(sk), rcvbuf_initial_size);
+ hh.bsdh.bufs = htons(rx_ring_posted(sdp_sk(sk)));
+ hh.localrcvsz = hh.desremrcvsz = htonl(sdp_sk(sk)->recv_bytes);
+ hh.max_adverts = 0x1;
+ sdp_sk(sk)->laddr =
+ ((struct sockaddr_in *)&id->route.addr.src_addr)->sin_addr.s_addr;
+ memset(&conn_param, 0, sizeof conn_param);
+ conn_param.private_data_len = sizeof hh;
+ conn_param.private_data = &hh;
+ conn_param.responder_resources = 4 /* TODO */;
+ conn_param.initiator_depth = 4 /* TODO */;
+ conn_param.retry_count = SDP_RETRY_COUNT;
+ SDP_DUMP_PACKET(NULL, "TX", NULL, &hh.bsdh);
+ rc = rdma_connect(id, &conn_param);
+ break;
+ case RDMA_CM_EVENT_ROUTE_ERROR:
+ sdp_dbg(sk, "RDMA_CM_EVENT_ROUTE_ERROR : %p\n", id);
+ rc = -ETIMEDOUT;
+ break;
+ case RDMA_CM_EVENT_CONNECT_REQUEST:
+ sdp_dbg(sk, "RDMA_CM_EVENT_CONNECT_REQUEST\n");
+ rc = sdp_connect_handler(sk, id, event);
+ if (rc) {
+ sdp_dbg(sk, "Destroying qp\n");
+ rdma_reject(id, NULL, 0);
+ break;
+ }
+ ssk = id->context;
+ atomic_set(&ssk->remote_credits, rx_ring_posted(ssk));
+ memset(&hah, 0, sizeof hah);
+ hah.bsdh.mid = SDP_MID_HELLO_ACK;
+ hah.bsdh.bufs = htons(rx_ring_posted(ssk));
+ hah.bsdh.len = htonl(sizeof(struct sdp_hah));
+ hah.majv_minv = SDP_MAJV_MINV;
+ hah.ext_max_adverts = 1; /* Doesn't seem to be mandated by spec,
+ but just in case */
+ hah.actrcvsz = htonl(ssk->recv_bytes);
+ memset(&conn_param, 0, sizeof conn_param);
+ conn_param.private_data_len = sizeof hah;
+ conn_param.private_data = &hah;
+ conn_param.responder_resources = 4 /* TODO */;
+ conn_param.initiator_depth = 4 /* TODO */;
+ conn_param.retry_count = SDP_RETRY_COUNT;
+ SDP_DUMP_PACKET(sk, "TX", NULL, &hah.bsdh);
+ rc = rdma_accept(id, &conn_param);
+ if (rc) {
+ ssk->id = NULL;
+ id->qp = NULL;
+ id->context = NULL;
+ }
+ break;
+ case RDMA_CM_EVENT_CONNECT_RESPONSE:
+ sdp_dbg(sk, "RDMA_CM_EVENT_CONNECT_RESPONSE\n");
+ rc = sdp_response_handler(sk, id, event);
+ if (rc) {
+ sdp_dbg(sk, "Destroying qp\n");
+ rdma_reject(id, NULL, 0);
+ } else
+ rc = rdma_accept(id, NULL);
+ break;
+ case RDMA_CM_EVENT_CONNECT_ERROR:
+ sdp_dbg(sk, "RDMA_CM_EVENT_CONNECT_ERROR\n");
+ rc = -ETIMEDOUT;
+ break;
+ case RDMA_CM_EVENT_UNREACHABLE:
+ sdp_dbg(sk, "RDMA_CM_EVENT_UNREACHABLE\n");
+ rc = -ENETUNREACH;
+ break;
+ case RDMA_CM_EVENT_REJECTED:
+ sdp_dbg(sk, "RDMA_CM_EVENT_REJECTED\n");
+ rc = -ECONNREFUSED;
+ break;
+ case RDMA_CM_EVENT_ESTABLISHED:
+ sdp_dbg(sk, "RDMA_CM_EVENT_ESTABLISHED\n");
+ sdp_sk(sk)->laddr =
+ ((struct sockaddr_in *)&id->route.addr.src_addr)->sin_addr.s_addr;
+ rc = sdp_connected_handler(sk, event);
+ break;
+ case RDMA_CM_EVENT_DISCONNECTED: /* This means DREQ/DREP received */
+ sdp_dbg(sk, "RDMA_CM_EVENT_DISCONNECTED\n");
+
+ SDP_WLOCK(ssk);
+ if (ssk->state == TCPS_LAST_ACK) {
+ sdp_cancel_dreq_wait_timeout(ssk);
+
+ sdp_dbg(sk, "%s: waiting for Infiniband tear down\n",
+ __func__);
+ }
+ ssk->qp_active = 0;
+ SDP_WUNLOCK(ssk);
+ rdma_disconnect(id);
+ SDP_WLOCK(ssk);
+ if (ssk->state != TCPS_TIME_WAIT) {
+ if (ssk->state == TCPS_CLOSE_WAIT) {
+ sdp_dbg(sk, "IB teardown while in "
+ "TCPS_CLOSE_WAIT taking reference to "
+ "let close() finish the work\n");
+ }
+ rc = sdp_disconnected_handler(sk);
+ if (rc)
+ rc = -EPIPE;
+ }
+ SDP_WUNLOCK(ssk);
+ break;
+ case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+ sdp_dbg(sk, "RDMA_CM_EVENT_TIMEWAIT_EXIT\n");
+ SDP_WLOCK(ssk);
+ rc = sdp_disconnected_handler(sk);
+ SDP_WUNLOCK(ssk);
+ break;
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+ sdp_dbg(sk, "RDMA_CM_EVENT_DEVICE_REMOVAL\n");
+ rc = -ENETRESET;
+ break;
+ default:
+ printk(KERN_ERR "SDP: Unexpected CMA event: %d\n",
+ event->event);
+ rc = -ECONNABORTED;
+ break;
+ }
+
+ sdp_dbg(sk, "event %d done. status %d\n", event->event, rc);
+
+ if (rc) {
+ SDP_WLOCK(ssk);
+ if (ssk->id == id) {
+ ssk->id = NULL;
+ id->qp = NULL;
+ id->context = NULL;
+ if (sdp_notify(ssk, -rc))
+ SDP_WUNLOCK(ssk);
+ } else
+ SDP_WUNLOCK(ssk);
+ }
+
+ return rc;
+}
diff --git a/sys/ofed/drivers/infiniband/ulp/sdp/sdp_dbg.h b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_dbg.h
new file mode 100644
index 0000000..188b58b
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_dbg.h
@@ -0,0 +1,167 @@
+#ifndef _SDP_DBG_H_
+#define _SDP_DBG_H_
+
+#define SDPSTATS_ON
+
+//#define GETNSTIMEODAY_SUPPORTED
+
+#define _sdp_printk(func, line, level, sk, format, arg...) \
+do { \
+ printk(level "%s:%d %p sdp_sock(%d:%d %d:%d): " format "\n", \
+ func, line, sk ? sdp_sk(sk) : NULL, \
+ curproc->p_pid, PCPU_GET(cpuid), \
+ (sk) && sdp_sk(sk) ? ntohs(sdp_sk(sk)->lport) : -1, \
+ (sk) && sdp_sk(sk) ? ntohs(sdp_sk(sk)->fport) : -1, ## arg); \
+} while (0)
+#define sdp_printk(level, sk, format, arg...) \
+ _sdp_printk(__func__, __LINE__, level, sk, format, ## arg)
+#define sdp_warn(sk, format, arg...) \
+ sdp_printk(KERN_WARNING, sk, format , ## arg)
+
+#define SDP_MODPARAM_SINT(var, def_val, msg) \
+ static int var = def_val; \
+ module_param_named(var, var, int, 0644); \
+ MODULE_PARM_DESC(var, msg " [" #def_val "]"); \
+
+#define SDP_MODPARAM_INT(var, def_val, msg) \
+ int var = def_val; \
+ module_param_named(var, var, int, 0644); \
+ MODULE_PARM_DESC(var, msg " [" #def_val "]"); \
+
+#ifdef SDP_PROFILING
+struct mbuf;
+struct sdpprf_log {
+ int idx;
+ int pid;
+ int cpu;
+ int sk_num;
+ int sk_dport;
+ struct mbuf *mb;
+ char msg[256];
+
+ unsigned long long time;
+
+ const char *func;
+ int line;
+};
+
+#define SDPPRF_LOG_SIZE 0x20000 /* must be a power of 2 */
+
+extern struct sdpprf_log sdpprf_log[SDPPRF_LOG_SIZE];
+extern int sdpprf_log_count;
+
+#ifdef GETNSTIMEODAY_SUPPORTED
+static inline unsigned long long current_nsec(void)
+{
+ struct timespec tv;
+ getnstimeofday(&tv);
+ return tv.tv_sec * NSEC_PER_SEC + tv.tv_nsec;
+}
+#else
+#define current_nsec() jiffies_to_usecs(jiffies)
+#endif
+
+#define sdp_prf1(sk, s, format, arg...) ({ \
+ struct sdpprf_log *l = \
+ &sdpprf_log[sdpprf_log_count++ & (SDPPRF_LOG_SIZE - 1)]; \
+ preempt_disable(); \
+ l->idx = sdpprf_log_count - 1; \
+ l->pid = current->pid; \
+ l->sk_num = (sk) ? inet_sk(sk)->num : -1; \
+ l->sk_dport = (sk) ? ntohs(inet_sk(sk)->dport) : -1; \
+ l->cpu = smp_processor_id(); \
+ l->mb = s; \
+ snprintf(l->msg, sizeof(l->msg) - 1, format, ## arg); \
+ l->time = current_nsec(); \
+ l->func = __func__; \
+ l->line = __LINE__; \
+ preempt_enable(); \
+ 1; \
+})
+//#define sdp_prf(sk, s, format, arg...)
+#define sdp_prf(sk, s, format, arg...) sdp_prf1(sk, s, format, ## arg)
+
+#else
+#define sdp_prf1(sk, s, format, arg...)
+#define sdp_prf(sk, s, format, arg...)
+#endif
+
+#ifdef CONFIG_INFINIBAND_SDP_DEBUG
+extern int sdp_debug_level;
+
+#define sdp_dbg(sk, format, arg...) \
+ do { \
+ if (sdp_debug_level > 0) \
+ sdp_printk(KERN_WARNING, sk, format , ## arg); \
+ } while (0)
+
+#else /* CONFIG_INFINIBAND_SDP_DEBUG */
+#define sdp_dbg(priv, format, arg...) \
+ do { (void) (priv); } while (0)
+#define sock_ref(sk, msg, sock_op) sock_op(sk)
+#endif /* CONFIG_INFINIBAND_SDP_DEBUG */
+
+#ifdef CONFIG_INFINIBAND_SDP_DEBUG_DATA
+
+extern int sdp_data_debug_level;
+#define sdp_dbg_data(sk, format, arg...) \
+ do { \
+ if (sdp_data_debug_level & 0x2) \
+ sdp_printk(KERN_WARNING, sk, format , ## arg); \
+ } while (0)
+#define SDP_DUMP_PACKET(sk, str, mb, h) \
+ do { \
+ if (sdp_data_debug_level & 0x1) \
+ dump_packet(sk, str, mb, h); \
+ } while (0)
+#else
+#define sdp_dbg_data(priv, format, arg...)
+#define SDP_DUMP_PACKET(sk, str, mb, h)
+#endif
+
+#define SOCK_REF_RESET "RESET"
+#define SOCK_REF_ALIVE "ALIVE" /* sock_alloc -> destruct_sock */
+#define SOCK_REF_CLONE "CLONE"
+#define SOCK_REF_CMA "CMA" /* sdp_cma_handler() is expected to be invoked */
+#define SOCK_REF_SEQ "SEQ" /* during proc read */
+#define SOCK_REF_DREQ_TO "DREQ_TO" /* dreq timeout is pending */
+#define SOCK_REF_ZCOPY "ZCOPY" /* zcopy send in process */
+#define SOCK_REF_RDMA_RD "RDMA_RD" /* RDMA read in process */
+
+#define sock_hold(sk, msg) sock_ref(sk, msg, sock_hold)
+#define sock_put(sk, msg) sock_ref(sk, msg, sock_put)
+#define __sock_put(sk, msg) sock_ref(sk, msg, __sock_put)
+
+#define ENUM2STR(e) [e] = #e
+
+static inline char *sdp_state_str(int state)
+{
+ static char *state2str[] = {
+ ENUM2STR(TCPS_ESTABLISHED),
+ ENUM2STR(TCPS_SYN_SENT),
+ ENUM2STR(TCPS_SYN_RECEIVED),
+ ENUM2STR(TCPS_FIN_WAIT_1),
+ ENUM2STR(TCPS_FIN_WAIT_2),
+ ENUM2STR(TCPS_TIME_WAIT),
+ ENUM2STR(TCPS_CLOSED),
+ ENUM2STR(TCPS_CLOSE_WAIT),
+ ENUM2STR(TCPS_LAST_ACK),
+ ENUM2STR(TCPS_LISTEN),
+ ENUM2STR(TCPS_CLOSING),
+ };
+
+ if (state < 0 || state >= ARRAY_SIZE(state2str))
+ return "unknown";
+
+ return state2str[state];
+}
+
+struct sdp_bsdh;
+#ifdef CONFIG_INFINIBAND_SDP_DEBUG_DATA
+void _dump_packet(const char *func, int line, struct socket *sk, char *str,
+ struct mbuf *mb, const struct sdp_bsdh *h);
+#define dump_packet(sk, str, mb, h) \
+ _dump_packet(__func__, __LINE__, sk, str, mb, h)
+#endif
+
+#endif
diff --git a/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c
new file mode 100644
index 0000000..fe747af
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c
@@ -0,0 +1,1962 @@
+
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
+ * The Regents of the University of California. All rights reserved.
+ * Copyright (c) 2004 The FreeBSD Foundation. All rights reserved.
+ * Copyright (c) 2004-2008 Robert N. M. Watson. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c
+ */
+
+/*
+ *
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "sdp.h"
+
+#include <net/if.h>
+#include <net/route.h>
+#include <net/vnet.h>
+
+uma_zone_t sdp_zone;
+struct rwlock sdp_lock;
+LIST_HEAD(, sdp_sock) sdp_list;
+
+struct workqueue_struct *rx_comp_wq;
+
+RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock");
+#define SDP_LIST_WLOCK() rw_wlock(&sdp_lock)
+#define SDP_LIST_RLOCK() rw_rlock(&sdp_lock)
+#define SDP_LIST_WUNLOCK() rw_wunlock(&sdp_lock)
+#define SDP_LIST_RUNLOCK() rw_runlock(&sdp_lock)
+#define SDP_LIST_WLOCK_ASSERT() rw_assert(&sdp_lock, RW_WLOCKED)
+#define SDP_LIST_RLOCK_ASSERT() rw_assert(&sdp_lock, RW_RLOCKED)
+#define SDP_LIST_LOCK_ASSERT() rw_assert(&sdp_lock, RW_LOCKED)
+
+MALLOC_DEFINE(M_SDP, "sdp", "Socket Direct Protocol");
+
+static void sdp_stop_keepalive_timer(struct socket *so);
+
+/*
+ * SDP protocol interface to socket abstraction.
+ */
+/*
+ * sdp_sendspace and sdp_recvspace are the default send and receive window
+ * sizes, respectively.
+ */
+u_long sdp_sendspace = 1024*32;
+u_long sdp_recvspace = 1024*64;
+
+static int sdp_count;
+
+/*
+ * Disable async. CMA events for sockets which are being torn down.
+ */
+static void
+sdp_destroy_cma(struct sdp_sock *ssk)
+{
+
+ if (ssk->id == NULL)
+ return;
+ rdma_destroy_id(ssk->id);
+ ssk->id = NULL;
+}
+
+static int
+sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred)
+{
+ struct sockaddr_in *sin;
+ struct sockaddr_in null;
+ int error;
+
+ SDP_WLOCK_ASSERT(ssk);
+
+ if (ssk->lport != 0 || ssk->laddr != INADDR_ANY)
+ return (EINVAL);
+ /* rdma_bind_addr handles bind races. */
+ SDP_WUNLOCK(ssk);
+ if (ssk->id == NULL)
+ ssk->id = rdma_create_id(sdp_cma_handler, ssk, RDMA_PS_SDP);
+ if (ssk->id == NULL) {
+ SDP_WLOCK(ssk);
+ return (ENOMEM);
+ }
+ if (nam == NULL) {
+ null.sin_family = AF_INET;
+ null.sin_len = sizeof(null);
+ null.sin_addr.s_addr = INADDR_ANY;
+ null.sin_port = 0;
+ bzero(&null.sin_zero, sizeof(null.sin_zero));
+ nam = (struct sockaddr *)&null;
+ }
+ error = -rdma_bind_addr(ssk->id, nam);
+ SDP_WLOCK(ssk);
+ if (error == 0) {
+ sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr;
+ ssk->laddr = sin->sin_addr.s_addr;
+ ssk->lport = sin->sin_port;
+ } else
+ sdp_destroy_cma(ssk);
+ return (error);
+}
+
+static void
+sdp_pcbfree(struct sdp_sock *ssk)
+{
+ KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk));
+
+ sdp_dbg(ssk->socket, "Freeing pcb");
+ SDP_WLOCK_ASSERT(ssk);
+ ssk->flags |= SDP_DESTROY;
+ SDP_WUNLOCK(ssk);
+ SDP_LIST_WLOCK();
+ sdp_count--;
+ LIST_REMOVE(ssk, list);
+ SDP_LIST_WUNLOCK();
+ crfree(ssk->cred);
+ sdp_destroy_cma(ssk);
+ ssk->qp_active = 0;
+ if (ssk->qp) {
+ ib_destroy_qp(ssk->qp);
+ ssk->qp = NULL;
+ }
+ sdp_tx_ring_destroy(ssk);
+ sdp_rx_ring_destroy(ssk);
+ rw_destroy(&ssk->rx_ring.destroyed_lock);
+ uma_zfree(sdp_zone, ssk);
+ rw_destroy(&ssk->lock);
+}
+
+/*
+ * Common routines to return a socket address.
+ */
+static struct sockaddr *
+sdp_sockaddr(in_port_t port, struct in_addr *addr_p)
+{
+ struct sockaddr_in *sin;
+
+ sin = malloc(sizeof *sin, M_SONAME,
+ M_WAITOK | M_ZERO);
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(*sin);
+ sin->sin_addr = *addr_p;
+ sin->sin_port = port;
+
+ return (struct sockaddr *)sin;
+}
+
+static int
+sdp_getsockaddr(struct socket *so, struct sockaddr **nam)
+{
+ struct sdp_sock *ssk;
+ struct in_addr addr;
+ in_port_t port;
+
+ ssk = sdp_sk(so);
+ SDP_RLOCK(ssk);
+ port = ssk->lport;
+ addr.s_addr = ssk->laddr;
+ SDP_RUNLOCK(ssk);
+
+ *nam = sdp_sockaddr(port, &addr);
+ return 0;
+}
+
+static int
+sdp_getpeeraddr(struct socket *so, struct sockaddr **nam)
+{
+ struct sdp_sock *ssk;
+ struct in_addr addr;
+ in_port_t port;
+
+ ssk = sdp_sk(so);
+ SDP_RLOCK(ssk);
+ port = ssk->fport;
+ addr.s_addr = ssk->faddr;
+ SDP_RUNLOCK(ssk);
+
+ *nam = sdp_sockaddr(port, &addr);
+ return 0;
+}
+
+static void
+sdp_pcbnotifyall(struct in_addr faddr, int errno,
+ struct sdp_sock *(*notify)(struct sdp_sock *, int))
+{
+ struct sdp_sock *ssk, *ssk_temp;
+
+ SDP_LIST_WLOCK();
+ LIST_FOREACH_SAFE(ssk, &sdp_list, list, ssk_temp) {
+ SDP_WLOCK(ssk);
+ if (ssk->faddr != faddr.s_addr || ssk->socket == NULL) {
+ SDP_WUNLOCK(ssk);
+ continue;
+ }
+ if ((ssk->flags & SDP_DESTROY) == 0)
+ if ((*notify)(ssk, errno))
+ SDP_WUNLOCK(ssk);
+ }
+ SDP_LIST_WUNLOCK();
+}
+
+#if 0
+static void
+sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg)
+{
+ struct sdp_sock *ssk;
+
+ SDP_LIST_RLOCK();
+ LIST_FOREACH(ssk, &sdp_list, list) {
+ SDP_WLOCK(ssk);
+ func(ssk, arg);
+ SDP_WUNLOCK(ssk);
+ }
+ SDP_LIST_RUNLOCK();
+}
+#endif
+
+static void
+sdp_output_reset(struct sdp_sock *ssk)
+{
+ struct rdma_cm_id *id;
+
+ SDP_WLOCK_ASSERT(ssk);
+ if (ssk->id) {
+ id = ssk->id;
+ ssk->qp_active = 0;
+ SDP_WUNLOCK(ssk);
+ rdma_disconnect(id);
+ SDP_WLOCK(ssk);
+ }
+ ssk->state = TCPS_CLOSED;
+}
+
+/*
+ * Attempt to close a SDP socket, marking it as dropped, and freeing
+ * the socket if we hold the only reference.
+ */
+static struct sdp_sock *
+sdp_closed(struct sdp_sock *ssk)
+{
+ struct socket *so;
+
+ SDP_WLOCK_ASSERT(ssk);
+
+ ssk->flags |= SDP_DROPPED;
+ so = ssk->socket;
+ soisdisconnected(so);
+ if (ssk->flags & SDP_SOCKREF) {
+ KASSERT(so->so_state & SS_PROTOREF,
+ ("sdp_closed: !SS_PROTOREF"));
+ ssk->flags &= ~SDP_SOCKREF;
+ SDP_WUNLOCK(ssk);
+ ACCEPT_LOCK();
+ SOCK_LOCK(so);
+ so->so_state &= ~SS_PROTOREF;
+ sofree(so);
+ return (NULL);
+ }
+ return (ssk);
+}
+
+/*
+ * Perform timer based shutdowns which can not operate in
+ * callout context.
+ */
+static void
+sdp_shutdown_task(void *data, int pending)
+{
+ struct sdp_sock *ssk;
+
+ ssk = data;
+ SDP_WLOCK(ssk);
+ /*
+ * I don't think this can race with another call to pcbfree()
+ * because SDP_TIMEWAIT protects it. SDP_DESTROY may be redundant.
+ */
+ if (ssk->flags & SDP_DESTROY)
+ panic("sdp_shutdown_task: Racing with pcbfree for ssk %p",
+ ssk);
+ if (ssk->flags & SDP_DISCON)
+ sdp_output_reset(ssk);
+ /* We have to clear this so sdp_detach() will call pcbfree(). */
+ ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT);
+ if ((ssk->flags & SDP_DROPPED) == 0 &&
+ sdp_closed(ssk) == NULL)
+ return;
+ if (ssk->socket == NULL) {
+ sdp_pcbfree(ssk);
+ return;
+ }
+ SDP_WUNLOCK(ssk);
+}
+
+/*
+ * 2msl has expired, schedule the shutdown task.
+ */
+static void
+sdp_2msl_timeout(void *data)
+{
+ struct sdp_sock *ssk;
+
+ ssk = data;
+ /* Callout canceled. */
+ if (!callout_active(&ssk->keep2msl))
+ goto out;
+ callout_deactivate(&ssk->keep2msl);
+ /* Should be impossible, defensive programming. */
+ if ((ssk->flags & SDP_TIMEWAIT) == 0)
+ goto out;
+ taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task);
+out:
+ SDP_WUNLOCK(ssk);
+ return;
+}
+
+/*
+ * Schedule the 2msl wait timer.
+ */
+static void
+sdp_2msl_wait(struct sdp_sock *ssk)
+{
+
+ SDP_WLOCK_ASSERT(ssk);
+ ssk->flags |= SDP_TIMEWAIT;
+ ssk->state = TCPS_TIME_WAIT;
+ soisdisconnected(ssk->socket);
+ callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk);
+}
+
+/*
+ * Timed out waiting for the final fin/ack from rdma_disconnect().
+ */
+static void
+sdp_dreq_timeout(void *data)
+{
+ struct sdp_sock *ssk;
+
+ ssk = data;
+ /* Callout canceled. */
+ if (!callout_active(&ssk->keep2msl))
+ goto out;
+ /* Callout rescheduled, probably as a different timer. */
+ if (callout_pending(&ssk->keep2msl))
+ goto out;
+ callout_deactivate(&ssk->keep2msl);
+ if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK)
+ goto out;
+ if ((ssk->flags & SDP_DREQWAIT) == 0)
+ goto out;
+ ssk->flags &= ~SDP_DREQWAIT;
+ ssk->flags |= SDP_DISCON;
+ sdp_2msl_wait(ssk);
+ ssk->qp_active = 0;
+out:
+ SDP_WUNLOCK(ssk);
+}
+
+/*
+ * Received the final fin/ack. Cancel the 2msl.
+ */
+void
+sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk)
+{
+ sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n");
+ ssk->flags &= ~SDP_DREQWAIT;
+ sdp_2msl_wait(ssk);
+}
+
+static int
+sdp_init_sock(struct socket *sk)
+{
+ struct sdp_sock *ssk = sdp_sk(sk);
+
+ sdp_dbg(sk, "%s\n", __func__);
+
+ callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED);
+ TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk);
+#ifdef SDP_ZCOPY
+ INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout);
+ ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */
+ ssk->tx_ring.rdma_inflight = NULL;
+#endif
+ atomic_set(&ssk->mseq_ack, 0);
+ sdp_rx_ring_init(ssk);
+ ssk->tx_ring.buffer = NULL;
+
+ return 0;
+}
+
+/*
+ * Allocate an sdp_sock for the socket and reserve socket buffer space.
+ */
+static int
+sdp_attach(struct socket *so, int proto, struct thread *td)
+{
+ struct sdp_sock *ssk;
+ int error;
+
+ ssk = sdp_sk(so);
+ KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so));
+ if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
+ error = soreserve(so, sdp_sendspace, sdp_recvspace);
+ if (error)
+ return (error);
+ }
+ so->so_rcv.sb_flags |= SB_AUTOSIZE;
+ so->so_snd.sb_flags |= SB_AUTOSIZE;
+ ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO);
+ if (ssk == NULL)
+ return (ENOBUFS);
+ rw_init(&ssk->lock, "sdpsock");
+ ssk->socket = so;
+ ssk->cred = crhold(so->so_cred);
+ so->so_pcb = (caddr_t)ssk;
+ sdp_init_sock(so);
+ ssk->flags = 0;
+ ssk->qp_active = 0;
+ ssk->state = TCPS_CLOSED;
+ SDP_LIST_WLOCK();
+ LIST_INSERT_HEAD(&sdp_list, ssk, list);
+ sdp_count++;
+ SDP_LIST_WUNLOCK();
+ if ((so->so_options & SO_LINGER) && so->so_linger == 0)
+ so->so_linger = TCP_LINGERTIME;
+
+ return (0);
+}
+
+/*
+ * Detach SDP from the socket, potentially leaving it around for the
+ * timewait to expire.
+ */
+static void
+sdp_detach(struct socket *so)
+{
+ struct sdp_sock *ssk;
+
+ ssk = sdp_sk(so);
+ SDP_WLOCK(ssk);
+ KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL"));
+ ssk->socket->so_pcb = NULL;
+ ssk->socket = NULL;
+ if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT))
+ SDP_WUNLOCK(ssk);
+ else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT)
+ sdp_pcbfree(ssk);
+ else
+ panic("sdp_detach: Unexpected state, ssk %p.\n", ssk);
+}
+
+/*
+ * Allocate a local address for the socket.
+ */
+static int
+sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ int error = 0;
+ struct sdp_sock *ssk;
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)nam;
+ if (nam->sa_len != sizeof (*sin))
+ return (EINVAL);
+ if (sin->sin_family != AF_INET)
+ return (EINVAL);
+ if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
+ return (EAFNOSUPPORT);
+
+ ssk = sdp_sk(so);
+ SDP_WLOCK(ssk);
+ if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
+ error = EINVAL;
+ goto out;
+ }
+ error = sdp_pcbbind(ssk, nam, td->td_ucred);
+out:
+ SDP_WUNLOCK(ssk);
+
+ return (error);
+}
+
+/*
+ * Prepare to accept connections.
+ */
+static int
+sdp_listen(struct socket *so, int backlog, struct thread *td)
+{
+ int error = 0;
+ struct sdp_sock *ssk;
+
+ ssk = sdp_sk(so);
+ SDP_WLOCK(ssk);
+ if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
+ error = EINVAL;
+ goto out;
+ }
+ if (error == 0 && ssk->lport == 0)
+ error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
+ SOCK_LOCK(so);
+ if (error == 0)
+ error = solisten_proto_check(so);
+ if (error == 0) {
+ solisten_proto(so, backlog);
+ ssk->state = TCPS_LISTEN;
+ }
+ SOCK_UNLOCK(so);
+
+out:
+ SDP_WUNLOCK(ssk);
+ if (error == 0)
+ error = -rdma_listen(ssk->id, backlog);
+ return (error);
+}
+
+/*
+ * Initiate a SDP connection to nam.
+ */
+static int
+sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td)
+{
+ struct sockaddr_in src;
+ struct socket *so;
+ int error;
+
+ so = ssk->socket;
+
+ SDP_WLOCK_ASSERT(ssk);
+ if (ssk->lport == 0) {
+ error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
+ if (error)
+ return error;
+ }
+ src.sin_family = AF_INET;
+ src.sin_len = sizeof(src);
+ bzero(&src.sin_zero, sizeof(src.sin_zero));
+ src.sin_port = ssk->lport;
+ src.sin_addr.s_addr = ssk->laddr;
+ soisconnecting(so);
+ SDP_WUNLOCK(ssk);
+ error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam,
+ SDP_RESOLVE_TIMEOUT);
+ SDP_WLOCK(ssk);
+ if (error == 0)
+ ssk->state = TCPS_SYN_SENT;
+
+ return 0;
+}
+
+/*
+ * Initiate SDP connection.
+ */
+static int
+sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ int error = 0;
+ struct sdp_sock *ssk;
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)nam;
+ if (nam->sa_len != sizeof (*sin))
+ return (EINVAL);
+ if (sin->sin_family != AF_INET)
+ return (EINVAL);
+ if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
+ return (EAFNOSUPPORT);
+ if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0)
+ return (error);
+ ssk = sdp_sk(so);
+ SDP_WLOCK(ssk);
+ if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED))
+ error = EINVAL;
+ else
+ error = sdp_start_connect(ssk, nam, td);
+ SDP_WUNLOCK(ssk);
+ return (error);
+}
+
+/*
+ * Drop a SDP socket, reporting
+ * the specified error. If connection is synchronized,
+ * then send a RST to peer.
+ */
+static struct sdp_sock *
+sdp_drop(struct sdp_sock *ssk, int errno)
+{
+ struct socket *so;
+
+ SDP_WLOCK_ASSERT(ssk);
+ so = ssk->socket;
+ if (TCPS_HAVERCVDSYN(ssk->state))
+ sdp_output_reset(ssk);
+ if (errno == ETIMEDOUT && ssk->softerror)
+ errno = ssk->softerror;
+ so->so_error = errno;
+ return (sdp_closed(ssk));
+}
+
+/*
+ * User issued close, and wish to trail through shutdown states:
+ * if never received SYN, just forget it. If got a SYN from peer,
+ * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
+ * If already got a FIN from peer, then almost done; go to LAST_ACK
+ * state. In all other cases, have already sent FIN to peer (e.g.
+ * after PRU_SHUTDOWN), and just have to play tedious game waiting
+ * for peer to send FIN or not respond to keep-alives, etc.
+ * We can let the user exit from the close as soon as the FIN is acked.
+ */
+static void
+sdp_usrclosed(struct sdp_sock *ssk)
+{
+
+ SDP_WLOCK_ASSERT(ssk);
+
+ switch (ssk->state) {
+ case TCPS_LISTEN:
+ ssk->state = TCPS_CLOSED;
+ SDP_WUNLOCK(ssk);
+ sdp_destroy_cma(ssk);
+ SDP_WLOCK(ssk);
+ /* FALLTHROUGH */
+ case TCPS_CLOSED:
+ ssk = sdp_closed(ssk);
+ /*
+ * sdp_closed() should never return NULL here as the socket is
+ * still open.
+ */
+ KASSERT(ssk != NULL,
+ ("sdp_usrclosed: sdp_closed() returned NULL"));
+ break;
+
+ case TCPS_SYN_SENT:
+ /* FALLTHROUGH */
+ case TCPS_SYN_RECEIVED:
+ ssk->flags |= SDP_NEEDFIN;
+ break;
+
+ case TCPS_ESTABLISHED:
+ ssk->flags |= SDP_NEEDFIN;
+ ssk->state = TCPS_FIN_WAIT_1;
+ break;
+
+ case TCPS_CLOSE_WAIT:
+ ssk->state = TCPS_LAST_ACK;
+ break;
+ }
+ if (ssk->state >= TCPS_FIN_WAIT_2) {
+ /* Prevent the connection hanging in FIN_WAIT_2 forever. */
+ if (ssk->state == TCPS_FIN_WAIT_2)
+ sdp_2msl_wait(ssk);
+ else
+ soisdisconnected(ssk->socket);
+ }
+}
+
+static void
+sdp_output_disconnect(struct sdp_sock *ssk)
+{
+
+ SDP_WLOCK_ASSERT(ssk);
+ callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT,
+ sdp_dreq_timeout, ssk);
+ ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT;
+ sdp_post_sends(ssk, M_NOWAIT);
+}
+
+/*
+ * Initiate or continue a disconnect.
+ * If embryonic state, just send reset (once).
+ * If in ``let data drain'' option and linger null, just drop.
+ * Otherwise (hard), mark socket disconnecting and drop
+ * current input data; switch states based on user close, and
+ * send segment to peer (with FIN).
+ */
+static void
+sdp_start_disconnect(struct sdp_sock *ssk)
+{
+ struct socket *so;
+ int unread;
+
+ so = ssk->socket;
+ SDP_WLOCK_ASSERT(ssk);
+ sdp_stop_keepalive_timer(so);
+ /*
+ * Neither sdp_closed() nor sdp_drop() should return NULL, as the
+ * socket is still open.
+ */
+ if (ssk->state < TCPS_ESTABLISHED) {
+ ssk = sdp_closed(ssk);
+ KASSERT(ssk != NULL,
+ ("sdp_start_disconnect: sdp_close() returned NULL"));
+ } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
+ ssk = sdp_drop(ssk, 0);
+ KASSERT(ssk != NULL,
+ ("sdp_start_disconnect: sdp_drop() returned NULL"));
+ } else {
+ soisdisconnecting(so);
+ unread = so->so_rcv.sb_cc;
+ sbflush(&so->so_rcv);
+ sdp_usrclosed(ssk);
+ if (!(ssk->flags & SDP_DROPPED)) {
+ if (unread)
+ sdp_output_reset(ssk);
+ else
+ sdp_output_disconnect(ssk);
+ }
+ }
+}
+
+/*
+ * User initiated disconnect.
+ */
+static int
+sdp_disconnect(struct socket *so)
+{
+ struct sdp_sock *ssk;
+ int error = 0;
+
+ ssk = sdp_sk(so);
+ SDP_WLOCK(ssk);
+ if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
+ error = ECONNRESET;
+ goto out;
+ }
+ sdp_start_disconnect(ssk);
+out:
+ SDP_WUNLOCK(ssk);
+ return (error);
+}
+
+/*
+ * Accept a connection. Essentially all the work is done at higher levels;
+ * just return the address of the peer, storing through addr.
+ *
+ *
+ * XXX This is broken XXX
+ *
+ * The rationale for acquiring the sdp lock here is somewhat complicated,
+ * and is described in detail in the commit log entry for r175612. Acquiring
+ * it delays an accept(2) racing with sonewconn(), which inserts the socket
+ * before the address/port fields are initialized. A better fix would
+ * prevent the socket from being placed in the listen queue until all fields
+ * are fully initialized.
+ */
+static int
+sdp_accept(struct socket *so, struct sockaddr **nam)
+{
+ struct sdp_sock *ssk = NULL;
+ struct in_addr addr;
+ in_port_t port;
+ int error;
+
+ if (so->so_state & SS_ISDISCONNECTED)
+ return (ECONNABORTED);
+
+ port = 0;
+ addr.s_addr = 0;
+ error = 0;
+ ssk = sdp_sk(so);
+ SDP_WLOCK(ssk);
+ if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
+ error = ECONNABORTED;
+ goto out;
+ }
+ port = ssk->fport;
+ addr.s_addr = ssk->faddr;
+out:
+ SDP_WUNLOCK(ssk);
+ if (error == 0)
+ *nam = sdp_sockaddr(port, &addr);
+ return error;
+}
+
+/*
+ * Mark the connection as being incapable of further output.
+ */
+static int
+sdp_shutdown(struct socket *so)
+{
+ int error = 0;
+ struct sdp_sock *ssk;
+
+ ssk = sdp_sk(so);
+ SDP_WLOCK(ssk);
+ if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
+ error = ECONNRESET;
+ goto out;
+ }
+ socantsendmore(so);
+ sdp_usrclosed(ssk);
+ if (!(ssk->flags & SDP_DROPPED))
+ sdp_output_disconnect(ssk);
+
+out:
+ SDP_WUNLOCK(ssk);
+
+ return (error);
+}
+
+static void
+sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt)
+{
+ struct mbuf *n;
+ int ncnt;
+
+ SOCKBUF_LOCK_ASSERT(sb);
+ SBLASTRECORDCHK(sb)
+ KASSERT(mb->m_flags & M_PKTHDR,
+ ("sdp_append: %p Missing packet header.\n", mb));
+ n = sb->sb_lastrecord;
+ /*
+ * If the queue is empty just set all pointers and proceed.
+ */
+ if (n == NULL) {
+ sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb;
+ for (; mb; mb = mb->m_next) {
+ sb->sb_mbtail = mb;
+ sballoc(sb, mb);
+ }
+ return;
+ }
+ /*
+ * Count the number of mbufs in the current tail.
+ */
+ for (ncnt = 0; n->m_next; n = n->m_next)
+ ncnt++;
+ n = sb->sb_lastrecord;
+ /*
+ * If the two chains can fit in a single sdp packet and
+ * the last record has not been sent yet (WRITABLE) coalesce
+ * them. The lastrecord remains the same but we must strip the
+ * packet header and then let sbcompress do the hard part.
+ */
+ if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES &&
+ n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE <
+ ssk->xmit_size_goal) {
+ m_adj(mb, SDP_HEAD_SIZE);
+ n->m_pkthdr.len += mb->m_pkthdr.len;
+ n->m_flags |= mb->m_flags & (M_PUSH | M_URG);
+ m_demote(mb, 1);
+ sbcompress(sb, mb, sb->sb_mbtail);
+ return;
+ }
+ /*
+ * Not compressible, just append to the end and adjust counters.
+ */
+ sb->sb_lastrecord->m_flags |= M_PUSH;
+ sb->sb_lastrecord->m_nextpkt = mb;
+ sb->sb_lastrecord = mb;
+ if (sb->sb_sndptr == NULL)
+ sb->sb_sndptr = mb;
+ for (; mb; mb = mb->m_next) {
+ sb->sb_mbtail = mb;
+ sballoc(sb, mb);
+ }
+}
+
+/*
+ * Do a send by putting data in output queue and updating urgent
+ * marker if URG set. Possibly send more data. Unlike the other
+ * pru_*() routines, the mbuf chains are our responsibility. We
+ * must either enqueue them or free them. The other pru_* routines
+ * generally are caller-frees.
+ *
+ * This comes from sendfile, normal sends will come from sdp_sosend().
+ */
+static int
+sdp_send(struct socket *so, int flags, struct mbuf *m,
+ struct sockaddr *nam, struct mbuf *control, struct thread *td)
+{
+ struct sdp_sock *ssk;
+ struct mbuf *n;
+ int error;
+ int cnt;
+
+ error = 0;
+ ssk = sdp_sk(so);
+ KASSERT(m->m_flags & M_PKTHDR,
+ ("sdp_send: %p no packet header", m));
+ M_PREPEND(m, SDP_HEAD_SIZE, M_WAIT);
+ mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA;
+ for (n = m, cnt = 0; n->m_next; n = n->m_next)
+ cnt++;
+ if (cnt > SDP_MAX_SEND_SGES) {
+ n = m_collapse(m, M_WAIT, SDP_MAX_SEND_SGES);
+ if (n == NULL) {
+ m_freem(m);
+ return (EMSGSIZE);
+ }
+ m = n;
+ for (cnt = 0; n->m_next; n = n->m_next)
+ cnt++;
+ }
+ SDP_WLOCK(ssk);
+ if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
+ if (control)
+ m_freem(control);
+ if (m)
+ m_freem(m);
+ error = ECONNRESET;
+ goto out;
+ }
+ if (control) {
+ /* SDP doesn't support control messages. */
+ if (control->m_len) {
+ m_freem(control);
+ if (m)
+ m_freem(m);
+ error = EINVAL;
+ goto out;
+ }
+ m_freem(control); /* empty control, just free it */
+ }
+ if (!(flags & PRUS_OOB)) {
+ SOCKBUF_LOCK(&so->so_snd);
+ sdp_append(ssk, &so->so_snd, m, cnt);
+ SOCKBUF_UNLOCK(&so->so_snd);
+ if (nam && ssk->state < TCPS_SYN_SENT) {
+ /*
+ * Do implied connect if not yet connected.
+ */
+ error = sdp_start_connect(ssk, nam, td);
+ if (error)
+ goto out;
+ }
+ if (flags & PRUS_EOF) {
+ /*
+ * Close the send side of the connection after
+ * the data is sent.
+ */
+ socantsendmore(so);
+ sdp_usrclosed(ssk);
+ if (!(ssk->flags & SDP_DROPPED))
+ sdp_output_disconnect(ssk);
+ } else if (!(ssk->flags & SDP_DROPPED) &&
+ !(flags & PRUS_MORETOCOME))
+ sdp_post_sends(ssk, M_NOWAIT);
+ SDP_WUNLOCK(ssk);
+ return (0);
+ } else {
+ SOCKBUF_LOCK(&so->so_snd);
+ if (sbspace(&so->so_snd) < -512) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ m_freem(m);
+ error = ENOBUFS;
+ goto out;
+ }
+ /*
+ * According to RFC961 (Assigned Protocols),
+ * the urgent pointer points to the last octet
+ * of urgent data. We continue, however,
+ * to consider it to indicate the first octet
+ * of data past the urgent section.
+ * Otherwise, snd_up should be one lower.
+ */
+ m->m_flags |= M_URG | M_PUSH;
+ sdp_append(ssk, &so->so_snd, m, cnt);
+ SOCKBUF_UNLOCK(&so->so_snd);
+ if (nam && ssk->state < TCPS_SYN_SENT) {
+ /*
+ * Do implied connect if not yet connected.
+ */
+ error = sdp_start_connect(ssk, nam, td);
+ if (error)
+ goto out;
+ }
+ sdp_post_sends(ssk, M_NOWAIT);
+ SDP_WUNLOCK(ssk);
+ return (0);
+ }
+out:
+ SDP_WUNLOCK(ssk);
+ return (error);
+}
+
+#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
+
+/*
+ * Send on a socket. If send must go all at once and message is larger than
+ * send buffering, then hard error. Lock against other senders. If must go
+ * all at once and not enough room now, then inform user that this would
+ * block and do nothing. Otherwise, if nonblocking, send as much as
+ * possible. The data to be sent is described by "uio" if nonzero, otherwise
+ * by the mbuf chain "top" (which must be null if uio is not). Data provided
+ * in mbuf chain must be small enough to send all at once.
+ *
+ * Returns nonzero on error, timeout or signal; callers must check for short
+ * counts if EINTR/ERESTART are returned. Data and control buffers are freed
+ * on return.
+ */
+static int
+sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
+ struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
+{
+ struct sdp_sock *ssk;
+ long space, resid;
+ int atomic;
+ int error;
+ int copy;
+
+ if (uio != NULL)
+ resid = uio->uio_resid;
+ else
+ resid = top->m_pkthdr.len;
+ atomic = top != NULL;
+ if (control != NULL) {
+ if (control->m_len) {
+ m_freem(control);
+ if (top)
+ m_freem(top);
+ return (EINVAL);
+ }
+ m_freem(control);
+ control = NULL;
+ }
+ /*
+ * In theory resid should be unsigned. However, space must be
+ * signed, as it might be less than 0 if we over-committed, and we
+ * must use a signed comparison of space and resid. On the other
+ * hand, a negative resid causes us to loop sending 0-length
+ * segments to the protocol.
+ *
+ * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
+ * type sockets since that's an error.
+ */
+ if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
+ error = EINVAL;
+ goto out;
+ }
+ if (td != NULL)
+ td->td_ru.ru_msgsnd++;
+
+ ssk = sdp_sk(so);
+ error = sblock(&so->so_snd, SBLOCKWAIT(flags));
+ if (error)
+ goto out;
+
+restart:
+ do {
+ SOCKBUF_LOCK(&so->so_snd);
+ if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ error = EPIPE;
+ goto release;
+ }
+ if (so->so_error) {
+ error = so->so_error;
+ so->so_error = 0;
+ SOCKBUF_UNLOCK(&so->so_snd);
+ goto release;
+ }
+ if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ error = ENOTCONN;
+ goto release;
+ }
+ space = sbspace(&so->so_snd);
+ if (flags & MSG_OOB)
+ space += 1024;
+ if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ error = EMSGSIZE;
+ goto release;
+ }
+ if (space < resid &&
+ (atomic || space < so->so_snd.sb_lowat)) {
+ if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ error = EWOULDBLOCK;
+ goto release;
+ }
+ error = sbwait(&so->so_snd);
+ SOCKBUF_UNLOCK(&so->so_snd);
+ if (error)
+ goto release;
+ goto restart;
+ }
+ SOCKBUF_UNLOCK(&so->so_snd);
+ do {
+ if (uio == NULL) {
+ resid = 0;
+ if (flags & MSG_EOR)
+ top->m_flags |= M_EOR;
+ } else {
+ /*
+ * Copy the data from userland into a mbuf
+ * chain. If no data is to be copied in,
+ * a single empty mbuf is returned.
+ */
+ copy = min(space,
+ ssk->xmit_size_goal - SDP_HEAD_SIZE);
+ top = m_uiotombuf(uio, M_WAITOK, copy,
+ 0, M_PKTHDR |
+ ((flags & MSG_EOR) ? M_EOR : 0));
+ if (top == NULL) {
+ /* only possible error */
+ error = EFAULT;
+ goto release;
+ }
+ space -= resid - uio->uio_resid;
+ resid = uio->uio_resid;
+ }
+ /*
+ * XXX all the SBS_CANTSENDMORE checks previously
+ * done could be out of date after dropping the
+ * socket lock.
+ */
+ error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB :
+ /*
+ * Set EOF on the last send if the user specified
+ * MSG_EOF.
+ */
+ ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF :
+ /* If there is more to send set PRUS_MORETOCOME. */
+ (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
+ top, addr, NULL, td);
+ top = NULL;
+ if (error)
+ goto release;
+ } while (resid && space > 0);
+ } while (resid);
+
+release:
+ sbunlock(&so->so_snd);
+out:
+ if (top != NULL)
+ m_freem(top);
+ return (error);
+}
+
+/*
+ * The part of soreceive() that implements reading non-inline out-of-band
+ * data from a socket. For more complete comments, see soreceive(), from
+ * which this code originated.
+ *
+ * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
+ * unable to return an mbuf chain to the caller.
+ */
+static int
+soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
+{
+ struct protosw *pr = so->so_proto;
+ struct mbuf *m;
+ int error;
+
+ KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
+
+ m = m_get(M_WAIT, MT_DATA);
+ error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
+ if (error)
+ goto bad;
+ do {
+ error = uiomove(mtod(m, void *),
+ (int) min(uio->uio_resid, m->m_len), uio);
+ m = m_free(m);
+ } while (uio->uio_resid && error == 0 && m);
+bad:
+ if (m != NULL)
+ m_freem(m);
+ return (error);
+}
+
+/*
+ * Optimized version of soreceive() for stream (TCP) sockets.
+ */
+static int
+sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio,
+ struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+{
+ int len = 0, error = 0, flags, oresid;
+ struct sockbuf *sb;
+ struct mbuf *m, *n = NULL;
+ struct sdp_sock *ssk;
+
+ /* We only do stream sockets. */
+ if (so->so_type != SOCK_STREAM)
+ return (EINVAL);
+ if (psa != NULL)
+ *psa = NULL;
+ if (controlp != NULL)
+ return (EINVAL);
+ if (flagsp != NULL)
+ flags = *flagsp &~ MSG_EOR;
+ else
+ flags = 0;
+ if (flags & MSG_OOB)
+ return (soreceive_rcvoob(so, uio, flags));
+ if (mp0 != NULL)
+ *mp0 = NULL;
+
+ sb = &so->so_rcv;
+ ssk = sdp_sk(so);
+
+ /* Prevent other readers from entering the socket. */
+ error = sblock(sb, SBLOCKWAIT(flags));
+ if (error)
+ goto out;
+ SOCKBUF_LOCK(sb);
+
+ /* Easy one, no space to copyout anything. */
+ if (uio->uio_resid == 0) {
+ error = EINVAL;
+ goto out;
+ }
+ oresid = uio->uio_resid;
+
+ /* We will never ever get anything unless we are connected. */
+ if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
+ /* When disconnecting there may be still some data left. */
+ if (sb->sb_cc > 0)
+ goto deliver;
+ if (!(so->so_state & SS_ISDISCONNECTED))
+ error = ENOTCONN;
+ goto out;
+ }
+
+ /* Socket buffer is empty and we shall not block. */
+ if (sb->sb_cc == 0 &&
+ ((sb->sb_flags & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
+ error = EAGAIN;
+ goto out;
+ }
+
+restart:
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+
+ /* Abort if socket has reported problems. */
+ if (so->so_error) {
+ if (sb->sb_cc > 0)
+ goto deliver;
+ if (oresid > uio->uio_resid)
+ goto out;
+ error = so->so_error;
+ if (!(flags & MSG_PEEK))
+ so->so_error = 0;
+ goto out;
+ }
+
+ /* Door is closed. Deliver what is left, if any. */
+ if (sb->sb_state & SBS_CANTRCVMORE) {
+ if (sb->sb_cc > 0)
+ goto deliver;
+ else
+ goto out;
+ }
+
+ /* Socket buffer got some data that we shall deliver now. */
+ if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
+ ((sb->sb_flags & SS_NBIO) ||
+ (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
+ sb->sb_cc >= sb->sb_lowat ||
+ sb->sb_cc >= uio->uio_resid ||
+ sb->sb_cc >= sb->sb_hiwat) ) {
+ goto deliver;
+ }
+
+ /* On MSG_WAITALL we must wait until all data or error arrives. */
+ if ((flags & MSG_WAITALL) &&
+ (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_lowat))
+ goto deliver;
+
+ /*
+ * Wait and block until (more) data comes in.
+ * NB: Drops the sockbuf lock during wait.
+ */
+ error = sbwait(sb);
+ if (error)
+ goto out;
+ goto restart;
+
+deliver:
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+ KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
+ KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
+
+ /* Statistics. */
+ if (uio->uio_td)
+ uio->uio_td->td_ru.ru_msgrcv++;
+
+ /* Fill uio until full or current end of socket buffer is reached. */
+ len = min(uio->uio_resid, sb->sb_cc);
+ if (mp0 != NULL) {
+ /* Dequeue as many mbufs as possible. */
+ if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
+ for (*mp0 = m = sb->sb_mb;
+ m != NULL && m->m_len <= len;
+ m = m->m_next) {
+ len -= m->m_len;
+ uio->uio_resid -= m->m_len;
+ sbfree(sb, m);
+ n = m;
+ }
+ sb->sb_mb = m;
+ if (sb->sb_mb == NULL)
+ SB_EMPTY_FIXUP(sb);
+ n->m_next = NULL;
+ }
+ /* Copy the remainder. */
+ if (len > 0) {
+ KASSERT(sb->sb_mb != NULL,
+ ("%s: len > 0 && sb->sb_mb empty", __func__));
+
+ m = m_copym(sb->sb_mb, 0, len, M_DONTWAIT);
+ if (m == NULL)
+ len = 0; /* Don't flush data from sockbuf. */
+ else
+ uio->uio_resid -= m->m_len;
+ if (*mp0 != NULL)
+ n->m_next = m;
+ else
+ *mp0 = m;
+ if (*mp0 == NULL) {
+ error = ENOBUFS;
+ goto out;
+ }
+ }
+ } else {
+ /* NB: Must unlock socket buffer as uiomove may sleep. */
+ SOCKBUF_UNLOCK(sb);
+ error = m_mbuftouio(uio, sb->sb_mb, len);
+ SOCKBUF_LOCK(sb);
+ if (error)
+ goto out;
+ }
+ SBLASTRECORDCHK(sb);
+ SBLASTMBUFCHK(sb);
+
+ /*
+ * Remove the delivered data from the socket buffer unless we
+ * were only peeking.
+ */
+ if (!(flags & MSG_PEEK)) {
+ if (len > 0)
+ sbdrop_locked(sb, len);
+
+ /* Notify protocol that we drained some data. */
+ SOCKBUF_UNLOCK(sb);
+ SDP_WLOCK(ssk);
+ sdp_do_posts(ssk);
+ SDP_WUNLOCK(ssk);
+ SOCKBUF_LOCK(sb);
+ }
+
+ /*
+ * For MSG_WAITALL we may have to loop again and wait for
+ * more data to come in.
+ */
+ if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
+ goto restart;
+out:
+ SOCKBUF_LOCK_ASSERT(sb);
+ SBLASTRECORDCHK(sb);
+ SBLASTMBUFCHK(sb);
+ SOCKBUF_UNLOCK(sb);
+ sbunlock(sb);
+ return (error);
+}
+
+/*
+ * Abort is used to teardown a connection typically while sitting in
+ * the accept queue.
+ */
+void
+sdp_abort(struct socket *so)
+{
+ struct sdp_sock *ssk;
+
+ ssk = sdp_sk(so);
+ SDP_WLOCK(ssk);
+ /*
+ * If we have not yet dropped, do it now.
+ */
+ if (!(ssk->flags & SDP_TIMEWAIT) &&
+ !(ssk->flags & SDP_DROPPED))
+ sdp_drop(ssk, ECONNABORTED);
+ KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X",
+ ssk, ssk->flags));
+ SDP_WUNLOCK(ssk);
+}
+
+/*
+ * Close a SDP socket and initiate a friendly disconnect.
+ */
+static void
+sdp_close(struct socket *so)
+{
+ struct sdp_sock *ssk;
+
+ ssk = sdp_sk(so);
+ SDP_WLOCK(ssk);
+ /*
+ * If we have not yet dropped, do it now.
+ */
+ if (!(ssk->flags & SDP_TIMEWAIT) &&
+ !(ssk->flags & SDP_DROPPED))
+ sdp_start_disconnect(ssk);
+
+ /*
+ * If we've still not dropped let the socket layer know we're
+ * holding on to the socket and pcb for a while.
+ */
+ if (!(ssk->flags & SDP_DROPPED)) {
+ SOCK_LOCK(so);
+ so->so_state |= SS_PROTOREF;
+ SOCK_UNLOCK(so);
+ ssk->flags |= SDP_SOCKREF;
+ }
+ SDP_WUNLOCK(ssk);
+}
+
+/*
+ * User requests out-of-band data.
+ */
+static int
+sdp_rcvoob(struct socket *so, struct mbuf *m, int flags)
+{
+ int error = 0;
+ struct sdp_sock *ssk;
+
+ ssk = sdp_sk(so);
+ SDP_WLOCK(ssk);
+ if (!rx_ring_trylock(&ssk->rx_ring)) {
+ SDP_WUNLOCK(ssk);
+ return (ECONNRESET);
+ }
+ if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
+ error = ECONNRESET;
+ goto out;
+ }
+ if ((so->so_oobmark == 0 &&
+ (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
+ so->so_options & SO_OOBINLINE ||
+ ssk->oobflags & SDP_HADOOB) {
+ error = EINVAL;
+ goto out;
+ }
+ if ((ssk->oobflags & SDP_HAVEOOB) == 0) {
+ error = EWOULDBLOCK;
+ goto out;
+ }
+ m->m_len = 1;
+ *mtod(m, caddr_t) = ssk->iobc;
+ if ((flags & MSG_PEEK) == 0)
+ ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB);
+out:
+ rx_ring_unlock(&ssk->rx_ring);
+ SDP_WUNLOCK(ssk);
+ return (error);
+}
+
+void
+sdp_urg(struct sdp_sock *ssk, struct mbuf *mb)
+{
+ struct mbuf *m;
+ struct socket *so;
+
+ so = ssk->socket;
+ if (so == NULL)
+ return;
+
+ so->so_oobmark = so->so_rcv.sb_cc + mb->m_pkthdr.len - 1;
+ sohasoutofband(so);
+ ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB);
+ if (!(so->so_options & SO_OOBINLINE)) {
+ for (m = mb; m->m_next != NULL; m = m->m_next);
+ ssk->iobc = *(mtod(m, char *) + m->m_len - 1);
+ ssk->oobflags |= SDP_HAVEOOB;
+ m->m_len--;
+ mb->m_pkthdr.len--;
+ }
+}
+
+/*
+ * Notify a sdp socket of an asynchronous error.
+ *
+ * Do not wake up user since there currently is no mechanism for
+ * reporting soft errors (yet - a kqueue filter may be added).
+ */
+struct sdp_sock *
+sdp_notify(struct sdp_sock *ssk, int error)
+{
+
+ SDP_WLOCK_ASSERT(ssk);
+
+ if ((ssk->flags & SDP_TIMEWAIT) ||
+ (ssk->flags & SDP_DROPPED))
+ return (ssk);
+
+ /*
+ * Ignore some errors if we are hooked up.
+ */
+ if (ssk->state == TCPS_ESTABLISHED &&
+ (error == EHOSTUNREACH || error == ENETUNREACH ||
+ error == EHOSTDOWN))
+ return (ssk);
+ ssk->softerror = error;
+ return sdp_drop(ssk, error);
+}
+
+static void
+sdp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
+{
+ struct in_addr faddr;
+
+ faddr = ((struct sockaddr_in *)sa)->sin_addr;
+ if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
+ return;
+
+ sdp_pcbnotifyall(faddr, inetctlerrmap[cmd], sdp_notify);
+}
+
+static int
+sdp_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
+ struct thread *td)
+{
+ return (EOPNOTSUPP);
+}
+
+static void
+sdp_keepalive_timeout(void *data)
+{
+ struct sdp_sock *ssk;
+
+ ssk = data;
+ /* Callout canceled. */
+ if (!callout_active(&ssk->keep2msl))
+ return;
+ /* Callout rescheduled as a different kind of timer. */
+ if (callout_pending(&ssk->keep2msl))
+ goto out;
+ callout_deactivate(&ssk->keep2msl);
+ if (ssk->flags & SDP_DROPPED ||
+ (ssk->socket->so_options & SO_KEEPALIVE) == 0)
+ goto out;
+ sdp_post_keepalive(ssk);
+ callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
+ sdp_keepalive_timeout, ssk);
+out:
+ SDP_WUNLOCK(ssk);
+}
+
+
+void
+sdp_start_keepalive_timer(struct socket *so)
+{
+ struct sdp_sock *ssk;
+
+ ssk = sdp_sk(so);
+ if (!callout_pending(&ssk->keep2msl))
+ callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
+ sdp_keepalive_timeout, ssk);
+}
+
+static void
+sdp_stop_keepalive_timer(struct socket *so)
+{
+ struct sdp_sock *ssk;
+
+ ssk = sdp_sk(so);
+ callout_stop(&ssk->keep2msl);
+}
+
+/*
+ * sdp_ctloutput() must drop the inpcb lock before performing copyin on
+ * socket option arguments. When it re-acquires the lock after the copy, it
+ * has to revalidate that the connection is still valid for the socket
+ * option.
+ */
+#define SDP_WLOCK_RECHECK(inp) do { \
+ SDP_WLOCK(ssk); \
+ if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { \
+ SDP_WUNLOCK(ssk); \
+ return (ECONNRESET); \
+ } \
+} while(0)
+
+static int
+sdp_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+ int error, opt, optval;
+ struct sdp_sock *ssk;
+
+ error = 0;
+ ssk = sdp_sk(so);
+ if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) {
+ SDP_WLOCK(ssk);
+ if (so->so_options & SO_KEEPALIVE)
+ sdp_start_keepalive_timer(so);
+ else
+ sdp_stop_keepalive_timer(so);
+ SDP_WUNLOCK(ssk);
+ }
+ if (sopt->sopt_level != IPPROTO_TCP)
+ return (error);
+
+ SDP_WLOCK(ssk);
+ if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
+ SDP_WUNLOCK(ssk);
+ return (ECONNRESET);
+ }
+
+ switch (sopt->sopt_dir) {
+ case SOPT_SET:
+ switch (sopt->sopt_name) {
+ case TCP_NODELAY:
+ SDP_WUNLOCK(ssk);
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ return (error);
+
+ SDP_WLOCK_RECHECK(ssk);
+ opt = SDP_NODELAY;
+ if (optval)
+ ssk->flags |= opt;
+ else
+ ssk->flags &= ~opt;
+ sdp_do_posts(ssk);
+ SDP_WUNLOCK(ssk);
+ break;
+
+ default:
+ SDP_WUNLOCK(ssk);
+ error = ENOPROTOOPT;
+ break;
+ }
+ break;
+
+ case SOPT_GET:
+ switch (sopt->sopt_name) {
+ case TCP_NODELAY:
+ optval = ssk->flags & SDP_NODELAY;
+ SDP_WUNLOCK(ssk);
+ error = sooptcopyout(sopt, &optval, sizeof optval);
+ break;
+ default:
+ SDP_WUNLOCK(ssk);
+ error = ENOPROTOOPT;
+ break;
+ }
+ break;
+ }
+ return (error);
+}
+#undef SDP_WLOCK_RECHECK
+
+int sdp_mod_count = 0;
+int sdp_mod_usec = 0;
+
+void
+sdp_set_default_moderation(struct sdp_sock *ssk)
+{
+ if (sdp_mod_count <= 0 || sdp_mod_usec <= 0)
+ return;
+ ib_modify_cq(ssk->rx_ring.cq, sdp_mod_count, sdp_mod_usec);
+}
+
+
+static void
+sdp_dev_add(struct ib_device *device)
+{
+ struct ib_fmr_pool_param param;
+ struct sdp_device *sdp_dev;
+
+ sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO);
+ sdp_dev->pd = ib_alloc_pd(device);
+ if (IS_ERR(sdp_dev->pd))
+ goto out_pd;
+ sdp_dev->mr = ib_get_dma_mr(sdp_dev->pd, IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(sdp_dev->mr))
+ goto out_mr;
+ memset(&param, 0, sizeof param);
+ param.max_pages_per_fmr = SDP_FMR_SIZE;
+ param.page_shift = PAGE_SHIFT;
+ param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ);
+ param.pool_size = SDP_FMR_POOL_SIZE;
+ param.dirty_watermark = SDP_FMR_DIRTY_SIZE;
+ param.cache = 1;
+ sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, &param);
+ if (IS_ERR(sdp_dev->fmr_pool))
+ goto out_fmr;
+ ib_set_client_data(device, &sdp_client, sdp_dev);
+ return;
+
+out_fmr:
+ ib_dereg_mr(sdp_dev->mr);
+out_mr:
+ ib_dealloc_pd(sdp_dev->pd);
+out_pd:
+ free(sdp_dev, M_SDP);
+}
+
+static void
+sdp_dev_rem(struct ib_device *device)
+{
+ struct sdp_device *sdp_dev;
+ struct sdp_sock *ssk;
+
+ SDP_LIST_WLOCK();
+ LIST_FOREACH(ssk, &sdp_list, list) {
+ if (ssk->ib_device != device)
+ continue;
+ SDP_WLOCK(ssk);
+ if ((ssk->flags & SDP_DESTROY) == 0)
+ ssk = sdp_notify(ssk, ECONNRESET);
+ if (ssk)
+ SDP_WUNLOCK(ssk);
+ }
+ SDP_LIST_WUNLOCK();
+ /*
+ * XXX Do I need to wait between these two?
+ */
+ sdp_dev = ib_get_client_data(device, &sdp_client);
+ if (!sdp_dev)
+ return;
+ ib_flush_fmr_pool(sdp_dev->fmr_pool);
+ ib_destroy_fmr_pool(sdp_dev->fmr_pool);
+ ib_dereg_mr(sdp_dev->mr);
+ ib_dealloc_pd(sdp_dev->pd);
+ free(sdp_dev, M_SDP);
+}
+
+struct ib_client sdp_client =
+ { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem };
+
+
+static int
+sdp_pcblist(SYSCTL_HANDLER_ARGS)
+{
+ int error, n, i;
+ struct sdp_sock *ssk;
+ struct xinpgen xig;
+
+ /*
+ * The process of preparing the TCB list is too time-consuming and
+ * resource-intensive to repeat twice on every request.
+ */
+ if (req->oldptr == NULL) {
+ n = sdp_count;
+ n += imax(n / 8, 10);
+ req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb);
+ return (0);
+ }
+
+ if (req->newptr != NULL)
+ return (EPERM);
+
+ /*
+ * OK, now we're committed to doing something.
+ */
+ SDP_LIST_RLOCK();
+ n = sdp_count;
+ SDP_LIST_RUNLOCK();
+
+ error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
+ + n * sizeof(struct xtcpcb));
+ if (error != 0)
+ return (error);
+
+ xig.xig_len = sizeof xig;
+ xig.xig_count = n;
+ xig.xig_gen = 0;
+ xig.xig_sogen = so_gencnt;
+ error = SYSCTL_OUT(req, &xig, sizeof xig);
+ if (error)
+ return (error);
+
+ SDP_LIST_RLOCK();
+ for (ssk = LIST_FIRST(&sdp_list), i = 0;
+ ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) {
+ struct xtcpcb xt;
+
+ SDP_RLOCK(ssk);
+ if (ssk->flags & SDP_TIMEWAIT) {
+ if (ssk->cred != NULL)
+ error = cr_cansee(req->td->td_ucred,
+ ssk->cred);
+ else
+ error = EINVAL; /* Skip this inp. */
+ } else if (ssk->socket)
+ error = cr_canseesocket(req->td->td_ucred,
+ ssk->socket);
+ else
+ error = EINVAL;
+ if (error) {
+ error = 0;
+ goto next;
+ }
+
+ bzero(&xt, sizeof(xt));
+ xt.xt_len = sizeof xt;
+ xt.xt_inp.inp_gencnt = 0;
+ xt.xt_inp.inp_vflag = INP_IPV4;
+ memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr));
+ xt.xt_inp.inp_lport = ssk->lport;
+ memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr));
+ xt.xt_inp.inp_fport = ssk->fport;
+ xt.xt_tp.t_state = ssk->state;
+ if (ssk->socket != NULL)
+ sotoxsocket(ssk->socket, &xt.xt_socket);
+ else
+ bzero(&xt.xt_socket, sizeof xt.xt_socket);
+ xt.xt_socket.xso_protocol = IPPROTO_TCP;
+ SDP_RUNLOCK(ssk);
+ error = SYSCTL_OUT(req, &xt, sizeof xt);
+ if (error)
+ break;
+ i++;
+ continue;
+next:
+ SDP_RUNLOCK(ssk);
+ }
+ if (!error) {
+ /*
+ * Give the user an updated idea of our state.
+ * If the generation differs from what we told
+ * her before, she knows that something happened
+ * while we were processing this request, and it
+ * might be necessary to retry.
+ */
+ xig.xig_gen = 0;
+ xig.xig_sogen = so_gencnt;
+ xig.xig_count = sdp_count;
+ error = SYSCTL_OUT(req, &xig, sizeof xig);
+ }
+ SDP_LIST_RUNLOCK();
+ return (error);
+}
+
+SYSCTL_NODE(_net_inet, -1, sdp, CTLFLAG_RW, 0, "SDP");
+
+SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist,
+ CTLFLAG_RD | CTLTYPE_STRUCT, 0, 0, sdp_pcblist, "S,xtcpcb",
+ "List of active SDP connections");
+
+static void
+sdp_zone_change(void *tag)
+{
+
+ uma_zone_set_max(sdp_zone, maxsockets);
+}
+
+static void
+sdp_init(void)
+{
+
+ LIST_INIT(&sdp_list);
+ sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ uma_zone_set_max(sdp_zone, maxsockets);
+ EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL,
+ EVENTHANDLER_PRI_ANY);
+ rx_comp_wq = create_singlethread_workqueue("rx_comp_wq");
+ ib_register_client(&sdp_client);
+}
+
+extern struct domain sdpdomain;
+
+struct pr_usrreqs sdp_usrreqs = {
+ .pru_abort = sdp_abort,
+ .pru_accept = sdp_accept,
+ .pru_attach = sdp_attach,
+ .pru_bind = sdp_bind,
+ .pru_connect = sdp_connect,
+ .pru_control = sdp_control,
+ .pru_detach = sdp_detach,
+ .pru_disconnect = sdp_disconnect,
+ .pru_listen = sdp_listen,
+ .pru_peeraddr = sdp_getpeeraddr,
+ .pru_rcvoob = sdp_rcvoob,
+ .pru_send = sdp_send,
+ .pru_sosend = sdp_sosend,
+ .pru_soreceive = sdp_sorecv,
+ .pru_shutdown = sdp_shutdown,
+ .pru_sockaddr = sdp_getsockaddr,
+ .pru_close = sdp_close,
+};
+
+struct protosw sdpsw[] = {
+{
+ .pr_type = SOCK_STREAM,
+ .pr_domain = &sdpdomain,
+ .pr_protocol = IPPROTO_IP,
+ .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
+ .pr_ctlinput = sdp_ctlinput,
+ .pr_ctloutput = sdp_ctloutput,
+ .pr_usrreqs = &sdp_usrreqs
+},
+{
+ .pr_type = SOCK_STREAM,
+ .pr_domain = &sdpdomain,
+ .pr_protocol = IPPROTO_TCP,
+ .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
+ .pr_ctlinput = sdp_ctlinput,
+ .pr_ctloutput = sdp_ctloutput,
+ .pr_usrreqs = &sdp_usrreqs
+},
+};
+
+struct domain sdpdomain = {
+ .dom_family = AF_INET_SDP,
+ .dom_name = "SDP",
+ .dom_init = sdp_init,
+ .dom_protosw = sdpsw,
+ .dom_protoswNPROTOSW = &sdpsw[sizeof(sdpsw)/sizeof(sdpsw[0])],
+};
+
+DOMAIN_SET(sdp);
+
+int sdp_debug_level = 1;
+int sdp_data_debug_level = 0;
diff --git a/sys/ofed/drivers/infiniband/ulp/sdp/sdp_proc.c b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_proc.c
new file mode 100644
index 0000000..74bc04a
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_proc.c
@@ -0,0 +1,533 @@
+/*
+ * Copyright (c) 2008 Mellanox Technologies Ltd. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/proc_fs.h>
+#include <rdma/sdp_socket.h>
+#include "sdp.h"
+
+#ifdef CONFIG_PROC_FS
+
+#define PROC_SDP_STATS "sdpstats"
+#define PROC_SDP_PERF "sdpprf"
+
+/* just like TCP fs */
+struct sdp_seq_afinfo {
+ struct module *owner;
+ char *name;
+ sa_family_t family;
+ int (*seq_show) (struct seq_file *m, void *v);
+ struct file_operations *seq_fops;
+};
+
+struct sdp_iter_state {
+ sa_family_t family;
+ int num;
+ struct seq_operations seq_ops;
+};
+
+static void *sdp_get_idx(struct seq_file *seq, loff_t pos)
+{
+ int i = 0;
+ struct sdp_sock *ssk;
+
+ if (!list_empty(&sock_list))
+ list_for_each_entry(ssk, &sock_list, sock_list) {
+ if (i == pos)
+ return ssk;
+ i++;
+ }
+
+ return NULL;
+}
+
+static void *sdp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ void *start = NULL;
+ struct sdp_iter_state *st = seq->private;
+
+ st->num = 0;
+
+ if (!*pos)
+ return SEQ_START_TOKEN;
+
+ spin_lock_irq(&sock_list_lock);
+ start = sdp_get_idx(seq, *pos - 1);
+ if (start)
+ sock_hold((struct socket *)start, SOCK_REF_SEQ);
+ spin_unlock_irq(&sock_list_lock);
+
+ return start;
+}
+
+static void *sdp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct sdp_iter_state *st = seq->private;
+ void *next = NULL;
+
+ spin_lock_irq(&sock_list_lock);
+ if (v == SEQ_START_TOKEN)
+ next = sdp_get_idx(seq, 0);
+ else
+ next = sdp_get_idx(seq, *pos);
+ if (next)
+ sock_hold((struct socket *)next, SOCK_REF_SEQ);
+ spin_unlock_irq(&sock_list_lock);
+
+ *pos += 1;
+ st->num++;
+
+ return next;
+}
+
+static void sdp_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+#define TMPSZ 150
+
+static int sdp_seq_show(struct seq_file *seq, void *v)
+{
+ struct sdp_iter_state *st;
+ struct socket *sk = v;
+ char tmpbuf[TMPSZ + 1];
+ unsigned int dest;
+ unsigned int src;
+ int uid;
+ unsigned long inode;
+ __u16 destp;
+ __u16 srcp;
+ __u32 rx_queue, tx_queue;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, "%-*s\n", TMPSZ - 1,
+ " sl local_address rem_address "
+ "uid inode rx_queue tx_queue state");
+ goto out;
+ }
+
+ st = seq->private;
+
+ dest = inet_sk(sk)->daddr;
+ src = inet_sk(sk)->rcv_saddr;
+ destp = ntohs(inet_sk(sk)->dport);
+ srcp = ntohs(inet_sk(sk)->sport);
+ uid = sock_i_uid(sk);
+ inode = sock_i_ino(sk);
+ rx_queue = rcv_nxt(sdp_sk(sk)) - sdp_sk(sk)->copied_seq;
+ tx_queue = sdp_sk(sk)->write_seq - sdp_sk(sk)->tx_ring.una_seq;
+
+ sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %5d %lu %08X:%08X %X",
+ st->num, src, srcp, dest, destp, uid, inode,
+ rx_queue, tx_queue, sk->sk_state);
+
+ seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
+
+ sock_put(sk, SOCK_REF_SEQ);
+out:
+ return 0;
+}
+
+static int sdp_seq_open(struct inode *inode, struct file *file)
+{
+ struct sdp_seq_afinfo *afinfo = PDE(inode)->data;
+ struct seq_file *seq;
+ struct sdp_iter_state *s;
+ int rc;
+
+ if (unlikely(afinfo == NULL))
+ return -EINVAL;
+
+/* Workaround bogus warning by memtrack */
+#define _kzalloc(size,flags) kzalloc(size,flags)
+#undef kzalloc
+ s = kzalloc(sizeof(*s), GFP_KERNEL);
+#define kzalloc(s,f) _kzalloc(s,f)
+ if (!s)
+ return -ENOMEM;
+ s->family = afinfo->family;
+ s->seq_ops.start = sdp_seq_start;
+ s->seq_ops.next = sdp_seq_next;
+ s->seq_ops.show = afinfo->seq_show;
+ s->seq_ops.stop = sdp_seq_stop;
+
+ rc = seq_open(file, &s->seq_ops);
+ if (rc)
+ goto out_kfree;
+ seq = file->private_data;
+ seq->private = s;
+out:
+ return rc;
+out_kfree:
+ kfree(s);
+ goto out;
+}
+
+
+static struct file_operations sdp_seq_fops;
+static struct sdp_seq_afinfo sdp_seq_afinfo = {
+ .owner = THIS_MODULE,
+ .name = "sdp",
+ .family = AF_INET_SDP,
+ .seq_show = sdp_seq_show,
+ .seq_fops = &sdp_seq_fops,
+};
+
+#ifdef SDPSTATS_ON
+DEFINE_PER_CPU(struct sdpstats, sdpstats);
+
+static void sdpstats_seq_hist(struct seq_file *seq, char *str, u32 *h, int n,
+ int is_log)
+{
+ int i;
+ u32 max = 0;
+
+ seq_printf(seq, "%s:\n", str);
+
+ for (i = 0; i < n; i++) {
+ if (h[i] > max)
+ max = h[i];
+ }
+
+ if (max == 0) {
+ seq_printf(seq, " - all values are 0\n");
+ return;
+ }
+
+ for (i = 0; i < n; i++) {
+ char s[51];
+ int j = 50 * h[i] / max;
+ int val = is_log ? (i == n-1 ? 0 : 1<<i) : i;
+ memset(s, '*', j);
+ s[j] = '\0';
+
+ seq_printf(seq, "%10d | %-50s - %d\n", val, s, h[i]);
+ }
+}
+
+#define SDPSTATS_COUNTER_GET(var) ({ \
+ u32 __val = 0; \
+ unsigned int __i; \
+ for_each_possible_cpu(__i) \
+ __val += per_cpu(sdpstats, __i).var; \
+ __val; \
+})
+
+#define SDPSTATS_HIST_GET(hist, hist_len, sum) ({ \
+ unsigned int __i; \
+ for_each_possible_cpu(__i) { \
+ unsigned int __j; \
+ u32 *h = per_cpu(sdpstats, __i).hist; \
+ for (__j = 0; __j < hist_len; __j++) { \
+ sum[__j] += h[__j]; \
+ } \
+ } \
+})
+
+#define __sdpstats_seq_hist(seq, msg, hist, is_log) ({ \
+ u32 tmp_hist[SDPSTATS_MAX_HIST_SIZE]; \
+ int hist_len = ARRAY_SIZE(__get_cpu_var(sdpstats).hist);\
+ memset(tmp_hist, 0, sizeof(tmp_hist)); \
+ SDPSTATS_HIST_GET(hist, hist_len, tmp_hist); \
+ sdpstats_seq_hist(seq, msg, tmp_hist, hist_len, is_log);\
+})
+
+static int sdpstats_seq_show(struct seq_file *seq, void *v)
+{
+ int i;
+
+ seq_printf(seq, "SDP statistics:\n");
+
+ __sdpstats_seq_hist(seq, "sendmsg_seglen", sendmsg_seglen, 1);
+ __sdpstats_seq_hist(seq, "send_size", send_size, 1);
+ __sdpstats_seq_hist(seq, "credits_before_update",
+ credits_before_update, 0);
+
+ seq_printf(seq, "sdp_sendmsg() calls\t\t: %d\n",
+ SDPSTATS_COUNTER_GET(sendmsg));
+ seq_printf(seq, "bcopy segments \t\t: %d\n",
+ SDPSTATS_COUNTER_GET(sendmsg_bcopy_segment));
+ seq_printf(seq, "bzcopy segments \t\t: %d\n",
+ SDPSTATS_COUNTER_GET(sendmsg_bzcopy_segment));
+ seq_printf(seq, "zcopy segments \t\t: %d\n",
+ SDPSTATS_COUNTER_GET(sendmsg_zcopy_segment));
+ seq_printf(seq, "post_send_credits \t\t: %d\n",
+ SDPSTATS_COUNTER_GET(post_send_credits));
+ seq_printf(seq, "memcpy_count \t\t: %u\n",
+ SDPSTATS_COUNTER_GET(memcpy_count));
+
+ for (i = 0; i < ARRAY_SIZE(__get_cpu_var(sdpstats).post_send); i++) {
+ if (mid2str(i)) {
+ seq_printf(seq, "post_send %-20s\t: %d\n",
+ mid2str(i),
+ SDPSTATS_COUNTER_GET(post_send[i]));
+ }
+ }
+
+ seq_printf(seq, "\n");
+ seq_printf(seq, "post_recv \t\t: %d\n",
+ SDPSTATS_COUNTER_GET(post_recv));
+ seq_printf(seq, "BZCopy poll miss \t\t: %d\n",
+ SDPSTATS_COUNTER_GET(bzcopy_poll_miss));
+ seq_printf(seq, "send_wait_for_mem \t\t: %d\n",
+ SDPSTATS_COUNTER_GET(send_wait_for_mem));
+ seq_printf(seq, "send_miss_no_credits\t\t: %d\n",
+ SDPSTATS_COUNTER_GET(send_miss_no_credits));
+
+ seq_printf(seq, "rx_poll_miss \t\t: %d\n", SDPSTATS_COUNTER_GET(rx_poll_miss));
+ seq_printf(seq, "tx_poll_miss \t\t: %d\n", SDPSTATS_COUNTER_GET(tx_poll_miss));
+ seq_printf(seq, "tx_poll_busy \t\t: %d\n", SDPSTATS_COUNTER_GET(tx_poll_busy));
+ seq_printf(seq, "tx_poll_hit \t\t: %d\n", SDPSTATS_COUNTER_GET(tx_poll_hit));
+
+ seq_printf(seq, "CQ stats:\n");
+ seq_printf(seq, "- RX interrupts\t\t: %d\n", SDPSTATS_COUNTER_GET(rx_int_count));
+ seq_printf(seq, "- TX interrupts\t\t: %d\n", SDPSTATS_COUNTER_GET(tx_int_count));
+
+ seq_printf(seq, "ZCopy stats:\n");
+ seq_printf(seq, "- TX timeout\t\t: %d\n", SDPSTATS_COUNTER_GET(zcopy_tx_timeout));
+ seq_printf(seq, "- TX cross send\t\t: %d\n", SDPSTATS_COUNTER_GET(zcopy_cross_send));
+ seq_printf(seq, "- TX aborted by peer\t: %d\n", SDPSTATS_COUNTER_GET(zcopy_tx_aborted));
+ seq_printf(seq, "- TX error\t\t: %d\n", SDPSTATS_COUNTER_GET(zcopy_tx_error));
+ return 0;
+}
+
+static ssize_t sdpstats_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *offs)
+{
+ int i;
+
+ for_each_possible_cpu(i)
+ memset(&per_cpu(sdpstats, i), 0, sizeof(struct sdpstats));
+ printk(KERN_WARNING "Cleared sdp statistics\n");
+
+ return count;
+}
+
+static int sdpstats_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, sdpstats_seq_show, NULL);
+}
+
+static struct file_operations sdpstats_fops = {
+ .owner = THIS_MODULE,
+ .open = sdpstats_seq_open,
+ .read = seq_read,
+ .write = sdpstats_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#endif
+
+#ifdef SDP_PROFILING
+struct sdpprf_log sdpprf_log[SDPPRF_LOG_SIZE];
+int sdpprf_log_count;
+
+static unsigned long long start_t;
+
+static int sdpprf_show(struct seq_file *m, void *v)
+{
+ struct sdpprf_log *l = v;
+ unsigned long nsec_rem, t;
+
+ if (!sdpprf_log_count) {
+ seq_printf(m, "No performance logs\n");
+ goto out;
+ }
+
+ t = l->time - start_t;
+ nsec_rem = do_div(t, 1000000000);
+
+ seq_printf(m, "%-6d: [%5lu.%06lu] %-50s - [%d{%d} %d:%d] "
+ "mb: %p %s:%d\n",
+ l->idx, (unsigned long)t, nsec_rem/1000,
+ l->msg, l->pid, l->cpu, l->sk_num, l->sk_dport,
+ l->mb, l->func, l->line);
+out:
+ return 0;
+}
+
+static void *sdpprf_start(struct seq_file *p, loff_t *pos)
+{
+ int idx = *pos;
+
+ if (!*pos) {
+ if (!sdpprf_log_count)
+ return SEQ_START_TOKEN;
+ }
+
+ if (*pos >= MIN(sdpprf_log_count, SDPPRF_LOG_SIZE - 1))
+ return NULL;
+
+ if (sdpprf_log_count >= SDPPRF_LOG_SIZE - 1) {
+ int off = sdpprf_log_count & (SDPPRF_LOG_SIZE - 1);
+ idx = (idx + off) & (SDPPRF_LOG_SIZE - 1);
+
+ }
+
+ if (!start_t)
+ start_t = sdpprf_log[idx].time;
+ return &sdpprf_log[idx];
+}
+
+static void *sdpprf_next(struct seq_file *p, void *v, loff_t *pos)
+{
+ struct sdpprf_log *l = v;
+
+ if (++*pos >= MIN(sdpprf_log_count, SDPPRF_LOG_SIZE - 1))
+ return NULL;
+
+ ++l;
+ if (l - &sdpprf_log[0] >= SDPPRF_LOG_SIZE - 1)
+ return &sdpprf_log[0];
+
+ return l;
+}
+
+static void sdpprf_stop(struct seq_file *p, void *v)
+{
+}
+
+static struct seq_operations sdpprf_ops = {
+ .start = sdpprf_start,
+ .stop = sdpprf_stop,
+ .next = sdpprf_next,
+ .show = sdpprf_show,
+};
+
+static int sdpprf_open(struct inode *inode, struct file *file)
+{
+ int res;
+
+ res = seq_open(file, &sdpprf_ops);
+
+ return res;
+}
+
+static ssize_t sdpprf_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *offs)
+{
+ sdpprf_log_count = 0;
+ printk(KERN_INFO "Cleared sdpprf statistics\n");
+
+ return count;
+}
+
+static struct file_operations sdpprf_fops = {
+ .open = sdpprf_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+ .write = sdpprf_write,
+};
+#endif /* SDP_PROFILING */
+
+int __init sdp_proc_init(void)
+{
+ struct proc_dir_entry *p = NULL;
+#ifdef SDPSTATS_ON
+ struct proc_dir_entry *stats = NULL;
+#endif
+#ifdef SDP_PROFILING
+ struct proc_dir_entry *prof = NULL;
+#endif
+
+ sdp_seq_afinfo.seq_fops->owner = sdp_seq_afinfo.owner;
+ sdp_seq_afinfo.seq_fops->open = sdp_seq_open;
+ sdp_seq_afinfo.seq_fops->read = seq_read;
+ sdp_seq_afinfo.seq_fops->llseek = seq_lseek;
+ sdp_seq_afinfo.seq_fops->release = seq_release_private;
+
+ p = proc_net_fops_create(&init_net, sdp_seq_afinfo.name, S_IRUGO,
+ sdp_seq_afinfo.seq_fops);
+ if (p)
+ p->data = &sdp_seq_afinfo;
+ else
+ goto no_mem;
+
+#ifdef SDPSTATS_ON
+
+ stats = proc_net_fops_create(&init_net, PROC_SDP_STATS,
+ S_IRUGO | S_IWUGO, &sdpstats_fops);
+ if (!stats)
+ goto no_mem_stats;
+
+#endif
+
+#ifdef SDP_PROFILING
+ prof = proc_net_fops_create(&init_net, PROC_SDP_PERF,
+ S_IRUGO | S_IWUGO, &sdpprf_fops);
+ if (!prof)
+ goto no_mem_prof;
+#endif
+
+ return 0;
+
+#ifdef SDP_PROFILING
+no_mem_prof:
+#endif
+
+#ifdef SDPSTATS_ON
+ proc_net_remove(&init_net, PROC_SDP_STATS);
+
+no_mem_stats:
+#endif
+ proc_net_remove(&init_net, sdp_seq_afinfo.name);
+
+no_mem:
+ return -ENOMEM;
+}
+
+void sdp_proc_unregister(void)
+{
+ proc_net_remove(&init_net, sdp_seq_afinfo.name);
+ memset(sdp_seq_afinfo.seq_fops, 0, sizeof(*sdp_seq_afinfo.seq_fops));
+
+#ifdef SDPSTATS_ON
+ proc_net_remove(&init_net, PROC_SDP_STATS);
+#endif
+#ifdef SDP_PROFILING
+ proc_net_remove(&init_net, PROC_SDP_PERF);
+#endif
+}
+
+#else /* CONFIG_PROC_FS */
+
+int __init sdp_proc_init(void)
+{
+ return 0;
+}
+
+void sdp_proc_unregister(void)
+{
+
+}
+#endif /* CONFIG_PROC_FS */
diff --git a/sys/ofed/drivers/infiniband/ulp/sdp/sdp_rx.c b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_rx.c
new file mode 100644
index 0000000..de4b80b
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_rx.c
@@ -0,0 +1,783 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies Ltd. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "sdp.h"
+
+SDP_MODPARAM_INT(rcvbuf_initial_size, 32 * 1024,
+ "Receive buffer initial size in bytes.");
+SDP_MODPARAM_SINT(rcvbuf_scale, 0x8,
+ "Receive buffer size scale factor.");
+
+/* Like tcp_fin - called when SDP_MID_DISCONNECT is received */
+static void
+sdp_handle_disconn(struct sdp_sock *ssk)
+{
+
+ sdp_dbg(ssk->socket, "%s\n", __func__);
+
+ SDP_WLOCK_ASSERT(ssk);
+ if (TCPS_HAVERCVDFIN(ssk->state) == 0)
+ socantrcvmore(ssk->socket);
+
+ switch (ssk->state) {
+ case TCPS_SYN_RECEIVED:
+ case TCPS_ESTABLISHED:
+ ssk->state = TCPS_CLOSE_WAIT;
+ break;
+
+ case TCPS_FIN_WAIT_1:
+ /* Received a reply FIN - start Infiniband tear down */
+ sdp_dbg(ssk->socket,
+ "%s: Starting Infiniband tear down sending DREQ\n",
+ __func__);
+
+ sdp_cancel_dreq_wait_timeout(ssk);
+ ssk->qp_active = 0;
+ if (ssk->id) {
+ struct rdma_cm_id *id;
+
+ id = ssk->id;
+ SDP_WUNLOCK(ssk);
+ rdma_disconnect(id);
+ SDP_WLOCK(ssk);
+ } else {
+ sdp_warn(ssk->socket,
+ "%s: ssk->id is NULL\n", __func__);
+ return;
+ }
+ break;
+ case TCPS_TIME_WAIT:
+ /* This is a mutual close situation and we've got the DREQ from
+ the peer before the SDP_MID_DISCONNECT */
+ break;
+ case TCPS_CLOSED:
+ /* FIN arrived after IB teardown started - do nothing */
+ sdp_dbg(ssk->socket, "%s: fin in state %s\n",
+ __func__, sdp_state_str(ssk->state));
+ return;
+ default:
+ sdp_warn(ssk->socket,
+ "%s: FIN in unexpected state. state=%d\n",
+ __func__, ssk->state);
+ break;
+ }
+}
+
+static int
+sdp_post_recv(struct sdp_sock *ssk)
+{
+ struct sdp_buf *rx_req;
+ int i, rc;
+ u64 addr;
+ struct ib_device *dev;
+ struct ib_recv_wr rx_wr = { NULL };
+ struct ib_sge ibsge[SDP_MAX_RECV_SGES];
+ struct ib_sge *sge = ibsge;
+ struct ib_recv_wr *bad_wr;
+ struct mbuf *mb, *m;
+ struct sdp_bsdh *h;
+ int id = ring_head(ssk->rx_ring);
+
+ /* Now, allocate and repost recv */
+ sdp_prf(ssk->socket, mb, "Posting mb");
+ mb = m_getm2(NULL, ssk->recv_bytes, M_NOWAIT, MT_DATA, M_PKTHDR);
+ if (mb == NULL) {
+ /* Retry so we can't stall out with no memory. */
+ if (!rx_ring_posted(ssk))
+ queue_work(rx_comp_wq, &ssk->rx_comp_work);
+ return -1;
+ }
+ for (m = mb; m != NULL; m = m->m_next) {
+ m->m_len = (m->m_flags & M_EXT) ? m->m_ext.ext_size :
+ ((m->m_flags & M_PKTHDR) ? MHLEN : MLEN);
+ mb->m_pkthdr.len += m->m_len;
+ }
+ h = mtod(mb, struct sdp_bsdh *);
+ rx_req = ssk->rx_ring.buffer + (id & (SDP_RX_SIZE - 1));
+ rx_req->mb = mb;
+ dev = ssk->ib_device;
+ for (i = 0; mb != NULL; i++, mb = mb->m_next, sge++) {
+ addr = ib_dma_map_single(dev, mb->m_data, mb->m_len,
+ DMA_TO_DEVICE);
+ /* TODO: proper error handling */
+ BUG_ON(ib_dma_mapping_error(dev, addr));
+ BUG_ON(i >= SDP_MAX_RECV_SGES);
+ rx_req->mapping[i] = addr;
+ sge->addr = addr;
+ sge->length = mb->m_len;
+ sge->lkey = ssk->sdp_dev->mr->lkey;
+ }
+
+ rx_wr.next = NULL;
+ rx_wr.wr_id = id | SDP_OP_RECV;
+ rx_wr.sg_list = ibsge;
+ rx_wr.num_sge = i;
+ rc = ib_post_recv(ssk->qp, &rx_wr, &bad_wr);
+ if (unlikely(rc)) {
+ sdp_warn(ssk->socket, "ib_post_recv failed. status %d\n", rc);
+
+ sdp_cleanup_sdp_buf(ssk, rx_req, DMA_FROM_DEVICE);
+ m_freem(mb);
+
+ sdp_notify(ssk, ECONNRESET);
+
+ return -1;
+ }
+
+ atomic_inc(&ssk->rx_ring.head);
+ SDPSTATS_COUNTER_INC(post_recv);
+
+ return 0;
+}
+
+static inline int
+sdp_post_recvs_needed(struct sdp_sock *ssk)
+{
+ unsigned long bytes_in_process;
+ unsigned long max_bytes;
+ int buffer_size;
+ int posted;
+
+ if (!ssk->qp_active || !ssk->socket)
+ return 0;
+
+ posted = rx_ring_posted(ssk);
+ if (posted >= SDP_RX_SIZE)
+ return 0;
+ if (posted < SDP_MIN_TX_CREDITS)
+ return 1;
+
+ buffer_size = ssk->recv_bytes;
+ max_bytes = max(ssk->socket->so_snd.sb_hiwat,
+ (1 + SDP_MIN_TX_CREDITS) * buffer_size);
+ max_bytes *= rcvbuf_scale;
+ /*
+ * Compute bytes in the receive queue and socket buffer.
+ */
+ bytes_in_process = (posted - SDP_MIN_TX_CREDITS) * buffer_size;
+ bytes_in_process += ssk->socket->so_rcv.sb_cc;
+
+ return bytes_in_process < max_bytes;
+}
+
+static inline void
+sdp_post_recvs(struct sdp_sock *ssk)
+{
+
+ while (sdp_post_recvs_needed(ssk))
+ if (sdp_post_recv(ssk))
+ return;
+}
+
+static inline struct mbuf *
+sdp_sock_queue_rcv_mb(struct socket *sk, struct mbuf *mb)
+{
+ struct sdp_sock *ssk = sdp_sk(sk);
+ struct sdp_bsdh *h;
+
+ h = mtod(mb, struct sdp_bsdh *);
+
+#ifdef SDP_ZCOPY
+ SDP_SKB_CB(mb)->seq = rcv_nxt(ssk);
+ if (h->mid == SDP_MID_SRCAVAIL) {
+ struct sdp_srcah *srcah = (struct sdp_srcah *)(h+1);
+ struct rx_srcavail_state *rx_sa;
+
+ ssk->srcavail_cancel_mseq = 0;
+
+ ssk->rx_sa = rx_sa = RX_SRCAVAIL_STATE(mb) = kzalloc(
+ sizeof(struct rx_srcavail_state), M_NOWAIT);
+
+ rx_sa->mseq = ntohl(h->mseq);
+ rx_sa->used = 0;
+ rx_sa->len = mb_len = ntohl(srcah->len);
+ rx_sa->rkey = ntohl(srcah->rkey);
+ rx_sa->vaddr = be64_to_cpu(srcah->vaddr);
+ rx_sa->flags = 0;
+
+ if (ssk->tx_sa) {
+ sdp_dbg_data(ssk->socket, "got RX SrcAvail while waiting "
+ "for TX SrcAvail. waking up TX SrcAvail"
+ "to be aborted\n");
+ wake_up(sk->sk_sleep);
+ }
+
+ atomic_add(mb->len, &ssk->rcv_nxt);
+ sdp_dbg_data(sk, "queueing SrcAvail. mb_len = %d vaddr = %lld\n",
+ mb_len, rx_sa->vaddr);
+ } else
+#endif
+ {
+ atomic_add(mb->m_pkthdr.len, &ssk->rcv_nxt);
+ }
+
+ m_adj(mb, SDP_HEAD_SIZE);
+ SOCKBUF_LOCK(&sk->so_rcv);
+ if (unlikely(h->flags & SDP_OOB_PRES))
+ sdp_urg(ssk, mb);
+ sbappend_locked(&sk->so_rcv, mb);
+ sorwakeup_locked(sk);
+ return mb;
+}
+
+static int
+sdp_get_recv_bytes(struct sdp_sock *ssk, u32 new_size)
+{
+
+ return MIN(new_size, SDP_MAX_PACKET);
+}
+
+int
+sdp_init_buffers(struct sdp_sock *ssk, u32 new_size)
+{
+
+ ssk->recv_bytes = sdp_get_recv_bytes(ssk, new_size);
+ sdp_post_recvs(ssk);
+
+ return 0;
+}
+
+int
+sdp_resize_buffers(struct sdp_sock *ssk, u32 new_size)
+{
+ u32 curr_size = ssk->recv_bytes;
+ u32 max_size = SDP_MAX_PACKET;
+
+ if (new_size > curr_size && new_size <= max_size) {
+ ssk->recv_bytes = sdp_get_recv_bytes(ssk, new_size);
+ return 0;
+ }
+ return -1;
+}
+
+static void
+sdp_handle_resize_request(struct sdp_sock *ssk, struct sdp_chrecvbuf *buf)
+{
+ if (sdp_resize_buffers(ssk, ntohl(buf->size)) == 0)
+ ssk->recv_request_head = ring_head(ssk->rx_ring) + 1;
+ else
+ ssk->recv_request_head = ring_tail(ssk->rx_ring);
+ ssk->recv_request = 1;
+}
+
+static void
+sdp_handle_resize_ack(struct sdp_sock *ssk, struct sdp_chrecvbuf *buf)
+{
+ u32 new_size = ntohl(buf->size);
+
+ if (new_size > ssk->xmit_size_goal)
+ ssk->xmit_size_goal = new_size;
+}
+
+static struct mbuf *
+sdp_recv_completion(struct sdp_sock *ssk, int id)
+{
+ struct sdp_buf *rx_req;
+ struct ib_device *dev;
+ struct mbuf *mb;
+
+ if (unlikely(id != ring_tail(ssk->rx_ring))) {
+ printk(KERN_WARNING "Bogus recv completion id %d tail %d\n",
+ id, ring_tail(ssk->rx_ring));
+ return NULL;
+ }
+
+ dev = ssk->ib_device;
+ rx_req = &ssk->rx_ring.buffer[id & (SDP_RX_SIZE - 1)];
+ mb = rx_req->mb;
+ sdp_cleanup_sdp_buf(ssk, rx_req, DMA_FROM_DEVICE);
+
+ atomic_inc(&ssk->rx_ring.tail);
+ atomic_dec(&ssk->remote_credits);
+ return mb;
+}
+
+/* socket lock should be taken before calling this */
+static int
+sdp_process_rx_ctl_mb(struct sdp_sock *ssk, struct mbuf *mb)
+{
+ struct sdp_bsdh *h;
+ struct socket *sk;
+
+ SDP_WLOCK_ASSERT(ssk);
+ sk = ssk->socket;
+ h = mtod(mb, struct sdp_bsdh *);
+ switch (h->mid) {
+ case SDP_MID_DATA:
+ case SDP_MID_SRCAVAIL:
+ sdp_dbg(sk, "DATA after socket rcv was shutdown\n");
+
+ /* got data in RCV_SHUTDOWN */
+ if (ssk->state == TCPS_FIN_WAIT_1) {
+ sdp_dbg(sk, "RX data when state = FIN_WAIT1\n");
+ sdp_notify(ssk, ECONNRESET);
+ }
+ m_freem(mb);
+
+ break;
+#ifdef SDP_ZCOPY
+ case SDP_MID_RDMARDCOMPL:
+ m_freem(mb);
+ break;
+ case SDP_MID_SENDSM:
+ sdp_handle_sendsm(ssk, ntohl(h->mseq_ack));
+ m_freem(mb);
+ break;
+ case SDP_MID_SRCAVAIL_CANCEL:
+ sdp_dbg_data(sk, "Handling SrcAvailCancel\n");
+ sdp_prf(sk, NULL, "Handling SrcAvailCancel");
+ if (ssk->rx_sa) {
+ ssk->srcavail_cancel_mseq = ntohl(h->mseq);
+ ssk->rx_sa->flags |= RX_SA_ABORTED;
+ ssk->rx_sa = NULL; /* TODO: change it into SDP_MID_DATA and get
+ the dirty logic from recvmsg */
+ } else {
+ sdp_dbg(sk, "Got SrcAvailCancel - "
+ "but no SrcAvail in process\n");
+ }
+ m_freem(mb);
+ break;
+ case SDP_MID_SINKAVAIL:
+ sdp_dbg_data(sk, "Got SinkAvail - not supported: ignored\n");
+ sdp_prf(sk, NULL, "Got SinkAvail - not supported: ignored");
+ /* FALLTHROUGH */
+#endif
+ case SDP_MID_ABORT:
+ sdp_dbg_data(sk, "Handling ABORT\n");
+ sdp_prf(sk, NULL, "Handling ABORT");
+ sdp_notify(ssk, ECONNRESET);
+ m_freem(mb);
+ break;
+ case SDP_MID_DISCONN:
+ sdp_dbg_data(sk, "Handling DISCONN\n");
+ sdp_prf(sk, NULL, "Handling DISCONN");
+ sdp_handle_disconn(ssk);
+ break;
+ case SDP_MID_CHRCVBUF:
+ sdp_dbg_data(sk, "Handling RX CHRCVBUF\n");
+ sdp_handle_resize_request(ssk, (struct sdp_chrecvbuf *)(h+1));
+ m_freem(mb);
+ break;
+ case SDP_MID_CHRCVBUF_ACK:
+ sdp_dbg_data(sk, "Handling RX CHRCVBUF_ACK\n");
+ sdp_handle_resize_ack(ssk, (struct sdp_chrecvbuf *)(h+1));
+ m_freem(mb);
+ break;
+ default:
+ /* TODO: Handle other messages */
+ sdp_warn(sk, "SDP: FIXME MID %d\n", h->mid);
+ m_freem(mb);
+ }
+
+ return 0;
+}
+
+static int
+sdp_process_rx_mb(struct sdp_sock *ssk, struct mbuf *mb)
+{
+ struct socket *sk;
+ struct sdp_bsdh *h;
+ unsigned long mseq_ack;
+ int credits_before;
+
+ h = mtod(mb, struct sdp_bsdh *);
+ sk = ssk->socket;
+ /*
+ * If another thread is in so_pcbfree this may be partially torn
+ * down but no further synchronization is required as the destroying
+ * thread will wait for receive to shutdown before discarding the
+ * socket.
+ */
+ if (sk == NULL) {
+ m_freem(mb);
+ return 0;
+ }
+
+ SDPSTATS_HIST_LINEAR(credits_before_update, tx_credits(ssk));
+
+ mseq_ack = ntohl(h->mseq_ack);
+ credits_before = tx_credits(ssk);
+ atomic_set(&ssk->tx_ring.credits, mseq_ack - ring_head(ssk->tx_ring) +
+ 1 + ntohs(h->bufs));
+ if (mseq_ack >= ssk->nagle_last_unacked)
+ ssk->nagle_last_unacked = 0;
+
+ sdp_prf1(ssk->socket, mb, "RX %s +%d c:%d->%d mseq:%d ack:%d\n",
+ mid2str(h->mid), ntohs(h->bufs), credits_before,
+ tx_credits(ssk), ntohl(h->mseq), ntohl(h->mseq_ack));
+
+ if (unlikely(h->mid == SDP_MID_DATA &&
+ mb->m_pkthdr.len == SDP_HEAD_SIZE)) {
+ /* Credit update is valid even after RCV_SHUTDOWN */
+ m_freem(mb);
+ return 0;
+ }
+
+ if ((h->mid != SDP_MID_DATA && h->mid != SDP_MID_SRCAVAIL) ||
+ TCPS_HAVERCVDFIN(ssk->state)) {
+ sdp_prf(sk, NULL, "Control mb - queing to control queue");
+#ifdef SDP_ZCOPY
+ if (h->mid == SDP_MID_SRCAVAIL_CANCEL) {
+ sdp_dbg_data(sk, "Got SrcAvailCancel. "
+ "seq: 0x%d seq_ack: 0x%d\n",
+ ntohl(h->mseq), ntohl(h->mseq_ack));
+ ssk->srcavail_cancel_mseq = ntohl(h->mseq);
+ }
+
+
+ if (h->mid == SDP_MID_RDMARDCOMPL) {
+ struct sdp_rrch *rrch = (struct sdp_rrch *)(h+1);
+ sdp_dbg_data(sk, "RdmaRdCompl message arrived\n");
+ sdp_handle_rdma_read_compl(ssk, ntohl(h->mseq_ack),
+ ntohl(rrch->len));
+ }
+#endif
+ mb->m_nextpkt = NULL;
+ if (ssk->rx_ctl_tail)
+ ssk->rx_ctl_tail->m_nextpkt = mb;
+ else
+ ssk->rx_ctl_q = mb;
+ ssk->rx_ctl_tail = mb;
+
+ return 0;
+ }
+
+ sdp_prf1(sk, NULL, "queueing %s mb\n", mid2str(h->mid));
+ mb = sdp_sock_queue_rcv_mb(sk, mb);
+
+
+ return 0;
+}
+
+/* called only from irq */
+static struct mbuf *
+sdp_process_rx_wc(struct sdp_sock *ssk, struct ib_wc *wc)
+{
+ struct mbuf *mb;
+ struct sdp_bsdh *h;
+ struct socket *sk = ssk->socket;
+ int mseq;
+
+ mb = sdp_recv_completion(ssk, wc->wr_id);
+ if (unlikely(!mb))
+ return NULL;
+
+ if (unlikely(wc->status)) {
+ if (ssk->qp_active && sk) {
+ sdp_dbg(sk, "Recv completion with error. "
+ "Status %d, vendor: %d\n",
+ wc->status, wc->vendor_err);
+ sdp_abort(sk);
+ ssk->qp_active = 0;
+ }
+ m_freem(mb);
+ return NULL;
+ }
+
+ sdp_dbg_data(sk, "Recv completion. ID %d Length %d\n",
+ (int)wc->wr_id, wc->byte_len);
+ if (unlikely(wc->byte_len < sizeof(struct sdp_bsdh))) {
+ sdp_warn(sk, "SDP BUG! byte_len %d < %zd\n",
+ wc->byte_len, sizeof(struct sdp_bsdh));
+ m_freem(mb);
+ return NULL;
+ }
+ /* Use m_adj to trim the tail of data we didn't use. */
+ m_adj(mb, -(mb->m_pkthdr.len - wc->byte_len));
+ h = mtod(mb, struct sdp_bsdh *);
+
+ SDP_DUMP_PACKET(ssk->socket, "RX", mb, h);
+
+ ssk->rx_packets++;
+ ssk->rx_bytes += mb->m_pkthdr.len;
+
+ mseq = ntohl(h->mseq);
+ atomic_set(&ssk->mseq_ack, mseq);
+ if (mseq != (int)wc->wr_id)
+ sdp_warn(sk, "SDP BUG! mseq %d != wrid %d\n",
+ mseq, (int)wc->wr_id);
+
+ return mb;
+}
+
+/* Wakeup writers if we now have credits. */
+static void
+sdp_bzcopy_write_space(struct sdp_sock *ssk)
+{
+ struct socket *sk = ssk->socket;
+
+ if (tx_credits(ssk) >= ssk->min_bufs && sk)
+ sowwakeup(sk);
+}
+
+/* only from interrupt. */
+static int
+sdp_poll_rx_cq(struct sdp_sock *ssk)
+{
+ struct ib_cq *cq = ssk->rx_ring.cq;
+ struct ib_wc ibwc[SDP_NUM_WC];
+ int n, i;
+ int wc_processed = 0;
+ struct mbuf *mb;
+
+ do {
+ n = ib_poll_cq(cq, SDP_NUM_WC, ibwc);
+ for (i = 0; i < n; ++i) {
+ struct ib_wc *wc = &ibwc[i];
+
+ BUG_ON(!(wc->wr_id & SDP_OP_RECV));
+ mb = sdp_process_rx_wc(ssk, wc);
+ if (!mb)
+ continue;
+
+ sdp_process_rx_mb(ssk, mb);
+ wc_processed++;
+ }
+ } while (n == SDP_NUM_WC);
+
+ if (wc_processed)
+ sdp_bzcopy_write_space(ssk);
+
+ return wc_processed;
+}
+
+static void
+sdp_rx_comp_work(struct work_struct *work)
+{
+ struct sdp_sock *ssk = container_of(work, struct sdp_sock,
+ rx_comp_work);
+
+ sdp_prf(ssk->socket, NULL, "%s", __func__);
+
+ SDP_WLOCK(ssk);
+ if (unlikely(!ssk->qp)) {
+ sdp_prf(ssk->socket, NULL, "qp was destroyed");
+ goto out;
+ }
+ if (unlikely(!ssk->rx_ring.cq)) {
+ sdp_prf(ssk->socket, NULL, "rx_ring.cq is NULL");
+ goto out;
+ }
+
+ if (unlikely(!ssk->poll_cq)) {
+ struct rdma_cm_id *id = ssk->id;
+ if (id && id->qp)
+ rdma_notify(id, RDMA_CM_EVENT_ESTABLISHED);
+ goto out;
+ }
+
+ sdp_do_posts(ssk);
+out:
+ SDP_WUNLOCK(ssk);
+}
+
+void
+sdp_do_posts(struct sdp_sock *ssk)
+{
+ struct socket *sk = ssk->socket;
+ int xmit_poll_force;
+ struct mbuf *mb;
+
+ SDP_WLOCK_ASSERT(ssk);
+ if (!ssk->qp_active) {
+ sdp_dbg(sk, "QP is deactivated\n");
+ return;
+ }
+
+ while ((mb = ssk->rx_ctl_q)) {
+ ssk->rx_ctl_q = mb->m_nextpkt;
+ mb->m_nextpkt = NULL;
+ sdp_process_rx_ctl_mb(ssk, mb);
+ }
+
+ if (ssk->state == TCPS_TIME_WAIT)
+ return;
+
+ if (!ssk->rx_ring.cq || !ssk->tx_ring.cq)
+ return;
+
+ sdp_post_recvs(ssk);
+
+ if (tx_ring_posted(ssk))
+ sdp_xmit_poll(ssk, 1);
+
+ sdp_post_sends(ssk, M_NOWAIT);
+
+ xmit_poll_force = tx_credits(ssk) < SDP_MIN_TX_CREDITS;
+
+ if (credit_update_needed(ssk) || xmit_poll_force) {
+ /* if has pending tx because run out of tx_credits - xmit it */
+ sdp_prf(sk, NULL, "Processing to free pending sends");
+ sdp_xmit_poll(ssk, xmit_poll_force);
+ sdp_prf(sk, NULL, "Sending credit update");
+ sdp_post_sends(ssk, M_NOWAIT);
+ }
+
+}
+
+int
+sdp_process_rx(struct sdp_sock *ssk)
+{
+ int wc_processed = 0;
+ int credits_before;
+
+ if (!rx_ring_trylock(&ssk->rx_ring)) {
+ sdp_dbg(ssk->socket, "ring destroyed. not polling it\n");
+ return 0;
+ }
+
+ credits_before = tx_credits(ssk);
+
+ wc_processed = sdp_poll_rx_cq(ssk);
+ sdp_prf(ssk->socket, NULL, "processed %d", wc_processed);
+
+ if (wc_processed) {
+ sdp_prf(ssk->socket, NULL, "credits: %d -> %d",
+ credits_before, tx_credits(ssk));
+ queue_work(rx_comp_wq, &ssk->rx_comp_work);
+ }
+ sdp_arm_rx_cq(ssk);
+
+ rx_ring_unlock(&ssk->rx_ring);
+
+ return (wc_processed);
+}
+
+static void
+sdp_rx_irq(struct ib_cq *cq, void *cq_context)
+{
+ struct socket *sk = cq_context;
+ struct sdp_sock *ssk = sdp_sk(sk);
+
+ if (cq != ssk->rx_ring.cq) {
+ sdp_dbg(sk, "cq = %p, ssk->cq = %p\n", cq, ssk->rx_ring.cq);
+ return;
+ }
+
+ SDPSTATS_COUNTER_INC(rx_int_count);
+
+ sdp_prf(sk, NULL, "rx irq");
+
+ sdp_process_rx(ssk);
+}
+
+static
+void sdp_rx_ring_purge(struct sdp_sock *ssk)
+{
+ while (rx_ring_posted(ssk) > 0) {
+ struct mbuf *mb;
+ mb = sdp_recv_completion(ssk, ring_tail(ssk->rx_ring));
+ if (!mb)
+ break;
+ m_freem(mb);
+ }
+}
+
+void
+sdp_rx_ring_init(struct sdp_sock *ssk)
+{
+ ssk->rx_ring.buffer = NULL;
+ ssk->rx_ring.destroyed = 0;
+ rw_init(&ssk->rx_ring.destroyed_lock, "sdp rx lock");
+}
+
+static void
+sdp_rx_cq_event_handler(struct ib_event *event, void *data)
+{
+}
+
+int
+sdp_rx_ring_create(struct sdp_sock *ssk, struct ib_device *device)
+{
+ struct ib_cq *rx_cq;
+ int rc = 0;
+
+
+ sdp_dbg(ssk->socket, "rx ring created");
+ INIT_WORK(&ssk->rx_comp_work, sdp_rx_comp_work);
+ atomic_set(&ssk->rx_ring.head, 1);
+ atomic_set(&ssk->rx_ring.tail, 1);
+
+ ssk->rx_ring.buffer = kmalloc(
+ sizeof *ssk->rx_ring.buffer * SDP_RX_SIZE, GFP_KERNEL);
+ if (!ssk->rx_ring.buffer) {
+ sdp_warn(ssk->socket,
+ "Unable to allocate RX Ring size %zd.\n",
+ sizeof(*ssk->rx_ring.buffer) * SDP_RX_SIZE);
+
+ return -ENOMEM;
+ }
+
+ rx_cq = ib_create_cq(device, sdp_rx_irq, sdp_rx_cq_event_handler,
+ ssk->socket, SDP_RX_SIZE, IB_CQ_VECTOR_LEAST_ATTACHED);
+
+ if (IS_ERR(rx_cq)) {
+ rc = PTR_ERR(rx_cq);
+ sdp_warn(ssk->socket, "Unable to allocate RX CQ: %d.\n", rc);
+ goto err_cq;
+ }
+
+ sdp_sk(ssk->socket)->rx_ring.cq = rx_cq;
+ sdp_arm_rx_cq(ssk);
+
+ return 0;
+
+err_cq:
+ kfree(ssk->rx_ring.buffer);
+ ssk->rx_ring.buffer = NULL;
+ return rc;
+}
+
+void
+sdp_rx_ring_destroy(struct sdp_sock *ssk)
+{
+
+ cancel_work_sync(&ssk->rx_comp_work);
+ rx_ring_destroy_lock(&ssk->rx_ring);
+
+ if (ssk->rx_ring.buffer) {
+ sdp_rx_ring_purge(ssk);
+
+ kfree(ssk->rx_ring.buffer);
+ ssk->rx_ring.buffer = NULL;
+ }
+
+ if (ssk->rx_ring.cq) {
+ if (ib_destroy_cq(ssk->rx_ring.cq)) {
+ sdp_warn(ssk->socket, "destroy cq(%p) failed\n",
+ ssk->rx_ring.cq);
+ } else {
+ ssk->rx_ring.cq = NULL;
+ }
+ }
+
+ WARN_ON(ring_head(ssk->rx_ring) != ring_tail(ssk->rx_ring));
+}
diff --git a/sys/ofed/drivers/infiniband/ulp/sdp/sdp_tx.c b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_tx.c
new file mode 100644
index 0000000..b0c37e5
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_tx.c
@@ -0,0 +1,490 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies Ltd. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "sdp.h"
+
+#define sdp_cnt(var) do { (var)++; } while (0)
+
+SDP_MODPARAM_SINT(sdp_keepalive_probes_sent, 0,
+ "Total number of keepalive probes sent.");
+
+static int sdp_process_tx_cq(struct sdp_sock *ssk);
+static void sdp_poll_tx_timeout(void *data);
+
+int
+sdp_xmit_poll(struct sdp_sock *ssk, int force)
+{
+ int wc_processed = 0;
+
+ SDP_WLOCK_ASSERT(ssk);
+ sdp_prf(ssk->socket, NULL, "%s", __func__);
+
+ /* If we don't have a pending timer, set one up to catch our recent
+ post in case the interface becomes idle */
+ if (!callout_pending(&ssk->tx_ring.timer))
+ callout_reset(&ssk->tx_ring.timer, SDP_TX_POLL_TIMEOUT,
+ sdp_poll_tx_timeout, ssk);
+
+ /* Poll the CQ every SDP_TX_POLL_MODER packets */
+ if (force || (++ssk->tx_ring.poll_cnt & (SDP_TX_POLL_MODER - 1)) == 0)
+ wc_processed = sdp_process_tx_cq(ssk);
+
+ return wc_processed;
+}
+
+void
+sdp_post_send(struct sdp_sock *ssk, struct mbuf *mb)
+{
+ struct sdp_buf *tx_req;
+ struct sdp_bsdh *h;
+ unsigned long mseq;
+ struct ib_device *dev;
+ struct ib_send_wr *bad_wr;
+ struct ib_sge ibsge[SDP_MAX_SEND_SGES];
+ struct ib_sge *sge;
+ struct ib_send_wr tx_wr = { NULL };
+ int i, rc;
+ u64 addr;
+
+ SDPSTATS_COUNTER_MID_INC(post_send, h->mid);
+ SDPSTATS_HIST(send_size, mb->len);
+
+ if (!ssk->qp_active) {
+ m_freem(mb);
+ return;
+ }
+
+ mseq = ring_head(ssk->tx_ring);
+ h = mtod(mb, struct sdp_bsdh *);
+ ssk->tx_packets++;
+ ssk->tx_bytes += mb->m_pkthdr.len;
+
+#ifdef SDP_ZCOPY
+ if (unlikely(h->mid == SDP_MID_SRCAVAIL)) {
+ struct tx_srcavail_state *tx_sa = TX_SRCAVAIL_STATE(mb);
+ if (ssk->tx_sa != tx_sa) {
+ sdp_dbg_data(ssk->socket, "SrcAvail cancelled "
+ "before being sent!\n");
+ WARN_ON(1);
+ m_freem(mb);
+ return;
+ }
+ TX_SRCAVAIL_STATE(mb)->mseq = mseq;
+ }
+#endif
+
+ if (unlikely(mb->m_flags & M_URG))
+ h->flags = SDP_OOB_PRES | SDP_OOB_PEND;
+ else
+ h->flags = 0;
+
+ mb->m_flags |= M_RDONLY; /* Don't allow compression once sent. */
+ h->bufs = htons(rx_ring_posted(ssk));
+ h->len = htonl(mb->m_pkthdr.len);
+ h->mseq = htonl(mseq);
+ h->mseq_ack = htonl(mseq_ack(ssk));
+
+ sdp_prf1(ssk->socket, mb, "TX: %s bufs: %d mseq:%ld ack:%d",
+ mid2str(h->mid), rx_ring_posted(ssk), mseq,
+ ntohl(h->mseq_ack));
+
+ SDP_DUMP_PACKET(ssk->socket, "TX", mb, h);
+
+ tx_req = &ssk->tx_ring.buffer[mseq & (SDP_TX_SIZE - 1)];
+ tx_req->mb = mb;
+ dev = ssk->ib_device;
+ sge = &ibsge[0];
+ for (i = 0; mb != NULL; i++, mb = mb->m_next, sge++) {
+ addr = ib_dma_map_single(dev, mb->m_data, mb->m_len,
+ DMA_TO_DEVICE);
+ /* TODO: proper error handling */
+ BUG_ON(ib_dma_mapping_error(dev, addr));
+ BUG_ON(i >= SDP_MAX_SEND_SGES);
+ tx_req->mapping[i] = addr;
+ sge->addr = addr;
+ sge->length = mb->m_len;
+ sge->lkey = ssk->sdp_dev->mr->lkey;
+ }
+ tx_wr.next = NULL;
+ tx_wr.wr_id = mseq | SDP_OP_SEND;
+ tx_wr.sg_list = ibsge;
+ tx_wr.num_sge = i;
+ tx_wr.opcode = IB_WR_SEND;
+ tx_wr.send_flags = IB_SEND_SIGNALED;
+ if (unlikely(tx_req->mb->m_flags & M_URG))
+ tx_wr.send_flags |= IB_SEND_SOLICITED;
+
+ rc = ib_post_send(ssk->qp, &tx_wr, &bad_wr);
+ if (unlikely(rc)) {
+ sdp_dbg(ssk->socket,
+ "ib_post_send failed with status %d.\n", rc);
+
+ sdp_cleanup_sdp_buf(ssk, tx_req, DMA_TO_DEVICE);
+
+ sdp_notify(ssk, ECONNRESET);
+ m_freem(tx_req->mb);
+ return;
+ }
+
+ atomic_inc(&ssk->tx_ring.head);
+ atomic_dec(&ssk->tx_ring.credits);
+ atomic_set(&ssk->remote_credits, rx_ring_posted(ssk));
+
+ return;
+}
+
+static struct mbuf *
+sdp_send_completion(struct sdp_sock *ssk, int mseq)
+{
+ struct ib_device *dev;
+ struct sdp_buf *tx_req;
+ struct mbuf *mb = NULL;
+ struct sdp_tx_ring *tx_ring = &ssk->tx_ring;
+
+ if (unlikely(mseq != ring_tail(*tx_ring))) {
+ printk(KERN_WARNING "Bogus send completion id %d tail %d\n",
+ mseq, ring_tail(*tx_ring));
+ goto out;
+ }
+
+ dev = ssk->ib_device;
+ tx_req = &tx_ring->buffer[mseq & (SDP_TX_SIZE - 1)];
+ mb = tx_req->mb;
+ sdp_cleanup_sdp_buf(ssk, tx_req, DMA_TO_DEVICE);
+
+#ifdef SDP_ZCOPY
+ /* TODO: AIO and real zcopy code; add their context support here */
+ if (BZCOPY_STATE(mb))
+ BZCOPY_STATE(mb)->busy--;
+#endif
+
+ atomic_inc(&tx_ring->tail);
+
+out:
+ return mb;
+}
+
+static int
+sdp_handle_send_comp(struct sdp_sock *ssk, struct ib_wc *wc)
+{
+ struct mbuf *mb = NULL;
+ struct sdp_bsdh *h;
+
+ if (unlikely(wc->status)) {
+ if (wc->status != IB_WC_WR_FLUSH_ERR) {
+ sdp_prf(ssk->socket, mb, "Send completion with error. "
+ "Status %d", wc->status);
+ sdp_dbg_data(ssk->socket, "Send completion with error. "
+ "Status %d\n", wc->status);
+ sdp_notify(ssk, ECONNRESET);
+ }
+ }
+
+ mb = sdp_send_completion(ssk, wc->wr_id);
+ if (unlikely(!mb))
+ return -1;
+
+ h = mtod(mb, struct sdp_bsdh *);
+ sdp_prf1(ssk->socket, mb, "tx completion. mseq:%d", ntohl(h->mseq));
+ sdp_dbg(ssk->socket, "tx completion. %p %d mseq:%d",
+ mb, mb->m_pkthdr.len, ntohl(h->mseq));
+ m_freem(mb);
+
+ return 0;
+}
+
+static inline void
+sdp_process_tx_wc(struct sdp_sock *ssk, struct ib_wc *wc)
+{
+
+ if (likely(wc->wr_id & SDP_OP_SEND)) {
+ sdp_handle_send_comp(ssk, wc);
+ return;
+ }
+
+#ifdef SDP_ZCOPY
+ if (wc->wr_id & SDP_OP_RDMA) {
+ /* TODO: handle failed RDMA read cqe */
+
+ sdp_dbg_data(ssk->socket,
+ "TX comp: RDMA read. status: %d\n", wc->status);
+ sdp_prf1(sk, NULL, "TX comp: RDMA read");
+
+ if (!ssk->tx_ring.rdma_inflight) {
+ sdp_warn(ssk->socket, "ERROR: unexpected RDMA read\n");
+ return;
+ }
+
+ if (!ssk->tx_ring.rdma_inflight->busy) {
+ sdp_warn(ssk->socket,
+ "ERROR: too many RDMA read completions\n");
+ return;
+ }
+
+ /* Only last RDMA read WR is signalled. Order is guaranteed -
+ * therefore if Last RDMA read WR is completed - all other
+ * have, too */
+ ssk->tx_ring.rdma_inflight->busy = 0;
+ sowwakeup(ssk->socket);
+ sdp_dbg_data(ssk->socket, "woke up sleepers\n");
+ return;
+ }
+#endif
+
+ /* Keepalive probe sent cleanup */
+ sdp_cnt(sdp_keepalive_probes_sent);
+
+ if (likely(!wc->status))
+ return;
+
+ sdp_dbg(ssk->socket, " %s consumes KEEPALIVE status %d\n",
+ __func__, wc->status);
+
+ if (wc->status == IB_WC_WR_FLUSH_ERR)
+ return;
+
+ sdp_notify(ssk, ECONNRESET);
+}
+
+static int
+sdp_process_tx_cq(struct sdp_sock *ssk)
+{
+ struct ib_wc ibwc[SDP_NUM_WC];
+ int n, i;
+ int wc_processed = 0;
+
+ SDP_WLOCK_ASSERT(ssk);
+
+ if (!ssk->tx_ring.cq) {
+ sdp_dbg(ssk->socket, "tx irq on destroyed tx_cq\n");
+ return 0;
+ }
+
+ do {
+ n = ib_poll_cq(ssk->tx_ring.cq, SDP_NUM_WC, ibwc);
+ for (i = 0; i < n; ++i) {
+ sdp_process_tx_wc(ssk, ibwc + i);
+ wc_processed++;
+ }
+ } while (n == SDP_NUM_WC);
+
+ if (wc_processed) {
+ sdp_post_sends(ssk, M_DONTWAIT);
+ sdp_prf1(sk, NULL, "Waking sendmsg. inflight=%d",
+ (u32) tx_ring_posted(ssk));
+ sowwakeup(ssk->socket);
+ }
+
+ return wc_processed;
+}
+
+static void
+sdp_poll_tx(struct sdp_sock *ssk)
+{
+ struct socket *sk = ssk->socket;
+ u32 inflight, wc_processed;
+
+ sdp_prf1(ssk->socket, NULL, "TX timeout: inflight=%d, head=%d tail=%d",
+ (u32) tx_ring_posted(ssk),
+ ring_head(ssk->tx_ring), ring_tail(ssk->tx_ring));
+
+ if (unlikely(ssk->state == TCPS_CLOSED)) {
+ sdp_warn(sk, "Socket is closed\n");
+ goto out;
+ }
+
+ wc_processed = sdp_process_tx_cq(ssk);
+ if (!wc_processed)
+ SDPSTATS_COUNTER_INC(tx_poll_miss);
+ else
+ SDPSTATS_COUNTER_INC(tx_poll_hit);
+
+ inflight = (u32) tx_ring_posted(ssk);
+ sdp_prf1(ssk->socket, NULL, "finished tx proccessing. inflight = %d",
+ inflight);
+
+ /* If there are still packets in flight and the timer has not already
+ * been scheduled by the Tx routine then schedule it here to guarantee
+ * completion processing of these packets */
+ if (inflight)
+ callout_reset(&ssk->tx_ring.timer, SDP_TX_POLL_TIMEOUT,
+ sdp_poll_tx_timeout, ssk);
+out:
+#ifdef SDP_ZCOPY
+ if (ssk->tx_ring.rdma_inflight && ssk->tx_ring.rdma_inflight->busy) {
+ sdp_prf1(sk, NULL, "RDMA is inflight - arming irq");
+ sdp_arm_tx_cq(ssk);
+ }
+#endif
+ return;
+}
+
+static void
+sdp_poll_tx_timeout(void *data)
+{
+ struct sdp_sock *ssk = (struct sdp_sock *)data;
+
+ if (!callout_active(&ssk->tx_ring.timer))
+ return;
+ callout_deactivate(&ssk->tx_ring.timer);
+ sdp_poll_tx(ssk);
+}
+
+static void
+sdp_tx_irq(struct ib_cq *cq, void *cq_context)
+{
+ struct sdp_sock *ssk;
+
+ ssk = cq_context;
+ sdp_prf1(ssk->socket, NULL, "tx irq");
+ sdp_dbg_data(ssk->socket, "Got tx comp interrupt\n");
+ SDPSTATS_COUNTER_INC(tx_int_count);
+ SDP_WLOCK(ssk);
+ sdp_poll_tx(ssk);
+ SDP_WUNLOCK(ssk);
+}
+
+static
+void sdp_tx_ring_purge(struct sdp_sock *ssk)
+{
+ while (tx_ring_posted(ssk)) {
+ struct mbuf *mb;
+ mb = sdp_send_completion(ssk, ring_tail(ssk->tx_ring));
+ if (!mb)
+ break;
+ m_freem(mb);
+ }
+}
+
+void
+sdp_post_keepalive(struct sdp_sock *ssk)
+{
+ int rc;
+ struct ib_send_wr wr, *bad_wr;
+
+ sdp_dbg(ssk->socket, "%s\n", __func__);
+
+ memset(&wr, 0, sizeof(wr));
+
+ wr.next = NULL;
+ wr.wr_id = 0;
+ wr.sg_list = NULL;
+ wr.num_sge = 0;
+ wr.opcode = IB_WR_RDMA_WRITE;
+
+ rc = ib_post_send(ssk->qp, &wr, &bad_wr);
+ if (rc) {
+ sdp_dbg(ssk->socket,
+ "ib_post_keepalive failed with status %d.\n", rc);
+ sdp_notify(ssk, ECONNRESET);
+ }
+
+ sdp_cnt(sdp_keepalive_probes_sent);
+}
+
+static void
+sdp_tx_cq_event_handler(struct ib_event *event, void *data)
+{
+}
+
+int
+sdp_tx_ring_create(struct sdp_sock *ssk, struct ib_device *device)
+{
+ struct ib_cq *tx_cq;
+ int rc = 0;
+
+ sdp_dbg(ssk->socket, "tx ring create\n");
+ callout_init_rw(&ssk->tx_ring.timer, &ssk->lock, 0);
+ callout_init_rw(&ssk->nagle_timer, &ssk->lock, 0);
+ atomic_set(&ssk->tx_ring.head, 1);
+ atomic_set(&ssk->tx_ring.tail, 1);
+
+ ssk->tx_ring.buffer = kzalloc(
+ sizeof *ssk->tx_ring.buffer * SDP_TX_SIZE, GFP_KERNEL);
+ if (!ssk->tx_ring.buffer) {
+ rc = -ENOMEM;
+ sdp_warn(ssk->socket, "Can't allocate TX Ring size %zd.\n",
+ sizeof(*ssk->tx_ring.buffer) * SDP_TX_SIZE);
+
+ goto out;
+ }
+
+ tx_cq = ib_create_cq(device, sdp_tx_irq, sdp_tx_cq_event_handler,
+ ssk, SDP_TX_SIZE, IB_CQ_VECTOR_LEAST_ATTACHED);
+
+ if (IS_ERR(tx_cq)) {
+ rc = PTR_ERR(tx_cq);
+ sdp_warn(ssk->socket, "Unable to allocate TX CQ: %d.\n", rc);
+ goto err_cq;
+ }
+ ssk->tx_ring.cq = tx_cq;
+ ssk->tx_ring.poll_cnt = 0;
+ sdp_arm_tx_cq(ssk);
+
+ return 0;
+
+err_cq:
+ kfree(ssk->tx_ring.buffer);
+ ssk->tx_ring.buffer = NULL;
+out:
+ return rc;
+}
+
+void
+sdp_tx_ring_destroy(struct sdp_sock *ssk)
+{
+
+ sdp_dbg(ssk->socket, "tx ring destroy\n");
+ SDP_WLOCK(ssk);
+ callout_stop(&ssk->tx_ring.timer);
+ callout_stop(&ssk->nagle_timer);
+ SDP_WUNLOCK(ssk);
+ callout_drain(&ssk->tx_ring.timer);
+ callout_drain(&ssk->nagle_timer);
+
+ if (ssk->tx_ring.buffer) {
+ sdp_tx_ring_purge(ssk);
+
+ kfree(ssk->tx_ring.buffer);
+ ssk->tx_ring.buffer = NULL;
+ }
+
+ if (ssk->tx_ring.cq) {
+ if (ib_destroy_cq(ssk->tx_ring.cq)) {
+ sdp_warn(ssk->socket, "destroy cq(%p) failed\n",
+ ssk->tx_ring.cq);
+ } else {
+ ssk->tx_ring.cq = NULL;
+ }
+ }
+
+ WARN_ON(ring_head(ssk->tx_ring) != ring_tail(ssk->tx_ring));
+}
diff --git a/sys/ofed/drivers/infiniband/ulp/sdp/sdp_zcopy.c b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_zcopy.c
new file mode 100644
index 0000000..0425f8e
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_zcopy.c
@@ -0,0 +1,804 @@
+/*
+ * Copyright (c) 2006 Mellanox Technologies Ltd. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/tcp.h>
+#include <asm/ioctls.h>
+#include <linux/workqueue.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <net/protocol.h>
+#include <net/inet_common.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_fmr_pool.h>
+#include <rdma/ib_umem.h>
+#include <net/tcp.h> /* for memcpy_toiovec */
+#include <asm/io.h>
+#include <asm/uaccess.h>
+#include <linux/delay.h>
+#include "sdp.h"
+
+static int sdp_post_srcavail(struct socket *sk, struct tx_srcavail_state *tx_sa)
+{
+ struct sdp_sock *ssk = sdp_sk(sk);
+ struct mbuf *mb;
+ int payload_len;
+ struct page *payload_pg;
+ int off, len;
+ struct ib_umem_chunk *chunk;
+
+ WARN_ON(ssk->tx_sa);
+
+ BUG_ON(!tx_sa);
+ BUG_ON(!tx_sa->fmr || !tx_sa->fmr->fmr->lkey);
+ BUG_ON(!tx_sa->umem);
+ BUG_ON(!tx_sa->umem->chunk_list.next);
+
+ chunk = list_entry(tx_sa->umem->chunk_list.next, struct ib_umem_chunk, list);
+ BUG_ON(!chunk->nmap);
+
+ off = tx_sa->umem->offset;
+ len = tx_sa->umem->length;
+
+ tx_sa->bytes_sent = tx_sa->bytes_acked = 0;
+
+ mb = sdp_alloc_mb_srcavail(sk, len, tx_sa->fmr->fmr->lkey, off, 0);
+ if (!mb) {
+ return -ENOMEM;
+ }
+ sdp_dbg_data(sk, "sending SrcAvail\n");
+
+ TX_SRCAVAIL_STATE(mb) = tx_sa; /* tx_sa is hanged on the mb
+ * but continue to live after mb is freed */
+ ssk->tx_sa = tx_sa;
+
+ /* must have payload inlined in SrcAvail packet in combined mode */
+ payload_len = MIN(tx_sa->umem->page_size - off, len);
+ payload_len = MIN(payload_len, ssk->xmit_size_goal - sizeof(struct sdp_srcah));
+ payload_pg = sg_page(&chunk->page_list[0]);
+ get_page(payload_pg);
+
+ sdp_dbg_data(sk, "payload: off: 0x%x, pg: %p, len: 0x%x\n",
+ off, payload_pg, payload_len);
+
+ mb_fill_page_desc(mb, mb_shinfo(mb)->nr_frags,
+ payload_pg, off, payload_len);
+
+ mb->len += payload_len;
+ mb->data_len = payload_len;
+ mb->truesize += payload_len;
+// sk->sk_wmem_queued += payload_len;
+// sk->sk_forward_alloc -= payload_len;
+
+ mb_entail(sk, ssk, mb);
+
+ ssk->write_seq += payload_len;
+ SDP_SKB_CB(mb)->end_seq += payload_len;
+
+ tx_sa->bytes_sent = tx_sa->umem->length;
+ tx_sa->bytes_acked = payload_len;
+
+ /* TODO: pushing the mb into the tx_queue should be enough */
+
+ return 0;
+}
+
+static int sdp_post_srcavail_cancel(struct socket *sk)
+{
+ struct sdp_sock *ssk = sdp_sk(sk);
+ struct mbuf *mb;
+
+ sdp_dbg_data(ssk->socket, "Posting srcavail cancel\n");
+
+ mb = sdp_alloc_mb_srcavail_cancel(sk, 0);
+ mb_entail(sk, ssk, mb);
+
+ sdp_post_sends(ssk, 0);
+
+ schedule_delayed_work(&ssk->srcavail_cancel_work,
+ SDP_SRCAVAIL_CANCEL_TIMEOUT);
+
+ return 0;
+}
+
+void srcavail_cancel_timeout(struct work_struct *work)
+{
+ struct sdp_sock *ssk =
+ container_of(work, struct sdp_sock, srcavail_cancel_work.work);
+ struct socket *sk = ssk->socket;
+
+ lock_sock(sk);
+
+ sdp_dbg_data(sk, "both SrcAvail and SrcAvailCancel timedout."
+ " closing connection\n");
+ sdp_set_error(sk, -ECONNRESET);
+ wake_up(&ssk->wq);
+
+ release_sock(sk);
+}
+
+static int sdp_wait_rdmardcompl(struct sdp_sock *ssk, long *timeo_p,
+ int ignore_signals)
+{
+ struct socket *sk = ssk->socket;
+ int err = 0;
+ long vm_wait = 0;
+ long current_timeo = *timeo_p;
+ struct tx_srcavail_state *tx_sa = ssk->tx_sa;
+ DEFINE_WAIT(wait);
+
+ sdp_dbg_data(sk, "sleep till RdmaRdCompl. timeo = %ld.\n", *timeo_p);
+ sdp_prf1(sk, NULL, "Going to sleep");
+ while (ssk->qp_active) {
+ prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+
+ if (unlikely(!*timeo_p)) {
+ err = -ETIME;
+ tx_sa->abort_flags |= TX_SA_TIMEDOUT;
+ sdp_prf1(sk, NULL, "timeout");
+ SDPSTATS_COUNTER_INC(zcopy_tx_timeout);
+ break;
+ }
+
+ else if (tx_sa->bytes_acked > tx_sa->bytes_sent) {
+ err = -EINVAL;
+ sdp_dbg_data(sk, "acked bytes > sent bytes\n");
+ tx_sa->abort_flags |= TX_SA_ERROR;
+ break;
+ }
+
+ if (tx_sa->abort_flags & TX_SA_SENDSM) {
+ sdp_prf1(sk, NULL, "Aborting SrcAvail sending");
+ SDPSTATS_COUNTER_INC(zcopy_tx_aborted);
+ err = -EAGAIN;
+ break ;
+ }
+
+ if (!ignore_signals) {
+ if (signal_pending(current)) {
+ err = -EINTR;
+ sdp_prf1(sk, NULL, "signalled");
+ tx_sa->abort_flags |= TX_SA_INTRRUPTED;
+ break;
+ }
+
+ if (ssk->rx_sa && (tx_sa->bytes_acked < tx_sa->bytes_sent)) {
+ sdp_dbg_data(sk, "Crossing SrcAvail - aborting this\n");
+ tx_sa->abort_flags |= TX_SA_CROSS_SEND;
+ SDPSTATS_COUNTER_INC(zcopy_cross_send);
+ err = -ETIME;
+ break ;
+ }
+ }
+
+ posts_handler_put(ssk);
+
+ sk_wait_event(sk, &current_timeo,
+ tx_sa->abort_flags &&
+ ssk->rx_sa &&
+ (tx_sa->bytes_acked < tx_sa->bytes_sent) &&
+ vm_wait);
+ sdp_dbg_data(ssk->socket, "woke up sleepers\n");
+
+ posts_handler_get(ssk);
+
+ if (tx_sa->bytes_acked == tx_sa->bytes_sent)
+ break;
+
+ if (vm_wait) {
+ vm_wait -= current_timeo;
+ current_timeo = *timeo_p;
+ if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
+ (current_timeo -= vm_wait) < 0)
+ current_timeo = 0;
+ vm_wait = 0;
+ }
+ *timeo_p = current_timeo;
+ }
+
+ finish_wait(sk->sk_sleep, &wait);
+
+ sdp_dbg_data(sk, "Finished waiting - RdmaRdCompl: %d/%d bytes, flags: 0x%x\n",
+ tx_sa->bytes_acked, tx_sa->bytes_sent, tx_sa->abort_flags);
+
+ if (!ssk->qp_active) {
+ sdp_dbg(sk, "QP destroyed while waiting\n");
+ return -EINVAL;
+ }
+ return err;
+}
+
+static void sdp_wait_rdma_wr_finished(struct sdp_sock *ssk)
+{
+ struct socket *sk = ssk->socket;
+ long timeo = HZ * 5; /* Timeout for for RDMA read */
+ DEFINE_WAIT(wait);
+
+ sdp_dbg_data(sk, "Sleep till RDMA wr finished.\n");
+ while (1) {
+ prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE);
+
+ if (!ssk->tx_ring.rdma_inflight->busy) {
+ sdp_dbg_data(sk, "got rdma cqe\n");
+ break;
+ }
+
+ if (!ssk->qp_active) {
+ sdp_dbg_data(sk, "QP destroyed\n");
+ break;
+ }
+
+ if (!timeo) {
+ sdp_warn(sk, "Panic: Timed out waiting for RDMA read\n");
+ WARN_ON(1);
+ break;
+ }
+
+ posts_handler_put(ssk);
+
+ sdp_prf1(sk, NULL, "Going to sleep");
+ sk_wait_event(sk, &timeo,
+ !ssk->tx_ring.rdma_inflight->busy);
+ sdp_prf1(sk, NULL, "Woke up");
+ sdp_dbg_data(ssk->socket, "woke up sleepers\n");
+
+ posts_handler_get(ssk);
+ }
+
+ finish_wait(sk->sk_sleep, &wait);
+
+ sdp_dbg_data(sk, "Finished waiting\n");
+}
+
+int sdp_post_rdma_rd_compl(struct sdp_sock *ssk,
+ struct rx_srcavail_state *rx_sa)
+{
+ struct mbuf *mb;
+ int copied = rx_sa->used - rx_sa->reported;
+
+ if (rx_sa->used <= rx_sa->reported)
+ return 0;
+
+ mb = sdp_alloc_mb_rdmardcompl(ssk->socket, copied, 0);
+
+ rx_sa->reported += copied;
+
+ /* TODO: What if no tx_credits available? */
+ sdp_post_send(ssk, mb);
+
+ return 0;
+}
+
+int sdp_post_sendsm(struct socket *sk)
+{
+ struct mbuf *mb = sdp_alloc_mb_sendsm(sk, 0);
+
+ sdp_post_send(sdp_sk(sk), mb);
+
+ return 0;
+}
+
+static int sdp_update_iov_used(struct socket *sk, struct iovec *iov, int len)
+{
+ sdp_dbg_data(sk, "updating consumed 0x%x bytes from iov\n", len);
+ while (len > 0) {
+ if (iov->iov_len) {
+ int copy = min_t(unsigned int, iov->iov_len, len);
+ len -= copy;
+ iov->iov_len -= copy;
+ iov->iov_base += copy;
+ }
+ iov++;
+ }
+
+ return 0;
+}
+
+static inline int sge_bytes(struct ib_sge *sge, int sge_cnt)
+{
+ int bytes = 0;
+
+ while (sge_cnt > 0) {
+ bytes += sge->length;
+ sge++;
+ sge_cnt--;
+ }
+
+ return bytes;
+}
+void sdp_handle_sendsm(struct sdp_sock *ssk, u32 mseq_ack)
+{
+ struct socket *sk = ssk->socket;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ssk->tx_sa_lock, flags);
+
+ if (!ssk->tx_sa) {
+ sdp_prf1(sk, NULL, "SendSM for cancelled/finished SrcAvail");
+ goto out;
+ }
+
+ if (ssk->tx_sa->mseq > mseq_ack) {
+ sdp_dbg_data(sk, "SendSM arrived for old SrcAvail. "
+ "SendSM mseq_ack: 0x%x, SrcAvail mseq: 0x%x\n",
+ mseq_ack, ssk->tx_sa->mseq);
+ goto out;
+ }
+
+ sdp_dbg_data(sk, "Got SendSM - aborting SrcAvail\n");
+
+ ssk->tx_sa->abort_flags |= TX_SA_SENDSM;
+ cancel_delayed_work(&ssk->srcavail_cancel_work);
+
+ wake_up(sk->sk_sleep);
+ sdp_dbg_data(sk, "woke up sleepers\n");
+
+out:
+ spin_unlock_irqrestore(&ssk->tx_sa_lock, flags);
+}
+
+void sdp_handle_rdma_read_compl(struct sdp_sock *ssk, u32 mseq_ack,
+ u32 bytes_completed)
+{
+ struct socket *sk = ssk->socket;
+ unsigned long flags;
+
+ sdp_prf1(sk, NULL, "RdmaRdCompl ssk=%p tx_sa=%p", ssk, ssk->tx_sa);
+ sdp_dbg_data(sk, "RdmaRdCompl ssk=%p tx_sa=%p\n", ssk, ssk->tx_sa);
+
+ spin_lock_irqsave(&ssk->tx_sa_lock, flags);
+
+ BUG_ON(!ssk);
+
+ if (!ssk->tx_sa) {
+ sdp_dbg_data(sk, "Got RdmaRdCompl for aborted SrcAvail\n");
+ goto out;
+ }
+
+ if (ssk->tx_sa->mseq > mseq_ack) {
+ sdp_dbg_data(sk, "RdmaRdCompl arrived for old SrcAvail. "
+ "SendSM mseq_ack: 0x%x, SrcAvail mseq: 0x%x\n",
+ mseq_ack, ssk->tx_sa->mseq);
+ goto out;
+ }
+
+ ssk->tx_sa->bytes_acked += bytes_completed;
+
+ wake_up(sk->sk_sleep);
+ sdp_dbg_data(sk, "woke up sleepers\n");
+
+out:
+ spin_unlock_irqrestore(&ssk->tx_sa_lock, flags);
+ return;
+}
+
+static unsigned long sdp_get_max_memlockable_bytes(unsigned long offset)
+{
+ unsigned long avail;
+ unsigned long lock_limit;
+
+ if (capable(CAP_IPC_LOCK))
+ return ULONG_MAX;
+
+ lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ avail = lock_limit - (current->mm->locked_vm << PAGE_SHIFT);
+
+ return avail - offset;
+}
+
+static int sdp_alloc_fmr(struct socket *sk, void *uaddr, size_t len,
+ struct ib_pool_fmr **_fmr, struct ib_umem **_umem)
+{
+ struct ib_pool_fmr *fmr;
+ struct ib_umem *umem;
+ struct ib_device *dev;
+ u64 *pages;
+ struct ib_umem_chunk *chunk;
+ int n, j, k;
+ int rc = 0;
+ unsigned long max_lockable_bytes;
+
+ if (unlikely(len > SDP_MAX_RDMA_READ_LEN)) {
+ sdp_dbg_data(sk, "len:0x%lx > FMR_SIZE: 0x%lx\n",
+ len, SDP_MAX_RDMA_READ_LEN);
+ len = SDP_MAX_RDMA_READ_LEN;
+ }
+
+ max_lockable_bytes = sdp_get_max_memlockable_bytes((unsigned long)uaddr & ~PAGE_MASK);
+ if (unlikely(len > max_lockable_bytes)) {
+ sdp_dbg_data(sk, "len:0x%lx > RLIMIT_MEMLOCK available: 0x%lx\n",
+ len, max_lockable_bytes);
+ len = max_lockable_bytes;
+ }
+
+ sdp_dbg_data(sk, "user buf: %p, len:0x%lx max_lockable_bytes: 0x%lx\n",
+ uaddr, len, max_lockable_bytes);
+
+ umem = ib_umem_get(&sdp_sk(sk)->context, (unsigned long)uaddr, len,
+ IB_ACCESS_REMOTE_WRITE, 0);
+
+ if (IS_ERR(umem)) {
+ rc = PTR_ERR(umem);
+ sdp_warn(sk, "Error doing umem_get 0x%lx bytes: %d\n", len, rc);
+ sdp_warn(sk, "RLIMIT_MEMLOCK: 0x%lx[cur] 0x%lx[max] CAP_IPC_LOCK: %d\n",
+ current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur,
+ current->signal->rlim[RLIMIT_MEMLOCK].rlim_max,
+ capable(CAP_IPC_LOCK));
+ goto err_umem_get;
+ }
+
+ sdp_dbg_data(sk, "umem->offset = 0x%x, length = 0x%lx\n",
+ umem->offset, umem->length);
+
+ pages = (u64 *) __get_free_page(GFP_KERNEL);
+ if (!pages)
+ goto err_pages_alloc;
+
+ n = 0;
+
+ dev = sdp_sk(sk)->ib_device;
+ list_for_each_entry(chunk, &umem->chunk_list, list) {
+ for (j = 0; j < chunk->nmap; ++j) {
+ len = ib_sg_dma_len(dev,
+ &chunk->page_list[j]) >> PAGE_SHIFT;
+
+ for (k = 0; k < len; ++k) {
+ pages[n++] = ib_sg_dma_address(dev,
+ &chunk->page_list[j]) +
+ umem->page_size * k;
+
+ }
+ }
+ }
+
+ fmr = ib_fmr_pool_map_phys(sdp_sk(sk)->sdp_dev->fmr_pool, pages, n, 0);
+ if (IS_ERR(fmr)) {
+ sdp_warn(sk, "Error allocating fmr: %ld\n", PTR_ERR(fmr));
+ goto err_fmr_alloc;
+ }
+
+ free_page((unsigned long) pages);
+
+ *_umem = umem;
+ *_fmr = fmr;
+
+ return 0;
+
+err_fmr_alloc:
+ free_page((unsigned long) pages);
+
+err_pages_alloc:
+ ib_umem_release(umem);
+
+err_umem_get:
+
+ return rc;
+}
+
+void sdp_free_fmr(struct socket *sk, struct ib_pool_fmr **_fmr, struct ib_umem **_umem)
+{
+ if (!sdp_sk(sk)->qp_active)
+ return;
+
+ ib_fmr_pool_unmap(*_fmr);
+ *_fmr = NULL;
+
+ ib_umem_release(*_umem);
+ *_umem = NULL;
+}
+
+static int sdp_post_rdma_read(struct socket *sk, struct rx_srcavail_state *rx_sa)
+{
+ struct sdp_sock *ssk = sdp_sk(sk);
+ struct ib_send_wr *bad_wr;
+ struct ib_send_wr wr = { NULL };
+ struct ib_sge sge;
+
+ wr.opcode = IB_WR_RDMA_READ;
+ wr.next = NULL;
+ wr.wr_id = SDP_OP_RDMA;
+ wr.wr.rdma.rkey = rx_sa->rkey;
+ wr.send_flags = 0;
+
+ ssk->tx_ring.rdma_inflight = rx_sa;
+
+ sge.addr = rx_sa->umem->offset;
+ sge.length = rx_sa->umem->length;
+ sge.lkey = rx_sa->fmr->fmr->lkey;
+
+ wr.wr.rdma.remote_addr = rx_sa->vaddr + rx_sa->used;
+ wr.num_sge = 1;
+ wr.sg_list = &sge;
+ rx_sa->busy++;
+
+ wr.send_flags = IB_SEND_SIGNALED;
+
+ return ib_post_send(ssk->qp, &wr, &bad_wr);
+}
+
+int sdp_rdma_to_iovec(struct socket *sk, struct iovec *iov, struct mbuf *mb,
+ unsigned long *used)
+{
+ struct sdp_sock *ssk = sdp_sk(sk);
+ struct rx_srcavail_state *rx_sa = RX_SRCAVAIL_STATE(mb);
+ int got_srcavail_cancel;
+ int rc = 0;
+ int len = *used;
+ int copied;
+
+ sdp_dbg_data(ssk->socket, "preparing RDMA read."
+ " len: 0x%x. buffer len: 0x%lx\n", len, iov->iov_len);
+
+ sock_hold(sk, SOCK_REF_RDMA_RD);
+
+ if (len > rx_sa->len) {
+ sdp_warn(sk, "len:0x%x > rx_sa->len: 0x%x\n", len, rx_sa->len);
+ WARN_ON(1);
+ len = rx_sa->len;
+ }
+
+ rc = sdp_alloc_fmr(sk, iov->iov_base, len, &rx_sa->fmr, &rx_sa->umem);
+ if (rc) {
+ sdp_warn(sk, "Error allocating fmr: %d\n", rc);
+ goto err_alloc_fmr;
+ }
+
+ rc = sdp_post_rdma_read(sk, rx_sa);
+ if (unlikely(rc)) {
+ sdp_warn(sk, "ib_post_send failed with status %d.\n", rc);
+ sdp_set_error(ssk->socket, -ECONNRESET);
+ wake_up(&ssk->wq);
+ goto err_post_send;
+ }
+
+ sdp_prf(sk, mb, "Finished posting(rc=%d), now to wait", rc);
+
+ got_srcavail_cancel = ssk->srcavail_cancel_mseq > rx_sa->mseq;
+
+ sdp_arm_tx_cq(sk);
+
+ sdp_wait_rdma_wr_finished(ssk);
+
+ sdp_prf(sk, mb, "Finished waiting(rc=%d)", rc);
+ if (!ssk->qp_active) {
+ sdp_dbg_data(sk, "QP destroyed during RDMA read\n");
+ rc = -EPIPE;
+ goto err_post_send;
+ }
+
+ copied = rx_sa->umem->length;
+
+ sdp_update_iov_used(sk, iov, copied);
+ rx_sa->used += copied;
+ atomic_add(copied, &ssk->rcv_nxt);
+ *used = copied;
+
+ ssk->tx_ring.rdma_inflight = NULL;
+
+err_post_send:
+ sdp_free_fmr(sk, &rx_sa->fmr, &rx_sa->umem);
+
+err_alloc_fmr:
+ if (rc && ssk->qp_active) {
+ sdp_warn(sk, "Couldn't do RDMA - post sendsm\n");
+ rx_sa->flags |= RX_SA_ABORTED;
+ }
+
+ sock_put(sk, SOCK_REF_RDMA_RD);
+
+ return rc;
+}
+
+static inline int wait_for_sndbuf(struct socket *sk, long *timeo_p)
+{
+ struct sdp_sock *ssk = sdp_sk(sk);
+ int ret = 0;
+ int credits_needed = 1;
+
+ sdp_dbg_data(sk, "Wait for mem\n");
+
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+
+ SDPSTATS_COUNTER_INC(send_wait_for_mem);
+
+ sdp_do_posts(ssk);
+
+ sdp_xmit_poll(ssk, 1);
+
+ ret = sdp_tx_wait_memory(ssk, timeo_p, &credits_needed);
+
+ return ret;
+}
+
+static int do_sdp_sendmsg_zcopy(struct socket *sk, struct tx_srcavail_state *tx_sa,
+ struct iovec *iov, long *timeo)
+{
+ struct sdp_sock *ssk = sdp_sk(sk);
+ int rc = 0;
+ unsigned long lock_flags;
+
+ rc = sdp_alloc_fmr(sk, iov->iov_base, iov->iov_len,
+ &tx_sa->fmr, &tx_sa->umem);
+ if (rc) {
+ sdp_warn(sk, "Error allocating fmr: %d\n", rc);
+ goto err_alloc_fmr;
+ }
+
+ if (tx_slots_free(ssk) == 0) {
+ rc = wait_for_sndbuf(sk, timeo);
+ if (rc) {
+ sdp_warn(sk, "Couldn't get send buffer\n");
+ goto err_no_tx_slots;
+ }
+ }
+
+ rc = sdp_post_srcavail(sk, tx_sa);
+ if (rc) {
+ sdp_dbg(sk, "Error posting SrcAvail\n");
+ goto err_abort_send;
+ }
+
+ rc = sdp_wait_rdmardcompl(ssk, timeo, 0);
+ if (unlikely(rc)) {
+ enum tx_sa_flag f = tx_sa->abort_flags;
+
+ if (f & TX_SA_SENDSM) {
+ sdp_dbg_data(sk, "Got SendSM. use SEND verb.\n");
+ } else if (f & TX_SA_ERROR) {
+ sdp_dbg_data(sk, "SrcAvail error completion\n");
+ sdp_reset(sk);
+ SDPSTATS_COUNTER_INC(zcopy_tx_error);
+ } else if (ssk->qp_active) {
+ sdp_post_srcavail_cancel(sk);
+
+ /* Wait for RdmaRdCompl/SendSM to
+ * finish the transaction */
+ *timeo = 2 * HZ;
+ sdp_dbg_data(sk, "Waiting for SendSM\n");
+ sdp_wait_rdmardcompl(ssk, timeo, 1);
+ sdp_dbg_data(sk, "finished waiting\n");
+
+ cancel_delayed_work(&ssk->srcavail_cancel_work);
+ } else {
+ sdp_dbg_data(sk, "QP was destroyed while waiting\n");
+ }
+ } else {
+ sdp_dbg_data(sk, "got RdmaRdCompl\n");
+ }
+
+ spin_lock_irqsave(&ssk->tx_sa_lock, lock_flags);
+ ssk->tx_sa = NULL;
+ spin_unlock_irqrestore(&ssk->tx_sa_lock, lock_flags);
+
+err_abort_send:
+ sdp_update_iov_used(sk, iov, tx_sa->bytes_acked);
+
+err_no_tx_slots:
+ sdp_free_fmr(sk, &tx_sa->fmr, &tx_sa->umem);
+
+err_alloc_fmr:
+ return rc;
+}
+
+int sdp_sendmsg_zcopy(struct kiocb *iocb, struct socket *sk, struct iovec *iov)
+{
+ struct sdp_sock *ssk = sdp_sk(sk);
+ int rc = 0;
+ long timeo;
+ struct tx_srcavail_state *tx_sa;
+ int offset;
+ size_t bytes_to_copy = 0;
+ int copied = 0;
+
+ sdp_dbg_data(sk, "Sending iov: %p, iov_len: 0x%lx\n",
+ iov->iov_base, iov->iov_len);
+ sdp_prf1(sk, NULL, "sdp_sendmsg_zcopy start");
+ if (ssk->rx_sa) {
+ sdp_dbg_data(sk, "Deadlock prevent: crossing SrcAvail\n");
+ return 0;
+ }
+
+ sock_hold(ssk->socket, SOCK_REF_ZCOPY);
+
+ SDPSTATS_COUNTER_INC(sendmsg_zcopy_segment);
+
+ timeo = SDP_SRCAVAIL_ADV_TIMEOUT ;
+
+ /* Ok commence sending. */
+ offset = (unsigned long)iov->iov_base & (PAGE_SIZE - 1);
+
+ tx_sa = kmalloc(sizeof(struct tx_srcavail_state), GFP_KERNEL);
+ if (!tx_sa) {
+ sdp_warn(sk, "Error allocating zcopy context\n");
+ rc = -EAGAIN; /* Buffer too big - fallback to bcopy */
+ goto err_alloc_tx_sa;
+ }
+
+ bytes_to_copy = iov->iov_len;
+ do {
+ tx_sa_reset(tx_sa);
+
+ rc = do_sdp_sendmsg_zcopy(sk, tx_sa, iov, &timeo);
+
+ if (iov->iov_len && iov->iov_len < sdp_zcopy_thresh) {
+ sdp_dbg_data(sk, "0x%lx bytes left, switching to bcopy\n",
+ iov->iov_len);
+ break;
+ }
+ } while (!rc && iov->iov_len > 0 && !tx_sa->abort_flags);
+
+ kfree(tx_sa);
+err_alloc_tx_sa:
+ copied = bytes_to_copy - iov->iov_len;
+
+ sdp_prf1(sk, NULL, "sdp_sendmsg_zcopy end rc: %d copied: %d", rc, copied);
+
+ sock_put(ssk->socket, SOCK_REF_ZCOPY);
+
+ if (rc < 0 && rc != -EAGAIN && rc != -ETIME)
+ return rc;
+
+ return copied;
+}
+
+void sdp_abort_srcavail(struct socket *sk)
+{
+ struct sdp_sock *ssk = sdp_sk(sk);
+ struct tx_srcavail_state *tx_sa = ssk->tx_sa;
+ unsigned long flags;
+
+ if (!tx_sa)
+ return;
+
+ cancel_delayed_work(&ssk->srcavail_cancel_work);
+ flush_scheduled_work();
+
+ spin_lock_irqsave(&ssk->tx_sa_lock, flags);
+
+ sdp_free_fmr(sk, &tx_sa->fmr, &tx_sa->umem);
+
+ ssk->tx_sa = NULL;
+
+ spin_unlock_irqrestore(&ssk->tx_sa_lock, flags);
+}
+
+void sdp_abort_rdma_read(struct socket *sk)
+{
+ struct sdp_sock *ssk = sdp_sk(sk);
+ struct rx_srcavail_state *rx_sa = ssk->rx_sa;
+
+ if (!rx_sa)
+ return;
+
+ sdp_free_fmr(sk, &rx_sa->fmr, &rx_sa->umem);
+
+ ssk->rx_sa = NULL;
+}
diff --git a/sys/ofed/drivers/infiniband/util/Kconfig b/sys/ofed/drivers/infiniband/util/Kconfig
new file mode 100644
index 0000000..5e98eaa
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/util/Kconfig
@@ -0,0 +1,6 @@
+config INFINIBAND_MADEYE
+ tristate "MAD debug viewer for InfiniBand"
+ depends on INFINIBAND
+ ---help---
+ Prints sent and received MADs on QP 0/1 for debugging.
+
diff --git a/sys/ofed/drivers/infiniband/util/Makefile b/sys/ofed/drivers/infiniband/util/Makefile
new file mode 100644
index 0000000..caf9471
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/util/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_INFINIBAND_MADEYE) += ib_madeye.o
+
+ib_madeye-y := madeye.o
diff --git a/sys/ofed/drivers/infiniband/util/madeye.c b/sys/ofed/drivers/infiniband/util/madeye.c
new file mode 100644
index 0000000..2c650a3
--- /dev/null
+++ b/sys/ofed/drivers/infiniband/util/madeye.c
@@ -0,0 +1,593 @@
+/*
+ * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 Voltaire Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directorY of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/err.h>
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_sa.h>
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("InfiniBand MAD viewer");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static void madeye_remove_one(struct ib_device *device);
+static void madeye_add_one(struct ib_device *device);
+
+static struct ib_client madeye_client = {
+ .name = "madeye",
+ .add = madeye_add_one,
+ .remove = madeye_remove_one
+};
+
+struct madeye_port {
+ struct ib_mad_agent *smi_agent;
+ struct ib_mad_agent *gsi_agent;
+};
+
+static int smp = 1;
+static int gmp = 1;
+static int mgmt_class = 0;
+static int attr_id = 0;
+static int data = 0;
+
+module_param(smp, int, 0444);
+module_param(gmp, int, 0444);
+module_param(mgmt_class, int, 0444);
+module_param(attr_id, int, 0444);
+module_param(data, int, 0444);
+
+MODULE_PARM_DESC(smp, "Display all SMPs (default=1)");
+MODULE_PARM_DESC(gmp, "Display all GMPs (default=1)");
+MODULE_PARM_DESC(mgmt_class, "Display all MADs of specified class (default=0)");
+MODULE_PARM_DESC(attr_id, "Display add MADs of specified attribute ID (default=0)");
+MODULE_PARM_DESC(data, "Display data area of MADs (default=0)");
+
+static char * get_class_name(u8 mgmt_class)
+{
+ switch(mgmt_class) {
+ case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+ return "LID routed SMP";
+ case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+ return "Directed route SMP";
+ case IB_MGMT_CLASS_SUBN_ADM:
+ return "Subnet admin.";
+ case IB_MGMT_CLASS_PERF_MGMT:
+ return "Perf. mgmt.";
+ case IB_MGMT_CLASS_BM:
+ return "Baseboard mgmt.";
+ case IB_MGMT_CLASS_DEVICE_MGMT:
+ return "Device mgmt.";
+ case IB_MGMT_CLASS_CM:
+ return "Comm. mgmt.";
+ case IB_MGMT_CLASS_SNMP:
+ return "SNMP";
+ default:
+ return "Unknown vendor/application";
+ }
+}
+
+static char * get_method_name(u8 mgmt_class, u8 method)
+{
+ switch(method) {
+ case IB_MGMT_METHOD_GET:
+ return "Get";
+ case IB_MGMT_METHOD_SET:
+ return "Set";
+ case IB_MGMT_METHOD_GET_RESP:
+ return "Get response";
+ case IB_MGMT_METHOD_SEND:
+ return "Send";
+ case IB_MGMT_METHOD_SEND | IB_MGMT_METHOD_RESP:
+ return "Send response";
+ case IB_MGMT_METHOD_TRAP:
+ return "Trap";
+ case IB_MGMT_METHOD_REPORT:
+ return "Report";
+ case IB_MGMT_METHOD_REPORT_RESP:
+ return "Report response";
+ case IB_MGMT_METHOD_TRAP_REPRESS:
+ return "Trap repress";
+ default:
+ break;
+ }
+
+ switch (mgmt_class) {
+ case IB_MGMT_CLASS_SUBN_ADM:
+ switch (method) {
+ case IB_SA_METHOD_GET_TABLE:
+ return "Get table";
+ case IB_SA_METHOD_GET_TABLE_RESP:
+ return "Get table response";
+ case IB_SA_METHOD_DELETE:
+ return "Delete";
+ case IB_SA_METHOD_DELETE_RESP:
+ return "Delete response";
+ case IB_SA_METHOD_GET_MULTI:
+ return "Get Multi";
+ case IB_SA_METHOD_GET_MULTI_RESP:
+ return "Get Multi response";
+ case IB_SA_METHOD_GET_TRACE_TBL:
+ return "Get Trace Table response";
+ default:
+ break;
+ }
+ default:
+ break;
+ }
+
+ return "Unknown";
+}
+
+static void print_status_details(u16 status)
+{
+ if (status & 0x0001)
+ printk(" busy\n");
+ if (status & 0x0002)
+ printk(" redirection required\n");
+ switch((status & 0x001C) >> 2) {
+ case 1:
+ printk(" bad version\n");
+ break;
+ case 2:
+ printk(" method not supported\n");
+ break;
+ case 3:
+ printk(" method/attribute combo not supported\n");
+ break;
+ case 7:
+ printk(" invalid attribute/modifier value\n");
+ break;
+ }
+}
+
+static char * get_sa_attr(__be16 attr)
+{
+ switch(attr) {
+ case IB_SA_ATTR_CLASS_PORTINFO:
+ return "Class Port Info";
+ case IB_SA_ATTR_NOTICE:
+ return "Notice";
+ case IB_SA_ATTR_INFORM_INFO:
+ return "Inform Info";
+ case IB_SA_ATTR_NODE_REC:
+ return "Node Record";
+ case IB_SA_ATTR_PORT_INFO_REC:
+ return "PortInfo Record";
+ case IB_SA_ATTR_SL2VL_REC:
+ return "SL to VL Record";
+ case IB_SA_ATTR_SWITCH_REC:
+ return "Switch Record";
+ case IB_SA_ATTR_LINEAR_FDB_REC:
+ return "Linear FDB Record";
+ case IB_SA_ATTR_RANDOM_FDB_REC:
+ return "Random FDB Record";
+ case IB_SA_ATTR_MCAST_FDB_REC:
+ return "Multicast FDB Record";
+ case IB_SA_ATTR_SM_INFO_REC:
+ return "SM Info Record";
+ case IB_SA_ATTR_LINK_REC:
+ return "Link Record";
+ case IB_SA_ATTR_GUID_INFO_REC:
+ return "Guid Info Record";
+ case IB_SA_ATTR_SERVICE_REC:
+ return "Service Record";
+ case IB_SA_ATTR_PARTITION_REC:
+ return "Partition Record";
+ case IB_SA_ATTR_PATH_REC:
+ return "Path Record";
+ case IB_SA_ATTR_VL_ARB_REC:
+ return "VL Arb Record";
+ case IB_SA_ATTR_MC_MEMBER_REC:
+ return "MC Member Record";
+ case IB_SA_ATTR_TRACE_REC:
+ return "Trace Record";
+ case IB_SA_ATTR_MULTI_PATH_REC:
+ return "Multi Path Record";
+ case IB_SA_ATTR_SERVICE_ASSOC_REC:
+ return "Service Assoc Record";
+ case IB_SA_ATTR_INFORM_INFO_REC:
+ return "Inform Info Record";
+ default:
+ return "";
+ }
+}
+
+static void print_mad_hdr(struct ib_mad_hdr *mad_hdr)
+{
+ printk("MAD version....0x%01x\n", mad_hdr->base_version);
+ printk("Class..........0x%01x (%s)\n", mad_hdr->mgmt_class,
+ get_class_name(mad_hdr->mgmt_class));
+ printk("Class version..0x%01x\n", mad_hdr->class_version);
+ printk("Method.........0x%01x (%s)\n", mad_hdr->method,
+ get_method_name(mad_hdr->mgmt_class, mad_hdr->method));
+ printk("Status.........0x%02x\n", be16_to_cpu(mad_hdr->status));
+ if (mad_hdr->status)
+ print_status_details(be16_to_cpu(mad_hdr->status));
+ printk("Class specific.0x%02x\n", be16_to_cpu(mad_hdr->class_specific));
+ printk("Trans ID.......0x%llx\n",
+ (unsigned long long)be64_to_cpu(mad_hdr->tid));
+ if (mad_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_ADM)
+ printk("Attr ID........0x%02x (%s)\n",
+ be16_to_cpu(mad_hdr->attr_id),
+ get_sa_attr(be16_to_cpu(mad_hdr->attr_id)));
+ else
+ printk("Attr ID........0x%02x\n",
+ be16_to_cpu(mad_hdr->attr_id));
+ printk("Attr modifier..0x%04x\n", be32_to_cpu(mad_hdr->attr_mod));
+}
+
+static char * get_rmpp_type(u8 rmpp_type)
+{
+ switch (rmpp_type) {
+ case IB_MGMT_RMPP_TYPE_DATA:
+ return "Data";
+ case IB_MGMT_RMPP_TYPE_ACK:
+ return "Ack";
+ case IB_MGMT_RMPP_TYPE_STOP:
+ return "Stop";
+ case IB_MGMT_RMPP_TYPE_ABORT:
+ return "Abort";
+ default:
+ return "Unknown";
+ }
+}
+
+static char * get_rmpp_flags(u8 rmpp_flags)
+{
+ if (rmpp_flags & IB_MGMT_RMPP_FLAG_ACTIVE)
+ if (rmpp_flags & IB_MGMT_RMPP_FLAG_FIRST)
+ if (rmpp_flags & IB_MGMT_RMPP_FLAG_LAST)
+ return "Active - First & Last";
+ else
+ return "Active - First";
+ else
+ if (rmpp_flags & IB_MGMT_RMPP_FLAG_LAST)
+ return "Active - Last";
+ else
+ return "Active";
+ else
+ return "Inactive";
+}
+
+static void print_rmpp_hdr(struct ib_rmpp_hdr *rmpp_hdr)
+{
+ printk("RMPP version...0x%01x\n", rmpp_hdr->rmpp_version);
+ printk("RMPP type......0x%01x (%s)\n", rmpp_hdr->rmpp_type,
+ get_rmpp_type(rmpp_hdr->rmpp_type));
+ printk("RMPP RRespTime.0x%01x\n", ib_get_rmpp_resptime(rmpp_hdr));
+ printk("RMPP flags.....0x%01x (%s)\n", ib_get_rmpp_flags(rmpp_hdr),
+ get_rmpp_flags(ib_get_rmpp_flags(rmpp_hdr)));
+ printk("RMPP status....0x%01x\n", rmpp_hdr->rmpp_status);
+ printk("Seg number.....0x%04x\n", be32_to_cpu(rmpp_hdr->seg_num));
+ switch (rmpp_hdr->rmpp_type) {
+ case IB_MGMT_RMPP_TYPE_DATA:
+ printk("Payload len....0x%04x\n",
+ be32_to_cpu(rmpp_hdr->paylen_newwin));
+ break;
+ case IB_MGMT_RMPP_TYPE_ACK:
+ printk("New window.....0x%04x\n",
+ be32_to_cpu(rmpp_hdr->paylen_newwin));
+ break;
+ default:
+ printk("Data 2.........0x%04x\n",
+ be32_to_cpu(rmpp_hdr->paylen_newwin));
+ break;
+ }
+}
+
+static char * get_smp_attr(__be16 attr)
+{
+ switch (attr) {
+ case IB_SMP_ATTR_NOTICE:
+ return "notice";
+ case IB_SMP_ATTR_NODE_DESC:
+ return "node description";
+ case IB_SMP_ATTR_NODE_INFO:
+ return "node info";
+ case IB_SMP_ATTR_SWITCH_INFO:
+ return "switch info";
+ case IB_SMP_ATTR_GUID_INFO:
+ return "GUID info";
+ case IB_SMP_ATTR_PORT_INFO:
+ return "port info";
+ case IB_SMP_ATTR_PKEY_TABLE:
+ return "pkey table";
+ case IB_SMP_ATTR_SL_TO_VL_TABLE:
+ return "SL to VL table";
+ case IB_SMP_ATTR_VL_ARB_TABLE:
+ return "VL arbitration table";
+ case IB_SMP_ATTR_LINEAR_FORWARD_TABLE:
+ return "linear forwarding table";
+ case IB_SMP_ATTR_RANDOM_FORWARD_TABLE:
+ return "random forward table";
+ case IB_SMP_ATTR_MCAST_FORWARD_TABLE:
+ return "multicast forward table";
+ case IB_SMP_ATTR_SM_INFO:
+ return "SM info";
+ case IB_SMP_ATTR_VENDOR_DIAG:
+ return "vendor diags";
+ case IB_SMP_ATTR_LED_INFO:
+ return "LED info";
+ default:
+ return "";
+ }
+}
+
+static void print_smp(struct ib_smp *smp)
+{
+ int i;
+
+ printk("MAD version....0x%01x\n", smp->base_version);
+ printk("Class..........0x%01x (%s)\n", smp->mgmt_class,
+ get_class_name(smp->mgmt_class));
+ printk("Class version..0x%01x\n", smp->class_version);
+ printk("Method.........0x%01x (%s)\n", smp->method,
+ get_method_name(smp->mgmt_class, smp->method));
+ printk("Status.........0x%02x\n", be16_to_cpu(smp->status));
+ if (smp->status)
+ print_status_details(be16_to_cpu(smp->status));
+ printk("Hop pointer....0x%01x\n", smp->hop_ptr);
+ printk("Hop counter....0x%01x\n", smp->hop_cnt);
+ printk("Trans ID.......0x%llx\n",
+ (unsigned long long)be64_to_cpu(smp->tid));
+ printk("Attr ID........0x%02x (%s)\n", be16_to_cpu(smp->attr_id),
+ get_smp_attr(smp->attr_id));
+ printk("Attr modifier..0x%04x\n", be32_to_cpu(smp->attr_mod));
+
+ printk("Mkey...........0x%llx\n",
+ (unsigned long long)be64_to_cpu(smp->mkey));
+ printk("DR SLID........0x%02x\n", be16_to_cpu(smp->dr_slid));
+ printk("DR DLID........0x%02x", be16_to_cpu(smp->dr_dlid));
+
+ if (data) {
+ for (i = 0; i < IB_SMP_DATA_SIZE; i++) {
+ if (i % 16 == 0)
+ printk("\nSMP Data.......");
+ printk("%01x ", smp->data[i]);
+ }
+ for (i = 0; i < IB_SMP_MAX_PATH_HOPS; i++) {
+ if (i % 16 == 0)
+ printk("\nInitial path...");
+ printk("%01x ", smp->initial_path[i]);
+ }
+ for (i = 0; i < IB_SMP_MAX_PATH_HOPS; i++) {
+ if (i % 16 == 0)
+ printk("\nReturn path....");
+ printk("%01x ", smp->return_path[i]);
+ }
+ }
+ printk("\n");
+}
+
+static void snoop_smi_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_buf *send_buf,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ struct ib_mad_hdr *hdr = send_buf->mad;
+
+ if (!smp && hdr->mgmt_class != mgmt_class)
+ return;
+ if (attr_id && be16_to_cpu(hdr->attr_id) != attr_id)
+ return;
+
+ printk("Madeye:sent SMP\n");
+ print_smp(send_buf->mad);
+}
+
+static void recv_smi_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ if (!smp && mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class != mgmt_class)
+ return;
+ if (attr_id && be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) != attr_id)
+ return;
+
+ printk("Madeye:recv SMP\n");
+ print_smp((struct ib_smp *)&mad_recv_wc->recv_buf.mad->mad_hdr);
+}
+
+static int is_rmpp_mad(struct ib_mad_hdr *mad_hdr)
+{
+ if (mad_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_ADM) {
+ switch (mad_hdr->method) {
+ case IB_SA_METHOD_GET_TABLE:
+ case IB_SA_METHOD_GET_TABLE_RESP:
+ case IB_SA_METHOD_GET_MULTI_RESP:
+ return 1;
+ default:
+ break;
+ }
+ } else if ((mad_hdr->mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) &&
+ (mad_hdr->mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END))
+ return 1;
+
+ return 0;
+}
+
+static void snoop_gsi_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_buf *send_buf,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ struct ib_mad_hdr *hdr = send_buf->mad;
+
+ if (!gmp && hdr->mgmt_class != mgmt_class)
+ return;
+ if (attr_id && be16_to_cpu(hdr->attr_id) != attr_id)
+ return;
+
+ printk("Madeye:sent GMP\n");
+ print_mad_hdr(hdr);
+
+ if (is_rmpp_mad(hdr))
+ print_rmpp_hdr(&((struct ib_rmpp_mad *) hdr)->rmpp_hdr);
+}
+
+static void recv_gsi_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_mad_hdr *hdr = &mad_recv_wc->recv_buf.mad->mad_hdr;
+ struct ib_rmpp_mad *mad = NULL;
+ struct ib_sa_mad *sa_mad;
+ struct ib_vendor_mad *vendor_mad;
+ u8 *mad_data;
+ int i, j;
+
+ if (!gmp && hdr->mgmt_class != mgmt_class)
+ return;
+ if (attr_id && be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) != attr_id)
+ return;
+
+ printk("Madeye:recv GMP\n");
+ print_mad_hdr(hdr);
+
+ if (is_rmpp_mad(hdr)) {
+ mad = (struct ib_rmpp_mad *) hdr;
+ print_rmpp_hdr(&mad->rmpp_hdr);
+ }
+
+ if (data) {
+ if (hdr->mgmt_class == IB_MGMT_CLASS_SUBN_ADM) {
+ j = IB_MGMT_SA_DATA;
+ /* Display SA header */
+ if (is_rmpp_mad(hdr) &&
+ mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA)
+ return;
+ sa_mad = (struct ib_sa_mad *)
+ &mad_recv_wc->recv_buf.mad;
+ mad_data = sa_mad->data;
+ } else {
+ if (is_rmpp_mad(hdr)) {
+ j = IB_MGMT_VENDOR_DATA;
+ /* Display OUI */
+ vendor_mad = (struct ib_vendor_mad *)
+ &mad_recv_wc->recv_buf.mad;
+ printk("Vendor OUI......%01x %01x %01x\n",
+ vendor_mad->oui[0],
+ vendor_mad->oui[1],
+ vendor_mad->oui[2]);
+ mad_data = vendor_mad->data;
+ } else {
+ j = IB_MGMT_MAD_DATA;
+ mad_data = mad_recv_wc->recv_buf.mad->data;
+ }
+ }
+ for (i = 0; i < j; i++) {
+ if (i % 16 == 0)
+ printk("\nData...........");
+ printk("%01x ", mad_data[i]);
+ }
+ printk("\n");
+ }
+}
+
+static void madeye_add_one(struct ib_device *device)
+{
+ struct madeye_port *port;
+ int reg_flags;
+ u8 i, s, e;
+
+ if (device->node_type == RDMA_NODE_IB_SWITCH) {
+ s = 0;
+ e = 0;
+ } else {
+ s = 1;
+ e = device->phys_port_cnt;
+ }
+
+ port = kmalloc(sizeof *port * (e - s + 1), GFP_KERNEL);
+ if (!port)
+ goto out;
+
+ reg_flags = IB_MAD_SNOOP_SEND_COMPLETIONS | IB_MAD_SNOOP_RECVS;
+ for (i = 0; i <= e - s; i++) {
+ port[i].smi_agent = ib_register_mad_snoop(device, i + s,
+ IB_QPT_SMI,
+ reg_flags,
+ snoop_smi_handler,
+ recv_smi_handler,
+ &port[i]);
+ port[i].gsi_agent = ib_register_mad_snoop(device, i + s,
+ IB_QPT_GSI,
+ reg_flags,
+ snoop_gsi_handler,
+ recv_gsi_handler,
+ &port[i]);
+ }
+
+out:
+ ib_set_client_data(device, &madeye_client, port);
+}
+
+static void madeye_remove_one(struct ib_device *device)
+{
+ struct madeye_port *port;
+ int i, s, e;
+
+ port = (struct madeye_port *)
+ ib_get_client_data(device, &madeye_client);
+ if (!port)
+ return;
+
+ if (device->node_type == RDMA_NODE_IB_SWITCH) {
+ s = 0;
+ e = 0;
+ } else {
+ s = 1;
+ e = device->phys_port_cnt;
+ }
+
+ for (i = 0; i <= e - s; i++) {
+ if (!IS_ERR(port[i].smi_agent))
+ ib_unregister_mad_agent(port[i].smi_agent);
+ if (!IS_ERR(port[i].gsi_agent))
+ ib_unregister_mad_agent(port[i].gsi_agent);
+ }
+ kfree(port);
+}
+
+static int __init ib_madeye_init(void)
+{
+ return ib_register_client(&madeye_client);
+}
+
+static void __exit ib_madeye_cleanup(void)
+{
+ ib_unregister_client(&madeye_client);
+}
+
+module_init(ib_madeye_init);
+module_exit(ib_madeye_cleanup);
diff --git a/sys/ofed/drivers/net/mlx4/Makefile b/sys/ofed/drivers/net/mlx4/Makefile
new file mode 100644
index 0000000..b9d2e7e
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/Makefile
@@ -0,0 +1,9 @@
+obj-$(CONFIG_MLX4_CORE) += mlx4_core.o
+
+mlx4_core-y := alloc.o catas.o cmd.o cq.o eq.o fw.o icm.o intf.o main.o mcg.o \
+ mr.o pd.o port.o profile.o qp.o reset.o sense.o srq.o xrcd.o
+
+obj-$(CONFIG_MLX4_EN) += mlx4_en.o
+
+mlx4_en-y := en_main.o en_tx.o en_rx.o en_ethtool.o en_port.o en_cq.o \
+ en_resources.o en_netdev.o en_frag.o en_selftest.o
diff --git a/sys/ofed/drivers/net/mlx4/alloc.c b/sys/ofed/drivers/net/mlx4/alloc.c
new file mode 100644
index 0000000..c22791a
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/alloc.c
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/bitmap.h>
+#include <linux/dma-mapping.h>
+#include <linux/vmalloc.h>
+
+#include "mlx4.h"
+
+u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap)
+{
+ u32 obj;
+
+ spin_lock(&bitmap->lock);
+
+ obj = find_next_zero_bit(bitmap->table, bitmap->max, bitmap->last);
+ if (obj >= bitmap->max) {
+ bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
+ & bitmap->mask;
+ obj = find_first_zero_bit(bitmap->table, bitmap->max);
+ }
+
+ if (obj < bitmap->max) {
+ set_bit(obj, bitmap->table);
+ bitmap->last = (obj + 1);
+ if (bitmap->last == bitmap->max)
+ bitmap->last = 0;
+ obj |= bitmap->top;
+ } else
+ obj = -1;
+
+ if (obj != -1)
+ --bitmap->avail;
+
+ spin_unlock(&bitmap->lock);
+
+ return obj;
+}
+
+void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj)
+{
+ mlx4_bitmap_free_range(bitmap, obj, 1);
+}
+
+static unsigned long find_aligned_range(unsigned long *bitmap,
+ u32 start, u32 nbits,
+ int len, int align)
+{
+ unsigned long end, i;
+
+again:
+ start = ALIGN(start, align);
+
+ while ((start < nbits) && test_bit(start, bitmap))
+ start += align;
+
+ if (start >= nbits)
+ return -1;
+
+ end = start+len;
+ if (end > nbits)
+ return -1;
+
+ for (i = start + 1; i < end; i++) {
+ if (test_bit(i, bitmap)) {
+ start = i + 1;
+ goto again;
+ }
+ }
+
+ return start;
+}
+
+u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align)
+{
+ u32 obj, i;
+
+ if (likely(cnt == 1 && align == 1))
+ return mlx4_bitmap_alloc(bitmap);
+
+ spin_lock(&bitmap->lock);
+
+ obj = find_aligned_range(bitmap->table, bitmap->last,
+ bitmap->max, cnt, align);
+ if (obj >= bitmap->max) {
+ bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
+ & bitmap->mask;
+ obj = find_aligned_range(bitmap->table, 0, bitmap->max,
+ cnt, align);
+ }
+
+ if (obj < bitmap->max) {
+ for (i = 0; i < cnt; i++)
+ set_bit(obj + i, bitmap->table);
+ if (obj == bitmap->last) {
+ bitmap->last = (obj + cnt);
+ if (bitmap->last >= bitmap->max)
+ bitmap->last = 0;
+ }
+ obj |= bitmap->top;
+ } else
+ obj = -1;
+
+ if (obj != -1)
+ bitmap->avail -= cnt;
+
+ spin_unlock(&bitmap->lock);
+
+ return obj;
+}
+
+u32 mlx4_bitmap_avail(struct mlx4_bitmap *bitmap)
+{
+ return bitmap->avail;
+}
+
+void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt)
+{
+ u32 i;
+
+ obj &= bitmap->max + bitmap->reserved_top - 1;
+
+ spin_lock(&bitmap->lock);
+ for (i = 0; i < cnt; i++)
+ clear_bit(obj + i, bitmap->table);
+ bitmap->last = min(bitmap->last, obj);
+ bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
+ & bitmap->mask;
+ bitmap->avail += cnt;
+ spin_unlock(&bitmap->lock);
+}
+
+int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask,
+ u32 reserved_bot, u32 reserved_top)
+{
+ int i;
+
+ /* num must be a power of 2 */
+ if (num != roundup_pow_of_two(num))
+ return -EINVAL;
+
+ bitmap->last = 0;
+ bitmap->top = 0;
+ bitmap->max = num - reserved_top;
+ bitmap->mask = mask;
+ bitmap->reserved_top = reserved_top;
+ bitmap->avail = num - reserved_top - reserved_bot;
+ spin_lock_init(&bitmap->lock);
+ bitmap->table = kzalloc(BITS_TO_LONGS(bitmap->max) *
+ sizeof (long), GFP_KERNEL);
+ if (!bitmap->table)
+ return -ENOMEM;
+
+ for (i = 0; i < reserved_bot; ++i)
+ set_bit(i, bitmap->table);
+
+ return 0;
+}
+
+void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap)
+{
+ kfree(bitmap->table);
+}
+
+/*
+ * Handling for queue buffers -- we allocate a bunch of memory and
+ * register it in a memory region at HCA virtual address 0. If the
+ * requested size is > max_direct, we split the allocation into
+ * multiple pages, so we don't require too much contiguous memory.
+ */
+
+int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
+ struct mlx4_buf *buf)
+{
+ dma_addr_t t;
+
+ buf->direct.buf = NULL;
+ if (size <= max_direct) {
+ buf->nbufs = 1;
+ buf->npages = 1;
+ buf->page_shift = get_order(size) + PAGE_SHIFT;
+ buf->direct.buf = dma_alloc_coherent(&dev->pdev->dev,
+ size, &t, GFP_KERNEL);
+ if (!buf->direct.buf)
+ return -ENOMEM;
+
+ buf->direct.map = t;
+
+ while (t & ((1 << buf->page_shift) - 1)) {
+ --buf->page_shift;
+ buf->npages *= 2;
+ }
+
+ memset(buf->direct.buf, 0, size);
+ } else {
+ int i;
+
+ buf->direct.buf = NULL;
+ buf->direct.map = 0;
+ buf->nbufs = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+ buf->npages = buf->nbufs;
+ buf->page_shift = PAGE_SHIFT;
+ buf->page_list = kzalloc(buf->nbufs * sizeof *buf->page_list,
+ GFP_KERNEL);
+ if (!buf->page_list)
+ return -ENOMEM;
+
+ for (i = 0; i < buf->nbufs; ++i) {
+ buf->page_list[i].buf =
+ dma_alloc_coherent(&dev->pdev->dev, PAGE_SIZE,
+ &t, GFP_KERNEL);
+ if (!buf->page_list[i].buf)
+ goto err_free;
+
+ buf->page_list[i].map = t;
+
+ memset(buf->page_list[i].buf, 0, PAGE_SIZE);
+ }
+
+ if (BITS_PER_LONG == 64) {
+ struct page **pages;
+ pages = kmalloc(sizeof *pages * buf->nbufs, GFP_KERNEL);
+ if (!pages)
+ goto err_free;
+ for (i = 0; i < buf->nbufs; ++i)
+ pages[i] = virt_to_page(buf->page_list[i].buf);
+ buf->direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL);
+ kfree(pages);
+ if (!buf->direct.buf)
+ goto err_free;
+ }
+ }
+
+ return 0;
+
+err_free:
+ mlx4_buf_free(dev, size, buf);
+
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(mlx4_buf_alloc);
+
+void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf)
+{
+ int i;
+
+ if (buf->nbufs == 1)
+ dma_free_coherent(&dev->pdev->dev, size, buf->direct.buf,
+ buf->direct.map);
+ else {
+ if (BITS_PER_LONG == 64 && buf->direct.buf)
+ vunmap(buf->direct.buf);
+
+ for (i = 0; i < buf->nbufs; ++i)
+ if (buf->page_list[i].buf)
+ dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+ buf->page_list[i].buf,
+ buf->page_list[i].map);
+ kfree(buf->page_list);
+ }
+ buf->direct.buf = NULL;
+}
+EXPORT_SYMBOL_GPL(mlx4_buf_free);
+
+static struct mlx4_db_pgdir *mlx4_alloc_db_pgdir(struct device *dma_device)
+{
+ struct mlx4_db_pgdir *pgdir;
+
+ pgdir = kzalloc(sizeof *pgdir, GFP_KERNEL);
+ if (!pgdir)
+ return NULL;
+
+ bitmap_fill(pgdir->order1, MLX4_DB_PER_PAGE / 2);
+ pgdir->bits[0] = pgdir->order0;
+ pgdir->bits[1] = pgdir->order1;
+ pgdir->db_page = dma_alloc_coherent(dma_device, PAGE_SIZE,
+ &pgdir->db_dma, GFP_KERNEL);
+ if (!pgdir->db_page) {
+ kfree(pgdir);
+ return NULL;
+ }
+
+ return pgdir;
+}
+
+static int mlx4_alloc_db_from_pgdir(struct mlx4_db_pgdir *pgdir,
+ struct mlx4_db *db, int order)
+{
+ int o;
+ int i;
+
+ for (o = order; o <= 1; ++o) {
+ i = find_first_bit(pgdir->bits[o], MLX4_DB_PER_PAGE >> o);
+ if (i < MLX4_DB_PER_PAGE >> o)
+ goto found;
+ }
+
+ return -ENOMEM;
+
+found:
+ clear_bit(i, pgdir->bits[o]);
+
+ i <<= o;
+
+ if (o > order)
+ set_bit(i ^ 1, pgdir->bits[order]);
+
+ db->u.pgdir = pgdir;
+ db->index = i;
+ db->db = pgdir->db_page + db->index;
+ db->dma = pgdir->db_dma + db->index * 4;
+ db->order = order;
+
+ return 0;
+}
+
+int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ struct mlx4_db_pgdir *pgdir;
+ int ret = 0;
+
+ mutex_lock(&priv->pgdir_mutex);
+
+ list_for_each_entry(pgdir, &priv->pgdir_list, list)
+ if (!mlx4_alloc_db_from_pgdir(pgdir, db, order))
+ goto out;
+
+ pgdir = mlx4_alloc_db_pgdir(&(dev->pdev->dev));
+ if (!pgdir) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ list_add(&pgdir->list, &priv->pgdir_list);
+
+ /* This should never fail -- we just allocated an empty page: */
+ WARN_ON(mlx4_alloc_db_from_pgdir(pgdir, db, order));
+
+out:
+ mutex_unlock(&priv->pgdir_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_db_alloc);
+
+void mlx4_db_free(struct mlx4_dev *dev, struct mlx4_db *db)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ int o;
+ int i;
+
+ mutex_lock(&priv->pgdir_mutex);
+
+ o = db->order;
+ i = db->index;
+
+ if (db->order == 0 && test_bit(i ^ 1, db->u.pgdir->order0)) {
+ clear_bit(i ^ 1, db->u.pgdir->order0);
+ ++o;
+ }
+ i >>= o;
+ set_bit(i, db->u.pgdir->bits[o]);
+
+ if (bitmap_full(db->u.pgdir->order1, MLX4_DB_PER_PAGE / 2)) {
+ dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE,
+ db->u.pgdir->db_page, db->u.pgdir->db_dma);
+ list_del(&db->u.pgdir->list);
+ kfree(db->u.pgdir);
+ }
+
+ mutex_unlock(&priv->pgdir_mutex);
+}
+EXPORT_SYMBOL_GPL(mlx4_db_free);
+
+int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres,
+ int size, int max_direct)
+{
+ int err;
+
+ err = mlx4_db_alloc(dev, &wqres->db, 1);
+ if (err)
+ return err;
+
+ *wqres->db.db = 0;
+
+ err = mlx4_buf_alloc(dev, size, max_direct, &wqres->buf);
+ if (err)
+ goto err_db;
+
+ err = mlx4_mtt_init(dev, wqres->buf.npages, wqres->buf.page_shift,
+ &wqres->mtt);
+ if (err)
+ goto err_buf;
+
+ err = mlx4_buf_write_mtt(dev, &wqres->mtt, &wqres->buf);
+ if (err)
+ goto err_mtt;
+
+ return 0;
+
+err_mtt:
+ mlx4_mtt_cleanup(dev, &wqres->mtt);
+err_buf:
+ mlx4_buf_free(dev, size, &wqres->buf);
+err_db:
+ mlx4_db_free(dev, &wqres->db);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_alloc_hwq_res);
+
+void mlx4_free_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres,
+ int size)
+{
+ mlx4_mtt_cleanup(dev, &wqres->mtt);
+ mlx4_buf_free(dev, size, &wqres->buf);
+ mlx4_db_free(dev, &wqres->db);
+}
+EXPORT_SYMBOL_GPL(mlx4_free_hwq_res);
diff --git a/sys/ofed/drivers/net/mlx4/catas.c b/sys/ofed/drivers/net/mlx4/catas.c
new file mode 100644
index 0000000..334aad9
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/catas.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/workqueue.h>
+
+#include "mlx4.h"
+
+#define MLX4_CATAS_POLL_INTERVAL (5 * HZ)
+
+static DEFINE_SPINLOCK(catas_lock);
+
+static LIST_HEAD(catas_list);
+static struct work_struct catas_work;
+
+static int internal_err_reset = 1;
+module_param(internal_err_reset, int, 0644);
+MODULE_PARM_DESC(internal_err_reset,
+ "Reset device on internal errors if non-zero (default 1)");
+
+static void dump_err_buf(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+
+ int i;
+
+ mlx4_err(dev, "Internal error detected:\n");
+ for (i = 0; i < priv->fw.catas_size; ++i)
+ mlx4_err(dev, " buf[%02x]: %08x\n",
+ i, swab32(readl(priv->catas_err.map + i)));
+}
+
+static void poll_catas(unsigned long dev_ptr)
+{
+ struct mlx4_dev *dev = (struct mlx4_dev *) dev_ptr;
+ struct mlx4_priv *priv = mlx4_priv(dev);
+
+ if (readl(priv->catas_err.map)) {
+ dump_err_buf(dev);
+
+ mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
+
+ if (internal_err_reset) {
+ spin_lock(&catas_lock);
+ list_add(&priv->catas_err.list, &catas_list);
+ spin_unlock(&catas_lock);
+
+ queue_work(mlx4_wq, &catas_work);
+ }
+ } else
+ mod_timer(&priv->catas_err.timer,
+ round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
+}
+
+static void catas_reset(struct work_struct *work)
+{
+ struct mlx4_priv *priv, *tmppriv;
+ struct mlx4_dev *dev;
+
+ LIST_HEAD(tlist);
+ int ret;
+
+ if (!mutex_trylock(&drv_mutex))
+ return;
+
+ spin_lock_irq(&catas_lock);
+ list_splice_init(&catas_list, &tlist);
+ spin_unlock_irq(&catas_lock);
+
+ list_for_each_entry_safe(priv, tmppriv, &tlist, catas_err.list) {
+ struct pci_dev *pdev = priv->dev.pdev;
+
+ ret = mlx4_restart_one(priv->dev.pdev);
+ /* 'priv' now is not valid */
+ if (ret)
+ printk(KERN_ERR "mlx4 %s: Reset failed (%d)\n",
+ pci_name(pdev), ret);
+ else {
+ dev = pci_get_drvdata(pdev);
+ mlx4_dbg(dev, "Reset succeeded\n");
+ }
+ }
+ mutex_unlock(&drv_mutex);
+}
+
+void mlx4_start_catas_poll(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ unsigned long addr;
+
+ INIT_LIST_HEAD(&priv->catas_err.list);
+ init_timer(&priv->catas_err.timer);
+ priv->catas_err.map = NULL;
+
+ addr = pci_resource_start(dev->pdev, priv->fw.catas_bar) +
+ priv->fw.catas_offset;
+
+ priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4);
+ if (!priv->catas_err.map) {
+ mlx4_warn(dev, "Failed to map internal error buffer at 0x%lx\n",
+ addr);
+ return;
+ }
+
+ priv->catas_err.timer.data = (unsigned long) dev;
+ priv->catas_err.timer.function = poll_catas;
+ priv->catas_err.timer.expires =
+ round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL);
+ add_timer(&priv->catas_err.timer);
+}
+
+void mlx4_stop_catas_poll(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+
+ del_timer_sync(&priv->catas_err.timer);
+
+ if (priv->catas_err.map)
+ iounmap(priv->catas_err.map);
+
+ spin_lock_irq(&catas_lock);
+ list_del(&priv->catas_err.list);
+ spin_unlock_irq(&catas_lock);
+}
+
+void __init mlx4_catas_init(void)
+{
+ INIT_WORK(&catas_work, catas_reset);
+}
diff --git a/sys/ofed/drivers/net/mlx4/cmd.c b/sys/ofed/drivers/net/mlx4/cmd.c
new file mode 100644
index 0000000..bc4a618
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/cmd.c
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/errno.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include <asm/io.h>
+
+#include "mlx4.h"
+
+#define CMD_POLL_TOKEN 0xffff
+
+enum {
+ /* command completed successfully: */
+ CMD_STAT_OK = 0x00,
+ /* Internal error (such as a bus error) occurred while processing command: */
+ CMD_STAT_INTERNAL_ERR = 0x01,
+ /* Operation/command not supported or opcode modifier not supported: */
+ CMD_STAT_BAD_OP = 0x02,
+ /* Parameter not supported or parameter out of range: */
+ CMD_STAT_BAD_PARAM = 0x03,
+ /* System not enabled or bad system state: */
+ CMD_STAT_BAD_SYS_STATE = 0x04,
+ /* Attempt to access reserved or unallocaterd resource: */
+ CMD_STAT_BAD_RESOURCE = 0x05,
+ /* Requested resource is currently executing a command, or is otherwise busy: */
+ CMD_STAT_RESOURCE_BUSY = 0x06,
+ /* Required capability exceeds device limits: */
+ CMD_STAT_EXCEED_LIM = 0x08,
+ /* Resource is not in the appropriate state or ownership: */
+ CMD_STAT_BAD_RES_STATE = 0x09,
+ /* Index out of range: */
+ CMD_STAT_BAD_INDEX = 0x0a,
+ /* FW image corrupted: */
+ CMD_STAT_BAD_NVMEM = 0x0b,
+ /* Error in ICM mapping (e.g. not enough auxiliary ICM pages to execute command): */
+ CMD_STAT_ICM_ERROR = 0x0c,
+ /* Attempt to modify a QP/EE which is not in the presumed state: */
+ CMD_STAT_BAD_QP_STATE = 0x10,
+ /* Bad segment parameters (Address/Size): */
+ CMD_STAT_BAD_SEG_PARAM = 0x20,
+ /* Memory Region has Memory Windows bound to: */
+ CMD_STAT_REG_BOUND = 0x21,
+ /* HCA local attached memory not present: */
+ CMD_STAT_LAM_NOT_PRE = 0x22,
+ /* Bad management packet (silently discarded): */
+ CMD_STAT_BAD_PKT = 0x30,
+ /* More outstanding CQEs in CQ than new CQ size: */
+ CMD_STAT_BAD_SIZE = 0x40,
+ /* Multi Function device support required: */
+ CMD_STAT_MULTI_FUNC_REQ = 0x50,
+};
+
+enum {
+ HCR_IN_PARAM_OFFSET = 0x00,
+ HCR_IN_MODIFIER_OFFSET = 0x08,
+ HCR_OUT_PARAM_OFFSET = 0x0c,
+ HCR_TOKEN_OFFSET = 0x14,
+ HCR_STATUS_OFFSET = 0x18,
+
+ HCR_OPMOD_SHIFT = 12,
+ HCR_T_BIT = 21,
+ HCR_E_BIT = 22,
+ HCR_GO_BIT = 23
+};
+
+enum {
+ GO_BIT_TIMEOUT_MSECS = 10000
+};
+
+struct mlx4_cmd_context {
+ struct completion done;
+ int result;
+ int next;
+ u64 out_param;
+ u16 token;
+ u8 fw_status;
+};
+
+static int mlx4_status_to_errno(u8 status)
+{
+ static const int trans_table[] = {
+ [CMD_STAT_INTERNAL_ERR] = -EIO,
+ [CMD_STAT_BAD_OP] = -EPERM,
+ [CMD_STAT_BAD_PARAM] = -EINVAL,
+ [CMD_STAT_BAD_SYS_STATE] = -ENXIO,
+ [CMD_STAT_BAD_RESOURCE] = -EBADF,
+ [CMD_STAT_RESOURCE_BUSY] = -EBUSY,
+ [CMD_STAT_EXCEED_LIM] = -ENOMEM,
+ [CMD_STAT_BAD_RES_STATE] = -EBADF,
+ [CMD_STAT_BAD_INDEX] = -EBADF,
+ [CMD_STAT_BAD_NVMEM] = -EFAULT,
+ [CMD_STAT_ICM_ERROR] = -ENFILE,
+ [CMD_STAT_BAD_QP_STATE] = -EINVAL,
+ [CMD_STAT_BAD_SEG_PARAM] = -EFAULT,
+ [CMD_STAT_REG_BOUND] = -EBUSY,
+ [CMD_STAT_LAM_NOT_PRE] = -EAGAIN,
+ [CMD_STAT_BAD_PKT] = -EINVAL,
+ [CMD_STAT_BAD_SIZE] = -ENOMEM,
+ [CMD_STAT_MULTI_FUNC_REQ] = -EACCES,
+ };
+
+ if (status >= ARRAY_SIZE(trans_table) ||
+ (status != CMD_STAT_OK && trans_table[status] == 0))
+ return -EIO;
+
+ return trans_table[status];
+}
+
+static int cmd_pending(struct mlx4_dev *dev)
+{
+ u32 status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET);
+
+ return (status & swab32(1 << HCR_GO_BIT)) ||
+ (mlx4_priv(dev)->cmd.toggle ==
+ !!(status & swab32(1 << HCR_T_BIT)));
+}
+
+static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param,
+ u32 in_modifier, u8 op_modifier, u16 op, u16 token,
+ int event)
+{
+ struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
+ u32 __iomem *hcr = cmd->hcr;
+ int ret = -EAGAIN;
+ unsigned long end;
+
+ mutex_lock(&cmd->hcr_mutex);
+
+ end = jiffies;
+ if (event)
+ end += msecs_to_jiffies(GO_BIT_TIMEOUT_MSECS);
+
+ while (cmd_pending(dev)) {
+ if (time_after_eq(jiffies, end))
+ goto out;
+ cond_resched();
+ }
+
+ /*
+ * We use writel (instead of something like memcpy_toio)
+ * because writes of less than 32 bits to the HCR don't work
+ * (and some architectures such as ia64 implement memcpy_toio
+ * in terms of writeb).
+ */
+ __raw_writel((__force u32) cpu_to_be32(in_param >> 32), hcr + 0);
+ __raw_writel((__force u32) cpu_to_be32(in_param & 0xfffffffful), hcr + 1);
+ __raw_writel((__force u32) cpu_to_be32(in_modifier), hcr + 2);
+ __raw_writel((__force u32) cpu_to_be32(out_param >> 32), hcr + 3);
+ __raw_writel((__force u32) cpu_to_be32(out_param & 0xfffffffful), hcr + 4);
+ __raw_writel((__force u32) cpu_to_be32(token << 16), hcr + 5);
+
+ /* __raw_writel may not order writes. */
+ wmb();
+
+ __raw_writel((__force u32) cpu_to_be32((1 << HCR_GO_BIT) |
+ (cmd->toggle << HCR_T_BIT) |
+ (event ? (1 << HCR_E_BIT) : 0) |
+ (op_modifier << HCR_OPMOD_SHIFT) |
+ op), hcr + 6);
+
+ /*
+ * Make sure that our HCR writes don't get mixed in with
+ * writes from another CPU starting a FW command.
+ */
+ mmiowb();
+
+ cmd->toggle = cmd->toggle ^ 1;
+
+ ret = 0;
+
+out:
+ mutex_unlock(&cmd->hcr_mutex);
+ return ret;
+}
+
+static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+ int out_is_imm, u32 in_modifier, u8 op_modifier,
+ u16 op, unsigned long timeout)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ void __iomem *hcr = priv->cmd.hcr;
+ int err = 0;
+ unsigned long end;
+ u32 stat;
+
+ down(&priv->cmd.poll_sem);
+
+ err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
+ in_modifier, op_modifier, op, CMD_POLL_TOKEN, 0);
+ if (err)
+ goto out;
+
+ end = msecs_to_jiffies(timeout) + jiffies;
+ while (cmd_pending(dev) && time_before(jiffies, end))
+ cond_resched();
+
+ if (cmd_pending(dev)) {
+ err = -ETIMEDOUT;
+ goto out;
+ }
+
+ if (out_is_imm)
+ *out_param =
+ (u64) be32_to_cpu((__force __be32)
+ __raw_readl(hcr + HCR_OUT_PARAM_OFFSET)) << 32 |
+ (u64) be32_to_cpu((__force __be32)
+ __raw_readl(hcr + HCR_OUT_PARAM_OFFSET + 4));
+ stat = be32_to_cpu((__force __be32) __raw_readl(hcr + HCR_STATUS_OFFSET)) >> 24;
+ err = mlx4_status_to_errno(stat);
+ if (err) {
+ if (op != MLX4_CMD_SET_NODE || stat != CMD_STAT_BAD_OP)
+ mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n",
+ op, stat);
+ }
+
+out:
+ up(&priv->cmd.poll_sem);
+ return err;
+}
+
+void mlx4_cmd_event(struct mlx4_dev *dev, u16 token, u8 status, u64 out_param)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ struct mlx4_cmd_context *context =
+ &priv->cmd.context[token & priv->cmd.token_mask];
+
+ /* previously timed out command completing at long last */
+ if (token != context->token)
+ return;
+
+ context->fw_status = status;
+ context->result = mlx4_status_to_errno(status);
+ context->out_param = out_param;
+
+ complete(&context->done);
+}
+
+static int mlx4_cmd_wait(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+ int out_is_imm, u32 in_modifier, u8 op_modifier,
+ u16 op, unsigned long timeout)
+{
+ struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
+ struct mlx4_cmd_context *context;
+ int err = 0;
+
+ down(&cmd->event_sem);
+
+ spin_lock(&cmd->context_lock);
+ BUG_ON(cmd->free_head < 0);
+ context = &cmd->context[cmd->free_head];
+ context->token += cmd->token_mask + 1;
+ cmd->free_head = context->next;
+ spin_unlock(&cmd->context_lock);
+
+ init_completion(&context->done);
+
+ mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
+ in_modifier, op_modifier, op, context->token, 1);
+
+ if (!wait_for_completion_timeout(&context->done, msecs_to_jiffies(timeout))) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ err = context->result;
+ if (err) {
+ if (op != MLX4_CMD_SET_NODE || context->fw_status != CMD_STAT_BAD_OP)
+ mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n",
+ op, context->fw_status);
+ goto out;
+ }
+
+ if (out_is_imm)
+ *out_param = context->out_param;
+
+out:
+ spin_lock(&cmd->context_lock);
+ context->next = cmd->free_head;
+ cmd->free_head = context - cmd->context;
+ spin_unlock(&cmd->context_lock);
+
+ up(&cmd->event_sem);
+ return err;
+}
+
+int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+ int out_is_imm, u32 in_modifier, u8 op_modifier,
+ u16 op, unsigned long timeout)
+{
+ if (mlx4_priv(dev)->cmd.use_events && !cold)
+ return mlx4_cmd_wait(dev, in_param, out_param, out_is_imm,
+ in_modifier, op_modifier, op, timeout);
+ else
+ return mlx4_cmd_poll(dev, in_param, out_param, out_is_imm,
+ in_modifier, op_modifier, op, timeout);
+}
+EXPORT_SYMBOL_GPL(__mlx4_cmd);
+
+int mlx4_cmd_init(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+
+ mutex_init(&priv->cmd.hcr_mutex);
+ sema_init(&priv->cmd.poll_sem, 1);
+ priv->cmd.use_events = 0;
+ priv->cmd.toggle = 1;
+
+ priv->cmd.hcr = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_HCR_BASE,
+ MLX4_HCR_SIZE);
+ if (!priv->cmd.hcr) {
+ mlx4_err(dev, "Couldn't map command register.");
+ return -ENOMEM;
+ }
+
+ priv->cmd.pool = pci_pool_create("mlx4_cmd", dev->pdev,
+ MLX4_MAILBOX_SIZE,
+ MLX4_MAILBOX_SIZE, 0);
+ if (!priv->cmd.pool) {
+ iounmap(priv->cmd.hcr);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+void mlx4_cmd_cleanup(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+
+ pci_pool_destroy(priv->cmd.pool);
+ iounmap(priv->cmd.hcr);
+}
+
+/*
+ * Switch to using events to issue FW commands (can only be called
+ * after event queue for command events has been initialized).
+ */
+int mlx4_cmd_use_events(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ int i;
+
+ priv->cmd.context = kmalloc(priv->cmd.max_cmds *
+ sizeof (struct mlx4_cmd_context),
+ GFP_KERNEL);
+ if (!priv->cmd.context)
+ return -ENOMEM;
+
+ for (i = 0; i < priv->cmd.max_cmds; ++i) {
+ priv->cmd.context[i].token = i;
+ priv->cmd.context[i].next = i + 1;
+ }
+
+ priv->cmd.context[priv->cmd.max_cmds - 1].next = -1;
+ priv->cmd.free_head = 0;
+
+ sema_init(&priv->cmd.event_sem, priv->cmd.max_cmds);
+ spin_lock_init(&priv->cmd.context_lock);
+
+ for (priv->cmd.token_mask = 1;
+ priv->cmd.token_mask < priv->cmd.max_cmds;
+ priv->cmd.token_mask <<= 1)
+ ; /* nothing */
+ --priv->cmd.token_mask;
+
+ priv->cmd.use_events = 1;
+
+ down(&priv->cmd.poll_sem);
+
+ return 0;
+}
+
+/*
+ * Switch back to polling (used when shutting down the device)
+ */
+void mlx4_cmd_use_polling(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ int i;
+
+ priv->cmd.use_events = 0;
+
+ for (i = 0; i < priv->cmd.max_cmds; ++i)
+ down(&priv->cmd.event_sem);
+
+ kfree(priv->cmd.context);
+
+ up(&priv->cmd.poll_sem);
+}
+
+struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+
+ mailbox = kmalloc(sizeof *mailbox, GFP_KERNEL);
+ if (!mailbox)
+ return ERR_PTR(-ENOMEM);
+
+ mailbox->buf = pci_pool_alloc(mlx4_priv(dev)->cmd.pool, GFP_KERNEL,
+ &mailbox->dma);
+ if (!mailbox->buf) {
+ kfree(mailbox);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return mailbox;
+}
+EXPORT_SYMBOL_GPL(mlx4_alloc_cmd_mailbox);
+
+void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox)
+{
+ if (!mailbox)
+ return;
+
+ pci_pool_free(mlx4_priv(dev)->cmd.pool, mailbox->buf, mailbox->dma);
+ kfree(mailbox);
+}
+EXPORT_SYMBOL_GPL(mlx4_free_cmd_mailbox);
diff --git a/sys/ofed/drivers/net/mlx4/cq.c b/sys/ofed/drivers/net/mlx4/cq.c
new file mode 100644
index 0000000..076c602
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/cq.c
@@ -0,0 +1,338 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/hardirq.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/cq.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+struct mlx4_cq_context {
+ __be32 flags;
+ u16 reserved1[3];
+ __be16 page_offset;
+ __be32 logsize_usrpage;
+ __be16 cq_period;
+ __be16 cq_max_count;
+ u8 reserved2[3];
+ u8 comp_eqn;
+ u8 log_page_size;
+ u8 reserved3[2];
+ u8 mtt_base_addr_h;
+ __be32 mtt_base_addr_l;
+ __be32 last_notified_index;
+ __be32 solicit_producer_index;
+ __be32 consumer_index;
+ __be32 producer_index;
+ u32 reserved4[2];
+ __be64 db_rec_addr;
+};
+
+#define MLX4_CQ_STATUS_OK ( 0 << 28)
+#define MLX4_CQ_STATUS_OVERFLOW ( 9 << 28)
+#define MLX4_CQ_STATUS_WRITE_FAIL (10 << 28)
+#define MLX4_CQ_FLAG_CC ( 1 << 18)
+#define MLX4_CQ_FLAG_OI ( 1 << 17)
+#define MLX4_CQ_STATE_ARMED ( 9 << 8)
+#define MLX4_CQ_STATE_ARMED_SOL ( 6 << 8)
+#define MLX4_EQ_STATE_FIRED (10 << 8)
+
+void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn)
+{
+ struct mlx4_cq *cq;
+
+ cq = radix_tree_lookup(&mlx4_priv(dev)->cq_table.tree,
+ cqn & (dev->caps.num_cqs - 1));
+ if (!cq) {
+ mlx4_dbg(dev, "Completion event for bogus CQ %08x\n", cqn);
+ return;
+ }
+
+ ++cq->arm_sn;
+
+ cq->comp(cq);
+}
+
+void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type)
+{
+ struct mlx4_cq_table *cq_table = &mlx4_priv(dev)->cq_table;
+ struct mlx4_cq *cq;
+
+ spin_lock(&cq_table->lock);
+
+ cq = radix_tree_lookup(&cq_table->tree, cqn & (dev->caps.num_cqs - 1));
+ if (cq)
+ atomic_inc(&cq->refcount);
+
+ spin_unlock(&cq_table->lock);
+
+ if (!cq) {
+ mlx4_warn(dev, "Async event for bogus CQ %08x\n", cqn);
+ return;
+ }
+
+ cq->event(cq, event_type);
+
+ if (atomic_dec_and_test(&cq->refcount))
+ complete(&cq->free);
+}
+
+static int mlx4_SW2HW_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+ int cq_num)
+{
+ return mlx4_cmd(dev, mailbox->dma, cq_num, 0, MLX4_CMD_SW2HW_CQ,
+ MLX4_CMD_TIME_CLASS_A);
+}
+
+static int mlx4_MODIFY_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+ int cq_num, u32 opmod)
+{
+ return mlx4_cmd(dev, mailbox->dma, cq_num, opmod, MLX4_CMD_MODIFY_CQ,
+ MLX4_CMD_TIME_CLASS_A);
+}
+
+static int mlx4_HW2SW_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+ int cq_num)
+{
+ return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, cq_num,
+ mailbox ? 0 : 1, MLX4_CMD_HW2SW_CQ,
+ MLX4_CMD_TIME_CLASS_A);
+}
+
+int mlx4_cq_modify(struct mlx4_dev *dev, struct mlx4_cq *cq,
+ u16 count, u16 period)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ struct mlx4_cq_context *cq_context;
+ int err;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+
+ cq_context = mailbox->buf;
+ memset(cq_context, 0, sizeof *cq_context);
+
+ cq_context->cq_max_count = cpu_to_be16(count);
+ cq_context->cq_period = cpu_to_be16(period);
+
+ err = mlx4_MODIFY_CQ(dev, mailbox, cq->cqn, 1);
+
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_cq_modify);
+
+int mlx4_cq_resize(struct mlx4_dev *dev, struct mlx4_cq *cq,
+ int entries, struct mlx4_mtt *mtt)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ struct mlx4_cq_context *cq_context;
+ u64 mtt_addr;
+ int err;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+
+ cq_context = mailbox->buf;
+ memset(cq_context, 0, sizeof *cq_context);
+
+ cq_context->logsize_usrpage = cpu_to_be32(ilog2(entries) << 24);
+ cq_context->log_page_size = mtt->page_shift - 12;
+ mtt_addr = mlx4_mtt_addr(dev, mtt);
+ cq_context->mtt_base_addr_h = mtt_addr >> 32;
+ cq_context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff);
+
+ err = mlx4_MODIFY_CQ(dev, mailbox, cq->cqn, 0);
+
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_cq_resize);
+
+static int mlx4_find_least_loaded_vector(struct mlx4_priv *priv)
+{
+ int i;
+ int index = 0;
+ int min = priv->eq_table.eq[0].load;
+
+ for (i = 1; i < priv->dev.caps.num_comp_vectors; i++) {
+ if (priv->eq_table.eq[i].load < min) {
+ index = i;
+ min = priv->eq_table.eq[i].load;
+ }
+ }
+
+ return index;
+}
+
+int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt,
+ struct mlx4_uar *uar, u64 db_rec, struct mlx4_cq *cq,
+ unsigned vector, int collapsed)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ struct mlx4_cq_table *cq_table = &priv->cq_table;
+ struct mlx4_cmd_mailbox *mailbox;
+ struct mlx4_cq_context *cq_context;
+ u64 mtt_addr;
+ int err;
+
+ cq->vector = (vector == MLX4_LEAST_ATTACHED_VECTOR) ?
+ mlx4_find_least_loaded_vector(priv) : vector;
+
+ if (cq->vector >= dev->caps.num_comp_vectors)
+ return -EINVAL;
+
+ cq->cqn = mlx4_bitmap_alloc(&cq_table->bitmap);
+ if (cq->cqn == -1)
+ return -ENOMEM;
+
+ err = mlx4_table_get(dev, &cq_table->table, cq->cqn);
+ if (err)
+ goto err_out;
+
+ err = mlx4_table_get(dev, &cq_table->cmpt_table, cq->cqn);
+ if (err)
+ goto err_put;
+
+ spin_lock_irq(&cq_table->lock);
+ err = radix_tree_insert(&cq_table->tree, cq->cqn, cq);
+ spin_unlock_irq(&cq_table->lock);
+ if (err)
+ goto err_cmpt_put;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox)) {
+ err = PTR_ERR(mailbox);
+ goto err_radix;
+ }
+
+ cq_context = mailbox->buf;
+ memset(cq_context, 0, sizeof *cq_context);
+
+ cq_context->flags = cpu_to_be32(!!collapsed << 18);
+ cq_context->logsize_usrpage = cpu_to_be32((ilog2(nent) << 24) | uar->index);
+ cq_context->comp_eqn = priv->eq_table.eq[cq->vector].eqn;
+ cq_context->log_page_size = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
+
+ mtt_addr = mlx4_mtt_addr(dev, mtt);
+ cq_context->mtt_base_addr_h = mtt_addr >> 32;
+ cq_context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff);
+ cq_context->db_rec_addr = cpu_to_be64(db_rec);
+
+ err = mlx4_SW2HW_CQ(dev, mailbox, cq->cqn);
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ if (err)
+ goto err_radix;
+
+ priv->eq_table.eq[cq->vector].load++;
+ cq->cons_index = 0;
+ cq->arm_sn = 1;
+ cq->uar = uar;
+ atomic_set(&cq->refcount, 1);
+ init_completion(&cq->free);
+
+ return 0;
+
+err_radix:
+ spin_lock_irq(&cq_table->lock);
+ radix_tree_delete(&cq_table->tree, cq->cqn);
+ spin_unlock_irq(&cq_table->lock);
+
+err_cmpt_put:
+ mlx4_table_put(dev, &cq_table->cmpt_table, cq->cqn);
+
+err_put:
+ mlx4_table_put(dev, &cq_table->table, cq->cqn);
+
+err_out:
+ mlx4_bitmap_free(&cq_table->bitmap, cq->cqn);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_cq_alloc);
+
+void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ struct mlx4_cq_table *cq_table = &priv->cq_table;
+ int err;
+
+ err = mlx4_HW2SW_CQ(dev, NULL, cq->cqn);
+ if (err)
+ mlx4_warn(dev, "HW2SW_CQ failed (%d) for CQN %06x\n", err, cq->cqn);
+
+ synchronize_irq(priv->eq_table.eq[cq->vector].irq);
+ priv->eq_table.eq[cq->vector].load--;
+
+ spin_lock_irq(&cq_table->lock);
+ radix_tree_delete(&cq_table->tree, cq->cqn);
+ spin_unlock_irq(&cq_table->lock);
+
+ if (atomic_dec_and_test(&cq->refcount))
+ complete(&cq->free);
+ wait_for_completion(&cq->free);
+
+ mlx4_table_put(dev, &cq_table->table, cq->cqn);
+ mlx4_bitmap_free(&cq_table->bitmap, cq->cqn);
+}
+EXPORT_SYMBOL_GPL(mlx4_cq_free);
+
+int mlx4_init_cq_table(struct mlx4_dev *dev)
+{
+ struct mlx4_cq_table *cq_table = &mlx4_priv(dev)->cq_table;
+ int err;
+
+ spin_lock_init(&cq_table->lock);
+ INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC);
+
+ err = mlx4_bitmap_init(&cq_table->bitmap, dev->caps.num_cqs,
+ dev->caps.num_cqs - 1, dev->caps.reserved_cqs, 0);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+void mlx4_cleanup_cq_table(struct mlx4_dev *dev)
+{
+ /* Nothing to do to clean up radix_tree */
+ mlx4_bitmap_cleanup(&mlx4_priv(dev)->cq_table.bitmap);
+}
diff --git a/sys/ofed/drivers/net/mlx4/en_cq.c b/sys/ofed/drivers/net/mlx4/en_cq.c
new file mode 100644
index 0000000..9f475ff
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/en_cq.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include "mlx4_en.h"
+
+#include <linux/mlx4/cq.h>
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/cmd.h>
+
+static void mlx4_en_cq_event(struct mlx4_cq *cq, enum mlx4_event event)
+{
+ return;
+}
+
+
+int mlx4_en_create_cq(struct mlx4_en_priv *priv,
+ struct mlx4_en_cq *cq,
+ int entries, int ring, enum cq_type mode)
+{
+ struct mlx4_en_dev *mdev = priv->mdev;
+ int err;
+
+ cq->size = entries;
+ if (mode == RX) {
+ cq->buf_size = cq->size * sizeof(struct mlx4_cqe);
+ cq->vector = (ring + priv->port) %
+ mdev->dev->caps.num_comp_vectors;
+ TASK_INIT(&cq->cq_task, 0, mlx4_en_rx_que, cq);
+ } else {
+ cq->buf_size = sizeof(struct mlx4_cqe);
+ cq->vector = MLX4_LEAST_ATTACHED_VECTOR;
+ TASK_INIT(&cq->cq_task, 0, mlx4_en_tx_que, cq);
+ }
+
+ cq->tq = taskqueue_create_fast("mlx4_en_que", M_NOWAIT,
+ taskqueue_thread_enqueue, &cq->tq);
+ taskqueue_start_threads(&cq->tq, 1, PI_NET, "%s cq",
+ if_name(priv->dev));
+ cq->ring = ring;
+ cq->is_tx = mode;
+ mtx_init(&cq->lock.m, "mlx4 cq", NULL, MTX_DEF);
+
+ err = mlx4_alloc_hwq_res(mdev->dev, &cq->wqres,
+ cq->buf_size, 2 * PAGE_SIZE);
+ if (err)
+ return err;
+
+ err = mlx4_en_map_buffer(&cq->wqres.buf);
+ if (err)
+ mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size);
+ else
+ cq->buf = (struct mlx4_cqe *) cq->wqres.buf.direct.buf;
+
+ return err;
+}
+
+int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
+{
+ struct mlx4_en_dev *mdev = priv->mdev;
+ int err;
+
+ cq->dev = mdev->pndev[priv->port];
+ cq->mcq.set_ci_db = cq->wqres.db.db;
+ cq->mcq.arm_db = cq->wqres.db.db + 1;
+ *cq->mcq.set_ci_db = 0;
+ *cq->mcq.arm_db = 0;
+ memset(cq->buf, 0, cq->buf_size);
+
+ if (!cq->is_tx)
+ cq->size = priv->rx_ring[cq->ring].actual_size;
+
+ err = mlx4_cq_alloc(mdev->dev, cq->size, &cq->wqres.mtt, &mdev->priv_uar,
+ cq->wqres.db.dma, &cq->mcq, cq->vector, cq->is_tx);
+ if (err)
+ return err;
+
+ cq->mcq.comp = cq->is_tx ? mlx4_en_tx_irq : mlx4_en_rx_irq;
+ cq->mcq.event = mlx4_en_cq_event;
+
+ if (cq->is_tx) {
+ init_timer(&cq->timer);
+ cq->timer.function = mlx4_en_poll_tx_cq;
+ cq->timer.data = (unsigned long) cq;
+ }
+
+ return 0;
+}
+
+void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
+{
+ struct mlx4_en_dev *mdev = priv->mdev;
+
+ taskqueue_drain(cq->tq, &cq->cq_task);
+ taskqueue_free(cq->tq);
+ mlx4_en_unmap_buffer(&cq->wqres.buf);
+ mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size);
+ cq->buf_size = 0;
+ cq->buf = NULL;
+ mtx_destroy(&cq->lock.m);
+}
+
+void mlx4_en_deactivate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
+{
+ struct mlx4_en_dev *mdev = priv->mdev;
+
+ taskqueue_drain(cq->tq, &cq->cq_task);
+ if (cq->is_tx)
+ del_timer(&cq->timer);
+
+ mlx4_cq_free(mdev->dev, &cq->mcq);
+}
+
+/* Set rx cq moderation parameters */
+int mlx4_en_set_cq_moder(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
+{
+ return mlx4_cq_modify(priv->mdev->dev, &cq->mcq,
+ cq->moder_cnt, cq->moder_time);
+}
+
+int mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
+{
+ mlx4_cq_arm(&cq->mcq, MLX4_CQ_DB_REQ_NOT, priv->mdev->uar_map,
+ &priv->mdev->uar_lock);
+
+ return 0;
+}
+
+
diff --git a/sys/ofed/drivers/net/mlx4/en_ethtool.c b/sys/ofed/drivers/net/mlx4/en_ethtool.c
new file mode 100644
index 0000000..9587fb3
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/en_ethtool.c
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+#include <linux/if_vlan.h>
+
+#include "mlx4_en.h"
+#include "en_port.h"
+
+
+static void mlx4_en_update_lro_stats(struct mlx4_en_priv *priv)
+{
+ int i;
+
+ priv->port_stats.lro_aggregated = 0;
+ priv->port_stats.lro_flushed = 0;
+ priv->port_stats.lro_no_desc = 0;
+
+ for (i = 0; i < priv->rx_ring_num; i++) {
+ priv->port_stats.lro_aggregated += priv->rx_ring[i].lro.stats.aggregated;
+ priv->port_stats.lro_flushed += priv->rx_ring[i].lro.stats.flushed;
+ priv->port_stats.lro_no_desc += priv->rx_ring[i].lro.stats.no_desc;
+ }
+}
+
+static void
+mlx4_en_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ struct mlx4_en_dev *mdev = priv->mdev;
+
+ sprintf(drvinfo->driver, DRV_NAME " (%s)", mdev->dev->board_id);
+ strncpy(drvinfo->version, DRV_VERSION " (" DRV_RELDATE ")", 32);
+ sprintf(drvinfo->fw_version, "%d.%d.%d",
+ (u16) (mdev->dev->caps.fw_ver >> 32),
+ (u16) ((mdev->dev->caps.fw_ver >> 16) & 0xffff),
+ (u16) (mdev->dev->caps.fw_ver & 0xffff));
+ strncpy(drvinfo->bus_info, pci_name(mdev->dev->pdev), 32);
+ drvinfo->n_stats = 0;
+ drvinfo->regdump_len = 0;
+ drvinfo->eedump_len = 0;
+}
+
+static u32 mlx4_en_get_tso(struct net_device *dev)
+{
+ return (dev->features & NETIF_F_TSO) != 0;
+}
+
+static int mlx4_en_set_tso(struct net_device *dev, u32 data)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+
+ if (data) {
+ if (!priv->mdev->LSO_support)
+ return -EPERM;
+ dev->features |= (NETIF_F_TSO | NETIF_F_TSO6);
+#ifdef HAVE_NETDEV_VLAN_FEATURES
+ dev->vlan_features |= (NETIF_F_TSO | NETIF_F_TSO6);
+#else
+ if (priv->vlgrp) {
+ int i;
+ struct net_device *vdev;
+ for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
+ vdev = vlan_group_get_device(priv->vlgrp, i);
+ if (vdev) {
+ vdev->features |= (NETIF_F_TSO | NETIF_F_TSO6);
+ vlan_group_set_device(priv->vlgrp, i, vdev);
+ }
+ }
+ }
+#endif
+ } else {
+ dev->features &= ~(NETIF_F_TSO | NETIF_F_TSO6);
+#ifdef HAVE_NETDEV_VLAN_FEATURES
+ dev->vlan_features &= ~(NETIF_F_TSO | NETIF_F_TSO6);
+#else
+ if (priv->vlgrp) {
+ int i;
+ struct net_device *vdev;
+ for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
+ vdev = vlan_group_get_device(priv->vlgrp, i);
+ if (vdev) {
+ vdev->features &= ~(NETIF_F_TSO | NETIF_F_TSO6);
+ vlan_group_set_device(priv->vlgrp, i, vdev);
+ }
+ }
+ }
+#endif
+ }
+ return 0;
+}
+
+static u32 mlx4_en_get_rx_csum(struct net_device *dev)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ return priv->rx_csum;
+}
+
+static int mlx4_en_set_rx_csum(struct net_device *dev, u32 data)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ priv->rx_csum = (data != 0);
+ return 0;
+}
+
+static const char main_strings[][ETH_GSTRING_LEN] = {
+ "rx_packets", "tx_packets", "rx_bytes", "tx_bytes", "rx_errors",
+ "tx_errors", "rx_dropped", "tx_dropped", "multicast", "collisions",
+ "rx_length_errors", "rx_over_errors", "rx_crc_errors",
+ "rx_frame_errors", "rx_fifo_errors", "rx_missed_errors",
+ "tx_aborted_errors", "tx_carrier_errors", "tx_fifo_errors",
+ "tx_heartbeat_errors", "tx_window_errors",
+
+ /* port statistics */
+ "lro_aggregated", "lro_flushed", "lro_no_desc", "tso_packets",
+ "queue_stopped", "wake_queue", "tx_timeout", "rx_alloc_failed",
+ "rx_csum_good", "rx_csum_none", "tx_chksum_offload",
+
+ /* packet statistics */
+ "broadcast", "rx_prio_0", "rx_prio_1", "rx_prio_2", "rx_prio_3",
+ "rx_prio_4", "rx_prio_5", "rx_prio_6", "rx_prio_7", "tx_prio_0",
+ "tx_prio_1", "tx_prio_2", "tx_prio_3", "tx_prio_4", "tx_prio_5",
+ "tx_prio_6", "tx_prio_7",
+};
+#define NUM_MAIN_STATS 21
+#define NUM_ALL_STATS (NUM_MAIN_STATS + NUM_PORT_STATS + NUM_PKT_STATS + NUM_PERF_STATS)
+
+static const char mlx4_en_test_names[][ETH_GSTRING_LEN]= {
+ "Interupt Test",
+ "Link Test",
+ "Speed Test",
+ "Register Test",
+ "Loopback Test",
+};
+
+static u32 mlx4_en_get_msglevel(struct net_device *dev)
+{
+ return ((struct mlx4_en_priv *) netdev_priv(dev))->msg_enable;
+}
+
+static void mlx4_en_set_msglevel(struct net_device *dev, u32 val)
+{
+ ((struct mlx4_en_priv *) netdev_priv(dev))->msg_enable = val;
+}
+
+static void mlx4_en_get_wol(struct net_device *netdev,
+ struct ethtool_wolinfo *wol)
+{
+ wol->supported = 0;
+ wol->wolopts = 0;
+
+ return;
+}
+
+static int mlx4_en_get_sset_count(struct net_device *dev, int sset)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+
+ switch (sset) {
+ case ETH_SS_STATS:
+ return NUM_ALL_STATS +
+ (priv->tx_ring_num + priv->rx_ring_num) * 2;
+ case ETH_SS_TEST:
+ return MLX4_EN_NUM_SELF_TEST - !(priv->mdev->dev->caps.loopback_support) * 2;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static void mlx4_en_get_ethtool_stats(struct net_device *dev,
+ struct ethtool_stats *stats, uint64_t *data)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ int index = 0;
+ int i;
+
+ spin_lock_bh(&priv->stats_lock);
+
+ mlx4_en_update_lro_stats(priv);
+
+ for (i = 0; i < NUM_MAIN_STATS; i++)
+ data[index++] = ((unsigned long *) &priv->stats)[i];
+ for (i = 0; i < NUM_PORT_STATS; i++)
+ data[index++] = ((unsigned long *) &priv->port_stats)[i];
+ for (i = 0; i < priv->tx_ring_num; i++) {
+ data[index++] = priv->tx_ring[i].packets;
+ data[index++] = priv->tx_ring[i].bytes;
+ }
+ for (i = 0; i < priv->rx_ring_num; i++) {
+ data[index++] = priv->rx_ring[i].packets;
+ data[index++] = priv->rx_ring[i].bytes;
+ }
+ for (i = 0; i < NUM_PKT_STATS; i++)
+ data[index++] = ((unsigned long *) &priv->pkstats)[i];
+ spin_unlock_bh(&priv->stats_lock);
+
+}
+
+static void mlx4_en_self_test(struct net_device *dev,
+ struct ethtool_test *etest, u64 *buf)
+{
+ mlx4_en_ex_selftest(dev, &etest->flags, buf);
+}
+
+static void mlx4_en_get_strings(struct net_device *dev,
+ uint32_t stringset, uint8_t *data)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ int index = 0;
+ int i;
+
+ switch (stringset) {
+ case ETH_SS_TEST:
+ for (i = 0; i < MLX4_EN_NUM_SELF_TEST - 2; i++)
+ strcpy(data + i * ETH_GSTRING_LEN, mlx4_en_test_names[i]);
+ if (priv->mdev->dev->caps.loopback_support)
+ for (; i < MLX4_EN_NUM_SELF_TEST; i++)
+ strcpy(data + i * ETH_GSTRING_LEN, mlx4_en_test_names[i]);
+ break;
+
+ case ETH_SS_STATS:
+ /* Add main counters */
+ for (i = 0; i < NUM_MAIN_STATS; i++)
+ strcpy(data + (index++) * ETH_GSTRING_LEN, main_strings[i]);
+ for (i = 0; i< NUM_PORT_STATS; i++)
+ strcpy(data + (index++) * ETH_GSTRING_LEN,
+ main_strings[i + NUM_MAIN_STATS]);
+ for (i = 0; i < priv->tx_ring_num; i++) {
+ sprintf(data + (index++) * ETH_GSTRING_LEN,
+ "tx%d_packets", i);
+ sprintf(data + (index++) * ETH_GSTRING_LEN,
+ "tx%d_bytes", i);
+ }
+ for (i = 0; i < priv->rx_ring_num; i++) {
+ sprintf(data + (index++) * ETH_GSTRING_LEN,
+ "rx%d_packets", i);
+ sprintf(data + (index++) * ETH_GSTRING_LEN,
+ "rx%d_bytes", i);
+ }
+ for (i = 0; i< NUM_PKT_STATS; i++)
+ strcpy(data + (index++) * ETH_GSTRING_LEN,
+ main_strings[i + NUM_MAIN_STATS + NUM_PORT_STATS]);
+ break;
+ }
+}
+
+static int mlx4_en_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ int trans_type;
+
+ cmd->autoneg = AUTONEG_DISABLE;
+ cmd->supported = SUPPORTED_10000baseT_Full;
+ cmd->advertising = ADVERTISED_10000baseT_Full;
+
+ if (mlx4_en_QUERY_PORT(priv->mdev, priv->port))
+ return -ENOMEM;
+
+ trans_type = priv->port_state.transciver;
+ if (netif_carrier_ok(dev)) {
+ cmd->speed = priv->port_state.link_speed;
+ cmd->duplex = DUPLEX_FULL;
+ } else {
+ cmd->speed = -1;
+ cmd->duplex = -1;
+ }
+
+ if (trans_type > 0 && trans_type <= 0xC) {
+ cmd->port = PORT_FIBRE;
+ cmd->transceiver = XCVR_EXTERNAL;
+ cmd->supported |= SUPPORTED_FIBRE;
+ cmd->advertising |= ADVERTISED_FIBRE;
+ } else if (trans_type == 0x80 || trans_type == 0) {
+ cmd->port = PORT_TP;
+ cmd->transceiver = XCVR_INTERNAL;
+ cmd->supported |= SUPPORTED_TP;
+ cmd->advertising |= ADVERTISED_TP;
+ } else {
+ cmd->port = -1;
+ cmd->transceiver = -1;
+ }
+ return 0;
+}
+
+static int mlx4_en_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+ if ((cmd->autoneg == AUTONEG_ENABLE) ||
+ (cmd->speed != SPEED_10000) || (cmd->duplex != DUPLEX_FULL))
+ return -EINVAL;
+
+ /* Nothing to change */
+ return 0;
+}
+
+static int mlx4_en_get_coalesce(struct net_device *dev,
+ struct ethtool_coalesce *coal)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+
+ coal->tx_coalesce_usecs = 0;
+ coal->tx_max_coalesced_frames = 0;
+ coal->rx_coalesce_usecs = priv->rx_usecs;
+ coal->rx_max_coalesced_frames = priv->rx_frames;
+
+ coal->pkt_rate_low = priv->pkt_rate_low;
+ coal->rx_coalesce_usecs_low = priv->rx_usecs_low;
+ coal->pkt_rate_high = priv->pkt_rate_high;
+ coal->rx_coalesce_usecs_high = priv->rx_usecs_high;
+ coal->rate_sample_interval = priv->sample_interval;
+ coal->use_adaptive_rx_coalesce = priv->adaptive_rx_coal;
+ return 0;
+}
+
+static int mlx4_en_set_coalesce(struct net_device *dev,
+ struct ethtool_coalesce *coal)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ int err, i;
+
+ priv->rx_frames = (coal->rx_max_coalesced_frames ==
+ MLX4_EN_AUTO_CONF) ?
+ MLX4_EN_RX_COAL_TARGET /
+ priv->dev->mtu + 1 :
+ coal->rx_max_coalesced_frames;
+ priv->rx_usecs = (coal->rx_coalesce_usecs ==
+ MLX4_EN_AUTO_CONF) ?
+ MLX4_EN_RX_COAL_TIME :
+ coal->rx_coalesce_usecs;
+
+ /* Set adaptive coalescing params */
+ priv->pkt_rate_low = coal->pkt_rate_low;
+ priv->rx_usecs_low = coal->rx_coalesce_usecs_low;
+ priv->pkt_rate_high = coal->pkt_rate_high;
+ priv->rx_usecs_high = coal->rx_coalesce_usecs_high;
+ priv->sample_interval = coal->rate_sample_interval;
+ priv->adaptive_rx_coal = coal->use_adaptive_rx_coalesce;
+ priv->last_moder_time = MLX4_EN_AUTO_CONF;
+ if (priv->adaptive_rx_coal)
+ return 0;
+
+ for (i = 0; i < priv->rx_ring_num; i++) {
+ priv->rx_cq[i].moder_cnt = priv->rx_frames;
+ priv->rx_cq[i].moder_time = priv->rx_usecs;
+ err = mlx4_en_set_cq_moder(priv, &priv->rx_cq[i]);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static int mlx4_en_set_pauseparam(struct net_device *dev,
+ struct ethtool_pauseparam *pause)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ struct mlx4_en_dev *mdev = priv->mdev;
+ int err;
+
+ priv->prof->tx_pause = pause->tx_pause != 0;
+ priv->prof->rx_pause = pause->rx_pause != 0;
+ err = mlx4_SET_PORT_general(mdev->dev, priv->port,
+ priv->rx_mb_size + ETH_FCS_LEN,
+ priv->prof->tx_pause,
+ priv->prof->tx_ppp,
+ priv->prof->rx_pause,
+ priv->prof->rx_ppp);
+ if (err)
+ en_err(priv, "Failed setting pause params\n");
+
+ return err;
+}
+
+static void mlx4_en_get_pauseparam(struct net_device *dev,
+ struct ethtool_pauseparam *pause)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+
+ pause->tx_pause = priv->prof->tx_pause;
+ pause->rx_pause = priv->prof->rx_pause;
+}
+
+static int mlx4_en_set_ringparam(struct net_device *dev,
+ struct ethtool_ringparam *param)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ struct mlx4_en_dev *mdev = priv->mdev;
+ u32 rx_size, tx_size;
+ int port_up = 0;
+ int err = 0;
+
+ if (param->rx_jumbo_pending || param->rx_mini_pending)
+ return -EINVAL;
+
+ rx_size = roundup_pow_of_two(param->rx_pending);
+ rx_size = max_t(u32, rx_size, MLX4_EN_MIN_RX_SIZE);
+ rx_size = min_t(u32, rx_size, MLX4_EN_MAX_RX_SIZE);
+ tx_size = roundup_pow_of_two(param->tx_pending);
+ tx_size = max_t(u32, tx_size, MLX4_EN_MIN_TX_SIZE);
+ tx_size = min_t(u32, tx_size, MLX4_EN_MAX_TX_SIZE);
+
+ if (rx_size == (priv->port_up ? priv->rx_ring[0].actual_size :
+ priv->rx_ring[0].size) &&
+ tx_size == priv->tx_ring[0].size)
+ return 0;
+
+ mutex_lock(&mdev->state_lock);
+ if (priv->port_up) {
+ port_up = 1;
+ mlx4_en_stop_port(dev);
+ }
+
+ mlx4_en_free_resources(priv);
+
+ priv->prof->tx_ring_size = tx_size;
+ priv->prof->rx_ring_size = rx_size;
+
+ err = mlx4_en_alloc_resources(priv);
+ if (err) {
+ en_err(priv, "Failed reallocating port resources\n");
+ goto out;
+ }
+ if (port_up) {
+ err = mlx4_en_start_port(dev);
+ if (err)
+ en_err(priv, "Failed starting port\n");
+ }
+
+out:
+ mutex_unlock(&mdev->state_lock);
+ return err;
+}
+
+static void mlx4_en_get_ringparam(struct net_device *dev,
+ struct ethtool_ringparam *param)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+
+ memset(param, 0, sizeof(*param));
+ param->rx_max_pending = MLX4_EN_MAX_RX_SIZE;
+ param->tx_max_pending = MLX4_EN_MAX_TX_SIZE;
+ param->rx_pending = priv->port_up ?
+ priv->rx_ring[0].actual_size : priv->rx_ring[0].size;
+ param->tx_pending = priv->tx_ring[0].size;
+}
+
+const struct ethtool_ops mlx4_en_ethtool_ops = {
+ .get_drvinfo = mlx4_en_get_drvinfo,
+ .get_settings = mlx4_en_get_settings,
+ .set_settings = mlx4_en_set_settings,
+#ifdef NETIF_F_TSO
+ .get_tso = mlx4_en_get_tso,
+ .set_tso = mlx4_en_set_tso,
+#endif
+ .get_sg = ethtool_op_get_sg,
+ .set_sg = ethtool_op_set_sg,
+ .get_link = ethtool_op_get_link,
+ .get_rx_csum = mlx4_en_get_rx_csum,
+ .set_rx_csum = mlx4_en_set_rx_csum,
+ .get_tx_csum = ethtool_op_get_tx_csum,
+ .set_tx_csum = ethtool_op_set_tx_ipv6_csum,
+ .get_strings = mlx4_en_get_strings,
+ .get_sset_count = mlx4_en_get_sset_count,
+ .get_ethtool_stats = mlx4_en_get_ethtool_stats,
+ .self_test = mlx4_en_self_test,
+ .get_wol = mlx4_en_get_wol,
+ .get_msglevel = mlx4_en_get_msglevel,
+ .set_msglevel = mlx4_en_set_msglevel,
+ .get_coalesce = mlx4_en_get_coalesce,
+ .set_coalesce = mlx4_en_set_coalesce,
+ .get_pauseparam = mlx4_en_get_pauseparam,
+ .set_pauseparam = mlx4_en_set_pauseparam,
+ .get_ringparam = mlx4_en_get_ringparam,
+ .set_ringparam = mlx4_en_set_ringparam,
+ .get_flags = ethtool_op_get_flags,
+ .set_flags = ethtool_op_set_flags,
+};
+
+
+
+
+
diff --git a/sys/ofed/drivers/net/mlx4/en_frag.c b/sys/ofed/drivers/net/mlx4/en_frag.c
new file mode 100644
index 0000000..6c6bac4
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/en_frag.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include "mlx4_en.h"
+
+#include <net/ethernet.h>
+#include <netinet/ip.h>
+#include <machine/in_cksum.h>
+
+static struct mlx4_en_ipfrag *find_session(struct mlx4_en_rx_ring *ring,
+ struct ip *iph)
+{
+ struct mlx4_en_ipfrag *session;
+ int i;
+
+ for (i = 0; i < MLX4_EN_NUM_IPFRAG_SESSIONS; i++) {
+ session = &ring->ipfrag[i];
+ if (session->fragments == NULL)
+ continue;
+ if (session->daddr == iph->ip_dst.s_addr &&
+ session->saddr == iph->ip_src.s_addr &&
+ session->id == iph->ip_id &&
+ session->protocol == iph->ip_p) {
+ return session;
+ }
+ }
+ return NULL;
+}
+
+static struct mlx4_en_ipfrag *start_session(struct mlx4_en_rx_ring *ring,
+ struct ip *iph)
+{
+ struct mlx4_en_ipfrag *session;
+ int index = -1;
+ int i;
+
+ for (i = 0; i < MLX4_EN_NUM_IPFRAG_SESSIONS; i++) {
+ if (ring->ipfrag[i].fragments == NULL) {
+ index = i;
+ break;
+ }
+ }
+ if (index < 0)
+ return NULL;
+
+ session = &ring->ipfrag[index];
+
+ return session;
+}
+
+
+static void flush_session(struct mlx4_en_priv *priv,
+ struct mlx4_en_ipfrag *session,
+ u16 more)
+{
+ struct mbuf *mb = session->fragments;
+ struct ip *iph = mb->m_pkthdr.header;
+ struct net_device *dev = mb->m_pkthdr.rcvif;
+
+ /* Update IP length and checksum */
+ iph->ip_len = htons(session->total_len);
+ iph->ip_off = htons(more | (session->offset >> 3));
+ iph->ip_sum = 0;
+ iph->ip_sum = in_cksum_skip(mb, iph->ip_hl * 4,
+ (char *)iph - mb->m_data);
+
+ dev->if_input(dev, mb);
+ session->fragments = NULL;
+ session->last = NULL;
+}
+
+
+static inline void frag_append(struct mlx4_en_priv *priv,
+ struct mlx4_en_ipfrag *session,
+ struct mbuf *mb,
+ unsigned int data_len)
+{
+ struct mbuf *parent = session->fragments;
+
+ /* Update mb bookkeeping */
+ parent->m_pkthdr.len += data_len;
+ session->total_len += data_len;
+
+ m_adj(mb, mb->m_pkthdr.len - data_len);
+
+ session->last->m_next = mb;
+ for (; mb->m_next != NULL; mb = mb->m_next);
+ session->last = mb;
+}
+
+int mlx4_en_rx_frags(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring,
+ struct mbuf *mb, struct mlx4_cqe *cqe)
+{
+ struct mlx4_en_ipfrag *session;
+ struct ip *iph;
+ u16 ip_len;
+ u16 ip_hlen;
+ int data_len;
+ u16 offset;
+
+ iph = (struct ip *)(mtod(mb, char *) + ETHER_HDR_LEN);
+ mb->m_pkthdr.header = iph;
+ ip_len = ntohs(iph->ip_len);
+ ip_hlen = iph->ip_hl * 4;
+ data_len = ip_len - ip_hlen;
+ offset = ntohs(iph->ip_off);
+ offset &= IP_OFFMASK;
+ offset <<= 3;
+
+ session = find_session(ring, iph);
+ if (unlikely(in_cksum_skip(mb, ip_hlen, (char *)iph - mb->m_data))) {
+ if (session)
+ flush_session(priv, session, IP_MF);
+ return -EINVAL;
+ }
+ if (session) {
+ if (unlikely(session->offset + session->total_len !=
+ offset + ip_hlen ||
+ session->total_len + mb->m_pkthdr.len > 65536)) {
+ flush_session(priv, session, IP_MF);
+ goto new_session;
+ }
+ frag_append(priv, session, mb, data_len);
+ } else {
+new_session:
+ session = start_session(ring, iph);
+ if (unlikely(!session))
+ return -ENOSPC;
+
+ session->fragments = mb;
+ session->daddr = iph->ip_dst.s_addr;
+ session->saddr = iph->ip_src.s_addr;
+ session->id = iph->ip_id;
+ session->protocol = iph->ip_p;
+ session->total_len = ip_len;
+ session->offset = offset;
+ for (; mb->m_next != NULL; mb = mb->m_next);
+ session->last = mb;
+ }
+ if (!(ntohs(iph->ip_off) & IP_MF))
+ flush_session(priv, session, 0);
+
+ return 0;
+}
+
+
+void mlx4_en_flush_frags(struct mlx4_en_priv *priv,
+ struct mlx4_en_rx_ring *ring)
+{
+ struct mlx4_en_ipfrag *session;
+ int i;
+
+ for (i = 0; i < MLX4_EN_NUM_IPFRAG_SESSIONS; i++) {
+ session = &ring->ipfrag[i];
+ if (session->fragments)
+ flush_session(priv, session, IP_MF);
+ }
+}
diff --git a/sys/ofed/drivers/net/mlx4/en_main.c b/sys/ofed/drivers/net/mlx4/en_main.c
new file mode 100644
index 0000000..4d75a10
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/en_main.c
@@ -0,0 +1,384 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4_en.h"
+
+MODULE_AUTHOR("Liran Liss, Yevgeny Petrilin");
+MODULE_DESCRIPTION("Mellanox ConnectX HCA Ethernet driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION " ("DRV_RELDATE")");
+
+static const char mlx4_en_version[] =
+ DRV_NAME ": Mellanox ConnectX HCA Ethernet driver v"
+ DRV_VERSION " (" DRV_RELDATE ")\n";
+
+#define MLX4_EN_PARM_INT(X, def_val, desc) \
+ static unsigned int X = def_val;\
+ module_param(X , uint, 0444); \
+ MODULE_PARM_DESC(X, desc);
+
+
+/*
+ * Device scope module parameters
+ */
+
+
+/* Enable RSS TCP traffic */
+MLX4_EN_PARM_INT(tcp_rss, 1,
+ "Enable RSS for incomming TCP traffic or disabled (0)");
+/* Enable RSS UDP traffic */
+MLX4_EN_PARM_INT(udp_rss, 1,
+ "Enable RSS for incomming UDP traffic or disabled (0)");
+
+/* Number of LRO sessions per Rx ring (rounded up to a power of two) */
+MLX4_EN_PARM_INT(num_lro, MLX4_EN_MAX_LRO_DESCRIPTORS,
+ "Number of LRO sessions per ring or disabled (0)");
+
+/* Allow reassembly of fragmented IP packets */
+MLX4_EN_PARM_INT(ip_reasm, 1, "Allow reassembly of fragmented IP packets (!0)");
+
+/* Priority pausing */
+MLX4_EN_PARM_INT(pfctx, 0, "Priority based Flow Control policy on TX[7:0]."
+ " Per priority bit mask");
+MLX4_EN_PARM_INT(pfcrx, 0, "Priority based Flow Control policy on RX[7:0]."
+ " Per priority bit mask");
+
+static int mlx4_en_get_profile(struct mlx4_en_dev *mdev)
+{
+ struct mlx4_en_profile *params = &mdev->profile;
+ int i;
+
+ params->tcp_rss = tcp_rss;
+ params->udp_rss = udp_rss;
+ if (params->udp_rss && !mdev->dev->caps.udp_rss) {
+ mlx4_warn(mdev, "UDP RSS is not supported on this device.\n");
+ params->udp_rss = 0;
+ }
+ params->num_lro = min_t(int, num_lro , MLX4_EN_MAX_LRO_DESCRIPTORS);
+ params->ip_reasm = ip_reasm;
+ for (i = 1; i <= MLX4_MAX_PORTS; i++) {
+ params->prof[i].rx_pause = 1;
+ params->prof[i].rx_ppp = pfcrx;
+ params->prof[i].tx_pause = 1;
+ params->prof[i].tx_ppp = pfctx;
+ params->prof[i].tx_ring_size = MLX4_EN_DEF_TX_RING_SIZE;
+ params->prof[i].rx_ring_size = MLX4_EN_DEF_RX_RING_SIZE;
+ params->prof[i].tx_ring_num = MLX4_EN_NUM_HASH_RINGS + 1 +
+ (!!pfcrx) * MLX4_EN_NUM_PPP_RINGS;
+ }
+
+ return 0;
+}
+
+static void *get_netdev(struct mlx4_dev *dev, void *ctx, u8 port)
+{
+ struct mlx4_en_dev *endev = ctx;
+
+ return endev->pndev[port];
+}
+
+static void mlx4_en_event(struct mlx4_dev *dev, void *endev_ptr,
+ enum mlx4_dev_event event, int port)
+{
+ struct mlx4_en_dev *mdev = (struct mlx4_en_dev *) endev_ptr;
+ struct mlx4_en_priv *priv;
+
+ if (!mdev->pndev[port])
+ return;
+
+ priv = netdev_priv(mdev->pndev[port]);
+ switch (event) {
+ case MLX4_DEV_EVENT_PORT_UP:
+ case MLX4_DEV_EVENT_PORT_DOWN:
+ /* To prevent races, we poll the link state in a separate
+ task rather than changing it here */
+ priv->link_state = event;
+ queue_work(mdev->workqueue, &priv->linkstate_task);
+ break;
+
+ case MLX4_DEV_EVENT_CATASTROPHIC_ERROR:
+ mlx4_err(mdev, "Internal error detected, restarting device\n");
+ break;
+
+ default:
+ mlx4_warn(mdev, "Unhandled event: %d\n", event);
+ }
+}
+
+static void mlx4_en_remove(struct mlx4_dev *dev, void *endev_ptr)
+{
+ struct mlx4_en_dev *mdev = endev_ptr;
+ int i;
+
+ mutex_lock(&mdev->state_lock);
+ mdev->device_up = false;
+ mutex_unlock(&mdev->state_lock);
+
+ mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH)
+ if (mdev->pndev[i])
+ mlx4_en_destroy_netdev(mdev->pndev[i]);
+
+ flush_workqueue(mdev->workqueue);
+ destroy_workqueue(mdev->workqueue);
+ mlx4_mr_free(dev, &mdev->mr);
+ mlx4_uar_free(dev, &mdev->priv_uar);
+ mlx4_pd_free(dev, mdev->priv_pdn);
+ sx_destroy(&mdev->state_lock.sx);
+ mtx_destroy(&mdev->uar_lock.m);
+ kfree(mdev);
+}
+
+static void *mlx4_en_add(struct mlx4_dev *dev)
+{
+ static int mlx4_en_version_printed;
+ struct mlx4_en_dev *mdev;
+ int i;
+ int err;
+
+ if (!mlx4_en_version_printed) {
+ printk(KERN_INFO "%s", mlx4_en_version);
+ mlx4_en_version_printed++;
+ }
+
+ mdev = kzalloc(sizeof *mdev, GFP_KERNEL);
+ if (!mdev) {
+ dev_err(&dev->pdev->dev, "Device struct alloc failed, "
+ "aborting.\n");
+ err = -ENOMEM;
+ goto err_free_res;
+ }
+
+ if (mlx4_pd_alloc(dev, &mdev->priv_pdn))
+ goto err_free_dev;
+
+ if (mlx4_uar_alloc(dev, &mdev->priv_uar))
+ goto err_pd;
+
+ mtx_init(&mdev->uar_lock.m, "mlx4 uar", NULL, MTX_DEF);
+ mdev->uar_map = ioremap(mdev->priv_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
+ if (!mdev->uar_map)
+ goto err_uar;
+
+ mdev->dev = dev;
+ mdev->dma_device = &(dev->pdev->dev);
+ mdev->pdev = dev->pdev;
+ mdev->device_up = false;
+
+ mdev->LSO_support = !!(dev->caps.flags & (1 << 15));
+ if (!mdev->LSO_support)
+ mlx4_warn(mdev, "LSO not supported, please upgrade to later "
+ "FW version to enable LSO\n");
+
+ if (mlx4_mr_alloc(mdev->dev, mdev->priv_pdn, 0, ~0ull,
+ MLX4_PERM_LOCAL_WRITE | MLX4_PERM_LOCAL_READ,
+ 0, 0, &mdev->mr)) {
+ mlx4_err(mdev, "Failed allocating memory region\n");
+ goto err_uar;
+ }
+ if (mlx4_mr_enable(mdev->dev, &mdev->mr)) {
+ mlx4_err(mdev, "Failed enabling memory region\n");
+ goto err_mr;
+ }
+
+ /* Build device profile according to supplied module parameters */
+ err = mlx4_en_get_profile(mdev);
+ if (err) {
+ mlx4_err(mdev, "Bad module parameters, aborting.\n");
+ goto err_mr;
+ }
+
+ /* Configure wich ports to start according to module parameters */
+ mdev->port_cnt = 0;
+ mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH)
+ mdev->port_cnt++;
+
+ /* If we did not receive an explicit number of Rx rings, default to
+ * the number of completion vectors populated by the mlx4_core */
+ mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
+ mlx4_info(mdev, "Using %d tx rings for port:%d\n",
+ mdev->profile.prof[i].tx_ring_num, i);
+ mdev->profile.prof[i].rx_ring_num = rounddown_pow_of_two(
+ min_t(int, dev->caps.num_comp_vectors, MAX_RX_RINGS/2)) +
+ (mdev->profile.udp_rss ? rounddown_pow_of_two(
+ min_t(int, dev->caps.num_comp_vectors, MAX_RX_RINGS/2)) : 1);
+ mlx4_info(mdev, "Defaulting to %d rx rings for port:%d\n",
+ mdev->profile.prof[i].rx_ring_num, i);
+ }
+
+ /* Create our own workqueue for reset/multicast tasks
+ * Note: we cannot use the shared workqueue because of deadlocks caused
+ * by the rtnl lock */
+ mdev->workqueue = create_singlethread_workqueue("mlx4_en");
+ if (!mdev->workqueue) {
+ err = -ENOMEM;
+ goto err_mr;
+ }
+
+ /* At this stage all non-port specific tasks are complete:
+ * mark the card state as up */
+ sx_init(&mdev->state_lock.sx, "mlxen state");
+ mdev->device_up = true;
+
+ /* Setup ports */
+
+ /* Create a netdev for each port */
+ mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
+ mlx4_info(mdev, "Activating port:%d\n", i);
+ if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i])) {
+ mdev->pndev[i] = NULL;
+ goto err_free_netdev;
+ }
+ }
+ return mdev;
+
+
+err_free_netdev:
+ mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
+ if (mdev->pndev[i])
+ mlx4_en_destroy_netdev(mdev->pndev[i]);
+ }
+
+ mutex_lock(&mdev->state_lock);
+ mdev->device_up = false;
+ mutex_unlock(&mdev->state_lock);
+ flush_workqueue(mdev->workqueue);
+
+ /* Stop event queue before we drop down to release shared SW state */
+ destroy_workqueue(mdev->workqueue);
+
+err_mr:
+ mlx4_mr_free(dev, &mdev->mr);
+err_uar:
+ mtx_destroy(&mdev->uar_lock.m);
+ mlx4_uar_free(dev, &mdev->priv_uar);
+err_pd:
+ mlx4_pd_free(dev, mdev->priv_pdn);
+err_free_dev:
+ kfree(mdev);
+err_free_res:
+ return NULL;
+}
+
+enum mlx4_query_reply mlx4_en_query(void *endev_ptr, void *int_dev)
+{
+ struct mlx4_en_dev *mdev = endev_ptr;
+ struct net_device *netdev = int_dev;
+ int p;
+
+ for (p = 1; p <= MLX4_MAX_PORTS; ++p)
+ if (mdev->pndev[p] == netdev)
+ return p;
+
+ return MLX4_QUERY_NOT_MINE;
+}
+
+#if 0
+static struct pci_device_id mlx4_en_pci_table[] = {
+ { PCI_VDEVICE(MELLANOX, 0x6340) }, /* MT25408 "Hermon" SDR */
+ { PCI_VDEVICE(MELLANOX, 0x634a) }, /* MT25408 "Hermon" DDR */
+ { PCI_VDEVICE(MELLANOX, 0x6354) }, /* MT25408 "Hermon" QDR */
+ { PCI_VDEVICE(MELLANOX, 0x6732) }, /* MT25408 "Hermon" DDR PCIe gen2 */
+ { PCI_VDEVICE(MELLANOX, 0x673c) }, /* MT25408 "Hermon" QDR PCIe gen2 */
+ { PCI_VDEVICE(MELLANOX, 0x6368) }, /* MT25408 "Hermon" EN 10GigE */
+ { PCI_VDEVICE(MELLANOX, 0x6750) }, /* MT25408 "Hermon" EN 10GigE PCIe gen2 */
+ { PCI_VDEVICE(MELLANOX, 0x6372) }, /* MT25458 ConnectX EN 10GBASE-T 10GigE */
+ { PCI_VDEVICE(MELLANOX, 0x675a) }, /* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */
+ { PCI_VDEVICE(MELLANOX, 0x6764) }, /* MT26468 ConnectX EN 10GigE PCIe gen2 */
+ { PCI_VDEVICE(MELLANOX, 0x6746) }, /* MT26438 ConnectX VPI PCIe 2.0 5GT/s - IB QDR / 10GigE Virt+ */
+ { PCI_VDEVICE(MELLANOX, 0x676e) }, /* MT26478 ConnectX EN 40GigE PCIe 2.0 5GT/s */
+ { PCI_VDEVICE(MELLANOX, 0x6778) }, /* MT26488 ConnectX VPI PCIe 2.0 5GT/s - IB DDR / 10GigE Virt+ */
+ { PCI_VDEVICE(MELLANOX, 0x1000) },
+ { PCI_VDEVICE(MELLANOX, 0x1001) },
+ { PCI_VDEVICE(MELLANOX, 0x1002) },
+ { PCI_VDEVICE(MELLANOX, 0x1003) },
+ { PCI_VDEVICE(MELLANOX, 0x1004) },
+ { PCI_VDEVICE(MELLANOX, 0x1005) },
+ { PCI_VDEVICE(MELLANOX, 0x1006) },
+ { PCI_VDEVICE(MELLANOX, 0x1007) },
+ { PCI_VDEVICE(MELLANOX, 0x1008) },
+ { PCI_VDEVICE(MELLANOX, 0x1009) },
+ { PCI_VDEVICE(MELLANOX, 0x100a) },
+ { PCI_VDEVICE(MELLANOX, 0x100b) },
+ { PCI_VDEVICE(MELLANOX, 0x100c) },
+ { PCI_VDEVICE(MELLANOX, 0x100d) },
+ { PCI_VDEVICE(MELLANOX, 0x100e) },
+ { PCI_VDEVICE(MELLANOX, 0x100f) },
+ { 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, mlx4_en_pci_table);
+#endif
+
+static struct mlx4_interface mlx4_en_interface = {
+ .add = mlx4_en_add,
+ .remove = mlx4_en_remove,
+ .event = mlx4_en_event,
+ .query = mlx4_en_query,
+ .get_prot_dev = get_netdev,
+ .protocol = MLX4_PROT_EN,
+};
+
+static int __init mlx4_en_init(void)
+{
+ return mlx4_register_interface(&mlx4_en_interface);
+}
+
+static void __exit mlx4_en_cleanup(void)
+{
+ mlx4_unregister_interface(&mlx4_en_interface);
+}
+
+module_init(mlx4_en_init);
+module_exit(mlx4_en_cleanup);
+
+#undef MODULE_VERSION
+#include <sys/module.h>
+static int
+mlxen_evhand(module_t mod, int event, void *arg)
+{
+ return (0);
+}
+static moduledata_t mlxen_mod = {
+ .name = "mlxen",
+ .evhand = mlxen_evhand,
+};
+DECLARE_MODULE(mlxen, mlxen_mod, SI_SUB_SMP, SI_ORDER_ANY);
+MODULE_DEPEND(mlxen, mlx4, 1, 1, 1);
diff --git a/sys/ofed/drivers/net/mlx4/en_netdev.c b/sys/ofed/drivers/net/mlx4/en_netdev.c
new file mode 100644
index 0000000..531f46f
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/en_netdev.c
@@ -0,0 +1,1511 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include "mlx4_en.h"
+
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/cq.h>
+
+#include <linux/delay.h>
+#include <net/ethernet.h>
+#include <net/if_vlan_var.h>
+#include <sys/sockio.h>
+
+static void mlx4_en_sysctl_stat(struct mlx4_en_priv *priv);
+
+static void mlx4_en_vlan_rx_add_vid(void *arg, struct net_device *dev, u16 vid)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ int idx;
+ u8 field;
+
+ if ((vid == 0) || (vid > 4095)) /* Invalid */
+ return;
+
+ en_dbg(HW, priv, "adding VLAN:%d\n", vid);
+
+ spin_lock(&priv->vlan_lock);
+ priv->vlgrp_modified = true;
+ idx = vid >> 5;
+ field = 1 << (vid & 0x1f);
+ if (priv->vlan_unregister[idx] & field)
+ priv->vlan_unregister[idx] &= ~field;
+ else
+ priv->vlan_register[idx] |= field;
+ priv->vlans[idx] |= field;
+ spin_unlock(&priv->vlan_lock);
+}
+
+static void mlx4_en_vlan_rx_kill_vid(void *arg, struct net_device *dev, u16 vid)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ int idx;
+ u8 field;
+
+ if ((vid == 0) || (vid > 4095)) /* Invalid */
+ return;
+ en_dbg(HW, priv, "Killing VID:%d\n", vid);
+ spin_lock(&priv->vlan_lock);
+ priv->vlgrp_modified = true;
+ idx = vid >> 5;
+ field = 1 << (vid & 0x1f);
+ if (priv->vlan_register[idx] & field)
+ priv->vlan_register[idx] &= ~field;
+ else
+ priv->vlan_unregister[idx] |= field;
+ priv->vlans[idx] &= ~field;
+ spin_unlock(&priv->vlan_lock);
+}
+
+u64 mlx4_en_mac_to_u64(u8 *addr)
+{
+ u64 mac = 0;
+ int i;
+
+ for (i = 0; i < ETHER_ADDR_LEN; i++) {
+ mac <<= 8;
+ mac |= addr[i];
+ }
+ return mac;
+}
+
+static int mlx4_en_cache_mclist(struct net_device *dev, u64 **mcaddrp)
+{
+ struct ifmultiaddr *ifma;;
+ u64 *mcaddr;
+ int cnt;
+ int i;
+
+ *mcaddrp = NULL;
+restart:
+ cnt = 0;
+ if_maddr_rlock(dev);
+ TAILQ_FOREACH(ifma, &dev->if_multiaddrs, ifma_link) {
+ if (ifma->ifma_addr->sa_family != AF_LINK)
+ continue;
+ if (((struct sockaddr_dl *)ifma->ifma_addr)->sdl_alen !=
+ ETHER_ADDR_LEN)
+ continue;
+ cnt++;
+ }
+ if_maddr_runlock(dev);
+ if (cnt == 0)
+ return (0);
+ mcaddr = kmalloc(sizeof(u64) * cnt, GFP_KERNEL);
+ if (mcaddr == NULL)
+ return (0);
+ i = 0;
+ if_maddr_rlock(dev);
+ TAILQ_FOREACH(ifma, &dev->if_multiaddrs, ifma_link) {
+ if (ifma->ifma_addr->sa_family != AF_LINK)
+ continue;
+ if (((struct sockaddr_dl *)ifma->ifma_addr)->sdl_alen !=
+ ETHER_ADDR_LEN)
+ continue;
+ /* Make sure the list didn't grow. */
+ if (i == cnt) {
+ if_maddr_runlock(dev);
+ kfree(mcaddr);
+ goto restart;
+ }
+ mcaddr[i++] = mlx4_en_mac_to_u64(
+ LLADDR((struct sockaddr_dl *)ifma->ifma_addr));
+ }
+ if_maddr_runlock(dev);
+ *mcaddrp = mcaddr;
+ return (i);
+}
+
+
+static void mlx4_en_set_multicast(struct net_device *dev)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+
+ if (!priv->port_up)
+ return;
+
+ queue_work(priv->mdev->workqueue, &priv->mcast_task);
+}
+
+static void mlx4_en_do_set_multicast(struct work_struct *work)
+{
+ struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
+ mcast_task);
+ struct net_device *dev = priv->dev;
+ struct mlx4_en_dev *mdev = priv->mdev;
+ int err;
+
+ mutex_lock(&mdev->state_lock);
+ if (!mdev->device_up) {
+ en_dbg(HW, priv, "Card is not up, "
+ "ignoring multicast change.\n");
+ goto out;
+ }
+ if (!priv->port_up) {
+ en_dbg(HW, priv, "Port is down, "
+ "ignoring multicast change.\n");
+ goto out;
+ }
+
+ /*
+ * Promsicuous mode: disable all filters
+ */
+
+ if (dev->if_flags & IFF_PROMISC) {
+ if (!(priv->flags & MLX4_EN_FLAG_PROMISC)) {
+ priv->flags |= MLX4_EN_FLAG_PROMISC;
+
+ /* Enable promiscouos mode */
+ err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port,
+ priv->base_qpn, 1);
+ if (err)
+ en_err(priv, "Failed enabling "
+ "promiscous mode\n");
+
+ /* Disable port multicast filter (unconditionally) */
+ err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
+ 0, MLX4_MCAST_DISABLE);
+ if (err)
+ en_err(priv, "Failed disabling "
+ "multicast filter\n");
+
+ /* Disable port VLAN filter */
+ err = mlx4_SET_VLAN_FLTR(mdev->dev, priv->port, NULL);
+ if (err)
+ en_err(priv, "Failed disabling VLAN filter\n");
+ }
+ goto out;
+ }
+
+ /*
+ * Not in promiscous mode
+ */
+
+ if (priv->flags & MLX4_EN_FLAG_PROMISC) {
+ priv->flags &= ~MLX4_EN_FLAG_PROMISC;
+
+ /* Disable promiscouos mode */
+ err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port,
+ priv->base_qpn, 0);
+ if (err)
+ en_err(priv, "Failed disabling promiscous mode\n");
+
+ /* Enable port VLAN filter */
+ err = mlx4_SET_VLAN_FLTR(mdev->dev, priv->port, priv->vlans);
+ if (err)
+ en_err(priv, "Failed enabling VLAN filter\n");
+ }
+
+ /* Enable/disable the multicast filter according to IFF_ALLMULTI */
+ if (dev->if_flags & IFF_ALLMULTI) {
+ err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
+ 0, MLX4_MCAST_DISABLE);
+ if (err)
+ en_err(priv, "Failed disabling multicast filter\n");
+ } else {
+ u64 *mcaddr;
+ int mccount;
+ int i;
+
+ err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
+ 0, MLX4_MCAST_DISABLE);
+ if (err)
+ en_err(priv, "Failed disabling multicast filter\n");
+
+ /* Flush mcast filter and init it with broadcast address */
+ mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, ETH_BCAST,
+ 1, MLX4_MCAST_CONFIG);
+
+ /* Update multicast list - we cache all addresses so they won't
+ * change while HW is updated holding the command semaphor */
+ mccount = mlx4_en_cache_mclist(dev, &mcaddr);
+ for (i = 0; i < mccount; i++)
+ mlx4_SET_MCAST_FLTR(mdev->dev, priv->port,
+ mcaddr[i], 0, MLX4_MCAST_CONFIG);
+ err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
+ 0, MLX4_MCAST_ENABLE);
+ if (err)
+ en_err(priv, "Failed enabling multicast filter\n");
+
+ kfree(mcaddr);
+ }
+out:
+ mutex_unlock(&mdev->state_lock);
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void mlx4_en_netpoll(struct net_device *dev)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ struct mlx4_en_cq *cq;
+ unsigned long flags;
+ int i;
+
+ for (i = 0; i < priv->rx_ring_num; i++) {
+ cq = &priv->rx_cq[i];
+ spin_lock_irqsave(&cq->lock, flags);
+ napi_synchronize(&cq->napi);
+ if (priv->rx_ring[i].use_frags)
+ mlx4_en_process_rx_cq(dev, cq, 0);
+ else
+ mlx4_en_process_rx_cq_mb(dev, cq, 0);
+ spin_unlock_irqrestore(&cq->lock, flags);
+ }
+}
+#endif
+
+static void mlx4_en_watchdog_timeout(void *arg)
+{
+ struct mlx4_en_priv *priv = arg;
+ struct mlx4_en_dev *mdev = priv->mdev;
+
+ en_dbg(DRV, priv, "Scheduling watchdog\n");
+ queue_work(mdev->workqueue, &priv->watchdog_task);
+ if (priv->port_up)
+ callout_reset(&priv->watchdog_timer, MLX4_EN_WATCHDOG_TIMEOUT,
+ mlx4_en_watchdog_timeout, priv);
+}
+
+
+/* XXX This clears user settings in too many cases. */
+static void mlx4_en_set_default_moderation(struct mlx4_en_priv *priv)
+{
+ struct mlx4_en_cq *cq;
+ int i;
+
+ /* If we haven't received a specific coalescing setting
+ * (module param), we set the moderation paramters as follows:
+ * - moder_cnt is set to the number of mtu sized packets to
+ * satisfy our coelsing target.
+ * - moder_time is set to a fixed value.
+ */
+ priv->rx_frames = MLX4_EN_RX_COAL_TARGET / priv->dev->if_mtu + 1;
+ priv->rx_usecs = MLX4_EN_RX_COAL_TIME;
+ en_dbg(INTR, priv, "Default coalesing params for mtu:%ld - "
+ "rx_frames:%d rx_usecs:%d\n",
+ priv->dev->if_mtu, priv->rx_frames, priv->rx_usecs);
+
+ /* Setup cq moderation params */
+ for (i = 0; i < priv->rx_ring_num; i++) {
+ cq = &priv->rx_cq[i];
+ cq->moder_cnt = priv->rx_frames;
+ cq->moder_time = priv->rx_usecs;
+ }
+
+ for (i = 0; i < priv->tx_ring_num; i++) {
+ cq = &priv->tx_cq[i];
+ cq->moder_cnt = MLX4_EN_TX_COAL_PKTS;
+ cq->moder_time = MLX4_EN_TX_COAL_TIME;
+ }
+
+ /* Reset auto-moderation params */
+ priv->pkt_rate_low = MLX4_EN_RX_RATE_LOW;
+ priv->rx_usecs_low = MLX4_EN_RX_COAL_TIME_LOW;
+ priv->pkt_rate_high = MLX4_EN_RX_RATE_HIGH;
+ priv->rx_usecs_high = MLX4_EN_RX_COAL_TIME_HIGH;
+ priv->sample_interval = MLX4_EN_SAMPLE_INTERVAL;
+ priv->adaptive_rx_coal = 1;
+ priv->last_moder_time = MLX4_EN_AUTO_CONF;
+ priv->last_moder_jiffies = 0;
+ priv->last_moder_packets = 0;
+ priv->last_moder_tx_packets = 0;
+ priv->last_moder_bytes = 0;
+}
+
+static void mlx4_en_auto_moderation(struct mlx4_en_priv *priv)
+{
+ unsigned long period = (unsigned long) (jiffies - priv->last_moder_jiffies);
+ struct mlx4_en_cq *cq;
+ unsigned long packets;
+ unsigned long rate;
+ unsigned long avg_pkt_size;
+ unsigned long rx_packets;
+ unsigned long rx_bytes;
+ unsigned long tx_packets;
+ unsigned long tx_pkt_diff;
+ unsigned long rx_pkt_diff;
+ int moder_time;
+ int i, err;
+
+ if (!priv->adaptive_rx_coal || period < priv->sample_interval * HZ)
+ return;
+
+ spin_lock(&priv->stats_lock);
+ rx_packets = priv->dev->if_ipackets;
+ rx_bytes = priv->dev->if_ibytes;
+ tx_packets = priv->dev->if_opackets;
+ spin_unlock(&priv->stats_lock);
+
+ if (!priv->last_moder_jiffies || !period)
+ goto out;
+
+ tx_pkt_diff = ((unsigned long) (tx_packets -
+ priv->last_moder_tx_packets));
+ rx_pkt_diff = ((unsigned long) (rx_packets -
+ priv->last_moder_packets));
+ packets = max(tx_pkt_diff, rx_pkt_diff);
+ rate = packets * HZ / period;
+ avg_pkt_size = packets ? ((unsigned long) (rx_bytes -
+ priv->last_moder_bytes)) / packets : 0;
+
+ /* Apply auto-moderation only when packet rate exceeds a rate that
+ * it matters */
+ if (rate > MLX4_EN_RX_RATE_THRESH) {
+ /* If tx and rx packet rates are not balanced, assume that
+ * traffic is mainly BW bound and apply maximum moderation.
+ * Otherwise, moderate according to packet rate */
+ if (2 * tx_pkt_diff > 3 * rx_pkt_diff ||
+ 2 * rx_pkt_diff > 3 * tx_pkt_diff) {
+ moder_time = priv->rx_usecs_high;
+ } else {
+ if (rate < priv->pkt_rate_low ||
+ avg_pkt_size < MLX4_EN_AVG_PKT_SMALL)
+ moder_time = priv->rx_usecs_low;
+ else if (rate > priv->pkt_rate_high)
+ moder_time = priv->rx_usecs_high;
+ else
+ moder_time = (rate - priv->pkt_rate_low) *
+ (priv->rx_usecs_high - priv->rx_usecs_low) /
+ (priv->pkt_rate_high - priv->pkt_rate_low) +
+ priv->rx_usecs_low;
+ }
+ } else {
+ /* When packet rate is low, use default moderation rather than
+ * 0 to prevent interrupt storms if traffic suddenly increases */
+ moder_time = priv->rx_usecs;
+ }
+
+ en_dbg(INTR, priv, "tx rate:%lu rx_rate:%lu\n",
+ tx_pkt_diff * HZ / period, rx_pkt_diff * HZ / period);
+
+ en_dbg(INTR, priv, "Rx moder_time changed from:%d to %d period:%lu "
+ "[jiff] packets:%lu avg_pkt_size:%lu rate:%lu [p/s])\n",
+ priv->last_moder_time, moder_time, period, packets,
+ avg_pkt_size, rate);
+
+ if (moder_time != priv->last_moder_time) {
+ priv->last_moder_time = moder_time;
+ for (i = 0; i < priv->rx_ring_num; i++) {
+ cq = &priv->rx_cq[i];
+ cq->moder_time = moder_time;
+ err = mlx4_en_set_cq_moder(priv, cq);
+ if (err) {
+ en_err(priv, "Failed modifying moderation for cq:%d\n", i);
+ break;
+ }
+ }
+ }
+
+out:
+ priv->last_moder_packets = rx_packets;
+ priv->last_moder_tx_packets = tx_packets;
+ priv->last_moder_bytes = rx_bytes;
+ priv->last_moder_jiffies = jiffies;
+}
+
+static void mlx4_en_handle_vlans(struct mlx4_en_priv *priv)
+{
+ u8 vlan_register[VLAN_FLTR_SIZE];
+ u8 vlan_unregister[VLAN_FLTR_SIZE];
+ int i, j, idx;
+ u16 vid;
+
+ /* cache the vlan data for processing
+ * done under lock to avoid changes during work */
+ spin_lock(&priv->vlan_lock);
+ for (i = 0; i < VLAN_FLTR_SIZE; i++) {
+ vlan_register[i] = priv->vlan_register[i];
+ priv->vlan_register[i] = 0;
+ vlan_unregister[i] = priv->vlan_unregister[i];
+ priv->vlan_unregister[i] = 0;
+ }
+ priv->vlgrp_modified = false;
+ spin_unlock(&priv->vlan_lock);
+
+ /* Configure the vlan filter
+ * The vlgrp is updated with all the vids that need to be allowed */
+ if (mlx4_SET_VLAN_FLTR(priv->mdev->dev, priv->port, priv->vlans))
+ en_err(priv, "Failed configuring VLAN filter\n");
+
+ /* Configure the VLAN table */
+ for (i = 0; i < VLAN_FLTR_SIZE; i++) {
+ for (j = 0; j < 32; j++) {
+ vid = (i << 5) + j;
+ if (vlan_register[i] & (1 << j))
+ if (mlx4_register_vlan(priv->mdev->dev, priv->port, vid, &idx))
+ en_dbg(HW, priv, "failed registering vlan %d\n", vid);
+ if (vlan_unregister[i] & (1 << j)) {
+ if (!mlx4_find_cached_vlan(priv->mdev->dev, priv->port, vid, &idx))
+ mlx4_unregister_vlan(priv->mdev->dev, priv->port, idx);
+ else
+ en_dbg(HW, priv, "could not find vid %d in cache\n", vid);
+ }
+ }
+ }
+}
+
+static void mlx4_en_do_get_stats(struct work_struct *work)
+{
+ struct delayed_work *delay = to_delayed_work(work);
+ struct mlx4_en_priv *priv = container_of(delay, struct mlx4_en_priv,
+ stats_task);
+ struct mlx4_en_dev *mdev = priv->mdev;
+ int err;
+
+ err = mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 0);
+ if (err)
+ en_dbg(HW, priv, "Could not update stats \n");
+
+
+ mutex_lock(&mdev->state_lock);
+ if (mdev->device_up) {
+ if (priv->port_up) {
+ if (priv->vlgrp_modified)
+ mlx4_en_handle_vlans(priv);
+
+ mlx4_en_auto_moderation(priv);
+ }
+
+ queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY);
+ }
+ if (mdev->mac_removed[MLX4_MAX_PORTS + 1 - priv->port]) {
+ panic("mlx4_en_do_get_stats: Unexpected mac removed for %d\n",
+ priv->port);
+ mdev->mac_removed[MLX4_MAX_PORTS + 1 - priv->port] = 0;
+ }
+ mutex_unlock(&mdev->state_lock);
+}
+
+static void mlx4_en_linkstate(struct work_struct *work)
+{
+ struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
+ linkstate_task);
+ struct mlx4_en_dev *mdev = priv->mdev;
+ int linkstate = priv->link_state;
+
+ mutex_lock(&mdev->state_lock);
+ /* If observable port state changed set carrier state and
+ * report to system log */
+ if (priv->last_link_state != linkstate) {
+ if (linkstate == MLX4_DEV_EVENT_PORT_DOWN) {
+ if_link_state_change(priv->dev, LINK_STATE_DOWN);
+ } else {
+ en_info(priv, "Link Up\n");
+ if_link_state_change(priv->dev, LINK_STATE_UP);
+ }
+ }
+ priv->last_link_state = linkstate;
+ mutex_unlock(&mdev->state_lock);
+}
+
+
+int mlx4_en_start_port(struct net_device *dev)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ struct mlx4_en_dev *mdev = priv->mdev;
+ struct mlx4_en_cq *cq;
+ struct mlx4_en_tx_ring *tx_ring;
+ int rx_index = 0;
+ int tx_index = 0;
+ int err = 0;
+ int i;
+ int j;
+
+ if (priv->port_up) {
+ en_dbg(DRV, priv, "start port called while port already up\n");
+ return 0;
+ }
+
+ /* Calculate Rx buf size */
+ dev->if_mtu = min(dev->if_mtu, priv->max_mtu);
+ mlx4_en_calc_rx_buf(dev);
+ en_dbg(DRV, priv, "Rx buf size:%d\n", priv->rx_mb_size);
+
+ /* Configure rx cq's and rings */
+ err = mlx4_en_activate_rx_rings(priv);
+ if (err) {
+ en_err(priv, "Failed to activate RX rings\n");
+ return err;
+ }
+
+ for (i = 0; i < priv->rx_ring_num; i++) {
+ cq = &priv->rx_cq[i];
+
+ err = mlx4_en_activate_cq(priv, cq);
+ if (err) {
+ en_err(priv, "Failed activating Rx CQ\n");
+ goto cq_err;
+ }
+ for (j = 0; j < cq->size; j++)
+ cq->buf[j].owner_sr_opcode = MLX4_CQE_OWNER_MASK;
+ err = mlx4_en_set_cq_moder(priv, cq);
+ if (err) {
+ en_err(priv, "Failed setting cq moderation parameters");
+ mlx4_en_deactivate_cq(priv, cq);
+ goto cq_err;
+ }
+ mlx4_en_arm_cq(priv, cq);
+ priv->rx_ring[i].cqn = cq->mcq.cqn;
+ ++rx_index;
+ }
+
+ err = mlx4_en_config_rss_steer(priv);
+ if (err) {
+ en_err(priv, "Failed configuring rss steering\n");
+ goto cq_err;
+ }
+
+ /* Configure tx cq's and rings */
+ for (i = 0; i < priv->tx_ring_num; i++) {
+ /* Configure cq */
+ cq = &priv->tx_cq[i];
+ err = mlx4_en_activate_cq(priv, cq);
+ if (err) {
+ en_err(priv, "Failed allocating Tx CQ\n");
+ goto tx_err;
+ }
+ err = mlx4_en_set_cq_moder(priv, cq);
+ if (err) {
+ en_err(priv, "Failed setting cq moderation parameters");
+ mlx4_en_deactivate_cq(priv, cq);
+ goto tx_err;
+ }
+ en_dbg(DRV, priv, "Resetting index of collapsed CQ:%d to -1\n", i);
+ cq->buf->wqe_index = cpu_to_be16(0xffff);
+
+ /* Configure ring */
+ tx_ring = &priv->tx_ring[i];
+ err = mlx4_en_activate_tx_ring(priv, tx_ring, cq->mcq.cqn);
+ if (err) {
+ en_err(priv, "Failed allocating Tx ring\n");
+ mlx4_en_deactivate_cq(priv, cq);
+ goto tx_err;
+ }
+ /* Set initial ownership of all Tx TXBBs to SW (1) */
+ for (j = 0; j < tx_ring->buf_size; j += STAMP_STRIDE)
+ *((u32 *) (tx_ring->buf + j)) = 0xffffffff;
+ ++tx_index;
+ }
+
+ /* Configure port */
+ err = mlx4_SET_PORT_general(mdev->dev, priv->port,
+ priv->rx_mb_size + ETHER_CRC_LEN,
+ priv->prof->tx_pause,
+ priv->prof->tx_ppp,
+ priv->prof->rx_pause,
+ priv->prof->rx_ppp);
+ if (err) {
+ en_err(priv, "Failed setting port general configurations "
+ "for port %d, with error %d\n", priv->port, err);
+ goto tx_err;
+ }
+ /* Set default qp number */
+ err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port, priv->base_qpn, 0);
+ if (err) {
+ en_err(priv, "Failed setting default qp numbers\n");
+ goto tx_err;
+ }
+ /* Set port mac number */
+ en_dbg(DRV, priv, "Setting mac for port %d\n", priv->port);
+ err = mlx4_register_mac(mdev->dev, priv->port,
+ mlx4_en_mac_to_u64(IF_LLADDR(dev)),
+ &priv->mac_index);
+ if (err) {
+ en_err(priv, "Failed setting port mac\n");
+ goto tx_err;
+ }
+ mdev->mac_removed[priv->port] = 0;
+
+ /* Init port */
+ en_dbg(HW, priv, "Initializing port\n");
+ err = mlx4_INIT_PORT(mdev->dev, priv->port);
+ if (err) {
+ en_err(priv, "Failed Initializing port\n");
+ goto mac_err;
+ }
+
+ /* Set the various hardware offload abilities */
+ dev->if_hwassist = 0;
+ if (dev->if_capenable & IFCAP_TSO4)
+ dev->if_hwassist |= CSUM_TSO;
+ if (dev->if_capenable & IFCAP_TXCSUM)
+ dev->if_hwassist |= (CSUM_TCP | CSUM_UDP | CSUM_IP);
+ if (dev->if_capenable & IFCAP_RXCSUM)
+ priv->rx_csum = 1;
+ else
+ priv->rx_csum = 0;
+
+ priv->port_up = true;
+
+ /* Populate multicast list */
+ mlx4_en_set_multicast(dev);
+
+ /* Enable the queues. */
+ atomic_clear_int(&dev->if_drv_flags, IFF_DRV_OACTIVE);
+ atomic_set_int(&dev->if_drv_flags, IFF_DRV_RUNNING);
+
+ callout_reset(&priv->watchdog_timer, MLX4_EN_WATCHDOG_TIMEOUT,
+ mlx4_en_watchdog_timeout, priv);
+
+ return 0;
+
+mac_err:
+ mlx4_unregister_mac(mdev->dev, priv->port, priv->mac_index);
+tx_err:
+ while (tx_index--) {
+ mlx4_en_deactivate_tx_ring(priv, &priv->tx_ring[tx_index]);
+ mlx4_en_deactivate_cq(priv, &priv->tx_cq[tx_index]);
+ }
+
+ mlx4_en_release_rss_steer(priv);
+cq_err:
+ while (rx_index--)
+ mlx4_en_deactivate_cq(priv, &priv->rx_cq[rx_index]);
+ for (i = 0; i < priv->rx_ring_num; i++)
+ mlx4_en_deactivate_rx_ring(priv, &priv->rx_ring[i]);
+
+ return err; /* need to close devices */
+}
+
+
+void mlx4_en_stop_port(struct net_device *dev)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ struct mlx4_en_dev *mdev = priv->mdev;
+ int i;
+
+ if (!priv->port_up) {
+ en_dbg(DRV, priv, "stop port called while port already down\n");
+ return;
+ }
+
+ /* Set port as not active */
+ priv->port_up = false;
+
+ /* Unregister Mac address for the port */
+ mlx4_unregister_mac(mdev->dev, priv->port, priv->mac_index);
+ mdev->mac_removed[priv->port] = 1;
+
+ /* Free TX Rings */
+ for (i = 0; i < priv->tx_ring_num; i++) {
+ mlx4_en_deactivate_tx_ring(priv, &priv->tx_ring[i]);
+ mlx4_en_deactivate_cq(priv, &priv->tx_cq[i]);
+ }
+ msleep(10);
+
+ for (i = 0; i < priv->tx_ring_num; i++)
+ mlx4_en_free_tx_buf(dev, &priv->tx_ring[i]);
+
+ /* Free RSS qps */
+ mlx4_en_release_rss_steer(priv);
+
+ /* Free RX Rings */
+ for (i = 0; i < priv->rx_ring_num; i++) {
+ mlx4_en_deactivate_rx_ring(priv, &priv->rx_ring[i]);
+ mlx4_en_deactivate_cq(priv, &priv->rx_cq[i]);
+ }
+
+ /* close port*/
+ mlx4_CLOSE_PORT(mdev->dev, priv->port);
+
+ callout_stop(&priv->watchdog_timer);
+
+ atomic_clear_int(&dev->if_drv_flags, IFF_DRV_RUNNING);
+}
+
+static void mlx4_en_restart(struct work_struct *work)
+{
+ struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
+ watchdog_task);
+ struct mlx4_en_dev *mdev = priv->mdev;
+ struct net_device *dev = priv->dev;
+ struct mlx4_en_tx_ring *ring;
+ int i;
+
+ if (priv->blocked == 0 || priv->port_up == 0)
+ return;
+ for (i = 0; i < priv->tx_ring_num; i++) {
+ ring = &priv->tx_ring[i];
+ if (ring->blocked &&
+ ring->watchdog_time + MLX4_EN_WATCHDOG_TIMEOUT < ticks)
+ goto reset;
+ }
+ return;
+
+reset:
+ priv->port_stats.tx_timeout++;
+ en_dbg(DRV, priv, "Watchdog task called for port %d\n", priv->port);
+
+ mutex_lock(&mdev->state_lock);
+ if (priv->port_up) {
+ mlx4_en_stop_port(dev);
+ if (mlx4_en_start_port(dev))
+ en_err(priv, "Failed restarting port %d\n", priv->port);
+ }
+ mutex_unlock(&mdev->state_lock);
+}
+
+
+static void
+mlx4_en_init(void *arg)
+{
+ struct mlx4_en_priv *priv;
+ struct mlx4_en_dev *mdev;
+ struct ifnet *dev;
+ int i;
+
+ priv = arg;
+ dev = priv->dev;
+ mdev = priv->mdev;
+ mutex_lock(&mdev->state_lock);
+ if (dev->if_drv_flags & IFF_DRV_RUNNING)
+ mlx4_en_stop_port(dev);
+
+ if (!mdev->device_up) {
+ en_err(priv, "Cannot open - device down/disabled\n");
+ goto out;
+ }
+
+ /* Reset HW statistics and performance counters */
+ if (mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 1))
+ en_dbg(HW, priv, "Failed dumping statistics\n");
+
+ memset(&priv->pstats, 0, sizeof(priv->pstats));
+
+ for (i = 0; i < priv->tx_ring_num; i++) {
+ priv->tx_ring[i].bytes = 0;
+ priv->tx_ring[i].packets = 0;
+ }
+ for (i = 0; i < priv->rx_ring_num; i++) {
+ priv->rx_ring[i].bytes = 0;
+ priv->rx_ring[i].packets = 0;
+ }
+
+ mlx4_en_set_default_moderation(priv);
+ if (mlx4_en_start_port(dev))
+ en_err(priv, "Failed starting port:%d\n", priv->port);
+
+out:
+ mutex_unlock(&mdev->state_lock);
+}
+
+void mlx4_en_free_resources(struct mlx4_en_priv *priv)
+{
+ int i;
+
+ for (i = 0; i < priv->tx_ring_num; i++) {
+ if (priv->tx_ring[i].tx_info)
+ mlx4_en_destroy_tx_ring(priv, &priv->tx_ring[i]);
+ if (priv->tx_cq[i].buf)
+ mlx4_en_destroy_cq(priv, &priv->tx_cq[i]);
+ }
+
+ for (i = 0; i < priv->rx_ring_num; i++) {
+ if (priv->rx_ring[i].rx_info)
+ mlx4_en_destroy_rx_ring(priv, &priv->rx_ring[i]);
+ if (priv->rx_cq[i].buf)
+ mlx4_en_destroy_cq(priv, &priv->rx_cq[i]);
+ }
+ /* Free the stats tree when we resize the rings. */
+ if (priv->sysctl)
+ sysctl_ctx_free(&priv->stat_ctx);
+
+}
+
+int mlx4_en_alloc_resources(struct mlx4_en_priv *priv)
+{
+ struct mlx4_en_port_profile *prof = priv->prof;
+ int i;
+
+ /* Create tx Rings */
+ for (i = 0; i < priv->tx_ring_num; i++) {
+ if (mlx4_en_create_cq(priv, &priv->tx_cq[i],
+ prof->tx_ring_size, i, TX))
+ goto err;
+
+ if (mlx4_en_create_tx_ring(priv, &priv->tx_ring[i],
+ prof->tx_ring_size, TXBB_SIZE))
+ goto err;
+ }
+
+ /* Create rx Rings */
+ for (i = 0; i < priv->rx_ring_num; i++) {
+ if (mlx4_en_create_cq(priv, &priv->rx_cq[i],
+ prof->rx_ring_size, i, RX))
+ goto err;
+
+ if (i > priv->rx_ring_num - priv->udp_rings - 1)
+ priv->rx_ring[i].use_frags = 0;
+ else
+ priv->rx_ring[i].use_frags = 1;
+ if (mlx4_en_create_rx_ring(priv, &priv->rx_ring[i],
+ prof->rx_ring_size))
+ goto err;
+ }
+
+ /* Re-create stat sysctls in case the number of rings changed. */
+ mlx4_en_sysctl_stat(priv);
+
+ /* Populate Tx priority mappings */
+ mlx4_en_set_prio_map(priv, priv->tx_prio_map,
+ prof->tx_ring_num - MLX4_EN_NUM_HASH_RINGS);
+
+ return 0;
+
+err:
+ en_err(priv, "Failed to allocate NIC resources\n");
+ return -ENOMEM;
+}
+
+
+void mlx4_en_destroy_netdev(struct net_device *dev)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ struct mlx4_en_dev *mdev = priv->mdev;
+
+ en_dbg(DRV, priv, "Destroying netdev on port:%d\n", priv->port);
+
+ if (priv->vlan_attach != NULL)
+ EVENTHANDLER_DEREGISTER(vlan_config, priv->vlan_attach);
+ if (priv->vlan_detach != NULL)
+ EVENTHANDLER_DEREGISTER(vlan_unconfig, priv->vlan_detach);
+
+ /* Unregister device - this will close the port if it was up */
+ if (priv->registered)
+ ether_ifdetach(dev);
+
+ if (priv->allocated)
+ mlx4_free_hwq_res(mdev->dev, &priv->res, MLX4_EN_PAGE_SIZE);
+
+ if (priv->sysctl)
+ sysctl_ctx_free(&priv->conf_ctx);
+
+ cancel_delayed_work(&priv->stats_task);
+ /* flush any pending task for this netdev */
+ flush_workqueue(mdev->workqueue);
+
+ /* Detach the netdev so tasks would not attempt to access it */
+ mutex_lock(&mdev->state_lock);
+ mdev->pndev[priv->port] = NULL;
+ mutex_unlock(&mdev->state_lock);
+
+ mlx4_en_free_resources(priv);
+ mtx_destroy(&priv->stats_lock.m);
+ mtx_destroy(&priv->vlan_lock.m);
+ kfree(priv);
+ if_free(dev);
+}
+
+static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ struct mlx4_en_dev *mdev = priv->mdev;
+ int err = 0;
+
+ en_dbg(DRV, priv, "Change MTU called - current:%ld new:%d\n",
+ dev->if_mtu, new_mtu);
+
+ if ((new_mtu < MLX4_EN_MIN_MTU) || (new_mtu > priv->max_mtu)) {
+ en_err(priv, "Bad MTU size:%d.\n", new_mtu);
+ return -EPERM;
+ }
+ mutex_lock(&mdev->state_lock);
+ dev->if_mtu = new_mtu;
+ if (dev->if_drv_flags & IFF_DRV_RUNNING) {
+ if (!mdev->device_up) {
+ /* NIC is probably restarting - let watchdog task reset
+ * the port */
+ en_dbg(DRV, priv, "Change MTU called with card down!?\n");
+ } else {
+ mlx4_en_stop_port(dev);
+ mlx4_en_set_default_moderation(priv);
+ err = mlx4_en_start_port(dev);
+ if (err) {
+ en_err(priv, "Failed restarting port:%d\n",
+ priv->port);
+ queue_work(mdev->workqueue, &priv->watchdog_task);
+ }
+ }
+ }
+ mutex_unlock(&mdev->state_lock);
+ return 0;
+}
+
+static int mlx4_en_calc_media(struct mlx4_en_priv *priv)
+{
+ int trans_type;
+ int active;
+
+ active = IFM_ETHER;
+ if (priv->last_link_state == MLX4_DEV_EVENT_PORT_DOWN)
+ return (active);
+ if (mlx4_en_QUERY_PORT(priv->mdev, priv->port))
+ return (active);
+ active |= IFM_FDX;
+ trans_type = priv->port_state.transciver;
+ /* XXX I don't know all of the transceiver values. */
+ if (priv->port_state.link_speed == 1000)
+ active |= IFM_1000_T;
+ else if (trans_type > 0 && trans_type <= 0xC)
+ active |= IFM_10G_SR;
+ else if (trans_type == 0x80 || trans_type == 0)
+ active |= IFM_10G_CX4;
+ if (priv->prof->tx_pause)
+ active |= IFM_ETH_TXPAUSE;
+ if (priv->prof->rx_pause)
+ active |= IFM_ETH_RXPAUSE;
+
+ return (active);
+}
+
+
+static void mlx4_en_media_status(struct ifnet *dev, struct ifmediareq *ifmr)
+{
+ struct mlx4_en_priv *priv;
+
+ priv = dev->if_softc;
+ ifmr->ifm_status = IFM_AVALID;
+ if (priv->last_link_state != MLX4_DEV_EVENT_PORT_DOWN)
+ ifmr->ifm_status |= IFM_ACTIVE;
+ ifmr->ifm_active = mlx4_en_calc_media(priv);
+
+ return;
+}
+
+static int mlx4_en_media_change(struct ifnet *dev)
+{
+ struct mlx4_en_priv *priv;
+ struct ifmedia *ifm;
+ int rxpause;
+ int txpause;
+ int error;
+
+ priv = dev->if_softc;
+ ifm = &priv->media;
+ rxpause = txpause = 0;
+ error = 0;
+
+ if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER)
+ return (EINVAL);
+ switch (IFM_SUBTYPE(ifm->ifm_media)) {
+ case IFM_AUTO:
+ break;
+ case IFM_10G_SR:
+ case IFM_10G_CX4:
+ case IFM_1000_T:
+ if (IFM_SUBTYPE(ifm->ifm_media) ==
+ IFM_SUBTYPE(mlx4_en_calc_media(priv)) &&
+ (ifm->ifm_media & IFM_FDX))
+ break;
+ /* Fallthrough */
+ default:
+ printf("%s: Only auto media type\n", if_name(dev));
+ return (EINVAL);
+ }
+ /* Allow user to set/clear pause */
+ if (IFM_OPTIONS(ifm->ifm_media) & IFM_ETH_RXPAUSE)
+ rxpause = 1;
+ if (IFM_OPTIONS(ifm->ifm_media) & IFM_ETH_TXPAUSE)
+ txpause = 1;
+ if (priv->prof->tx_pause != txpause || priv->prof->rx_pause != rxpause) {
+ priv->prof->tx_pause = txpause;
+ priv->prof->rx_pause = rxpause;
+ error = -mlx4_SET_PORT_general(priv->mdev->dev, priv->port,
+ priv->rx_mb_size + ETHER_CRC_LEN, priv->prof->tx_pause,
+ priv->prof->tx_ppp, priv->prof->rx_pause,
+ priv->prof->rx_ppp);
+ }
+ return (error);
+}
+
+static int mlx4_en_ioctl(struct ifnet *dev, u_long command, caddr_t data)
+{
+ struct mlx4_en_priv *priv;
+ struct mlx4_en_dev *mdev;
+ struct ifreq *ifr;
+ int error;
+ int mask;
+
+ error = 0;
+ mask = 0;
+ priv = dev->if_softc;
+ mdev = priv->mdev;
+ ifr = (struct ifreq *) data;
+ switch (command) {
+ case SIOCSIFMTU:
+ error = -mlx4_en_change_mtu(dev, ifr->ifr_mtu);
+ break;
+ case SIOCSIFFLAGS:
+ if (dev->if_flags & IFF_UP) {
+ if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0) {
+ mutex_lock(&mdev->state_lock);
+ mlx4_en_start_port(dev);
+ mutex_unlock(&mdev->state_lock);
+ } else
+ mlx4_en_set_multicast(dev);
+ } else {
+ mutex_lock(&mdev->state_lock);
+ if (dev->if_drv_flags & IFF_DRV_RUNNING) {
+ mlx4_en_stop_port(dev);
+ if_link_state_change(dev, LINK_STATE_DOWN);
+ }
+ mutex_unlock(&mdev->state_lock);
+ }
+ break;
+ case SIOCADDMULTI:
+ case SIOCDELMULTI:
+ mlx4_en_set_multicast(dev);
+ break;
+ case SIOCSIFMEDIA:
+ case SIOCGIFMEDIA:
+ error = ifmedia_ioctl(dev, ifr, &priv->media, command);
+ break;
+ case SIOCSIFCAP:
+ mask = ifr->ifr_reqcap ^ dev->if_capenable;
+ if (mask & IFCAP_HWCSUM)
+ dev->if_capenable ^= IFCAP_HWCSUM;
+ if (mask & IFCAP_TSO4)
+ dev->if_capenable ^= IFCAP_TSO4;
+ if (mask & IFCAP_LRO)
+ dev->if_capenable ^= IFCAP_LRO;
+ if (mask & IFCAP_VLAN_HWTAGGING)
+ dev->if_capenable ^= IFCAP_VLAN_HWTAGGING;
+ if (mask & IFCAP_VLAN_HWFILTER)
+ dev->if_capenable ^= IFCAP_VLAN_HWFILTER;
+ if (dev->if_drv_flags & IFF_DRV_RUNNING)
+ mlx4_en_init(priv);
+ VLAN_CAPABILITIES(dev);
+ break;
+ default:
+ error = ether_ioctl(dev, command, data);
+ break;
+ }
+
+ return (error);
+}
+
+static int mlx4_en_set_ring_size(struct net_device *dev,
+ int rx_size, int tx_size)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ struct mlx4_en_dev *mdev = priv->mdev;
+ int port_up = 0;
+ int err = 0;
+
+ rx_size = roundup_pow_of_two(rx_size);
+ rx_size = max_t(u32, rx_size, MLX4_EN_MIN_RX_SIZE);
+ rx_size = min_t(u32, rx_size, MLX4_EN_MAX_RX_SIZE);
+ tx_size = roundup_pow_of_two(tx_size);
+ tx_size = max_t(u32, tx_size, MLX4_EN_MIN_TX_SIZE);
+ tx_size = min_t(u32, tx_size, MLX4_EN_MAX_TX_SIZE);
+
+ if (rx_size == (priv->port_up ?
+ priv->rx_ring[0].actual_size : priv->rx_ring[0].size) &&
+ tx_size == priv->tx_ring[0].size)
+ return 0;
+
+ mutex_lock(&mdev->state_lock);
+ if (priv->port_up) {
+ port_up = 1;
+ mlx4_en_stop_port(dev);
+ }
+ mlx4_en_free_resources(priv);
+ priv->prof->tx_ring_size = tx_size;
+ priv->prof->rx_ring_size = rx_size;
+ err = mlx4_en_alloc_resources(priv);
+ if (err) {
+ en_err(priv, "Failed reallocating port resources\n");
+ goto out;
+ }
+ if (port_up) {
+ err = mlx4_en_start_port(dev);
+ if (err)
+ en_err(priv, "Failed starting port\n");
+ }
+out:
+ mutex_unlock(&mdev->state_lock);
+ return err;
+}
+
+static int mlx4_en_set_rx_ring_size(SYSCTL_HANDLER_ARGS)
+{
+ struct mlx4_en_priv *priv;
+ int size;
+ int error;
+
+ priv = arg1;
+ size = priv->prof->rx_ring_size;
+ error = sysctl_handle_int(oidp, &size, 0, req);
+ if (error || !req->newptr)
+ return (error);
+ error = -mlx4_en_set_ring_size(priv->dev, size,
+ priv->prof->tx_ring_size);
+
+ return (error);
+}
+
+static int mlx4_en_set_tx_ring_size(SYSCTL_HANDLER_ARGS)
+{
+ struct mlx4_en_priv *priv;
+ int size;
+ int error;
+
+ priv = arg1;
+ size = priv->prof->tx_ring_size;
+ error = sysctl_handle_int(oidp, &size, 0, req);
+ if (error || !req->newptr)
+ return (error);
+ error = -mlx4_en_set_ring_size(priv->dev, priv->prof->rx_ring_size,
+ size);
+
+ return (error);
+}
+
+static void mlx4_en_sysctl_conf(struct mlx4_en_priv *priv)
+{
+ struct net_device *dev;
+ struct sysctl_ctx_list *ctx;
+ struct sysctl_oid *node;
+ struct sysctl_oid_list *node_list;
+ struct sysctl_oid *coal;
+ struct sysctl_oid_list *coal_list;
+
+ dev = priv->dev;
+ ctx = &priv->conf_ctx;
+
+ sysctl_ctx_init(ctx);
+ priv->sysctl = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
+ OID_AUTO, dev->if_xname, CTLFLAG_RD, 0, "mlx4 10gig ethernet");
+ node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(priv->sysctl), OID_AUTO,
+ "conf", CTLFLAG_RD, NULL, "Configuration");
+ node_list = SYSCTL_CHILDREN(node);
+
+ SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "msg_enable",
+ CTLFLAG_RW, &priv->msg_enable, 0,
+ "Driver message enable bitfield");
+ SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "rx_rings",
+ CTLTYPE_INT | CTLFLAG_RD, &priv->rx_ring_num, 0,
+ "Number of receive rings");
+ SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "tx_rings",
+ CTLTYPE_INT | CTLFLAG_RD, &priv->tx_ring_num, 0,
+ "Number of transmit rings");
+ SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, "rx_size",
+ CTLTYPE_INT | CTLFLAG_RW, priv, 0, mlx4_en_set_rx_ring_size, "I",
+ "Receive ring size");
+ SYSCTL_ADD_PROC(ctx, node_list, OID_AUTO, "tx_size",
+ CTLTYPE_INT | CTLFLAG_RW, priv, 0, mlx4_en_set_tx_ring_size, "I",
+ "Transmit ring size");
+ SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "ip_reasm",
+ CTLFLAG_RD, &priv->mdev->profile.ip_reasm, 0,
+ "Allow reassembly of IP fragments.");
+
+ /* Add coalescer configuration. */
+ coal = SYSCTL_ADD_NODE(ctx, node_list, OID_AUTO,
+ "coalesce", CTLFLAG_RD, NULL, "Interrupt coalesce configuration");
+ coal_list = SYSCTL_CHILDREN(node);
+ SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "pkt_rate_low",
+ CTLFLAG_RW, &priv->pkt_rate_low, 0,
+ "Packets per-second for minimum delay");
+ SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "rx_usecs_low",
+ CTLFLAG_RW, &priv->rx_usecs_low, 0,
+ "Minimum RX delay in micro-seconds");
+ SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "pkt_rate_high",
+ CTLFLAG_RW, &priv->pkt_rate_high, 0,
+ "Packets per-second for maximum delay");
+ SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "rx_usecs_high",
+ CTLFLAG_RW, &priv->rx_usecs_high, 0,
+ "Maximum RX delay in micro-seconds");
+ SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "sample_interval",
+ CTLFLAG_RW, &priv->sample_interval, 0,
+ "adaptive frequency in units of HZ ticks");
+ SYSCTL_ADD_UINT(ctx, coal_list, OID_AUTO, "adaptive_rx_coal",
+ CTLFLAG_RW, &priv->adaptive_rx_coal, 0,
+ "Enable adaptive rx coalescing");
+}
+
+static void mlx4_en_sysctl_stat(struct mlx4_en_priv *priv)
+{
+ struct net_device *dev;
+ struct sysctl_ctx_list *ctx;
+ struct sysctl_oid *node;
+ struct sysctl_oid_list *node_list;
+ struct sysctl_oid *ring_node;
+ struct sysctl_oid_list *ring_list;
+ struct mlx4_en_tx_ring *tx_ring;
+ struct mlx4_en_rx_ring *rx_ring;
+ char namebuf[128];
+ int i;
+
+ dev = priv->dev;
+
+ ctx = &priv->stat_ctx;
+ sysctl_ctx_init(ctx);
+ node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(priv->sysctl), OID_AUTO,
+ "stat", CTLFLAG_RD, NULL, "Statistics");
+ node_list = SYSCTL_CHILDREN(node);
+
+#ifdef MLX4_EN_PERF_STAT
+ SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "tx_poll", CTLFLAG_RD,
+ &priv->pstats.tx_poll, "TX Poll calls");
+ SYSCTL_ADD_QUAD(ctx, node_list, OID_AUTO, "tx_pktsz_avg", CTLFLAG_RD,
+ &priv->pstats.tx_pktsz_avg, "TX average packet size");
+ SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "inflight_avg", CTLFLAG_RD,
+ &priv->pstats.inflight_avg, "TX average packets in-flight");
+ SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "tx_coal_avg", CTLFLAG_RD,
+ &priv->pstats.tx_coal_avg, "TX average coalesced completions");
+ SYSCTL_ADD_UINT(ctx, node_list, OID_AUTO, "rx_coal_avg", CTLFLAG_RD,
+ &priv->pstats.rx_coal_avg, "RX average coalesced completions");
+#endif
+
+ SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tso_packets", CTLFLAG_RD,
+ &priv->port_stats.tso_packets, "TSO packets sent");
+ SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "queue_stopped", CTLFLAG_RD,
+ &priv->port_stats.queue_stopped, "Queue full");
+ SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "wake_queue", CTLFLAG_RD,
+ &priv->port_stats.wake_queue, "Queue resumed after full");
+ SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_timeout", CTLFLAG_RD,
+ &priv->port_stats.tx_timeout, "Transmit timeouts");
+ SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_alloc_failed", CTLFLAG_RD,
+ &priv->port_stats.rx_alloc_failed, "RX failed to allocate mbuf");
+ SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_chksum_good", CTLFLAG_RD,
+ &priv->port_stats.rx_chksum_good, "RX checksum offload success");
+ SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_chksum_none", CTLFLAG_RD,
+ &priv->port_stats.rx_chksum_none, "RX without checksum offload");
+ SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_chksum_offload",
+ CTLFLAG_RD, &priv->port_stats.tx_chksum_offload,
+ "TX checksum offloads");
+
+ /* Could strdup the names and add in a loop. This is simpler. */
+ SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "broadcast", CTLFLAG_RD,
+ &priv->pkstats.broadcast, "Broadcast packets");
+ SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio0", CTLFLAG_RD,
+ &priv->pkstats.tx_prio[0], "TX Priority 0 packets");
+ SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio1", CTLFLAG_RD,
+ &priv->pkstats.tx_prio[1], "TX Priority 1 packets");
+ SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio2", CTLFLAG_RD,
+ &priv->pkstats.tx_prio[2], "TX Priority 2 packets");
+ SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio3", CTLFLAG_RD,
+ &priv->pkstats.tx_prio[3], "TX Priority 3 packets");
+ SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio4", CTLFLAG_RD,
+ &priv->pkstats.tx_prio[4], "TX Priority 4 packets");
+ SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio5", CTLFLAG_RD,
+ &priv->pkstats.tx_prio[5], "TX Priority 5 packets");
+ SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio6", CTLFLAG_RD,
+ &priv->pkstats.tx_prio[6], "TX Priority 6 packets");
+ SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "tx_prio7", CTLFLAG_RD,
+ &priv->pkstats.tx_prio[7], "TX Priority 7 packets");
+ SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio0", CTLFLAG_RD,
+ &priv->pkstats.rx_prio[0], "RX Priority 0 packets");
+ SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio1", CTLFLAG_RD,
+ &priv->pkstats.rx_prio[1], "RX Priority 1 packets");
+ SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio2", CTLFLAG_RD,
+ &priv->pkstats.rx_prio[2], "RX Priority 2 packets");
+ SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio3", CTLFLAG_RD,
+ &priv->pkstats.rx_prio[3], "RX Priority 3 packets");
+ SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio4", CTLFLAG_RD,
+ &priv->pkstats.rx_prio[4], "RX Priority 4 packets");
+ SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio5", CTLFLAG_RD,
+ &priv->pkstats.rx_prio[5], "RX Priority 5 packets");
+ SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio6", CTLFLAG_RD,
+ &priv->pkstats.rx_prio[6], "RX Priority 6 packets");
+ SYSCTL_ADD_ULONG(ctx, node_list, OID_AUTO, "rx_prio7", CTLFLAG_RD,
+ &priv->pkstats.rx_prio[7], "RX Priority 7 packets");
+
+ for (i = 0; i < priv->tx_ring_num; i++) {
+ tx_ring = &priv->tx_ring[i];
+ snprintf(namebuf, sizeof(namebuf), "tx_ring%d", i);
+ ring_node = SYSCTL_ADD_NODE(ctx, node_list, OID_AUTO, namebuf,
+ CTLFLAG_RD, NULL, "TX Ring");
+ ring_list = SYSCTL_CHILDREN(ring_node);
+ SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "packets",
+ CTLFLAG_RD, &tx_ring->packets, "TX packets");
+ SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "bytes",
+ CTLFLAG_RD, &tx_ring->bytes, "TX bytes");
+ SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "error",
+ CTLFLAG_RD, &tx_ring->errors, "TX soft errors");
+
+ }
+ for (i = 0; i < priv->rx_ring_num; i++) {
+ rx_ring = &priv->rx_ring[i];
+ snprintf(namebuf, sizeof(namebuf), "rx_ring%d", i);
+ ring_node = SYSCTL_ADD_NODE(ctx, node_list, OID_AUTO, namebuf,
+ CTLFLAG_RD, NULL, "RX Ring");
+ ring_list = SYSCTL_CHILDREN(ring_node);
+ SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "packets",
+ CTLFLAG_RD, &rx_ring->packets, "RX packets");
+ SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "bytes",
+ CTLFLAG_RD, &rx_ring->bytes, "RX bytes");
+ SYSCTL_ADD_ULONG(ctx, ring_list, OID_AUTO, "error",
+ CTLFLAG_RD, &rx_ring->errors, "RX soft errors");
+ SYSCTL_ADD_UINT(ctx, ring_list, OID_AUTO, "lro_queued",
+ CTLFLAG_RD, &rx_ring->lro.lro_queued, 0, "LRO Queued");
+ SYSCTL_ADD_UINT(ctx, ring_list, OID_AUTO, "lro_flushed",
+ CTLFLAG_RD, &rx_ring->lro.lro_flushed, 0, "LRO Flushed");
+ }
+}
+
+int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
+ struct mlx4_en_port_profile *prof)
+{
+ static volatile int mlx4_en_unit;
+ struct net_device *dev;
+ struct mlx4_en_priv *priv;
+ uint8_t dev_addr[ETHER_ADDR_LEN];
+ int err;
+ int i;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ dev = priv->dev = if_alloc(IFT_ETHER);
+ if (dev == NULL) {
+ mlx4_err(mdev, "Net device allocation failed\n");
+ kfree(priv);
+ return -ENOMEM;
+ }
+ dev->if_softc = priv;
+ if_initname(dev, "mlxen", atomic_fetchadd_int(&mlx4_en_unit, 1));
+ dev->if_mtu = ETHERMTU;
+ dev->if_baudrate = 1000000000;
+ dev->if_init = mlx4_en_init;
+ dev->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
+ dev->if_ioctl = mlx4_en_ioctl;
+ dev->if_transmit = mlx4_en_transmit;
+ dev->if_qflush = mlx4_en_qflush;
+ dev->if_snd.ifq_maxlen = prof->tx_ring_size;
+
+ /*
+ * Initialize driver private data
+ */
+ priv->dev = dev;
+ priv->mdev = mdev;
+ priv->prof = prof;
+ priv->port = port;
+ priv->port_up = false;
+ priv->rx_csum = 1;
+ priv->flags = prof->flags;
+ priv->tx_ring_num = prof->tx_ring_num;
+ priv->rx_ring_num = prof->rx_ring_num;
+ priv->udp_rings = mdev->profile.udp_rss ? prof->rx_ring_num / 2 : 1;
+ priv->mac_index = -1;
+ priv->msg_enable = MLX4_EN_MSG_LEVEL;
+ mtx_init(&priv->stats_lock.m, "mlx4 stats", NULL, MTX_DEF);
+ mtx_init(&priv->vlan_lock.m, "mlx4 vlan", NULL, MTX_DEF);
+ INIT_WORK(&priv->mcast_task, mlx4_en_do_set_multicast);
+ INIT_WORK(&priv->watchdog_task, mlx4_en_restart);
+ INIT_WORK(&priv->linkstate_task, mlx4_en_linkstate);
+ INIT_DELAYED_WORK(&priv->stats_task, mlx4_en_do_get_stats);
+ callout_init(&priv->watchdog_timer, 1);
+
+ /* Query for default mac and max mtu */
+ priv->max_mtu = mdev->dev->caps.eth_mtu_cap[priv->port];
+ priv->mac = mdev->dev->caps.def_mac[priv->port];
+
+ if (ILLEGAL_MAC(priv->mac)) {
+ en_err(priv, "Port: %d, invalid mac burned: 0x%llx, quiting\n",
+ priv->port, priv->mac);
+ err = -EINVAL;
+ goto out;
+ }
+
+ mlx4_en_sysctl_conf(priv);
+
+ err = mlx4_en_alloc_resources(priv);
+ if (err)
+ goto out;
+
+ /* Allocate page for receive rings */
+ err = mlx4_alloc_hwq_res(mdev->dev, &priv->res,
+ MLX4_EN_PAGE_SIZE, MLX4_EN_PAGE_SIZE);
+ if (err) {
+ en_err(priv, "Failed to allocate page for rx qps\n");
+ goto out;
+ }
+ priv->allocated = 1;
+
+ /*
+ * Set driver features
+ */
+ dev->if_capabilities |= IFCAP_RXCSUM | IFCAP_TXCSUM;
+ dev->if_capabilities |= IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING;
+ dev->if_capabilities |= IFCAP_VLAN_HWCSUM | IFCAP_VLAN_HWFILTER;
+ dev->if_capabilities |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU;
+#if 0 /* Not yet */
+ dev->if_capabilities |= IFCAP_WOL;
+#endif
+ if (mdev->LSO_support)
+ dev->if_capabilities |= IFCAP_TSO | IFCAP_VLAN_HWTSO;
+
+ /* Don't enable LOR unless the user requests. */
+ dev->if_capenable = dev->if_capabilities;
+
+ if (mdev->profile.num_lro)
+ dev->if_capabilities |= IFCAP_LRO;
+
+ /* Register for VLAN events */
+ priv->vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
+ mlx4_en_vlan_rx_add_vid, priv, EVENTHANDLER_PRI_FIRST);
+ priv->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
+ mlx4_en_vlan_rx_kill_vid, priv, EVENTHANDLER_PRI_FIRST);
+
+ mdev->pndev[priv->port] = dev;
+
+ priv->last_link_state = MLX4_DEV_EVENT_PORT_DOWN;
+ if_link_state_change(dev, LINK_STATE_DOWN);
+
+ /* Set default MAC */
+ for (i = 0; i < ETHER_ADDR_LEN; i++)
+ dev_addr[ETHER_ADDR_LEN - 1 - i] = (u8) (priv->mac >> (8 * i));
+
+ ether_ifattach(dev, dev_addr);
+ ifmedia_init(&priv->media, IFM_IMASK | IFM_ETH_FMASK,
+ mlx4_en_media_change, mlx4_en_media_status);
+ ifmedia_add(&priv->media, IFM_ETHER | IFM_FDX | IFM_1000_T, 0, NULL);
+ ifmedia_add(&priv->media, IFM_ETHER | IFM_FDX | IFM_10G_SR, 0, NULL);
+ ifmedia_add(&priv->media, IFM_ETHER | IFM_FDX | IFM_10G_CX4, 0, NULL);
+ ifmedia_add(&priv->media, IFM_ETHER | IFM_AUTO, 0, NULL);
+ ifmedia_set(&priv->media, IFM_ETHER | IFM_AUTO);
+
+ en_warn(priv, "Using %d TX rings\n", prof->tx_ring_num);
+ en_warn(priv, "Using %d RX rings\n", prof->rx_ring_num);
+
+ priv->registered = 1;
+ queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY);
+
+ return 0;
+
+out:
+ mlx4_en_destroy_netdev(dev);
+ return err;
+}
+
diff --git a/sys/ofed/drivers/net/mlx4/en_params.c b/sys/ofed/drivers/net/mlx4/en_params.c
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/en_params.c
diff --git a/sys/ofed/drivers/net/mlx4/en_port.c b/sys/ofed/drivers/net/mlx4/en_port.c
new file mode 100644
index 0000000..36a53d3
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/en_port.c
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+
+#include "mlx4_en.h"
+
+#include <linux/if_vlan.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/cmd.h>
+
+
+int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port,
+ u64 mac, u64 clear, u8 mode)
+{
+ return mlx4_cmd(dev, (mac | (clear << 63)), port, mode,
+ MLX4_CMD_SET_MCAST_FLTR, MLX4_CMD_TIME_CLASS_B);
+}
+
+int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, u8 port, u32 *vlans)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ struct mlx4_set_vlan_fltr_mbox *filter;
+ int i;
+ int err = 0;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+
+ filter = mailbox->buf;
+ memset(filter, 0, sizeof *filter);
+ if (vlans)
+ for (i = 0; i < VLAN_FLTR_SIZE; i ++)
+ filter->entry[i] = cpu_to_be32(vlans[i]);
+ err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_SET_VLAN_FLTR,
+ MLX4_CMD_TIME_CLASS_B);
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ return err;
+}
+
+
+int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
+ u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ struct mlx4_set_port_general_context *context;
+ int err;
+ u32 in_mod;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ context = mailbox->buf;
+ memset(context, 0, sizeof *context);
+
+ context->flags = SET_PORT_GEN_ALL_VALID;
+ context->mtu = cpu_to_be16(mtu);
+ context->pptx = (pptx * (!pfctx)) << 7;
+ context->pfctx = pfctx;
+ context->pprx = (pprx * (!pfcrx)) << 7;
+ context->pfcrx = pfcrx;
+
+ in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
+ err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+ MLX4_CMD_TIME_CLASS_B);
+
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ return err;
+}
+
+int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn,
+ u8 promisc)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ struct mlx4_set_port_rqp_calc_context *context;
+ int err;
+ u32 in_mod;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ context = mailbox->buf;
+ memset(context, 0, sizeof *context);
+
+ context->base_qpn = cpu_to_be32(base_qpn);
+ context->promisc = cpu_to_be32(promisc << SET_PORT_PROMISC_EN_SHIFT | base_qpn);
+ context->mcast = cpu_to_be32((dev->caps.mc_promisc_mode <<
+ SET_PORT_PROMISC_MODE_SHIFT) | base_qpn);
+ context->intra_no_vlan = 0;
+ context->no_vlan = MLX4_NO_VLAN_IDX;
+ context->intra_vlan_miss = 0;
+ context->vlan_miss = MLX4_VLAN_MISS_IDX;
+
+ in_mod = MLX4_SET_PORT_RQP_CALC << 8 | port;
+ err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+ MLX4_CMD_TIME_CLASS_B);
+
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ return err;
+}
+
+int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port)
+{
+ struct mlx4_en_query_port_context *qport_context;
+ struct mlx4_en_priv *priv = netdev_priv(mdev->pndev[port]);
+ struct mlx4_en_port_state *state = &priv->port_state;
+ struct mlx4_cmd_mailbox *mailbox;
+ int err;
+
+ mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ memset(mailbox->buf, 0, sizeof(*qport_context));
+ err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0,
+ MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B);
+ if (err)
+ goto out;
+ qport_context = mailbox->buf;
+
+ /* This command is always accessed from Ethtool context
+ * already synchronized, no need in locking */
+ state->link_state = !!(qport_context->link_up & MLX4_EN_LINK_UP_MASK);
+ if ((qport_context->link_speed & MLX4_EN_SPEED_MASK) ==
+ MLX4_EN_1G_SPEED)
+ state->link_speed = 1000;
+ else
+ state->link_speed = 10000;
+ state->transciver = qport_context->transceiver;
+ if (be32_to_cpu(qport_context->transceiver_code_hi) & 0x400)
+ state->transciver = 0x80;
+
+out:
+ mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+ return err;
+}
+
+static int read_iboe_counters(struct mlx4_dev *dev, int index, u64 counters[])
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ int err;
+ int mode;
+ struct mlx4_counters_ext *ext;
+ struct mlx4_counters *reg;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return -ENOMEM;
+
+ err = mlx4_cmd_box(dev, 0, mailbox->dma, index, 0,
+ MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C);
+ if (err)
+ goto out;
+
+ mode = be32_to_cpu(((struct mlx4_counters *)mailbox->buf)->counter_mode) & 0xf;
+ switch (mode) {
+ case 0:
+ reg = mailbox->buf;
+ counters[0] = be64_to_cpu(reg->rx_frames);
+ counters[1] = be64_to_cpu(reg->tx_frames);
+ counters[2] = be64_to_cpu(reg->rx_bytes);
+ counters[3] = be64_to_cpu(reg->tx_bytes);
+ break;
+ case 1:
+ ext = mailbox->buf;
+ counters[0] = be64_to_cpu(ext->rx_uni_frames);
+ counters[1] = be64_to_cpu(ext->tx_uni_frames);
+ counters[2] = be64_to_cpu(ext->rx_uni_bytes);
+ counters[3] = be64_to_cpu(ext->tx_uni_bytes);
+ break;
+ default:
+ err = -EINVAL;
+ }
+
+out:
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ return err;
+}
+
+int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
+{
+ struct mlx4_en_stat_out_mbox *mlx4_en_stats;
+ struct net_device *dev;
+ struct mlx4_en_priv *priv;
+ struct mlx4_cmd_mailbox *mailbox;
+ u64 in_mod = reset << 8 | port;
+ unsigned long oerror;
+ unsigned long ierror;
+ int err;
+ int i;
+ int counter;
+ u64 counters[4];
+
+ dev = mdev->pndev[port];
+ priv = netdev_priv(dev);
+ memset(counters, 0, sizeof counters);
+ counter = mlx4_get_iboe_counter(priv->mdev->dev, port);
+ if (counter >= 0)
+ err = read_iboe_counters(priv->mdev->dev, counter, counters);
+
+ mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ memset(mailbox->buf, 0, sizeof(*mlx4_en_stats));
+ err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, in_mod, 0,
+ MLX4_CMD_DUMP_ETH_STATS, MLX4_CMD_TIME_CLASS_B);
+ if (err)
+ goto out;
+
+ mlx4_en_stats = mailbox->buf;
+
+ spin_lock(&priv->stats_lock);
+
+ oerror = ierror = 0;
+ dev->if_ipackets = counters[0];
+ dev->if_ibytes = counters[2];
+ for (i = 0; i < priv->rx_ring_num; i++) {
+ dev->if_ipackets += priv->rx_ring[i].packets;
+ dev->if_ibytes += priv->rx_ring[i].bytes;
+ ierror += priv->rx_ring[i].errors;
+ }
+ dev->if_opackets = counters[1];
+ dev->if_obytes = counters[3];
+ for (i = 0; i <= priv->tx_ring_num; i++) {
+ dev->if_opackets += priv->tx_ring[i].packets;
+ dev->if_obytes += priv->tx_ring[i].bytes;
+ oerror += priv->tx_ring[i].errors;
+ }
+
+ dev->if_ierrors = be32_to_cpu(mlx4_en_stats->RDROP) + ierror;
+ dev->if_oerrors = be32_to_cpu(mlx4_en_stats->TDROP) + oerror;
+ dev->if_imcasts = be64_to_cpu(mlx4_en_stats->MCAST_prio_0) +
+ be64_to_cpu(mlx4_en_stats->MCAST_prio_1) +
+ be64_to_cpu(mlx4_en_stats->MCAST_prio_2) +
+ be64_to_cpu(mlx4_en_stats->MCAST_prio_3) +
+ be64_to_cpu(mlx4_en_stats->MCAST_prio_4) +
+ be64_to_cpu(mlx4_en_stats->MCAST_prio_5) +
+ be64_to_cpu(mlx4_en_stats->MCAST_prio_6) +
+ be64_to_cpu(mlx4_en_stats->MCAST_prio_7) +
+ be64_to_cpu(mlx4_en_stats->MCAST_novlan);
+ dev->if_omcasts = be64_to_cpu(mlx4_en_stats->TMCAST_prio_0) +
+ be64_to_cpu(mlx4_en_stats->TMCAST_prio_1) +
+ be64_to_cpu(mlx4_en_stats->TMCAST_prio_2) +
+ be64_to_cpu(mlx4_en_stats->TMCAST_prio_3) +
+ be64_to_cpu(mlx4_en_stats->TMCAST_prio_4) +
+ be64_to_cpu(mlx4_en_stats->TMCAST_prio_5) +
+ be64_to_cpu(mlx4_en_stats->TMCAST_prio_6) +
+ be64_to_cpu(mlx4_en_stats->TMCAST_prio_7) +
+ be64_to_cpu(mlx4_en_stats->TMCAST_novlan);
+ dev->if_collisions = 0;
+
+ priv->pkstats.broadcast =
+ be64_to_cpu(mlx4_en_stats->RBCAST_prio_0) +
+ be64_to_cpu(mlx4_en_stats->RBCAST_prio_1) +
+ be64_to_cpu(mlx4_en_stats->RBCAST_prio_2) +
+ be64_to_cpu(mlx4_en_stats->RBCAST_prio_3) +
+ be64_to_cpu(mlx4_en_stats->RBCAST_prio_4) +
+ be64_to_cpu(mlx4_en_stats->RBCAST_prio_5) +
+ be64_to_cpu(mlx4_en_stats->RBCAST_prio_6) +
+ be64_to_cpu(mlx4_en_stats->RBCAST_prio_7) +
+ be64_to_cpu(mlx4_en_stats->RBCAST_novlan);
+ priv->pkstats.rx_prio[0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_0);
+ priv->pkstats.rx_prio[1] = be64_to_cpu(mlx4_en_stats->RTOT_prio_1);
+ priv->pkstats.rx_prio[2] = be64_to_cpu(mlx4_en_stats->RTOT_prio_2);
+ priv->pkstats.rx_prio[3] = be64_to_cpu(mlx4_en_stats->RTOT_prio_3);
+ priv->pkstats.rx_prio[4] = be64_to_cpu(mlx4_en_stats->RTOT_prio_4);
+ priv->pkstats.rx_prio[5] = be64_to_cpu(mlx4_en_stats->RTOT_prio_5);
+ priv->pkstats.rx_prio[6] = be64_to_cpu(mlx4_en_stats->RTOT_prio_6);
+ priv->pkstats.rx_prio[7] = be64_to_cpu(mlx4_en_stats->RTOT_prio_7);
+ priv->pkstats.tx_prio[0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_0);
+ priv->pkstats.tx_prio[1] = be64_to_cpu(mlx4_en_stats->TTOT_prio_1);
+ priv->pkstats.tx_prio[2] = be64_to_cpu(mlx4_en_stats->TTOT_prio_2);
+ priv->pkstats.tx_prio[3] = be64_to_cpu(mlx4_en_stats->TTOT_prio_3);
+ priv->pkstats.tx_prio[4] = be64_to_cpu(mlx4_en_stats->TTOT_prio_4);
+ priv->pkstats.tx_prio[5] = be64_to_cpu(mlx4_en_stats->TTOT_prio_5);
+ priv->pkstats.tx_prio[6] = be64_to_cpu(mlx4_en_stats->TTOT_prio_6);
+ priv->pkstats.tx_prio[7] = be64_to_cpu(mlx4_en_stats->TTOT_prio_7);
+ spin_unlock(&priv->stats_lock);
+
+out:
+ mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+ return err;
+}
+
diff --git a/sys/ofed/drivers/net/mlx4/en_port.h b/sys/ofed/drivers/net/mlx4/en_port.h
new file mode 100644
index 0000000..96551d3
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/en_port.h
@@ -0,0 +1,591 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef _MLX4_EN_PORT_H_
+#define _MLX4_EN_PORT_H_
+
+
+#define SET_PORT_GEN_ALL_VALID 0x7
+#define SET_PORT_PROMISC_EN_SHIFT 31
+#define SET_PORT_PROMISC_MODE_SHIFT 30
+
+enum {
+ MLX4_CMD_SET_VLAN_FLTR = 0x47,
+ MLX4_CMD_SET_MCAST_FLTR = 0x48,
+ MLX4_CMD_DUMP_ETH_STATS = 0x49,
+};
+
+struct mlx4_set_port_general_context {
+ u8 reserved[3];
+ u8 flags;
+ u16 reserved2;
+ __be16 mtu;
+ u8 pptx;
+ u8 pfctx;
+ u16 reserved3;
+ u8 pprx;
+ u8 pfcrx;
+ u16 reserved4;
+};
+
+struct mlx4_set_port_rqp_calc_context {
+ __be32 base_qpn;
+ __be32 flags;
+ u8 reserved[3];
+ u8 mac_miss;
+ u8 intra_no_vlan;
+ u8 no_vlan;
+ u8 intra_vlan_miss;
+ u8 vlan_miss;
+ u8 reserved2[3];
+ u8 no_vlan_prio;
+ __be32 promisc;
+ __be32 mcast;
+};
+
+#define VLAN_FLTR_SIZE 128
+struct mlx4_set_vlan_fltr_mbox {
+ __be32 entry[VLAN_FLTR_SIZE];
+};
+
+
+enum {
+ MLX4_MCAST_CONFIG = 0,
+ MLX4_MCAST_DISABLE = 1,
+ MLX4_MCAST_ENABLE = 2,
+};
+
+struct mlx4_en_query_port_context {
+ u8 link_up;
+#define MLX4_EN_LINK_UP_MASK 0x80
+ u8 reserved;
+ __be16 mtu;
+ u8 reserved2;
+ u8 link_speed;
+#define MLX4_EN_SPEED_MASK 0x3
+#define MLX4_EN_1G_SPEED 0x2
+ u16 reserved3[5];
+ __be64 mac;
+ u8 transceiver;
+ u8 reserved4[3];
+ __be32 wavelenth;
+ u32 reserved5;
+ __be32 transceiver_code_hi;
+ __be32 transceiver_code_low;
+};
+
+
+struct mlx4_en_stat_out_mbox {
+ /* Received frames with a length of 64 octets */
+ __be64 R64_prio_0;
+ __be64 R64_prio_1;
+ __be64 R64_prio_2;
+ __be64 R64_prio_3;
+ __be64 R64_prio_4;
+ __be64 R64_prio_5;
+ __be64 R64_prio_6;
+ __be64 R64_prio_7;
+ __be64 R64_novlan;
+ /* Received frames with a length of 127 octets */
+ __be64 R127_prio_0;
+ __be64 R127_prio_1;
+ __be64 R127_prio_2;
+ __be64 R127_prio_3;
+ __be64 R127_prio_4;
+ __be64 R127_prio_5;
+ __be64 R127_prio_6;
+ __be64 R127_prio_7;
+ __be64 R127_novlan;
+ /* Received frames with a length of 255 octets */
+ __be64 R255_prio_0;
+ __be64 R255_prio_1;
+ __be64 R255_prio_2;
+ __be64 R255_prio_3;
+ __be64 R255_prio_4;
+ __be64 R255_prio_5;
+ __be64 R255_prio_6;
+ __be64 R255_prio_7;
+ __be64 R255_novlan;
+ /* Received frames with a length of 511 octets */
+ __be64 R511_prio_0;
+ __be64 R511_prio_1;
+ __be64 R511_prio_2;
+ __be64 R511_prio_3;
+ __be64 R511_prio_4;
+ __be64 R511_prio_5;
+ __be64 R511_prio_6;
+ __be64 R511_prio_7;
+ __be64 R511_novlan;
+ /* Received frames with a length of 1023 octets */
+ __be64 R1023_prio_0;
+ __be64 R1023_prio_1;
+ __be64 R1023_prio_2;
+ __be64 R1023_prio_3;
+ __be64 R1023_prio_4;
+ __be64 R1023_prio_5;
+ __be64 R1023_prio_6;
+ __be64 R1023_prio_7;
+ __be64 R1023_novlan;
+ /* Received frames with a length of 1518 octets */
+ __be64 R1518_prio_0;
+ __be64 R1518_prio_1;
+ __be64 R1518_prio_2;
+ __be64 R1518_prio_3;
+ __be64 R1518_prio_4;
+ __be64 R1518_prio_5;
+ __be64 R1518_prio_6;
+ __be64 R1518_prio_7;
+ __be64 R1518_novlan;
+ /* Received frames with a length of 1522 octets */
+ __be64 R1522_prio_0;
+ __be64 R1522_prio_1;
+ __be64 R1522_prio_2;
+ __be64 R1522_prio_3;
+ __be64 R1522_prio_4;
+ __be64 R1522_prio_5;
+ __be64 R1522_prio_6;
+ __be64 R1522_prio_7;
+ __be64 R1522_novlan;
+ /* Received frames with a length of 1548 octets */
+ __be64 R1548_prio_0;
+ __be64 R1548_prio_1;
+ __be64 R1548_prio_2;
+ __be64 R1548_prio_3;
+ __be64 R1548_prio_4;
+ __be64 R1548_prio_5;
+ __be64 R1548_prio_6;
+ __be64 R1548_prio_7;
+ __be64 R1548_novlan;
+ /* Received frames with a length of 1548 < octets < MTU */
+ __be64 R2MTU_prio_0;
+ __be64 R2MTU_prio_1;
+ __be64 R2MTU_prio_2;
+ __be64 R2MTU_prio_3;
+ __be64 R2MTU_prio_4;
+ __be64 R2MTU_prio_5;
+ __be64 R2MTU_prio_6;
+ __be64 R2MTU_prio_7;
+ __be64 R2MTU_novlan;
+ /* Received frames with a length of MTU< octets and good CRC */
+ __be64 RGIANT_prio_0;
+ __be64 RGIANT_prio_1;
+ __be64 RGIANT_prio_2;
+ __be64 RGIANT_prio_3;
+ __be64 RGIANT_prio_4;
+ __be64 RGIANT_prio_5;
+ __be64 RGIANT_prio_6;
+ __be64 RGIANT_prio_7;
+ __be64 RGIANT_novlan;
+ /* Received broadcast frames with good CRC */
+ __be64 RBCAST_prio_0;
+ __be64 RBCAST_prio_1;
+ __be64 RBCAST_prio_2;
+ __be64 RBCAST_prio_3;
+ __be64 RBCAST_prio_4;
+ __be64 RBCAST_prio_5;
+ __be64 RBCAST_prio_6;
+ __be64 RBCAST_prio_7;
+ __be64 RBCAST_novlan;
+ /* Received multicast frames with good CRC */
+ __be64 MCAST_prio_0;
+ __be64 MCAST_prio_1;
+ __be64 MCAST_prio_2;
+ __be64 MCAST_prio_3;
+ __be64 MCAST_prio_4;
+ __be64 MCAST_prio_5;
+ __be64 MCAST_prio_6;
+ __be64 MCAST_prio_7;
+ __be64 MCAST_novlan;
+ /* Received unicast not short or GIANT frames with good CRC */
+ __be64 RTOTG_prio_0;
+ __be64 RTOTG_prio_1;
+ __be64 RTOTG_prio_2;
+ __be64 RTOTG_prio_3;
+ __be64 RTOTG_prio_4;
+ __be64 RTOTG_prio_5;
+ __be64 RTOTG_prio_6;
+ __be64 RTOTG_prio_7;
+ __be64 RTOTG_novlan;
+
+ /* Count of total octets of received frames, includes framing characters */
+ __be64 RTTLOCT_prio_0;
+ /* Count of total octets of received frames, not including framing
+ characters */
+ __be64 RTTLOCT_NOFRM_prio_0;
+ /* Count of Total number of octets received
+ (only for frames without errors) */
+ __be64 ROCT_prio_0;
+
+ __be64 RTTLOCT_prio_1;
+ __be64 RTTLOCT_NOFRM_prio_1;
+ __be64 ROCT_prio_1;
+
+ __be64 RTTLOCT_prio_2;
+ __be64 RTTLOCT_NOFRM_prio_2;
+ __be64 ROCT_prio_2;
+
+ __be64 RTTLOCT_prio_3;
+ __be64 RTTLOCT_NOFRM_prio_3;
+ __be64 ROCT_prio_3;
+
+ __be64 RTTLOCT_prio_4;
+ __be64 RTTLOCT_NOFRM_prio_4;
+ __be64 ROCT_prio_4;
+
+ __be64 RTTLOCT_prio_5;
+ __be64 RTTLOCT_NOFRM_prio_5;
+ __be64 ROCT_prio_5;
+
+ __be64 RTTLOCT_prio_6;
+ __be64 RTTLOCT_NOFRM_prio_6;
+ __be64 ROCT_prio_6;
+
+ __be64 RTTLOCT_prio_7;
+ __be64 RTTLOCT_NOFRM_prio_7;
+ __be64 ROCT_prio_7;
+
+ __be64 RTTLOCT_novlan;
+ __be64 RTTLOCT_NOFRM_novlan;
+ __be64 ROCT_novlan;
+
+ /* Count of Total received frames including bad frames */
+ __be64 RTOT_prio_0;
+ /* Count of Total number of received frames with 802.1Q encapsulation */
+ __be64 R1Q_prio_0;
+ __be64 reserved1;
+
+ __be64 RTOT_prio_1;
+ __be64 R1Q_prio_1;
+ __be64 reserved2;
+
+ __be64 RTOT_prio_2;
+ __be64 R1Q_prio_2;
+ __be64 reserved3;
+
+ __be64 RTOT_prio_3;
+ __be64 R1Q_prio_3;
+ __be64 reserved4;
+
+ __be64 RTOT_prio_4;
+ __be64 R1Q_prio_4;
+ __be64 reserved5;
+
+ __be64 RTOT_prio_5;
+ __be64 R1Q_prio_5;
+ __be64 reserved6;
+
+ __be64 RTOT_prio_6;
+ __be64 R1Q_prio_6;
+ __be64 reserved7;
+
+ __be64 RTOT_prio_7;
+ __be64 R1Q_prio_7;
+ __be64 reserved8;
+
+ __be64 RTOT_novlan;
+ __be64 R1Q_novlan;
+ __be64 reserved9;
+
+ /* Total number of Successfully Received Control Frames */
+ __be64 RCNTL;
+ __be64 reserved10;
+ __be64 reserved11;
+ __be64 reserved12;
+ /* Count of received frames with a length/type field value between 46
+ (42 for VLANtagged frames) and 1500 (also 1500 for VLAN-tagged frames),
+ inclusive */
+ __be64 RInRangeLengthErr;
+ /* Count of received frames with length/type field between 1501 and 1535
+ decimal, inclusive */
+ __be64 ROutRangeLengthErr;
+ /* Count of received frames that are longer than max allowed size for
+ 802.3 frames (1518/1522) */
+ __be64 RFrmTooLong;
+ /* Count frames received with PCS error */
+ __be64 PCS;
+
+ /* Transmit frames with a length of 64 octets */
+ __be64 T64_prio_0;
+ __be64 T64_prio_1;
+ __be64 T64_prio_2;
+ __be64 T64_prio_3;
+ __be64 T64_prio_4;
+ __be64 T64_prio_5;
+ __be64 T64_prio_6;
+ __be64 T64_prio_7;
+ __be64 T64_novlan;
+ __be64 T64_loopbk;
+ /* Transmit frames with a length of 65 to 127 octets. */
+ __be64 T127_prio_0;
+ __be64 T127_prio_1;
+ __be64 T127_prio_2;
+ __be64 T127_prio_3;
+ __be64 T127_prio_4;
+ __be64 T127_prio_5;
+ __be64 T127_prio_6;
+ __be64 T127_prio_7;
+ __be64 T127_novlan;
+ __be64 T127_loopbk;
+ /* Transmit frames with a length of 128 to 255 octets */
+ __be64 T255_prio_0;
+ __be64 T255_prio_1;
+ __be64 T255_prio_2;
+ __be64 T255_prio_3;
+ __be64 T255_prio_4;
+ __be64 T255_prio_5;
+ __be64 T255_prio_6;
+ __be64 T255_prio_7;
+ __be64 T255_novlan;
+ __be64 T255_loopbk;
+ /* Transmit frames with a length of 256 to 511 octets */
+ __be64 T511_prio_0;
+ __be64 T511_prio_1;
+ __be64 T511_prio_2;
+ __be64 T511_prio_3;
+ __be64 T511_prio_4;
+ __be64 T511_prio_5;
+ __be64 T511_prio_6;
+ __be64 T511_prio_7;
+ __be64 T511_novlan;
+ __be64 T511_loopbk;
+ /* Transmit frames with a length of 512 to 1023 octets */
+ __be64 T1023_prio_0;
+ __be64 T1023_prio_1;
+ __be64 T1023_prio_2;
+ __be64 T1023_prio_3;
+ __be64 T1023_prio_4;
+ __be64 T1023_prio_5;
+ __be64 T1023_prio_6;
+ __be64 T1023_prio_7;
+ __be64 T1023_novlan;
+ __be64 T1023_loopbk;
+ /* Transmit frames with a length of 1024 to 1518 octets */
+ __be64 T1518_prio_0;
+ __be64 T1518_prio_1;
+ __be64 T1518_prio_2;
+ __be64 T1518_prio_3;
+ __be64 T1518_prio_4;
+ __be64 T1518_prio_5;
+ __be64 T1518_prio_6;
+ __be64 T1518_prio_7;
+ __be64 T1518_novlan;
+ __be64 T1518_loopbk;
+ /* Counts transmit frames with a length of 1519 to 1522 bytes */
+ __be64 T1522_prio_0;
+ __be64 T1522_prio_1;
+ __be64 T1522_prio_2;
+ __be64 T1522_prio_3;
+ __be64 T1522_prio_4;
+ __be64 T1522_prio_5;
+ __be64 T1522_prio_6;
+ __be64 T1522_prio_7;
+ __be64 T1522_novlan;
+ __be64 T1522_loopbk;
+ /* Transmit frames with a length of 1523 to 1548 octets */
+ __be64 T1548_prio_0;
+ __be64 T1548_prio_1;
+ __be64 T1548_prio_2;
+ __be64 T1548_prio_3;
+ __be64 T1548_prio_4;
+ __be64 T1548_prio_5;
+ __be64 T1548_prio_6;
+ __be64 T1548_prio_7;
+ __be64 T1548_novlan;
+ __be64 T1548_loopbk;
+ /* Counts transmit frames with a length of 1549 to MTU bytes */
+ __be64 T2MTU_prio_0;
+ __be64 T2MTU_prio_1;
+ __be64 T2MTU_prio_2;
+ __be64 T2MTU_prio_3;
+ __be64 T2MTU_prio_4;
+ __be64 T2MTU_prio_5;
+ __be64 T2MTU_prio_6;
+ __be64 T2MTU_prio_7;
+ __be64 T2MTU_novlan;
+ __be64 T2MTU_loopbk;
+ /* Transmit frames with a length greater than MTU octets and a good CRC. */
+ __be64 TGIANT_prio_0;
+ __be64 TGIANT_prio_1;
+ __be64 TGIANT_prio_2;
+ __be64 TGIANT_prio_3;
+ __be64 TGIANT_prio_4;
+ __be64 TGIANT_prio_5;
+ __be64 TGIANT_prio_6;
+ __be64 TGIANT_prio_7;
+ __be64 TGIANT_novlan;
+ __be64 TGIANT_loopbk;
+ /* Transmit broadcast frames with a good CRC */
+ __be64 TBCAST_prio_0;
+ __be64 TBCAST_prio_1;
+ __be64 TBCAST_prio_2;
+ __be64 TBCAST_prio_3;
+ __be64 TBCAST_prio_4;
+ __be64 TBCAST_prio_5;
+ __be64 TBCAST_prio_6;
+ __be64 TBCAST_prio_7;
+ __be64 TBCAST_novlan;
+ __be64 TBCAST_loopbk;
+ /* Transmit multicast frames with a good CRC */
+ __be64 TMCAST_prio_0;
+ __be64 TMCAST_prio_1;
+ __be64 TMCAST_prio_2;
+ __be64 TMCAST_prio_3;
+ __be64 TMCAST_prio_4;
+ __be64 TMCAST_prio_5;
+ __be64 TMCAST_prio_6;
+ __be64 TMCAST_prio_7;
+ __be64 TMCAST_novlan;
+ __be64 TMCAST_loopbk;
+ /* Transmit good frames that are neither broadcast nor multicast */
+ __be64 TTOTG_prio_0;
+ __be64 TTOTG_prio_1;
+ __be64 TTOTG_prio_2;
+ __be64 TTOTG_prio_3;
+ __be64 TTOTG_prio_4;
+ __be64 TTOTG_prio_5;
+ __be64 TTOTG_prio_6;
+ __be64 TTOTG_prio_7;
+ __be64 TTOTG_novlan;
+ __be64 TTOTG_loopbk;
+
+ /* total octets of transmitted frames, including framing characters */
+ __be64 TTTLOCT_prio_0;
+ /* total octets of transmitted frames, not including framing characters */
+ __be64 TTTLOCT_NOFRM_prio_0;
+ /* ifOutOctets */
+ __be64 TOCT_prio_0;
+
+ __be64 TTTLOCT_prio_1;
+ __be64 TTTLOCT_NOFRM_prio_1;
+ __be64 TOCT_prio_1;
+
+ __be64 TTTLOCT_prio_2;
+ __be64 TTTLOCT_NOFRM_prio_2;
+ __be64 TOCT_prio_2;
+
+ __be64 TTTLOCT_prio_3;
+ __be64 TTTLOCT_NOFRM_prio_3;
+ __be64 TOCT_prio_3;
+
+ __be64 TTTLOCT_prio_4;
+ __be64 TTTLOCT_NOFRM_prio_4;
+ __be64 TOCT_prio_4;
+
+ __be64 TTTLOCT_prio_5;
+ __be64 TTTLOCT_NOFRM_prio_5;
+ __be64 TOCT_prio_5;
+
+ __be64 TTTLOCT_prio_6;
+ __be64 TTTLOCT_NOFRM_prio_6;
+ __be64 TOCT_prio_6;
+
+ __be64 TTTLOCT_prio_7;
+ __be64 TTTLOCT_NOFRM_prio_7;
+ __be64 TOCT_prio_7;
+
+ __be64 TTTLOCT_novlan;
+ __be64 TTTLOCT_NOFRM_novlan;
+ __be64 TOCT_novlan;
+
+ __be64 TTTLOCT_loopbk;
+ __be64 TTTLOCT_NOFRM_loopbk;
+ __be64 TOCT_loopbk;
+
+ /* Total frames transmitted with a good CRC that are not aborted */
+ __be64 TTOT_prio_0;
+ /* Total number of frames transmitted with 802.1Q encapsulation */
+ __be64 T1Q_prio_0;
+ __be64 reserved13;
+
+ __be64 TTOT_prio_1;
+ __be64 T1Q_prio_1;
+ __be64 reserved14;
+
+ __be64 TTOT_prio_2;
+ __be64 T1Q_prio_2;
+ __be64 reserved15;
+
+ __be64 TTOT_prio_3;
+ __be64 T1Q_prio_3;
+ __be64 reserved16;
+
+ __be64 TTOT_prio_4;
+ __be64 T1Q_prio_4;
+ __be64 reserved17;
+
+ __be64 TTOT_prio_5;
+ __be64 T1Q_prio_5;
+ __be64 reserved18;
+
+ __be64 TTOT_prio_6;
+ __be64 T1Q_prio_6;
+ __be64 reserved19;
+
+ __be64 TTOT_prio_7;
+ __be64 T1Q_prio_7;
+ __be64 reserved20;
+
+ __be64 TTOT_novlan;
+ __be64 T1Q_novlan;
+ __be64 reserved21;
+
+ __be64 TTOT_loopbk;
+ __be64 T1Q_loopbk;
+ __be64 reserved22;
+
+ /* Received frames with a length greater than MTU octets and a bad CRC */
+ __be32 RJBBR;
+ /* Received frames with a bad CRC that are not runts, jabbers,
+ or alignment errors */
+ __be32 RCRC;
+ /* Received frames with SFD with a length of less than 64 octets and a
+ bad CRC */
+ __be32 RRUNT;
+ /* Received frames with a length less than 64 octets and a good CRC */
+ __be32 RSHORT;
+ /* Total Number of Received Packets Dropped */
+ __be32 RDROP;
+ /* Drop due to overflow */
+ __be32 RdropOvflw;
+ /* Drop due to overflow */
+ __be32 RdropLength;
+ /* Total of good frames. Does not include frames received with
+ frame-too-long, FCS, or length errors */
+ __be32 RTOTFRMS;
+ /* Total dropped Xmited packets */
+ __be32 TDROP;
+};
+
+enum mlx4_query_reply mlx4_en_query(void *endev_ptr, void *int_dev);
+
+#endif
diff --git a/sys/ofed/drivers/net/mlx4/en_resources.c b/sys/ofed/drivers/net/mlx4/en_resources.c
new file mode 100644
index 0000000..a147153
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/en_resources.c
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/vmalloc.h>
+#include <linux/mlx4/qp.h>
+
+#include "mlx4_en.h"
+
+void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
+ int is_tx, int rss, int qpn, int cqn,
+ struct mlx4_qp_context *context)
+{
+ struct mlx4_en_dev *mdev = priv->mdev;
+
+ memset(context, 0, sizeof *context);
+ context->flags = cpu_to_be32(7 << 16 | rss << 13);
+ context->pd = cpu_to_be32(mdev->priv_pdn);
+ context->mtu_msgmax = 0xff;
+ if (!is_tx && !rss) {
+ context->rq_size_stride = ilog2(size) << 3 | (ilog2(stride) - 4);
+ }
+ if (is_tx)
+ context->sq_size_stride = ilog2(size) << 3 | (ilog2(stride) - 4);
+ else
+ context->sq_size_stride = ilog2(TXBB_SIZE) - 4;
+ context->usr_page = cpu_to_be32(mdev->priv_uar.index);
+ context->local_qpn = cpu_to_be32(qpn);
+ context->pri_path.ackto = 1 & 0x07;
+ context->pri_path.sched_queue = 0x83 | (priv->port - 1) << 6;
+ context->pri_path.counter_index = 0xff;
+ context->cqn_send = cpu_to_be32(cqn);
+ context->cqn_recv = cpu_to_be32(cqn);
+ context->db_rec_addr = cpu_to_be64(priv->res.db.dma << 2);
+}
+
+
+int mlx4_en_map_buffer(struct mlx4_buf *buf)
+{
+ struct page **pages;
+ int i;
+
+ if (buf->direct.buf != NULL || buf->nbufs == 1)
+ return 0;
+
+ pages = kmalloc(sizeof *pages * buf->nbufs, GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
+
+ for (i = 0; i < buf->nbufs; ++i)
+ pages[i] = virt_to_page(buf->page_list[i].buf);
+
+ buf->direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL);
+ kfree(pages);
+ if (!buf->direct.buf)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void mlx4_en_unmap_buffer(struct mlx4_buf *buf)
+{
+ if (buf->direct.buf != NULL || buf->nbufs == 1)
+ return;
+
+ vunmap(buf->direct.buf);
+ buf->direct.buf = NULL;
+}
+
+void mlx4_en_sqp_event(struct mlx4_qp *qp, enum mlx4_event event)
+{
+ return;
+}
+
diff --git a/sys/ofed/drivers/net/mlx4/en_rx.c b/sys/ofed/drivers/net/mlx4/en_rx.c
new file mode 100644
index 0000000..fc4345f
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/en_rx.c
@@ -0,0 +1,1006 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include "mlx4_en.h"
+
+#include <linux/mlx4/cq.h>
+#include <linux/mlx4/qp.h>
+
+#include <net/ethernet.h>
+#include <net/if_vlan_var.h>
+#include <sys/mbuf.h>
+
+enum {
+ MIN_RX_ARM = 1024,
+};
+
+static int mlx4_en_alloc_buf(struct mlx4_en_priv *priv,
+ struct mlx4_en_rx_desc *rx_desc,
+ struct mbuf **mb_list,
+ int i)
+{
+ struct mlx4_en_dev *mdev = priv->mdev;
+ struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
+ struct mbuf *mb;
+ dma_addr_t dma;
+
+ if (i == 0)
+ mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, frag_info->frag_size);
+ else
+ mb = m_getjcl(M_NOWAIT, MT_DATA, 0, frag_info->frag_size);
+ if (mb == NULL) {
+ priv->port_stats.rx_alloc_failed++;
+ return -ENOMEM;
+ }
+ dma = pci_map_single(mdev->pdev, mb->m_data, frag_info->frag_size,
+ PCI_DMA_FROMDEVICE);
+ rx_desc->data[i].addr = cpu_to_be64(dma);
+ mb_list[i] = mb;
+ return 0;
+}
+
+static void
+mlx4_en_init_rx_desc_mb(struct mlx4_en_priv *priv,
+ struct mlx4_en_rx_ring *ring, int index)
+{
+ struct mlx4_en_rx_desc *rx_desc = ring->buf + ring->stride * index;
+
+ rx_desc->data->byte_count = cpu_to_be32(priv->rx_mb_size);
+ rx_desc->data->lkey = cpu_to_be32(priv->mdev->mr.key);
+}
+
+static void mlx4_en_init_rx_desc(struct mlx4_en_priv *priv,
+ struct mlx4_en_rx_ring *ring, int index)
+{
+ struct mlx4_en_rx_desc *rx_desc = ring->buf + ring->stride * index;
+ int possible_frags;
+ int i;
+
+ /* Set size and memtype fields */
+ for (i = 0; i < priv->num_frags; i++) {
+ rx_desc->data[i].byte_count =
+ cpu_to_be32(priv->frag_info[i].frag_size);
+ rx_desc->data[i].lkey = cpu_to_be32(priv->mdev->mr.key);
+ }
+
+ /* If the number of used fragments does not fill up the ring stride,
+ * remaining (unused) fragments must be padded with null address/size
+ * and a special memory key */
+ possible_frags = (ring->stride - sizeof(struct mlx4_en_rx_desc)) / DS_SIZE;
+ for (i = priv->num_frags; i < possible_frags; i++) {
+ rx_desc->data[i].byte_count = 0;
+ rx_desc->data[i].lkey = cpu_to_be32(MLX4_EN_MEMTYPE_PAD);
+ rx_desc->data[i].addr = 0;
+ }
+}
+
+static int
+mlx4_en_alloc_rx_mb(struct mlx4_en_priv *priv,
+ struct mlx4_en_rx_desc *rx_desc,
+ struct mbuf **pmb, int unmap)
+{
+ struct mlx4_en_dev *mdev = priv->mdev;
+ dma_addr_t dma;
+ int size = priv->rx_mb_size;
+ struct mbuf *new_mb;
+
+ new_mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
+ if (unlikely(new_mb == NULL)) {
+ priv->port_stats.rx_alloc_failed++;
+ return -ENOMEM;
+ }
+
+ if (unmap)
+ pci_unmap_single(mdev->pdev, be64_to_cpu(rx_desc->data->addr),
+ be32_to_cpu(rx_desc->data->byte_count),
+ PCI_DMA_FROMDEVICE);
+ dma = pci_map_single(priv->mdev->pdev, new_mb->m_data, size, DMA_FROM_DEVICE);
+ *pmb = new_mb;
+ rx_desc->data->addr = cpu_to_be64(dma);
+ return 0;
+}
+
+static int
+mlx4_en_prepare_rx_desc_mb(struct mlx4_en_priv *priv,
+ struct mlx4_en_rx_ring *ring, int index)
+{
+ struct mlx4_en_rx_desc *rx_desc = ring->buf + (index * ring->stride);
+ struct mbuf **pmb = (struct mbuf **) ring->rx_info + index;
+
+ return mlx4_en_alloc_rx_mb(priv, rx_desc, pmb, 0);
+}
+
+static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv,
+ struct mlx4_en_rx_ring *ring, int index)
+{
+ struct mlx4_en_rx_desc *rx_desc = ring->buf + (index * ring->stride);
+ struct mbuf **mb_list = ring->rx_info + (index << priv->log_rx_info);
+ int i;
+
+ for (i = 0; i < priv->num_frags; i++)
+ if (mlx4_en_alloc_buf(priv, rx_desc, mb_list, i))
+ goto err;
+
+ return 0;
+
+err:
+ while (i--)
+ m_free(mb_list[i]);
+ return -ENOMEM;
+}
+
+static inline void mlx4_en_update_rx_prod_db(struct mlx4_en_rx_ring *ring)
+{
+ *ring->wqres.db.db = cpu_to_be32(ring->prod & 0xffff);
+}
+
+static void mlx4_en_free_rx_desc(struct mlx4_en_priv *priv,
+ struct mlx4_en_rx_ring *ring,
+ int index)
+{
+ struct mlx4_en_frag_info *frag_info;
+ struct mlx4_en_dev *mdev = priv->mdev;
+ struct mbuf **mb_list;
+ struct mbuf *mb;
+ struct mlx4_en_rx_desc *rx_desc = ring->buf + (index << ring->log_stride);
+ dma_addr_t dma;
+ int nr;
+
+ if (ring->use_frags) {
+ mb_list = ring->rx_info + (index << priv->log_rx_info);
+ for (nr = 0; nr < priv->num_frags; nr++) {
+ en_dbg(DRV, priv, "Freeing fragment:%d\n", nr);
+ frag_info = &priv->frag_info[nr];
+ dma = be64_to_cpu(rx_desc->data[nr].addr);
+
+ en_dbg(DRV, priv, "Unmaping buffer at dma:0x%llx\n", (u64) dma);
+ pci_unmap_single(mdev->pdev, dma, frag_info->frag_size,
+ PCI_DMA_FROMDEVICE);
+ m_free(mb_list[nr]);
+ }
+ } else {
+ mb = *((struct mbuf **) ring->rx_info + index);
+ dma = be64_to_cpu(rx_desc->data->addr);
+ pci_unmap_single(mdev->pdev, dma,
+ priv->rx_mb_size,
+ PCI_DMA_FROMDEVICE);
+ m_free(mb);
+ }
+}
+
+static int mlx4_en_fill_rx_buffers(struct mlx4_en_priv *priv)
+{
+ struct mlx4_en_rx_ring *ring;
+ int ring_ind;
+ int buf_ind;
+ int new_size;
+ int err;
+
+ for (buf_ind = 0; buf_ind < priv->prof->rx_ring_size; buf_ind++) {
+ for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
+ ring = &priv->rx_ring[ring_ind];
+
+ if (ring->use_frags)
+ err = mlx4_en_prepare_rx_desc(priv, ring,
+ ring->actual_size);
+ else
+ err = mlx4_en_prepare_rx_desc_mb(priv, ring,
+ ring->actual_size);
+ if (err) {
+ if (ring->actual_size == 0) {
+ en_err(priv, "Failed to allocate "
+ "enough rx buffers\n");
+ return -ENOMEM;
+ } else {
+ new_size = rounddown_pow_of_two(ring->actual_size);
+ en_warn(priv, "Only %d buffers allocated "
+ "reducing ring size to %d\n",
+ ring->actual_size, new_size);
+ goto reduce_rings;
+ }
+ }
+ ring->actual_size++;
+ ring->prod++;
+ }
+ }
+ return 0;
+
+reduce_rings:
+ for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
+ ring = &priv->rx_ring[ring_ind];
+ while (ring->actual_size > new_size) {
+ ring->actual_size--;
+ ring->prod--;
+ mlx4_en_free_rx_desc(priv, ring, ring->actual_size);
+ }
+ }
+
+ return 0;
+}
+
+static void mlx4_en_free_rx_buf(struct mlx4_en_priv *priv,
+ struct mlx4_en_rx_ring *ring)
+{
+ int index;
+
+ en_dbg(DRV, priv, "Freeing Rx buf - cons:%d prod:%d\n",
+ ring->cons, ring->prod);
+
+ /* Unmap and free Rx buffers */
+ BUG_ON((u32) (ring->prod - ring->cons) > ring->actual_size);
+ while (ring->cons != ring->prod) {
+ index = ring->cons & ring->size_mask;
+ en_dbg(DRV, priv, "Processing descriptor:%d\n", index);
+ mlx4_en_free_rx_desc(priv, ring, index);
+ ++ring->cons;
+ }
+}
+
+
+int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
+ struct mlx4_en_rx_ring *ring, u32 size)
+{
+ struct mlx4_en_dev *mdev = priv->mdev;
+ int err;
+ int tmp;
+
+
+ ring->prod = 0;
+ ring->cons = 0;
+ ring->size = size;
+ ring->size_mask = size - 1;
+ ring->stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) +
+ DS_SIZE * (ring->use_frags ?
+ MLX4_EN_MAX_RX_FRAGS : 1));
+ ring->log_stride = ffs(ring->stride) - 1;
+ ring->buf_size = ring->size * ring->stride + TXBB_SIZE;
+
+ if (ring->use_frags)
+ tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS *
+ sizeof(struct mbuf *));
+ else
+ tmp = size * sizeof(struct mbuf *);
+
+ ring->rx_info = kmalloc(tmp, GFP_KERNEL);
+ if (!ring->rx_info) {
+ en_err(priv, "Failed allocating rx_info ring\n");
+ return -ENOMEM;
+ }
+ en_dbg(DRV, priv, "Allocated rx_info ring at addr:%p size:%d stride:%d (%d)\n",
+ ring->rx_info, tmp, ring->stride, ring->log_stride);
+
+ err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres,
+ ring->buf_size, 2 * PAGE_SIZE);
+ if (err)
+ goto err_ring;
+
+ err = mlx4_en_map_buffer(&ring->wqres.buf);
+ if (err) {
+ en_err(priv, "Failed to map RX buffer\n");
+ goto err_hwq;
+ }
+ ring->buf = ring->wqres.buf.direct.buf;
+
+ return 0;
+
+ mlx4_en_unmap_buffer(&ring->wqres.buf);
+err_hwq:
+ mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
+err_ring:
+ kfree(ring->rx_info);
+ ring->rx_info = NULL;
+ return err;
+}
+
+int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv)
+{
+ struct mlx4_en_rx_ring *ring;
+ int i;
+ int ring_ind;
+ int err;
+ int stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) +
+ DS_SIZE * priv->num_frags);
+
+ for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
+ ring = &priv->rx_ring[ring_ind];
+
+ ring->prod = 0;
+ ring->cons = 0;
+ ring->actual_size = 0;
+ ring->cqn = priv->rx_cq[ring_ind].mcq.cqn;
+
+ if (ring->use_frags)
+ ring->stride = stride;
+ if (ring->stride <= TXBB_SIZE)
+ ring->buf += TXBB_SIZE;
+
+ ring->log_stride = ffs(ring->stride) - 1;
+ ring->buf_size = ring->size * ring->stride;
+
+ memset(ring->buf, 0, ring->buf_size);
+ mlx4_en_update_rx_prod_db(ring);
+
+ if (ring->use_frags) {
+ /* Initailize all descriptors */
+ for (i = 0; i < ring->size; i++)
+ mlx4_en_init_rx_desc(priv, ring, i);
+ } else {
+ for (i = 0; i < ring->size; i++)
+ mlx4_en_init_rx_desc_mb(priv, ring, i);
+ }
+ /* Configure lro mngr */
+ if (priv->dev->if_capenable & IFCAP_LRO) {
+ if (tcp_lro_init(&ring->lro))
+ priv->dev->if_capenable &= ~IFCAP_LRO;
+ else
+ ring->lro.ifp = priv->dev;
+ }
+ }
+ err = mlx4_en_fill_rx_buffers(priv);
+ if (err)
+ goto err_buffers;
+
+ for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
+ ring = &priv->rx_ring[ring_ind];
+
+ ring->size_mask = ring->actual_size - 1;
+ mlx4_en_update_rx_prod_db(ring);
+ }
+
+
+ return 0;
+
+err_buffers:
+ for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++)
+ mlx4_en_free_rx_buf(priv, &priv->rx_ring[ring_ind]);
+
+ ring_ind = priv->rx_ring_num - 1;
+ return err;
+}
+
+void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
+ struct mlx4_en_rx_ring *ring)
+{
+ struct mlx4_en_dev *mdev = priv->mdev;
+
+ mlx4_en_unmap_buffer(&ring->wqres.buf);
+ mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size + TXBB_SIZE);
+ kfree(ring->rx_info);
+ ring->rx_info = NULL;
+}
+
+void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
+ struct mlx4_en_rx_ring *ring)
+{
+ tcp_lro_free(&ring->lro);
+ mlx4_en_free_rx_buf(priv, ring);
+ if (ring->stride <= TXBB_SIZE)
+ ring->buf -= TXBB_SIZE;
+}
+
+
+/* Unmap a completed descriptor and free unused pages */
+static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
+ struct mlx4_en_rx_desc *rx_desc,
+ struct mbuf **mb_list,
+ int length)
+{
+ struct mlx4_en_dev *mdev = priv->mdev;
+ struct mlx4_en_frag_info *frag_info;
+ dma_addr_t dma;
+ struct mbuf *mb;
+ int nr;
+
+ mb = mb_list[0];
+ mb->m_pkthdr.len = length;
+ /* Collect used fragments while replacing them in the HW descirptors */
+ for (nr = 0; nr < priv->num_frags; nr++) {
+ frag_info = &priv->frag_info[nr];
+ if (length <= frag_info->frag_prefix_size)
+ break;
+ if (nr)
+ mb->m_next = mb_list[nr];
+ mb = mb_list[nr];
+ mb->m_len = frag_info[nr].frag_size;
+ dma = be64_to_cpu(rx_desc->data[nr].addr);
+
+ /* Allocate a replacement page */
+ if (mlx4_en_alloc_buf(priv, rx_desc, mb_list, nr))
+ goto fail;
+
+ /* Unmap buffer */
+ pci_unmap_single(mdev->pdev, dma, frag_info[nr].frag_size,
+ PCI_DMA_FROMDEVICE);
+ }
+ /* Adjust size of last fragment to match actual length */
+ mb->m_len = length - priv->frag_info[nr - 1].frag_prefix_size;
+ mb->m_next = NULL;
+ return 0;
+
+fail:
+ /* Drop all accumulated fragments (which have already been replaced in
+ * the descriptor) of this packet; remaining fragments are reused... */
+ while (nr > 0) {
+ nr--;
+ m_free(mb_list[nr]);
+ }
+ return -ENOMEM;
+}
+
+
+static struct mbuf *mlx4_en_rx_mb(struct mlx4_en_priv *priv,
+ struct mlx4_en_rx_desc *rx_desc,
+ struct mbuf **mb_list,
+ unsigned int length)
+{
+ struct mbuf *mb;
+
+ mb = mb_list[0];
+ /* Move relevant fragments to mb */
+ if (unlikely(mlx4_en_complete_rx_desc(priv, rx_desc, mb_list, length)))
+ return NULL;
+
+ return mb;
+}
+
+static inline int invalid_cqe(struct mlx4_en_priv *priv,
+ struct mlx4_cqe *cqe)
+{
+ /* Drop packet on bad receive or bad checksum */
+ if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
+ MLX4_CQE_OPCODE_ERROR)) {
+ en_err(priv, "CQE completed in error - vendor "
+ "syndrom:%d syndrom:%d\n",
+ ((struct mlx4_err_cqe *) cqe)->vendor_err_syndrome,
+ ((struct mlx4_err_cqe *) cqe)->syndrome);
+ return 1;
+ }
+ if (unlikely(cqe->badfcs_enc & MLX4_CQE_BAD_FCS)) {
+ en_dbg(RX_ERR, priv, "Accepted frame with bad FCS\n");
+ return 1;;
+ }
+
+ return 0;
+}
+
+static struct mbuf *
+mlx4_en_get_rx_mb(struct mlx4_en_priv *priv,
+ struct mlx4_en_rx_desc *rx_desc,
+ struct mbuf **pmb,
+ unsigned int length)
+{
+ struct mlx4_en_dev *mdev = priv->mdev;
+ struct mbuf *mb;
+ dma_addr_t dma;
+
+ if (length <= SMALL_PACKET_SIZE) {
+ mb = m_gethdr(M_WAITOK, MT_DATA);
+ if (unlikely(mb == NULL))
+ return NULL;
+ /* We are copying all relevant data to the mb - temporarily
+ * synch buffers for the copy */
+ dma = be64_to_cpu(rx_desc->data->addr);
+ dma_sync_single_range_for_cpu(&mdev->pdev->dev, dma, 0,
+ length, DMA_FROM_DEVICE);
+ memcpy(mb->m_data, (*pmb)->m_data, length);
+ dma_sync_single_range_for_device(&mdev->pdev->dev, dma, 0,
+ length, DMA_FROM_DEVICE);
+
+ } else {
+ mb = *pmb;
+ if (unlikely(mlx4_en_alloc_rx_mb(priv, rx_desc, pmb, 1)))
+ return NULL;
+ }
+
+ mb->m_len = length;
+ mb->m_pkthdr.len = length;
+ return mb;
+}
+
+static void validate_loopback(struct mlx4_en_priv *priv, struct mbuf *mb)
+{
+ int i;
+ int offset = ETHER_HDR_LEN;
+
+ for (i = 0; i < MLX4_LOOPBACK_TEST_PAYLOAD; i++, offset++) {
+ if (*(mb->m_data + offset) != (unsigned char) (i & 0xff))
+ goto out_loopback;
+ }
+ /* Loopback found */
+ priv->loopback_ok = 1;
+
+out_loopback:
+ m_freem(mb);
+}
+
+int mlx4_en_process_rx_cq_mb(struct net_device *dev,
+ struct mlx4_en_cq *cq, int budget)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ struct mlx4_cqe *cqe;
+ struct mlx4_en_rx_ring *ring = &priv->rx_ring[cq->ring];
+ struct mlx4_en_rx_desc *rx_desc;
+ struct mbuf **pmb;
+ struct mbuf *mb;
+ int index;
+ unsigned int length;
+ int polled = 0;
+
+ if (!priv->port_up)
+ return 0;
+
+ /* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
+ * descriptor offset can be deduced from the CQE index instead of
+ * reading 'cqe->index' */
+ index = cq->mcq.cons_index & ring->size_mask;
+ cqe = &cq->buf[index];
+
+ /* Process all completed CQEs */
+ while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
+ cq->mcq.cons_index & cq->size)) {
+
+ pmb = (struct mbuf **) ring->rx_info + index;
+ rx_desc = ring->buf + (index << ring->log_stride);
+
+ /*
+ * make sure we read the CQE after we read the ownership bit
+ */
+ rmb();
+
+ if (invalid_cqe(priv, cqe))
+ goto next;
+
+ /*
+ * Packet is OK - process it.
+ */
+ length = be32_to_cpu(cqe->byte_cnt);
+
+ mb = mlx4_en_get_rx_mb(priv, rx_desc, pmb, length);
+ if (unlikely(!mb)){
+ ring->errors++;
+ goto next;
+ }
+
+ ring->bytes += length;
+ ring->packets++;
+
+ if (unlikely(priv->validate_loopback)) {
+ validate_loopback(priv, mb);
+ goto next;
+ }
+ mb->m_pkthdr.flowid = cq->ring;
+ mb->m_flags |= M_FLOWID;
+ mb->m_pkthdr.rcvif = dev;
+ if (be32_to_cpu(cqe->vlan_my_qpn) &
+ MLX4_CQE_VLAN_PRESENT_MASK) {
+ mb->m_pkthdr.ether_vtag = be16_to_cpu(cqe->sl_vid);
+ mb->m_flags |= M_VLANTAG;
+ }
+
+ if (likely(priv->rx_csum && cqe->checksum == 0xffff)) {
+ priv->port_stats.rx_chksum_good++;
+ mb->m_pkthdr.csum_flags =
+ CSUM_IP_CHECKED | CSUM_IP_VALID |
+ CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+ mb->m_pkthdr.csum_data = htons(0xffff);
+ } else {
+ priv->port_stats.rx_chksum_none++;
+ mb->m_pkthdr.csum_flags = 0;
+ if (priv->mdev->profile.ip_reasm &&
+ cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV4) &&
+ !mlx4_en_rx_frags(priv, ring, mb, cqe))
+ goto next;
+ }
+ /* Push it up the stack */
+ dev->if_input(dev, mb);
+
+next:
+ ++cq->mcq.cons_index;
+ index = (cq->mcq.cons_index) & ring->size_mask;
+ cqe = &cq->buf[index];
+ if (++polled == budget)
+ break;
+ }
+
+ /* If CQ is empty, flush all pending IP reassembly sessions */
+ mlx4_en_flush_frags(priv, ring);
+
+ AVG_PERF_COUNTER(priv->pstats.rx_coal_avg, polled);
+ mlx4_cq_set_ci(&cq->mcq);
+ wmb(); /* ensure HW sees CQ consumer before we post new buffers */
+ ring->cons = cq->mcq.cons_index;
+ ring->prod += polled; /* Polled descriptors were realocated in place */
+ mlx4_en_update_rx_prod_db(ring);
+ return polled;
+}
+
+int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int budget)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ struct mlx4_cqe *cqe;
+ struct mlx4_en_rx_ring *ring = &priv->rx_ring[cq->ring];
+ struct mbuf **mb_list;
+ struct mlx4_en_rx_desc *rx_desc;
+ struct mbuf *mb;
+ struct lro_entry *queued;
+ int index;
+ unsigned int length;
+ int polled = 0;
+
+ if (!priv->port_up)
+ return 0;
+
+ /* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
+ * descriptor offset can be deduced from the CQE index instead of
+ * reading 'cqe->index' */
+ index = cq->mcq.cons_index & ring->size_mask;
+ cqe = &cq->buf[index];
+
+ /* Process all completed CQEs */
+ while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
+ cq->mcq.cons_index & cq->size)) {
+
+ mb_list = ring->rx_info + (index << priv->log_rx_info);
+ rx_desc = ring->buf + (index << ring->log_stride);
+
+ /*
+ * make sure we read the CQE after we read the ownership bit
+ */
+ rmb();
+
+ if (invalid_cqe(priv, cqe))
+ goto next;
+
+ /*
+ * Packet is OK - process it.
+ */
+ length = be32_to_cpu(cqe->byte_cnt);
+ mb = mlx4_en_rx_mb(priv, rx_desc, mb_list, length);
+ if (!mb) {
+ ring->errors++;
+ goto next;
+ }
+
+ ring->bytes += length;
+ ring->packets++;
+
+ if (unlikely(priv->validate_loopback)) {
+ validate_loopback(priv, mb);
+ goto next;
+ }
+
+ mb->m_pkthdr.flowid = cq->ring;
+ mb->m_flags |= M_FLOWID;
+ mb->m_pkthdr.rcvif = dev;
+ if (be32_to_cpu(cqe->vlan_my_qpn) &
+ MLX4_CQE_VLAN_PRESENT_MASK) {
+ mb->m_pkthdr.ether_vtag = be16_to_cpu(cqe->sl_vid);
+ mb->m_flags |= M_VLANTAG;
+ }
+ if (likely(priv->rx_csum) &&
+ (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPOK)) &&
+ (cqe->checksum == cpu_to_be16(0xffff))) {
+ priv->port_stats.rx_chksum_good++;
+ mb->m_pkthdr.csum_flags =
+ CSUM_IP_CHECKED | CSUM_IP_VALID |
+ CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+ mb->m_pkthdr.csum_data = htons(0xffff);
+ /* This packet is eligible for LRO if it is:
+ * - DIX Ethernet (type interpretation)
+ * - TCP/IP (v4)
+ * - without IP options
+ * - not an IP fragment
+ */
+ if (mlx4_en_can_lro(cqe->status) &&
+ (dev->if_capenable & IFCAP_LRO)) {
+ if (ring->lro.lro_cnt != 0 &&
+ tcp_lro_rx(&ring->lro, mb, 0) == 0)
+ goto next;
+ }
+
+ /* LRO not possible, complete processing here */
+ INC_PERF_COUNTER(priv->pstats.lro_misses);
+ } else {
+ mb->m_pkthdr.csum_flags = 0;
+ priv->port_stats.rx_chksum_none++;
+ }
+
+ /* Push it up the stack */
+ dev->if_input(dev, mb);
+
+next:
+ ++cq->mcq.cons_index;
+ index = (cq->mcq.cons_index) & ring->size_mask;
+ cqe = &cq->buf[index];
+ if (++polled == budget)
+ break;
+ }
+ while ((queued = SLIST_FIRST(&ring->lro.lro_active)) != NULL) {
+ SLIST_REMOVE_HEAD(&ring->lro.lro_active, next);
+ tcp_lro_flush(&ring->lro, queued);
+ }
+ AVG_PERF_COUNTER(priv->pstats.rx_coal_avg, polled);
+ mlx4_cq_set_ci(&cq->mcq);
+ wmb(); /* ensure HW sees CQ consumer before we post new buffers */
+ ring->cons = cq->mcq.cons_index;
+ ring->prod += polled; /* Polled descriptors were realocated in place */
+ mlx4_en_update_rx_prod_db(ring);
+ return polled;
+}
+
+
+/* Rx CQ polling - called by NAPI */
+static int mlx4_en_poll_rx_cq(struct mlx4_en_cq *cq, int budget)
+{
+ struct net_device *dev = cq->dev;
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ int done;
+
+ if (priv->rx_ring[cq->ring].use_frags)
+ done = mlx4_en_process_rx_cq(dev, cq, budget);
+ else
+ done = mlx4_en_process_rx_cq_mb(dev, cq, budget);
+
+ cq->tot_rx += done;
+
+ return done;
+}
+
+void mlx4_en_rx_que(void *context, int pending)
+{
+ struct mlx4_en_cq *cq;
+
+ cq = context;
+ while (mlx4_en_poll_rx_cq(cq, MLX4_EN_MAX_RX_POLL)
+ == MLX4_EN_MAX_RX_POLL);
+ mlx4_en_arm_cq(cq->dev->if_softc, cq);
+}
+
+void mlx4_en_rx_irq(struct mlx4_cq *mcq)
+{
+ struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq);
+ struct mlx4_en_priv *priv = netdev_priv(cq->dev);
+ int done;
+
+ done = mlx4_en_poll_rx_cq(cq, MLX4_EN_MAX_RX_POLL);
+ if (done == MLX4_EN_MAX_RX_POLL)
+ taskqueue_enqueue(cq->tq, &cq->cq_task);
+ else
+ mlx4_en_arm_cq(priv, cq);
+}
+
+
+#if MLX4_EN_MAX_RX_FRAGS == 3
+static int frag_sizes[] = {
+ FRAG_SZ0,
+ FRAG_SZ1,
+ FRAG_SZ2,
+};
+#elif MLX4_EN_MAX_RX_FRAGS == 2
+static int frag_sizes[] = {
+ FRAG_SZ0,
+ FRAG_SZ1,
+};
+#else
+#error "Unknown MAX_RX_FRAGS"
+#endif
+
+void mlx4_en_calc_rx_buf(struct net_device *dev)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ int eff_mtu = dev->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + ETH_LLC_SNAP_SIZE;
+ int buf_size = 0;
+ int i, frag;
+
+ for (i = 0, frag = 0; buf_size < eff_mtu; frag++, i++) {
+ /*
+ * Allocate small to large but only as much as is needed for
+ * the tail.
+ */
+ while (i > 0 && eff_mtu - buf_size <= frag_sizes[i - 1])
+ i--;
+ priv->frag_info[frag].frag_size = frag_sizes[i];
+ priv->frag_info[frag].frag_prefix_size = buf_size;
+ buf_size += priv->frag_info[frag].frag_size;
+ }
+
+ priv->num_frags = frag;
+ /*
+ * For use_frags == 0 calculate the size extbuf we require.
+ */
+ if (eff_mtu <= MCLBYTES)
+ priv->rx_mb_size = MCLBYTES;
+ else if (eff_mtu <= MJUMPAGESIZE)
+ priv->rx_mb_size = MJUMPAGESIZE;
+ else if (eff_mtu <= MJUM9BYTES)
+ priv->rx_mb_size = MJUM9BYTES;
+ else
+ priv->rx_mb_size = MJUM16BYTES;
+ priv->log_rx_info =
+ ROUNDUP_LOG2(priv->num_frags * sizeof(struct mbuf *));
+
+ en_dbg(DRV, priv, "Rx buffer scatter-list (effective-mtu:%d "
+ "num_frags:%d):\n", eff_mtu, priv->num_frags);
+ for (i = 0; i < priv->num_frags; i++) {
+ en_dbg(DRV, priv, " frag:%d - size:%d prefix:%d\n", i,
+ priv->frag_info[i].frag_size,
+ priv->frag_info[i].frag_prefix_size)
+ }
+}
+
+/* RSS related functions */
+
+static int mlx4_en_config_rss_qp(struct mlx4_en_priv *priv, int qpn,
+ struct mlx4_en_rx_ring *ring,
+ enum mlx4_qp_state *state,
+ struct mlx4_qp *qp)
+{
+ struct mlx4_en_dev *mdev = priv->mdev;
+ struct mlx4_qp_context *context;
+ int err = 0;
+
+ context = kmalloc(sizeof *context , GFP_KERNEL);
+ if (!context) {
+ en_err(priv, "Failed to allocate qp context\n");
+ return -ENOMEM;
+ }
+
+ err = mlx4_qp_alloc(mdev->dev, qpn, qp);
+ if (err) {
+ en_err(priv, "Failed to allocate qp #%x\n", qpn);
+ goto out;
+ }
+ qp->event = mlx4_en_sqp_event;
+
+ memset(context, 0, sizeof *context);
+ mlx4_en_fill_qp_context(priv, ring->actual_size, ring->stride, 0, 0,
+ qpn, ring->cqn, context);
+ context->db_rec_addr = cpu_to_be64(ring->wqres.db.dma);
+
+ err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, context, qp, state);
+ if (err) {
+ mlx4_qp_remove(mdev->dev, qp);
+ mlx4_qp_free(mdev->dev, qp);
+ }
+ mlx4_en_update_rx_prod_db(ring);
+out:
+ kfree(context);
+ return err;
+}
+
+/* Allocate rx qp's and configure them according to rss map */
+int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv)
+{
+ struct mlx4_en_dev *mdev = priv->mdev;
+ struct mlx4_en_rss_map *rss_map = &priv->rss_map;
+ struct mlx4_qp_context context;
+ struct mlx4_en_rss_context *rss_context;
+ void *ptr;
+ u8 rss_mask = (priv->udp_rings > 1) ? 0x3f : 0x14;
+ int i, qpn;
+ int err = 0;
+ int good_qps = 0;
+
+ en_dbg(DRV, priv, "Configuring rss steering\n");
+ err = mlx4_qp_reserve_range(mdev->dev, priv->rx_ring_num,
+ roundup_pow_of_two(priv->rx_ring_num),
+ &rss_map->base_qpn);
+ if (err) {
+ en_err(priv, "Failed reserving %d qps\n", priv->rx_ring_num);
+ return err;
+ }
+
+ for (i = 0; i < priv->rx_ring_num; i++) {
+ qpn = rss_map->base_qpn + i;
+ err = mlx4_en_config_rss_qp(priv, qpn,
+ &priv->rx_ring[i],
+ &rss_map->state[i],
+ &rss_map->qps[i]);
+ if (err)
+ goto rss_err;
+
+ ++good_qps;
+ }
+
+ /* Configure RSS indirection qp */
+ err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &priv->base_qpn);
+ if (err) {
+ en_err(priv, "Failed to reserve range for RSS "
+ "indirection qp\n");
+ goto rss_err;
+ }
+ err = mlx4_qp_alloc(mdev->dev, priv->base_qpn, &rss_map->indir_qp);
+ if (err) {
+ en_err(priv, "Failed to allocate RSS indirection QP\n");
+ goto reserve_err;
+ }
+ rss_map->indir_qp.event = mlx4_en_sqp_event;
+ mlx4_en_fill_qp_context(priv, 0, 0, 0, 1, priv->base_qpn,
+ priv->rx_ring[0].cqn, &context);
+
+ ptr = ((void *) &context) + 0x3c;
+ rss_context = (struct mlx4_en_rss_context *) ptr;
+ rss_context->base_qpn = cpu_to_be32(ilog2(priv->rx_ring_num - priv->udp_rings) << 24 |
+ (rss_map->base_qpn));
+ rss_context->default_qpn = cpu_to_be32(rss_map->base_qpn +
+ priv->rx_ring_num -
+ priv->udp_rings);
+ rss_context->flags = rss_mask;
+ if (priv->udp_rings > 1)
+ rss_context->base_qpn_udp = rss_context->default_qpn;
+
+ err = mlx4_qp_to_ready(mdev->dev, &priv->res.mtt, &context,
+ &rss_map->indir_qp, &rss_map->indir_state);
+ if (err)
+ goto indir_err;
+
+ return 0;
+
+indir_err:
+ mlx4_qp_modify(mdev->dev, NULL, rss_map->indir_state,
+ MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->indir_qp);
+ mlx4_qp_remove(mdev->dev, &rss_map->indir_qp);
+ mlx4_qp_free(mdev->dev, &rss_map->indir_qp);
+reserve_err:
+ mlx4_qp_release_range(mdev->dev, priv->base_qpn, 1);
+rss_err:
+ for (i = 0; i < good_qps; i++) {
+ mlx4_qp_modify(mdev->dev, NULL, rss_map->state[i],
+ MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->qps[i]);
+ mlx4_qp_remove(mdev->dev, &rss_map->qps[i]);
+ mlx4_qp_free(mdev->dev, &rss_map->qps[i]);
+ }
+ mlx4_qp_release_range(mdev->dev, rss_map->base_qpn, priv->rx_ring_num);
+ return err;
+}
+
+void mlx4_en_release_rss_steer(struct mlx4_en_priv *priv)
+{
+ struct mlx4_en_dev *mdev = priv->mdev;
+ struct mlx4_en_rss_map *rss_map = &priv->rss_map;
+ int i;
+
+ mlx4_qp_modify(mdev->dev, NULL, rss_map->indir_state,
+ MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->indir_qp);
+ mlx4_qp_remove(mdev->dev, &rss_map->indir_qp);
+ mlx4_qp_free(mdev->dev, &rss_map->indir_qp);
+ mlx4_qp_release_range(mdev->dev, priv->base_qpn, 1);
+
+ for (i = 0; i < priv->rx_ring_num; i++) {
+ mlx4_qp_modify(mdev->dev, NULL, rss_map->state[i],
+ MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->qps[i]);
+ mlx4_qp_remove(mdev->dev, &rss_map->qps[i]);
+ mlx4_qp_free(mdev->dev, &rss_map->qps[i]);
+ }
+ mlx4_qp_release_range(mdev->dev, rss_map->base_qpn, priv->rx_ring_num);
+}
diff --git a/sys/ofed/drivers/net/mlx4/en_selftest.c b/sys/ofed/drivers/net/mlx4/en_selftest.c
new file mode 100644
index 0000000..0e62027
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/en_selftest.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include "mlx4_en.h"
+
+#include <linux/kernel.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+#include <linux/delay.h>
+#include <linux/mlx4/driver.h>
+
+
+static int mlx4_en_test_registers(struct mlx4_en_priv *priv)
+{
+ return mlx4_cmd(priv->mdev->dev, 0, 0, 0, MLX4_CMD_HW_HEALTH_CHECK,
+ MLX4_CMD_TIME_CLASS_A);
+}
+
+static int mlx4_en_test_loopback_xmit(struct mlx4_en_priv *priv)
+{
+ struct mbuf *mb;
+ struct ethhdr *ethh;
+ unsigned char *packet;
+ unsigned int packet_size = MLX4_LOOPBACK_TEST_PAYLOAD;
+ unsigned int i;
+ int err;
+
+
+ /* build the pkt before xmit */
+ mb = netdev_alloc_mb(priv->dev, MLX4_LOOPBACK_TEST_PAYLOAD + ETH_HLEN + NET_IP_ALIGN);
+ if (!mb) {
+ en_err(priv, "-LOOPBACK_TEST_XMIT- failed to create mb for xmit\n");
+ return -ENOMEM;
+ }
+ mb_reserve(mb, NET_IP_ALIGN);
+
+ ethh = (struct ethhdr *)mb_put(mb, sizeof(struct ethhdr));
+ packet = (unsigned char *)mb_put(mb, packet_size);
+ memcpy(ethh->h_dest, priv->dev->dev_addr, ETH_ALEN);
+ memset(ethh->h_source, 0, ETH_ALEN);
+ ethh->h_proto = htons(ETH_P_ARP);
+ mb_set_mac_header(mb, 0);
+ for (i = 0; i < packet_size; ++i) /* fill our packet */
+ packet[i] = (unsigned char)(i & 0xff);
+
+ /* xmit the pkt */
+ err = mlx4_en_xmit(mb, priv->dev);
+ return err;
+}
+
+static int mlx4_en_test_loopback(struct mlx4_en_priv *priv)
+{
+ u32 loopback_ok = 0;
+ int i;
+
+
+ priv->loopback_ok = 0;
+ priv->validate_loopback = 1;
+
+ /* xmit */
+ if (mlx4_en_test_loopback_xmit(priv)) {
+ en_err(priv, "Transmitting loopback packet failed\n");
+ goto mlx4_en_test_loopback_exit;
+ }
+
+ /* polling for result */
+ for (i = 0; i < MLX4_EN_LOOPBACK_RETRIES; ++i) {
+ msleep(MLX4_EN_LOOPBACK_TIMEOUT);
+ if (priv->loopback_ok) {
+ loopback_ok = 1;
+ break;
+ }
+ }
+ if (!loopback_ok)
+ en_err(priv, "Loopback packet didn't arrive\n");
+
+mlx4_en_test_loopback_exit:
+
+ priv->validate_loopback = 0;
+ return (!loopback_ok);
+}
+
+
+static int mlx4_en_test_link(struct mlx4_en_priv *priv)
+{
+ if (mlx4_en_QUERY_PORT(priv->mdev, priv->port))
+ return -ENOMEM;
+ if (priv->port_state.link_state == 1)
+ return 0;
+ else
+ return 1;
+}
+
+static int mlx4_en_test_speed(struct mlx4_en_priv *priv)
+{
+
+ if (mlx4_en_QUERY_PORT(priv->mdev, priv->port))
+ return -ENOMEM;
+
+ /* The device currently only supports 10G speed */
+ if (priv->port_state.link_speed != SPEED_10000)
+ return priv->port_state.link_speed;
+ return 0;
+}
+
+
+void mlx4_en_ex_selftest(struct net_device *dev, u32 *flags, u64 *buf)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ struct mlx4_en_dev *mdev = priv->mdev;
+ struct mlx4_en_tx_ring *tx_ring;
+ int i, carrier_ok;
+
+ memset(buf, 0, sizeof(u64) * MLX4_EN_NUM_SELF_TEST);
+
+ if (*flags & ETH_TEST_FL_OFFLINE) {
+ /* disable the interface */
+ carrier_ok = netif_carrier_ok(dev);
+
+ netif_carrier_off(dev);
+retry_tx:
+ /* Wait untill all tx queues are empty.
+ * there should not be any additional incoming traffic
+ * since we turned the carrier off */
+ msleep(200);
+ for (i = 0; i < priv->tx_ring_num && carrier_ok; i++) {
+ tx_ring = &priv->tx_ring[i];
+ if (tx_ring->prod != (tx_ring->cons + tx_ring->last_nr_txbb))
+ goto retry_tx;
+ }
+
+ if (priv->mdev->dev->caps.loopback_support){
+ buf[3] = mlx4_en_test_registers(priv);
+ buf[4] = mlx4_en_test_loopback(priv);
+ }
+
+ if (carrier_ok)
+ netif_carrier_on(dev);
+
+ }
+ buf[0] = mlx4_test_interrupts(mdev->dev);
+ buf[1] = mlx4_en_test_link(priv);
+ buf[2] = mlx4_en_test_speed(priv);
+
+ for (i = 0; i < MLX4_EN_NUM_SELF_TEST; i++) {
+ if (buf[i])
+ *flags |= ETH_TEST_FL_FAILED;
+ }
+}
diff --git a/sys/ofed/drivers/net/mlx4/en_tx.c b/sys/ofed/drivers/net/mlx4/en_tx.c
new file mode 100644
index 0000000..cd45f9d
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/en_tx.c
@@ -0,0 +1,1035 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include "mlx4_en.h"
+
+#include <linux/mlx4/cq.h>
+#include <linux/mlx4/qp.h>
+#include <linux/vmalloc.h>
+
+#include <net/ethernet.h>
+#include <net/if_vlan_var.h>
+#include <sys/mbuf.h>
+
+#include <netinet/in_systm.h>
+#include <netinet/in.h>
+#include <netinet/if_ether.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_lro.h>
+#include <netinet/udp.h>
+
+enum {
+ MAX_INLINE = 104, /* 128 - 16 - 4 - 4 */
+ MAX_BF = 256,
+};
+
+static int inline_thold = MAX_INLINE;
+
+module_param_named(inline_thold, inline_thold, int, 0444);
+MODULE_PARM_DESC(inline_thold, "treshold for using inline data");
+
+int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
+ struct mlx4_en_tx_ring *ring, u32 size,
+ u16 stride)
+{
+ struct mlx4_en_dev *mdev = priv->mdev;
+ int tmp;
+ int err;
+
+ ring->size = size;
+ ring->size_mask = size - 1;
+ ring->stride = stride;
+
+ inline_thold = min(inline_thold, MAX_INLINE);
+
+ mtx_init(&ring->tx_lock.m, "mlx4 tx", NULL, MTX_DEF);
+ mtx_init(&ring->comp_lock.m, "mlx4 comp", NULL, MTX_DEF);
+
+ /* Allocate the buf ring */
+ ring->br = buf_ring_alloc(MLX4_EN_DEF_TX_QUEUE_SIZE, M_DEVBUF,
+ M_WAITOK, &ring->tx_lock.m);
+ if (ring->br == NULL) {
+ en_err(priv, "Failed allocating tx_info ring\n");
+ return -ENOMEM;
+ }
+
+ tmp = size * sizeof(struct mlx4_en_tx_info);
+ ring->tx_info = kmalloc(tmp, GFP_KERNEL);
+ if (!ring->tx_info) {
+ en_err(priv, "Failed allocating tx_info ring\n");
+ err = -ENOMEM;
+ goto err_tx;
+ }
+ en_dbg(DRV, priv, "Allocated tx_info ring at addr:%p size:%d\n",
+ ring->tx_info, tmp);
+
+ ring->bounce_buf = kmalloc(MAX_DESC_SIZE, GFP_KERNEL);
+ if (!ring->bounce_buf) {
+ en_err(priv, "Failed allocating bounce buffer\n");
+ err = -ENOMEM;
+ goto err_tx;
+ }
+ ring->buf_size = ALIGN(size * ring->stride, MLX4_EN_PAGE_SIZE);
+
+ err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size,
+ 2 * PAGE_SIZE);
+ if (err) {
+ en_err(priv, "Failed allocating hwq resources\n");
+ goto err_bounce;
+ }
+
+ err = mlx4_en_map_buffer(&ring->wqres.buf);
+ if (err) {
+ en_err(priv, "Failed to map TX buffer\n");
+ goto err_hwq_res;
+ }
+
+ ring->buf = ring->wqres.buf.direct.buf;
+
+ en_dbg(DRV, priv, "Allocated TX ring (addr:%p) - buf:%p size:%d "
+ "buf_size:%d dma:%llx\n", ring, ring->buf, ring->size,
+ ring->buf_size, (unsigned long long) ring->wqres.buf.direct.map);
+
+ err = mlx4_qp_reserve_range(mdev->dev, 1, 256, &ring->qpn);
+ if (err) {
+ en_err(priv, "Failed reserving qp for tx ring.\n");
+ goto err_map;
+ }
+
+ err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp);
+ if (err) {
+ en_err(priv, "Failed allocating qp %d\n", ring->qpn);
+ goto err_reserve;
+ }
+ ring->qp.event = mlx4_en_sqp_event;
+
+ err = mlx4_bf_alloc(mdev->dev, &ring->bf);
+ if (err) {
+ ring->bf.uar = &mdev->priv_uar;
+ ring->bf.uar->map = mdev->uar_map;
+ ring->bf_enabled = false;
+ } else
+ ring->bf_enabled = true;
+
+ return 0;
+
+err_reserve:
+ mlx4_qp_release_range(mdev->dev, ring->qpn, 1);
+err_map:
+ mlx4_en_unmap_buffer(&ring->wqres.buf);
+err_hwq_res:
+ mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
+err_bounce:
+ kfree(ring->bounce_buf);
+ ring->bounce_buf = NULL;
+err_tx:
+ buf_ring_free(ring->br, M_DEVBUF);
+ kfree(ring->tx_info);
+ ring->tx_info = NULL;
+ return err;
+}
+
+void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
+ struct mlx4_en_tx_ring *ring)
+{
+ struct mlx4_en_dev *mdev = priv->mdev;
+ en_dbg(DRV, priv, "Destroying tx ring, qpn: %d\n", ring->qpn);
+
+ buf_ring_free(ring->br, M_DEVBUF);
+ if (ring->bf_enabled)
+ mlx4_bf_free(mdev->dev, &ring->bf);
+ mlx4_qp_remove(mdev->dev, &ring->qp);
+ mlx4_qp_free(mdev->dev, &ring->qp);
+ mlx4_qp_release_range(mdev->dev, ring->qpn, 1);
+ mlx4_en_unmap_buffer(&ring->wqres.buf);
+ mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
+ kfree(ring->bounce_buf);
+ ring->bounce_buf = NULL;
+ kfree(ring->tx_info);
+ ring->tx_info = NULL;
+ mtx_destroy(&ring->tx_lock.m);
+ mtx_destroy(&ring->comp_lock.m);
+}
+
+int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
+ struct mlx4_en_tx_ring *ring,
+ int cq)
+{
+ struct mlx4_en_dev *mdev = priv->mdev;
+ int err;
+
+ ring->cqn = cq;
+ ring->prod = 0;
+ ring->cons = 0xffffffff;
+ ring->last_nr_txbb = 1;
+ ring->poll_cnt = 0;
+ ring->blocked = 0;
+ memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info));
+ memset(ring->buf, 0, ring->buf_size);
+
+ ring->qp_state = MLX4_QP_STATE_RST;
+ ring->doorbell_qpn = swab32(ring->qp.qpn << 8);
+
+ mlx4_en_fill_qp_context(priv, ring->size, ring->stride, 1, 0, ring->qpn,
+ ring->cqn, &ring->context);
+ if (ring->bf_enabled)
+ ring->context.usr_page = cpu_to_be32(ring->bf.uar->index);
+
+ err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, &ring->context,
+ &ring->qp, &ring->qp_state);
+
+ return err;
+}
+
+void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
+ struct mlx4_en_tx_ring *ring)
+{
+ struct mlx4_en_dev *mdev = priv->mdev;
+
+ mlx4_qp_modify(mdev->dev, NULL, ring->qp_state,
+ MLX4_QP_STATE_RST, NULL, 0, 0, &ring->qp);
+}
+
+
+static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
+ struct mlx4_en_tx_ring *ring,
+ int index, u8 owner)
+{
+ struct mlx4_en_dev *mdev = priv->mdev;
+ struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
+ struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE;
+ struct mlx4_wqe_data_seg *data = (void *) tx_desc + tx_info->data_offset;
+ struct mbuf *mb = tx_info->mb;
+ void *end = ring->buf + ring->buf_size;
+ int frags = tx_info->nr_segs;
+ int i;
+ __be32 *ptr = (__be32 *)tx_desc;
+ __be32 stamp = cpu_to_be32(STAMP_VAL | (!!owner << STAMP_SHIFT));
+
+ /* Optimize the common case when there are no wraparounds */
+ if (likely((void *) tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) {
+ if (!tx_info->inl) {
+ for (i = 0; i < frags; i++) {
+ pci_unmap_single(mdev->pdev,
+ (dma_addr_t) be64_to_cpu(data[i].addr),
+ data[i].byte_count, PCI_DMA_TODEVICE);
+ }
+ }
+ /* Stamp the freed descriptor */
+ for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE; i += STAMP_STRIDE) {
+ *ptr = stamp;
+ ptr += STAMP_DWORDS;
+ }
+
+ } else {
+ if (!tx_info->inl) {
+ for (i = 0; i < frags; i++) {
+ /* Check for wraparound before unmapping */
+ if ((void *) data >= end)
+ data = (struct mlx4_wqe_data_seg *) ring->buf;
+ pci_unmap_single(mdev->pdev,
+ (dma_addr_t) be64_to_cpu(data->addr),
+ data->byte_count, PCI_DMA_TODEVICE);
+ ++data;
+ }
+ }
+ /* Stamp the freed descriptor */
+ for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE; i += STAMP_STRIDE) {
+ *ptr = stamp;
+ ptr += STAMP_DWORDS;
+ if ((void *) ptr >= end) {
+ ptr = ring->buf;
+ stamp ^= cpu_to_be32(0x80000000);
+ }
+ }
+
+ }
+ m_freem(mb);
+ return tx_info->nr_txbb;
+}
+
+
+int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ int cnt = 0;
+
+ /* Skip last polled descriptor */
+ ring->cons += ring->last_nr_txbb;
+ en_dbg(DRV, priv, "Freeing Tx buf - cons:0x%x prod:0x%x\n",
+ ring->cons, ring->prod);
+
+ if ((u32) (ring->prod - ring->cons) > ring->size) {
+ en_warn(priv, "Tx consumer passed producer!\n");
+ return 0;
+ }
+
+ while (ring->cons != ring->prod) {
+ ring->last_nr_txbb = mlx4_en_free_tx_desc(priv, ring,
+ ring->cons & ring->size_mask,
+ !!(ring->cons & ring->size));
+ ring->cons += ring->last_nr_txbb;
+ cnt++;
+ }
+
+ if (cnt)
+ en_dbg(DRV, priv, "Freed %d uncompleted tx descriptors\n", cnt);
+
+ return cnt;
+}
+
+void mlx4_en_set_prio_map(struct mlx4_en_priv *priv, u16 *prio_map, u32 ring_num)
+{
+ int block = 8 / ring_num;
+ int extra = 8 - (block * ring_num);
+ int num = 0;
+ u16 ring = 1;
+ int prio;
+
+ if (ring_num == 1) {
+ for (prio = 0; prio < 8; prio++)
+ prio_map[prio] = 0;
+ return;
+ }
+
+ for (prio = 0; prio < 8; prio++) {
+ if (extra && (num == block + 1)) {
+ ring++;
+ num = 0;
+ extra--;
+ } else if (!extra && (num == block)) {
+ ring++;
+ num = 0;
+ }
+ prio_map[prio] = ring;
+ en_dbg(DRV, priv, " prio:%d --> ring:%d\n", prio, ring);
+ num++;
+ }
+}
+
+static void mlx4_en_process_tx_cq(struct net_device *dev, struct mlx4_en_cq *cq)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ struct mlx4_cq *mcq = &cq->mcq;
+ struct mlx4_en_tx_ring *ring = &priv->tx_ring[cq->ring];
+ struct mlx4_cqe *cqe = cq->buf;
+ u16 index;
+ u16 new_index;
+ u32 txbbs_skipped = 0;
+ u32 cq_last_sav;
+
+ /* index always points to the first TXBB of the last polled descriptor */
+ index = ring->cons & ring->size_mask;
+ new_index = be16_to_cpu(cqe->wqe_index) & ring->size_mask;
+ if (index == new_index)
+ return;
+
+ if (!priv->port_up)
+ return;
+
+ /*
+ * We use a two-stage loop:
+ * - the first samples the HW-updated CQE
+ * - the second frees TXBBs until the last sample
+ * This lets us amortize CQE cache misses, while still polling the CQ
+ * until is quiescent.
+ */
+ cq_last_sav = mcq->cons_index;
+ do {
+ do {
+ /* Skip over last polled CQE */
+ index = (index + ring->last_nr_txbb) & ring->size_mask;
+ txbbs_skipped += ring->last_nr_txbb;
+
+ /* Poll next CQE */
+ ring->last_nr_txbb = mlx4_en_free_tx_desc(
+ priv, ring, index,
+ !!((ring->cons + txbbs_skipped) &
+ ring->size));
+ ++mcq->cons_index;
+
+ } while (index != new_index);
+
+ new_index = be16_to_cpu(cqe->wqe_index) & ring->size_mask;
+ } while (index != new_index);
+ AVG_PERF_COUNTER(priv->pstats.tx_coal_avg,
+ (u32) (mcq->cons_index - cq_last_sav));
+
+ /*
+ * To prevent CQ overflow we first update CQ consumer and only then
+ * the ring consumer.
+ */
+ mlx4_cq_set_ci(mcq);
+ wmb();
+ ring->cons += txbbs_skipped;
+
+ /* Wakeup Tx queue if this ring stopped it */
+ if (unlikely(ring->blocked)) {
+ if ((u32) (ring->prod - ring->cons) <=
+ ring->size - HEADROOM - MAX_DESC_TXBBS) {
+ ring->blocked = 0;
+ if (atomic_fetchadd_int(&priv->blocked, -1) == 1)
+ atomic_clear_int(&dev->if_drv_flags,
+ IFF_DRV_OACTIVE);
+ priv->port_stats.wake_queue++;
+ }
+ }
+}
+
+void mlx4_en_tx_irq(struct mlx4_cq *mcq)
+{
+ struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq);
+ struct mlx4_en_priv *priv = netdev_priv(cq->dev);
+ struct mlx4_en_tx_ring *ring = &priv->tx_ring[cq->ring];
+
+ if (!spin_trylock(&ring->comp_lock))
+ return;
+ mlx4_en_process_tx_cq(cq->dev, cq);
+ mod_timer(&cq->timer, jiffies + 1);
+ spin_unlock(&ring->comp_lock);
+}
+
+
+void mlx4_en_poll_tx_cq(unsigned long data)
+{
+ struct mlx4_en_cq *cq = (struct mlx4_en_cq *) data;
+ struct mlx4_en_priv *priv = netdev_priv(cq->dev);
+ struct mlx4_en_tx_ring *ring = &priv->tx_ring[cq->ring];
+ u32 inflight;
+
+ INC_PERF_COUNTER(priv->pstats.tx_poll);
+
+ if (!spin_trylock(&ring->comp_lock)) {
+ mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT);
+ return;
+ }
+ mlx4_en_process_tx_cq(cq->dev, cq);
+ inflight = (u32) (ring->prod - ring->cons - ring->last_nr_txbb);
+
+ /* If there are still packets in flight and the timer has not already
+ * been scheduled by the Tx routine then schedule it here to guarantee
+ * completion processing of these packets */
+ if (inflight && priv->port_up)
+ mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT);
+
+ spin_unlock(&ring->comp_lock);
+}
+
+static struct mlx4_en_tx_desc *mlx4_en_bounce_to_desc(struct mlx4_en_priv *priv,
+ struct mlx4_en_tx_ring *ring,
+ u32 index,
+ unsigned int desc_size)
+{
+ u32 copy = (ring->size - index) * TXBB_SIZE;
+ int i;
+
+ for (i = desc_size - copy - 4; i >= 0; i -= 4) {
+ if ((i & (TXBB_SIZE - 1)) == 0)
+ wmb();
+
+ *((u32 *) (ring->buf + i)) =
+ *((u32 *) (ring->bounce_buf + copy + i));
+ }
+
+ for (i = copy - 4; i >= 4 ; i -= 4) {
+ if ((i & (TXBB_SIZE - 1)) == 0)
+ wmb();
+
+ *((u32 *) (ring->buf + index * TXBB_SIZE + i)) =
+ *((u32 *) (ring->bounce_buf + i));
+ }
+
+ /* Return real descriptor location */
+ return ring->buf + index * TXBB_SIZE;
+}
+
+static inline void mlx4_en_xmit_poll(struct mlx4_en_priv *priv, int tx_ind)
+{
+ struct mlx4_en_cq *cq = &priv->tx_cq[tx_ind];
+ struct mlx4_en_tx_ring *ring = &priv->tx_ring[tx_ind];
+
+ /* If we don't have a pending timer, set one up to catch our recent
+ post in case the interface becomes idle */
+ if (!timer_pending(&cq->timer))
+ mod_timer(&cq->timer, jiffies + MLX4_EN_TX_POLL_TIMEOUT);
+
+ /* Poll the CQ every mlx4_en_TX_MODER_POLL packets */
+ if ((++ring->poll_cnt & (MLX4_EN_TX_POLL_MODER - 1)) == 0)
+ if (spin_trylock(&ring->comp_lock)) {
+ mlx4_en_process_tx_cq(priv->dev, cq);
+ spin_unlock(&ring->comp_lock);
+ }
+}
+
+static int is_inline(struct mbuf *mb)
+{
+
+ if (inline_thold && mb->m_pkthdr.len <= inline_thold &&
+ (mb->m_pkthdr.csum_flags & CSUM_TSO) == 0)
+ return 1;
+
+ return 0;
+}
+
+static int inline_size(struct mbuf *mb)
+{
+ int len;
+
+ len = mb->m_pkthdr.len;
+ if (len + CTRL_SIZE + sizeof(struct mlx4_wqe_inline_seg)
+ <= MLX4_INLINE_ALIGN)
+ return ALIGN(len + CTRL_SIZE +
+ sizeof(struct mlx4_wqe_inline_seg), 16);
+ else
+ return ALIGN(len + CTRL_SIZE + 2 *
+ sizeof(struct mlx4_wqe_inline_seg), 16);
+}
+
+static int get_head_size(struct mbuf *mb)
+{
+ struct tcphdr *th;
+ struct ip *ip;
+ int ip_hlen, tcp_hlen;
+ int len;
+
+ len = ETHER_HDR_LEN;
+ if (mb->m_len < len + sizeof(struct ip))
+ return (0);
+ ip = (struct ip *)(mtod(mb, char *) + len);
+ if (ip->ip_p != IPPROTO_TCP)
+ return (0);
+ ip_hlen = ip->ip_hl << 2;
+ len += ip_hlen;
+ if (mb->m_len < len + sizeof(struct tcphdr))
+ return (0);
+ th = (struct tcphdr *)(mtod(mb, char *) + len);
+ tcp_hlen = th->th_off << 2;
+ len += tcp_hlen;
+ if (mb->m_len < len)
+ return (0);
+ return (len);
+}
+
+static int get_real_size(struct mbuf *mb, struct net_device *dev, int *segsp,
+ int *lso_header_size)
+{
+ struct mbuf *m;
+ int nr_segs;
+
+ nr_segs = 0;
+ for (m = mb; m != NULL; m = m->m_next)
+ if (m->m_len)
+ nr_segs++;
+
+ if (mb->m_pkthdr.csum_flags & CSUM_TSO) {
+ *lso_header_size = get_head_size(mb);
+ if (*lso_header_size) {
+ if (mb->m_len == *lso_header_size)
+ nr_segs--;
+ *segsp = nr_segs;
+ return CTRL_SIZE + nr_segs * DS_SIZE +
+ ALIGN(*lso_header_size + 4, DS_SIZE);
+ }
+ } else
+ *lso_header_size = 0;
+ *segsp = nr_segs;
+ if (is_inline(mb))
+ return inline_size(mb);
+ return (CTRL_SIZE + nr_segs * DS_SIZE);
+}
+
+static struct mbuf *mb_copy(struct mbuf *mb, int *offp, char *data, int len)
+{
+ int bytes;
+ int off;
+
+ off = *offp;
+ while (len) {
+ bytes = min(mb->m_len - off, len);
+ if (bytes)
+ memcpy(data, mb->m_data + off, bytes);
+ len -= bytes;
+ data += bytes;
+ off += bytes;
+ if (off == mb->m_len) {
+ off = 0;
+ mb = mb->m_next;
+ }
+ }
+ *offp = off;
+ return (mb);
+}
+
+static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, struct mbuf *mb,
+ int real_size, u16 *vlan_tag, int tx_ind)
+{
+ struct mlx4_wqe_inline_seg *inl = &tx_desc->inl;
+ int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - sizeof *inl;
+ int len;
+ int off;
+
+ off = 0;
+ len = mb->m_pkthdr.len;
+ if (len <= spc) {
+ inl->byte_count = cpu_to_be32(1 << 31 | len);
+ mb_copy(mb, &off, (void *)(inl + 1), len);
+ } else {
+ inl->byte_count = cpu_to_be32(1 << 31 | spc);
+ mb = mb_copy(mb, &off, (void *)(inl + 1), spc);
+ inl = (void *) (inl + 1) + spc;
+ mb_copy(mb, &off, (void *)(inl + 1), len - spc);
+ wmb();
+ inl->byte_count = cpu_to_be32(1 << 31 | (len - spc));
+ }
+ tx_desc->ctrl.vlan_tag = cpu_to_be16(*vlan_tag);
+ tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN * !!(*vlan_tag);
+ tx_desc->ctrl.fence_size = (real_size / 16) & 0x3f;
+}
+
+u16 mlx4_en_select_queue(struct net_device *dev, struct mbuf *mb)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ struct mlx4_en_tx_hash_entry *entry;
+ struct ether_header *eth;
+ struct tcphdr *th;
+ struct ip *iph;
+ u32 hash_index;
+ int tx_ind = 0;
+ u16 vlan_tag = 0;
+ int len;
+
+ /* Obtain VLAN information if present */
+ if (mb->m_flags & M_VLANTAG) {
+ vlan_tag = mb->m_pkthdr.ether_vtag;
+ /* Set the Tx ring to use according to vlan priority */
+ tx_ind = priv->tx_prio_map[vlan_tag >> 13];
+ if (tx_ind)
+ return tx_ind;
+ }
+ if (mb->m_len <
+ ETHER_HDR_LEN + sizeof(struct ip) + sizeof(struct tcphdr))
+ return MLX4_EN_NUM_HASH_RINGS;
+ eth = mtod(mb, struct ether_header *);
+ /* Hashing is only done for TCP/IP or UDP/IP packets */
+ if (be16_to_cpu(eth->ether_type) != ETHERTYPE_IP)
+ return MLX4_EN_NUM_HASH_RINGS;
+ len = ETHER_HDR_LEN;
+ iph = (struct ip *)(mtod(mb, char *) + len);
+ len += iph->ip_hl << 2;
+ th = (struct tcphdr *)(mtod(mb, char *) + len);
+ hash_index = be32_to_cpu(iph->ip_dst.s_addr) & MLX4_EN_TX_HASH_MASK;
+ switch(iph->ip_p) {
+ case IPPROTO_UDP:
+ break;
+ case IPPROTO_TCP:
+ if (mb->m_len < len + sizeof(struct tcphdr))
+ return MLX4_EN_NUM_HASH_RINGS;
+ hash_index =
+ (hash_index ^ be16_to_cpu(th->th_dport ^ th->th_sport)) &
+ MLX4_EN_TX_HASH_MASK;
+ break;
+ default:
+ return MLX4_EN_NUM_HASH_RINGS;
+ }
+
+ entry = &priv->tx_hash[hash_index];
+ if(unlikely(!entry->cnt)) {
+ tx_ind = hash_index & (MLX4_EN_NUM_HASH_RINGS / 2 - 1);
+ if (2 * entry->small_pkts > entry->big_pkts)
+ tx_ind += MLX4_EN_NUM_HASH_RINGS / 2;
+ entry->small_pkts = entry->big_pkts = 0;
+ entry->ring = tx_ind;
+ }
+
+ entry->cnt++;
+ if (mb->m_pkthdr.len > MLX4_EN_SMALL_PKT_SIZE)
+ entry->big_pkts++;
+ else
+ entry->small_pkts++;
+ return entry->ring;
+}
+
+static void mlx4_bf_copy(unsigned long *dst, unsigned long *src, unsigned bytecnt)
+{
+ __iowrite64_copy(dst, src, bytecnt / 8);
+}
+
+static int mlx4_en_xmit(struct net_device *dev, int tx_ind, struct mbuf **mbp)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ struct mlx4_en_dev *mdev = priv->mdev;
+ struct mlx4_en_tx_ring *ring;
+ struct mlx4_en_cq *cq;
+ struct mlx4_en_tx_desc *tx_desc;
+ struct mlx4_wqe_data_seg *data;
+ struct mlx4_en_tx_info *tx_info;
+ struct mbuf *m;
+ int nr_txbb;
+ int nr_segs;
+ int desc_size;
+ int real_size;
+ dma_addr_t dma;
+ u32 index, bf_index;
+ __be32 op_own;
+ u16 vlan_tag = 0;
+ int i;
+ int lso_header_size;
+ bool bounce = false;
+ struct mbuf *mb;
+ int defrag = 1;
+
+ ring = &priv->tx_ring[tx_ind];
+ mb = *mbp;
+ if (!priv->port_up)
+ goto tx_drop;
+
+retry:
+ real_size = get_real_size(mb, dev, &nr_segs, &lso_header_size);
+ if (unlikely(!real_size))
+ goto tx_drop;
+
+ /* Allign descriptor to TXBB size */
+ desc_size = ALIGN(real_size, TXBB_SIZE);
+ nr_txbb = desc_size / TXBB_SIZE;
+ if (unlikely(nr_txbb > MAX_DESC_TXBBS)) {
+ if (defrag) {
+ mb = m_defrag(*mbp, M_DONTWAIT);
+ if (mb == NULL) {
+ mb = *mbp;
+ goto tx_drop;
+ }
+ *mbp = mb;
+ defrag = 0;
+ goto retry;
+ }
+ goto tx_drop;
+ }
+
+ /* Check available TXBBs And 2K spare for prefetch */
+ if (unlikely(((int)(ring->prod - ring->cons)) >
+ ring->size - HEADROOM - MAX_DESC_TXBBS)) {
+ /* every full Tx ring stops queue */
+ if (ring->blocked == 0)
+ atomic_add_int(&priv->blocked, 1);
+ atomic_set_int(&dev->if_drv_flags, IFF_DRV_OACTIVE);
+ ring->blocked = 1;
+ priv->port_stats.queue_stopped++;
+
+ /* Use interrupts to find out when queue opened */
+ cq = &priv->tx_cq[tx_ind];
+ mlx4_en_arm_cq(priv, cq);
+ return EBUSY;
+ }
+
+ /* Track current inflight packets for performance analysis */
+ AVG_PERF_COUNTER(priv->pstats.inflight_avg,
+ (u32) (ring->prod - ring->cons - 1));
+
+ /* Packet is good - grab an index and transmit it */
+ index = ring->prod & ring->size_mask;
+ bf_index = ring->prod;
+
+ /* See if we have enough space for whole descriptor TXBB for setting
+ * SW ownership on next descriptor; if not, use a bounce buffer. */
+ if (likely(index + nr_txbb <= ring->size))
+ tx_desc = ring->buf + index * TXBB_SIZE;
+ else {
+ tx_desc = (struct mlx4_en_tx_desc *) ring->bounce_buf;
+ bounce = true;
+ }
+
+ /* Prepare ctrl segement apart opcode+ownership, which depends on
+ * whether LSO is used */
+ if (mb->m_flags & M_VLANTAG)
+ vlan_tag = mb->m_pkthdr.ether_vtag;
+ tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag);
+ tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN * !!vlan_tag;
+ tx_desc->ctrl.fence_size = (real_size / 16) & 0x3f;
+ tx_desc->ctrl.srcrb_flags = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE |
+ MLX4_WQE_CTRL_SOLICITED);
+ if (mb->m_pkthdr.csum_flags & (CSUM_IP|CSUM_TCP|CSUM_UDP)) {
+ tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM |
+ MLX4_WQE_CTRL_TCP_UDP_CSUM);
+ priv->port_stats.tx_chksum_offload++;
+ }
+
+ if (unlikely(priv->validate_loopback)) {
+ /* Copy dst mac address to wqe */
+ struct ether_header *ethh;
+ u64 mac;
+ u32 mac_l, mac_h;
+
+ ethh = mtod(mb, struct ether_header *);
+ mac = mlx4_en_mac_to_u64(ethh->ether_dhost);
+ if (mac) {
+ mac_h = (u32) ((mac & 0xffff00000000ULL) >> 16);
+ mac_l = (u32) (mac & 0xffffffff);
+ tx_desc->ctrl.srcrb_flags |= cpu_to_be32(mac_h);
+ tx_desc->ctrl.imm = cpu_to_be32(mac_l);
+ }
+ }
+
+ /* Handle LSO (TSO) packets */
+ if (lso_header_size) {
+ int segsz;
+
+ /* Mark opcode as LSO */
+ op_own = cpu_to_be32(MLX4_OPCODE_LSO | (1 << 6)) |
+ ((ring->prod & ring->size) ?
+ cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
+
+ /* Fill in the LSO prefix */
+ tx_desc->lso.mss_hdr_size = cpu_to_be32(
+ mb->m_pkthdr.tso_segsz << 16 | lso_header_size);
+
+ /* Copy headers;
+ * note that we already verified that it is linear */
+ memcpy(tx_desc->lso.header, mb->m_data, lso_header_size);
+ data = ((void *) &tx_desc->lso +
+ ALIGN(lso_header_size + 4, DS_SIZE));
+
+ priv->port_stats.tso_packets++;
+ segsz = mb->m_pkthdr.tso_segsz;
+ i = ((mb->m_pkthdr.len - lso_header_size) / segsz) +
+ !!((mb->m_pkthdr.len - lso_header_size) % segsz);
+ ring->bytes += mb->m_pkthdr.len + (i - 1) * lso_header_size;
+ ring->packets += i;
+ mb->m_data += lso_header_size;
+ mb->m_len -= lso_header_size;
+ } else {
+ /* Normal (Non LSO) packet */
+ op_own = cpu_to_be32(MLX4_OPCODE_SEND) |
+ ((ring->prod & ring->size) ?
+ cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
+ data = &tx_desc->data;
+ ring->bytes += max(mb->m_pkthdr.len,
+ (unsigned int)ETHER_MIN_LEN - ETHER_CRC_LEN);
+ ring->packets++;
+
+ }
+ AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, mb->m_pkthdr.len);
+
+ /* Save mb in tx_info ring */
+ tx_info = &ring->tx_info[index];
+ tx_info->mb = mb;
+ tx_info->nr_txbb = nr_txbb;
+ tx_info->nr_segs = nr_segs;
+ /* valid only for non inline segments */
+ tx_info->data_offset = (void *) data - (void *) tx_desc;
+
+ if (!is_inline(mb)) {
+ for (i = 0, m = mb; i < nr_segs; i++, m = m->m_next) {
+ if (m->m_len == 0) {
+ i--;
+ continue;
+ }
+ dma = pci_map_single(mdev->dev->pdev, m->m_data,
+ m->m_len, PCI_DMA_TODEVICE);
+ data->addr = cpu_to_be64(dma);
+ data->lkey = cpu_to_be32(mdev->mr.key);
+ wmb();
+ data->byte_count = cpu_to_be32(m->m_len);
+ data++;
+ }
+ if (lso_header_size) {
+ mb->m_data -= lso_header_size;
+ mb->m_len += lso_header_size;
+ }
+ tx_info->inl = 0;
+ } else {
+ build_inline_wqe(tx_desc, mb, real_size, &vlan_tag, tx_ind);
+ tx_info->inl = 1;
+ }
+
+ ring->prod += nr_txbb;
+
+ /* If we used a bounce buffer then copy descriptor back into place */
+ if (bounce)
+ tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size);
+
+ if (ring->bf_enabled && desc_size <= MAX_BF && !bounce && !vlan_tag) {
+ *(u32 *) (&tx_desc->ctrl.vlan_tag) |= ring->doorbell_qpn;
+ op_own |= htonl((bf_index & 0xffff) << 8);
+ /* Ensure new descirptor hits memory
+ * before setting ownership of this descriptor to HW */
+ wmb();
+ tx_desc->ctrl.owner_opcode = op_own;
+
+ wmb();
+
+ mlx4_bf_copy(ring->bf.reg + ring->bf.offset, (unsigned long *) &tx_desc->ctrl,
+ desc_size);
+
+ wmb();
+
+ ring->bf.offset ^= ring->bf.buf_size;
+ } else {
+ /* Ensure new descirptor hits memory
+ * before setting ownership of this descriptor to HW */
+ wmb();
+ tx_desc->ctrl.owner_opcode = op_own;
+ wmb();
+ writel(ring->doorbell_qpn, ring->bf.uar->map + MLX4_SEND_DOORBELL);
+ }
+
+ return 0;
+
+tx_drop:
+ *mbp = NULL;
+ m_freem(mb);
+ ring->errors++;
+ return EINVAL;
+}
+
+
+static int
+mlx4_en_transmit_locked(struct ifnet *dev, int tx_ind, struct mbuf *m)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ struct mlx4_en_tx_ring *ring;
+ struct mbuf *next;
+ int enqueued, err = 0;
+
+ ring = &priv->tx_ring[tx_ind];
+ if ((dev->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
+ IFF_DRV_RUNNING || priv->port_up == 0) {
+ if (m != NULL)
+ err = drbr_enqueue(dev, ring->br, m);
+ return (err);
+ }
+
+ enqueued = 0;
+ if (m == NULL) {
+ next = drbr_dequeue(dev, ring->br);
+ } else if (drbr_needs_enqueue(dev, ring->br)) {
+ if ((err = drbr_enqueue(dev, ring->br, m)) != 0)
+ return (err);
+ next = drbr_dequeue(dev, ring->br);
+ } else
+ next = m;
+
+ /* Process the queue */
+ while (next != NULL) {
+ if ((err = mlx4_en_xmit(dev, tx_ind, &next)) != 0) {
+ if (next != NULL)
+ err = drbr_enqueue(dev, ring->br, next);
+ break;
+ }
+ enqueued++;
+ drbr_stats_update(dev, next->m_pkthdr.len, next->m_flags);
+ /* Send a copy of the frame to the BPF listener */
+ ETHER_BPF_MTAP(dev, next);
+ if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0)
+ break;
+ next = drbr_dequeue(dev, ring->br);
+ }
+
+ if (enqueued > 0)
+ ring->watchdog_time = ticks;
+
+ return (err);
+}
+
+void
+mlx4_en_tx_que(void *context, int pending)
+{
+ struct mlx4_en_tx_ring *ring;
+ struct mlx4_en_priv *priv;
+ struct net_device *dev;
+ struct mlx4_en_cq *cq;
+ int tx_ind;
+
+ cq = context;
+ dev = cq->dev;
+ priv = dev->if_softc;
+ tx_ind = cq->ring;
+ ring = &priv->tx_ring[tx_ind];
+ if (dev->if_drv_flags & IFF_DRV_RUNNING) {
+ mlx4_en_xmit_poll(priv, tx_ind);
+ spin_lock(&ring->tx_lock);
+ if (!drbr_empty(dev, ring->br))
+ mlx4_en_transmit_locked(dev, tx_ind, NULL);
+ spin_unlock(&ring->tx_lock);
+ }
+}
+
+int
+mlx4_en_transmit(struct ifnet *dev, struct mbuf *m)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ struct mlx4_en_tx_ring *ring;
+ struct mlx4_en_cq *cq;
+ int i = 0, err = 0;
+
+ /* Which queue to use */
+ if ((m->m_flags & (M_FLOWID | M_VLANTAG)) == M_FLOWID)
+ i = m->m_pkthdr.flowid % (MLX4_EN_NUM_HASH_RINGS - 1);
+ else
+ i = mlx4_en_select_queue(dev, m);
+
+ ring = &priv->tx_ring[i];
+
+ if (spin_trylock(&ring->tx_lock)) {
+ err = mlx4_en_transmit_locked(dev, i, m);
+ spin_unlock(&ring->tx_lock);
+ /* Poll CQ here */
+ mlx4_en_xmit_poll(priv, i);
+ } else {
+ err = drbr_enqueue(dev, ring->br, m);
+ cq = &priv->tx_cq[i];
+ taskqueue_enqueue(cq->tq, &cq->cq_task);
+ }
+
+ return (err);
+}
+
+/*
+ * Flush ring buffers.
+ */
+void
+mlx4_en_qflush(struct ifnet *dev)
+{
+ struct mlx4_en_priv *priv = netdev_priv(dev);
+ struct mlx4_en_tx_ring *ring = priv->tx_ring;
+ struct mbuf *m;
+
+ for (int i = 0; i < priv->tx_ring_num; i++, ring++) {
+ spin_lock(&ring->tx_lock);
+ while ((m = buf_ring_dequeue_sc(ring->br)) != NULL)
+ m_freem(m);
+ spin_unlock(&ring->tx_lock);
+ }
+ if_qflush(dev);
+}
diff --git a/sys/ofed/drivers/net/mlx4/eq.c b/sys/ofed/drivers/net/mlx4/eq.c
new file mode 100644
index 0000000..42885c7
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/eq.c
@@ -0,0 +1,727 @@
+/*
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+#include "fw.h"
+
+enum {
+ MLX4_NUM_ASYNC_EQE = 0x100,
+ MLX4_NUM_SPARE_EQE = 0x80,
+ MLX4_EQ_ENTRY_SIZE = 0x20
+};
+
+/*
+ * Must be packed because start is 64 bits but only aligned to 32 bits.
+ */
+struct mlx4_eq_context {
+ __be32 flags;
+ u16 reserved1[3];
+ __be16 page_offset;
+ u8 log_eq_size;
+ u8 reserved2[4];
+ u8 eq_period;
+ u8 reserved3;
+ u8 eq_max_count;
+ u8 reserved4[3];
+ u8 intr;
+ u8 log_page_size;
+ u8 reserved5[2];
+ u8 mtt_base_addr_h;
+ __be32 mtt_base_addr_l;
+ u32 reserved6[2];
+ __be32 consumer_index;
+ __be32 producer_index;
+ u32 reserved7[4];
+};
+
+#define MLX4_EQ_STATUS_OK ( 0 << 28)
+#define MLX4_EQ_STATUS_WRITE_FAIL (10 << 28)
+#define MLX4_EQ_OWNER_SW ( 0 << 24)
+#define MLX4_EQ_OWNER_HW ( 1 << 24)
+#define MLX4_EQ_FLAG_EC ( 1 << 18)
+#define MLX4_EQ_FLAG_OI ( 1 << 17)
+#define MLX4_EQ_STATE_ARMED ( 9 << 8)
+#define MLX4_EQ_STATE_FIRED (10 << 8)
+#define MLX4_EQ_STATE_ALWAYS_ARMED (11 << 8)
+
+#define MLX4_ASYNC_EVENT_MASK ((1ull << MLX4_EVENT_TYPE_PATH_MIG) | \
+ (1ull << MLX4_EVENT_TYPE_COMM_EST) | \
+ (1ull << MLX4_EVENT_TYPE_SQ_DRAINED) | \
+ (1ull << MLX4_EVENT_TYPE_CQ_ERROR) | \
+ (1ull << MLX4_EVENT_TYPE_WQ_CATAS_ERROR) | \
+ (1ull << MLX4_EVENT_TYPE_EEC_CATAS_ERROR) | \
+ (1ull << MLX4_EVENT_TYPE_PATH_MIG_FAILED) | \
+ (1ull << MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | \
+ (1ull << MLX4_EVENT_TYPE_WQ_ACCESS_ERROR) | \
+ (1ull << MLX4_EVENT_TYPE_PORT_CHANGE) | \
+ (1ull << MLX4_EVENT_TYPE_ECC_DETECT) | \
+ (1ull << MLX4_EVENT_TYPE_SRQ_CATAS_ERROR) | \
+ (1ull << MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE) | \
+ (1ull << MLX4_EVENT_TYPE_SRQ_LIMIT) | \
+ (1ull << MLX4_EVENT_TYPE_CMD))
+
+struct mlx4_eqe {
+ u8 reserved1;
+ u8 type;
+ u8 reserved2;
+ u8 subtype;
+ union {
+ u32 raw[6];
+ struct {
+ __be32 cqn;
+ } __attribute__((packed)) comp;
+ struct {
+ u16 reserved1;
+ __be16 token;
+ u32 reserved2;
+ u8 reserved3[3];
+ u8 status;
+ __be64 out_param;
+ } __attribute__((packed)) cmd;
+ struct {
+ __be32 qpn;
+ } __attribute__((packed)) qp;
+ struct {
+ __be32 srqn;
+ } __attribute__((packed)) srq;
+ struct {
+ __be32 cqn;
+ u32 reserved1;
+ u8 reserved2[3];
+ u8 syndrome;
+ } __attribute__((packed)) cq_err;
+ struct {
+ u32 reserved1[2];
+ __be32 port;
+ } __attribute__((packed)) port_change;
+ } event;
+ u8 reserved3[3];
+ u8 owner;
+} __attribute__((packed));
+
+static void eq_set_ci(struct mlx4_eq *eq, int req_not)
+{
+ __raw_writel((__force u32) cpu_to_be32((eq->cons_index & 0xffffff) |
+ req_not << 31),
+ eq->doorbell);
+ /* We still want ordering, just not swabbing, so add a barrier */
+ mb();
+}
+
+static struct mlx4_eqe *get_eqe(struct mlx4_eq *eq, u32 entry)
+{
+ unsigned long off = (entry & (eq->nent - 1)) * MLX4_EQ_ENTRY_SIZE;
+ return eq->page_list[off / PAGE_SIZE].buf + off % PAGE_SIZE;
+}
+
+static struct mlx4_eqe *next_eqe_sw(struct mlx4_eq *eq)
+{
+ struct mlx4_eqe *eqe = get_eqe(eq, eq->cons_index);
+ return !!(eqe->owner & 0x80) ^ !!(eq->cons_index & eq->nent) ? NULL : eqe;
+}
+
+static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
+{
+ struct mlx4_eqe *eqe;
+ int cqn;
+ int eqes_found = 0;
+ int set_ci = 0;
+ int port;
+
+ while ((eqe = next_eqe_sw(eq))) {
+ /*
+ * Make sure we read EQ entry contents after we've
+ * checked the ownership bit.
+ */
+ rmb();
+
+ switch (eqe->type) {
+ case MLX4_EVENT_TYPE_COMP:
+ cqn = be32_to_cpu(eqe->event.comp.cqn) & 0xffffff;
+ mlx4_cq_completion(dev, cqn);
+ break;
+
+ case MLX4_EVENT_TYPE_PATH_MIG:
+ case MLX4_EVENT_TYPE_COMM_EST:
+ case MLX4_EVENT_TYPE_SQ_DRAINED:
+ case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE:
+ case MLX4_EVENT_TYPE_WQ_CATAS_ERROR:
+ case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
+ case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+ case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
+ mlx4_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+ eqe->type);
+ break;
+
+ case MLX4_EVENT_TYPE_SRQ_LIMIT:
+ case MLX4_EVENT_TYPE_SRQ_CATAS_ERROR:
+ mlx4_srq_event(dev, be32_to_cpu(eqe->event.srq.srqn) & 0xffffff,
+ eqe->type);
+ break;
+
+ case MLX4_EVENT_TYPE_CMD:
+ mlx4_cmd_event(dev,
+ be16_to_cpu(eqe->event.cmd.token),
+ eqe->event.cmd.status,
+ be64_to_cpu(eqe->event.cmd.out_param));
+ break;
+
+ case MLX4_EVENT_TYPE_PORT_CHANGE:
+ port = be32_to_cpu(eqe->event.port_change.port) >> 28;
+ if (eqe->subtype == MLX4_PORT_CHANGE_SUBTYPE_DOWN) {
+ mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_DOWN,
+ port);
+ mlx4_priv(dev)->sense.do_sense_port[port] = 1;
+ } else {
+ mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_UP,
+ port);
+ mlx4_priv(dev)->sense.do_sense_port[port] = 0;
+ }
+ break;
+
+ case MLX4_EVENT_TYPE_CQ_ERROR:
+ mlx4_warn(dev, "CQ %s on CQN %06x\n",
+ eqe->event.cq_err.syndrome == 1 ?
+ "overrun" : "access violation",
+ be32_to_cpu(eqe->event.cq_err.cqn) & 0xffffff);
+ mlx4_cq_event(dev, be32_to_cpu(eqe->event.cq_err.cqn),
+ eqe->type);
+ break;
+
+ case MLX4_EVENT_TYPE_EQ_OVERFLOW:
+ mlx4_warn(dev, "EQ overrun on EQN %d\n", eq->eqn);
+ break;
+
+ case MLX4_EVENT_TYPE_EEC_CATAS_ERROR:
+ case MLX4_EVENT_TYPE_ECC_DETECT:
+ default:
+ mlx4_warn(dev, "Unhandled event %02x(%02x) on EQ %d at index %u\n",
+ eqe->type, eqe->subtype, eq->eqn, eq->cons_index);
+ break;
+ };
+
+ ++eq->cons_index;
+ eqes_found = 1;
+ ++set_ci;
+
+ /*
+ * The HCA will think the queue has overflowed if we
+ * don't tell it we've been processing events. We
+ * create our EQs with MLX4_NUM_SPARE_EQE extra
+ * entries, so we must update our consumer index at
+ * least that often.
+ */
+ if (unlikely(set_ci >= MLX4_NUM_SPARE_EQE)) {
+ eq_set_ci(eq, 0);
+ set_ci = 0;
+ }
+ }
+
+ eq_set_ci(eq, 1);
+
+ return eqes_found;
+}
+
+static irqreturn_t mlx4_interrupt(int irq, void *dev_ptr)
+{
+ struct mlx4_dev *dev = dev_ptr;
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ int work = 0;
+ int i;
+
+ writel(priv->eq_table.clr_mask, priv->eq_table.clr_int);
+
+ for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
+ work |= mlx4_eq_int(dev, &priv->eq_table.eq[i]);
+
+ return IRQ_RETVAL(work);
+}
+
+static irqreturn_t mlx4_msi_x_interrupt(int irq, void *eq_ptr)
+{
+ struct mlx4_eq *eq = eq_ptr;
+ struct mlx4_dev *dev = eq->dev;
+
+ mlx4_eq_int(dev, eq);
+
+ /* MSI-X vectors always belong to us */
+ return IRQ_HANDLED;
+}
+
+static int mlx4_MAP_EQ(struct mlx4_dev *dev, u64 event_mask, int unmap,
+ int eq_num)
+{
+ return mlx4_cmd(dev, event_mask, (unmap << 31) | eq_num,
+ 0, MLX4_CMD_MAP_EQ, MLX4_CMD_TIME_CLASS_B);
+}
+
+static int mlx4_SW2HW_EQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+ int eq_num)
+{
+ return mlx4_cmd(dev, mailbox->dma, eq_num, 0, MLX4_CMD_SW2HW_EQ,
+ MLX4_CMD_TIME_CLASS_A);
+}
+
+static int mlx4_HW2SW_EQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+ int eq_num)
+{
+ return mlx4_cmd_box(dev, 0, mailbox->dma, eq_num, 0, MLX4_CMD_HW2SW_EQ,
+ MLX4_CMD_TIME_CLASS_A);
+}
+
+static int mlx4_num_eq_uar(struct mlx4_dev *dev)
+{
+ /*
+ * Each UAR holds 4 EQ doorbells. To figure out how many UARs
+ * we need to map, take the difference of highest index and
+ * the lowest index we'll use and add 1.
+ */
+ return (dev->caps.num_comp_vectors + 1 + dev->caps.reserved_eqs) / 4 -
+ dev->caps.reserved_eqs / 4 + 1;
+}
+
+static void __iomem *mlx4_get_eq_uar(struct mlx4_dev *dev, struct mlx4_eq *eq)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ int index;
+
+ index = eq->eqn / 4 - dev->caps.reserved_eqs / 4;
+
+ if (!priv->eq_table.uar_map[index]) {
+ priv->eq_table.uar_map[index] =
+ ioremap(pci_resource_start(dev->pdev, 2) +
+ ((eq->eqn / 4) << PAGE_SHIFT),
+ PAGE_SIZE);
+ if (!priv->eq_table.uar_map[index]) {
+ mlx4_err(dev, "Couldn't map EQ doorbell for EQN 0x%06x\n",
+ eq->eqn);
+ return NULL;
+ }
+ }
+
+ return priv->eq_table.uar_map[index] + 0x800 + 8 * (eq->eqn % 4);
+}
+
+static int mlx4_create_eq(struct mlx4_dev *dev, int nent,
+ u8 intr, struct mlx4_eq *eq)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ struct mlx4_cmd_mailbox *mailbox;
+ struct mlx4_eq_context *eq_context;
+ int npages;
+ u64 *dma_list = NULL;
+ dma_addr_t t;
+ u64 mtt_addr;
+ int err = -ENOMEM;
+ int i;
+
+ eq->dev = dev;
+ eq->nent = roundup_pow_of_two(max(nent, 2));
+ npages = PAGE_ALIGN(eq->nent * MLX4_EQ_ENTRY_SIZE) / PAGE_SIZE;
+
+ eq->page_list = kmalloc(npages * sizeof *eq->page_list,
+ GFP_KERNEL);
+ if (!eq->page_list)
+ goto err_out;
+
+ for (i = 0; i < npages; ++i)
+ eq->page_list[i].buf = NULL;
+
+ dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+ if (!dma_list)
+ goto err_out_free;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ goto err_out_free;
+ eq_context = mailbox->buf;
+
+ for (i = 0; i < npages; ++i) {
+ eq->page_list[i].buf = dma_alloc_coherent(&dev->pdev->dev,
+ PAGE_SIZE, &t, GFP_KERNEL);
+ if (!eq->page_list[i].buf)
+ goto err_out_free_pages;
+
+ dma_list[i] = t;
+ eq->page_list[i].map = t;
+
+ memset(eq->page_list[i].buf, 0, PAGE_SIZE);
+ }
+
+ eq->eqn = mlx4_bitmap_alloc(&priv->eq_table.bitmap);
+ if (eq->eqn == -1)
+ goto err_out_free_pages;
+
+ eq->doorbell = mlx4_get_eq_uar(dev, eq);
+ if (!eq->doorbell) {
+ err = -ENOMEM;
+ goto err_out_free_eq;
+ }
+
+ err = mlx4_mtt_init(dev, npages, PAGE_SHIFT, &eq->mtt);
+ if (err)
+ goto err_out_free_eq;
+
+ err = mlx4_write_mtt(dev, &eq->mtt, 0, npages, dma_list);
+ if (err)
+ goto err_out_free_mtt;
+
+ memset(eq_context, 0, sizeof *eq_context);
+ eq_context->flags = cpu_to_be32(MLX4_EQ_STATUS_OK |
+ MLX4_EQ_STATE_ARMED);
+ eq_context->log_eq_size = ilog2(eq->nent);
+ eq_context->intr = intr;
+ eq_context->log_page_size = PAGE_SHIFT - MLX4_ICM_PAGE_SHIFT;
+
+ mtt_addr = mlx4_mtt_addr(dev, &eq->mtt);
+ eq_context->mtt_base_addr_h = mtt_addr >> 32;
+ eq_context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff);
+
+ err = mlx4_SW2HW_EQ(dev, mailbox, eq->eqn);
+ if (err) {
+ mlx4_warn(dev, "SW2HW_EQ failed (%d)\n", err);
+ goto err_out_free_mtt;
+ }
+
+ kfree(dma_list);
+ mlx4_free_cmd_mailbox(dev, mailbox);
+
+ eq->cons_index = 0;
+
+ return err;
+
+err_out_free_mtt:
+ mlx4_mtt_cleanup(dev, &eq->mtt);
+
+err_out_free_eq:
+ mlx4_bitmap_free(&priv->eq_table.bitmap, eq->eqn);
+
+err_out_free_pages:
+ for (i = 0; i < npages; ++i)
+ if (eq->page_list[i].buf)
+ dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+ eq->page_list[i].buf,
+ eq->page_list[i].map);
+
+ mlx4_free_cmd_mailbox(dev, mailbox);
+
+err_out_free:
+ kfree(eq->page_list);
+ kfree(dma_list);
+
+err_out:
+ return err;
+}
+
+static void mlx4_free_eq(struct mlx4_dev *dev,
+ struct mlx4_eq *eq)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ struct mlx4_cmd_mailbox *mailbox;
+ int err;
+ int npages = PAGE_ALIGN(MLX4_EQ_ENTRY_SIZE * eq->nent) / PAGE_SIZE;
+ int i;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return;
+
+ err = mlx4_HW2SW_EQ(dev, mailbox, eq->eqn);
+ if (err)
+ mlx4_warn(dev, "HW2SW_EQ failed (%d)\n", err);
+
+ if (0) {
+ mlx4_dbg(dev, "Dumping EQ context %02x:\n", eq->eqn);
+ for (i = 0; i < sizeof (struct mlx4_eq_context) / 4; ++i) {
+ if (i % 4 == 0)
+ printk("[%02x] ", i * 4);
+ printk(" %08x", be32_to_cpup(mailbox->buf + i * 4));
+ if ((i + 1) % 4 == 0)
+ printk("\n");
+ }
+ }
+
+ mlx4_mtt_cleanup(dev, &eq->mtt);
+ for (i = 0; i < npages; ++i)
+ pci_free_consistent(dev->pdev, PAGE_SIZE,
+ eq->page_list[i].buf,
+ eq->page_list[i].map);
+
+ kfree(eq->page_list);
+ mlx4_bitmap_free(&priv->eq_table.bitmap, eq->eqn);
+ mlx4_free_cmd_mailbox(dev, mailbox);
+}
+
+static void mlx4_free_irqs(struct mlx4_dev *dev)
+{
+ struct mlx4_eq_table *eq_table = &mlx4_priv(dev)->eq_table;
+ int i;
+
+ if (eq_table->have_irq)
+ free_irq(dev->pdev->irq, dev);
+ for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
+ if (eq_table->eq[i].have_irq) {
+ free_irq(eq_table->eq[i].irq, eq_table->eq + i);
+ eq_table->eq[i].have_irq = 0;
+ }
+
+ kfree(eq_table->irq_names);
+}
+
+static int mlx4_map_clr_int(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+
+ priv->clr_base = ioremap(pci_resource_start(dev->pdev, priv->fw.clr_int_bar) +
+ priv->fw.clr_int_base, MLX4_CLR_INT_SIZE);
+ if (!priv->clr_base) {
+ mlx4_err(dev, "Couldn't map interrupt clear register, aborting.\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void mlx4_unmap_clr_int(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+
+ iounmap(priv->clr_base);
+}
+
+int mlx4_alloc_eq_table(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+
+ priv->eq_table.eq = kcalloc(dev->caps.num_eqs - dev->caps.reserved_eqs,
+ sizeof *priv->eq_table.eq, GFP_KERNEL);
+ if (!priv->eq_table.eq)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void mlx4_free_eq_table(struct mlx4_dev *dev)
+{
+ kfree(mlx4_priv(dev)->eq_table.eq);
+}
+
+int mlx4_init_eq_table(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ int err;
+ int i;
+
+ priv->eq_table.uar_map = kcalloc(sizeof *priv->eq_table.uar_map,
+ mlx4_num_eq_uar(dev), GFP_KERNEL);
+ if (!priv->eq_table.uar_map) {
+ err = -ENOMEM;
+ goto err_out_free;
+ }
+
+ err = mlx4_bitmap_init(&priv->eq_table.bitmap, dev->caps.num_eqs,
+ dev->caps.num_eqs - 1, dev->caps.reserved_eqs, 0);
+ if (err)
+ goto err_out_free;
+
+ for (i = 0; i < mlx4_num_eq_uar(dev); ++i)
+ priv->eq_table.uar_map[i] = NULL;
+
+ err = mlx4_map_clr_int(dev);
+ if (err)
+ goto err_out_bitmap;
+
+ priv->eq_table.clr_mask =
+ swab32(1 << (priv->eq_table.inta_pin & 31));
+ priv->eq_table.clr_int = priv->clr_base +
+ (priv->eq_table.inta_pin < 32 ? 4 : 0);
+
+ priv->eq_table.irq_names = kmalloc(16 * dev->caps.num_comp_vectors, GFP_KERNEL);
+ if (!priv->eq_table.irq_names) {
+ err = -ENOMEM;
+ goto err_out_bitmap;
+ }
+
+ for (i = 0; i < dev->caps.num_comp_vectors; ++i) {
+ err = mlx4_create_eq(dev, dev->caps.num_cqs + MLX4_NUM_SPARE_EQE,
+ (dev->flags & MLX4_FLAG_MSI_X) ? i : 0,
+ &priv->eq_table.eq[i]);
+ if (err) {
+ --i;
+ goto err_out_unmap;
+ }
+ }
+
+ err = mlx4_create_eq(dev, MLX4_NUM_ASYNC_EQE + MLX4_NUM_SPARE_EQE,
+ (dev->flags & MLX4_FLAG_MSI_X) ? dev->caps.num_comp_vectors : 0,
+ &priv->eq_table.eq[dev->caps.num_comp_vectors]);
+ if (err)
+ goto err_out_comp;
+
+ if (dev->flags & MLX4_FLAG_MSI_X) {
+ static const char async_eq_name[] = DRV_NAME "(async)";
+ const char *eq_name;
+
+ for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) {
+ if (i < dev->caps.num_comp_vectors) {
+ snprintf(priv->eq_table.irq_names + i * 16, 16,
+ "eth-mlx4-%d", i);
+ eq_name = priv->eq_table.irq_names + i * 16;
+ } else
+ eq_name = async_eq_name;
+
+ err = request_irq(priv->eq_table.eq[i].irq,
+ mlx4_msi_x_interrupt, 0, eq_name,
+ priv->eq_table.eq + i);
+ if (err)
+ goto err_out_async;
+
+ priv->eq_table.eq[i].have_irq = 1;
+ }
+ } else {
+ err = request_irq(dev->pdev->irq, mlx4_interrupt,
+ IRQF_SHARED, DRV_NAME, dev);
+ if (err)
+ goto err_out_async;
+
+ priv->eq_table.have_irq = 1;
+ }
+
+ err = mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 0,
+ priv->eq_table.eq[dev->caps.num_comp_vectors].eqn);
+ if (err)
+ mlx4_warn(dev, "MAP_EQ for async EQ %d failed (%d)\n",
+ priv->eq_table.eq[dev->caps.num_comp_vectors].eqn, err);
+
+ for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
+ eq_set_ci(&priv->eq_table.eq[i], 1);
+
+ return 0;
+
+err_out_async:
+ mlx4_free_eq(dev, &priv->eq_table.eq[dev->caps.num_comp_vectors]);
+
+err_out_comp:
+ i = dev->caps.num_comp_vectors - 1;
+
+err_out_unmap:
+ while (i >= 0) {
+ mlx4_free_eq(dev, &priv->eq_table.eq[i]);
+ --i;
+ }
+ mlx4_unmap_clr_int(dev);
+ mlx4_free_irqs(dev);
+
+err_out_bitmap:
+ mlx4_bitmap_cleanup(&priv->eq_table.bitmap);
+
+err_out_free:
+ kfree(priv->eq_table.uar_map);
+
+ return err;
+}
+
+void mlx4_cleanup_eq_table(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ int i;
+
+ mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 1,
+ priv->eq_table.eq[dev->caps.num_comp_vectors].eqn);
+
+ mlx4_free_irqs(dev);
+
+ for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
+ mlx4_free_eq(dev, &priv->eq_table.eq[i]);
+
+ mlx4_unmap_clr_int(dev);
+
+ for (i = 0; i < mlx4_num_eq_uar(dev); ++i)
+ if (priv->eq_table.uar_map[i])
+ iounmap(priv->eq_table.uar_map[i]);
+
+ mlx4_bitmap_cleanup(&priv->eq_table.bitmap);
+
+ kfree(priv->eq_table.uar_map);
+}
+
+/* A test that verifies that we can accept interrupts on all
+ * the irq vectors of the device.
+ * Interrupts are checked using the NOP command.
+ */
+int mlx4_test_interrupts(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ int i;
+ int err;
+
+ err = mlx4_NOP(dev);
+ /* When not in MSI_X, there is only one irq to check */
+ if (!(dev->flags & MLX4_FLAG_MSI_X))
+ return err;
+
+ /* A loop over all completion vectors, for each vector we will check
+ * whether it works by mapping command completions to that vector
+ * and performing a NOP command
+ */
+ for(i = 0; !err && (i < dev->caps.num_comp_vectors); ++i) {
+ /* Temporary use polling for command completions */
+ mlx4_cmd_use_polling(dev);
+
+ /* Map the new eq to handle all asyncronous events */
+ err = mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 0,
+ priv->eq_table.eq[i].eqn);
+ if (err) {
+ mlx4_warn(dev, "Failed mapping eq for interrupt test\n");
+ mlx4_cmd_use_events(dev);
+ break;
+ }
+
+ /* Go back to using events */
+ mlx4_cmd_use_events(dev);
+ err = mlx4_NOP(dev);
+ }
+
+ /* Return to default */
+ mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 0,
+ priv->eq_table.eq[dev->caps.num_comp_vectors].eqn);
+ return err;
+}
+EXPORT_SYMBOL(mlx4_test_interrupts);
diff --git a/sys/ofed/drivers/net/mlx4/fw.c b/sys/ofed/drivers/net/mlx4/fw.c
new file mode 100644
index 0000000..774d261
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/fw.c
@@ -0,0 +1,1010 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx4/cmd.h>
+
+#include "fw.h"
+#include "icm.h"
+
+enum {
+ MLX4_COMMAND_INTERFACE_MIN_REV = 2,
+ MLX4_COMMAND_INTERFACE_MAX_REV = 3,
+ MLX4_COMMAND_INTERFACE_NEW_PORT_CMDS = 3,
+};
+
+extern void __buggy_use_of_MLX4_GET(void);
+extern void __buggy_use_of_MLX4_PUT(void);
+
+static int enable_qos;
+module_param(enable_qos, bool, 0444);
+MODULE_PARM_DESC(enable_qos, "Enable Quality of Service support in the HCA (default: off)");
+
+static int mlx4_pre_t11_mode = 0;
+module_param_named(enable_pre_t11_mode, mlx4_pre_t11_mode, int, 0644);
+MODULE_PARM_DESC(enable_pre_t11_mode, "For FCoXX, enable pre-t11 mode if non-zero (default: 0)");
+
+#define MLX4_GET(dest, source, offset) \
+ do { \
+ void *__p = (char *) (source) + (offset); \
+ switch (sizeof (dest)) { \
+ case 1: (dest) = *(u8 *) __p; break; \
+ case 2: (dest) = be16_to_cpup(__p); break; \
+ case 4: (dest) = be32_to_cpup(__p); break; \
+ case 8: (dest) = be64_to_cpup(__p); break; \
+ default: __buggy_use_of_MLX4_GET(); \
+ } \
+ } while (0)
+
+#define MLX4_PUT(dest, source, offset) \
+ do { \
+ void *__d = ((char *) (dest) + (offset)); \
+ switch (sizeof(source)) { \
+ case 1: *(u8 *) __d = (source); break; \
+ case 2: *(__be16 *) __d = cpu_to_be16(source); break; \
+ case 4: *(__be32 *) __d = cpu_to_be32(source); break; \
+ case 8: *(__be64 *) __d = cpu_to_be64(source); break; \
+ default: __buggy_use_of_MLX4_PUT(); \
+ } \
+ } while (0)
+
+static void dump_dev_cap_flags(struct mlx4_dev *dev, u64 flags)
+{
+ static const char *fname[] = {
+ [ 0] = "RC transport",
+ [ 1] = "UC transport",
+ [ 2] = "UD transport",
+ [ 3] = "XRC transport",
+ [ 4] = "reliable multicast",
+ [ 5] = "FCoIB support",
+ [ 6] = "SRQ support",
+ [ 7] = "IPoIB checksum offload",
+ [ 8] = "P_Key violation counter",
+ [ 9] = "Q_Key violation counter",
+ [10] = "VMM",
+ [12] = "DPDP",
+ [16] = "MW support",
+ [17] = "APM support",
+ [18] = "Atomic ops support",
+ [19] = "Raw multicast support",
+ [20] = "Address vector port checking support",
+ [21] = "UD multicast support",
+ [24] = "Demand paging support",
+ [25] = "Router support",
+ [30] = "IBoE support",
+ [48] = "Basic counters support",
+ [49] = "Extended counters support",
+ };
+ int i;
+
+ mlx4_dbg(dev, "DEV_CAP flags:\n");
+ for (i = 0; i < ARRAY_SIZE(fname); ++i)
+ if (fname[i] && (flags & (1LL << i)))
+ mlx4_dbg(dev, " %s\n", fname[i]);
+}
+
+int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ u32 *inbox;
+ int err = 0;
+
+#define MOD_STAT_CFG_IN_SIZE 0x100
+
+#define MOD_STAT_CFG_PG_SZ_M_OFFSET 0x002
+#define MOD_STAT_CFG_PG_SZ_OFFSET 0x003
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ inbox = mailbox->buf;
+
+ memset(inbox, 0, MOD_STAT_CFG_IN_SIZE);
+
+ MLX4_PUT(inbox, cfg->log_pg_sz, MOD_STAT_CFG_PG_SZ_OFFSET);
+ MLX4_PUT(inbox, cfg->log_pg_sz_m, MOD_STAT_CFG_PG_SZ_M_OFFSET);
+
+ err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_MOD_STAT_CFG,
+ MLX4_CMD_TIME_CLASS_A);
+
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ return err;
+}
+
+int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ u32 *outbox;
+ u8 field;
+ u32 field32;
+ u16 size;
+ u16 stat_rate;
+ int err;
+ int i;
+ u32 in_modifier;
+ u64 out_param;
+ u32 tmp1, tmp2;
+
+#define QUERY_DEV_CAP_OUT_SIZE 0x100
+#define QUERY_DEV_CAP_MAX_SRQ_SZ_OFFSET 0x10
+#define QUERY_DEV_CAP_MAX_QP_SZ_OFFSET 0x11
+#define QUERY_DEV_CAP_RSVD_QP_OFFSET 0x12
+#define QUERY_DEV_CAP_MAX_QP_OFFSET 0x13
+#define QUERY_DEV_CAP_RSVD_SRQ_OFFSET 0x14
+#define QUERY_DEV_CAP_MAX_SRQ_OFFSET 0x15
+#define QUERY_DEV_CAP_RSVD_EEC_OFFSET 0x16
+#define QUERY_DEV_CAP_MAX_EEC_OFFSET 0x17
+#define QUERY_DEV_CAP_MAX_CQ_SZ_OFFSET 0x19
+#define QUERY_DEV_CAP_RSVD_CQ_OFFSET 0x1a
+#define QUERY_DEV_CAP_MAX_CQ_OFFSET 0x1b
+#define QUERY_DEV_CAP_MAX_MPT_OFFSET 0x1d
+#define QUERY_DEV_CAP_RSVD_EQ_OFFSET 0x1e
+#define QUERY_DEV_CAP_MAX_EQ_OFFSET 0x1f
+#define QUERY_DEV_CAP_RSVD_MTT_OFFSET 0x20
+#define QUERY_DEV_CAP_MAX_MRW_SZ_OFFSET 0x21
+#define QUERY_DEV_CAP_RSVD_MRW_OFFSET 0x22
+#define QUERY_DEV_CAP_MAX_MTT_SEG_OFFSET 0x23
+#define QUERY_DEV_CAP_MAX_AV_OFFSET 0x27
+#define QUERY_DEV_CAP_MAX_REQ_QP_OFFSET 0x29
+#define QUERY_DEV_CAP_MAX_RES_QP_OFFSET 0x2b
+#define QUERY_DEV_CAP_MAX_GSO_OFFSET 0x2d
+#define QUERY_DEV_CAP_MAX_RDMA_OFFSET 0x2f
+#define QUERY_DEV_CAP_STAT_CFG_INL_OFFSET 0x31
+#define QUERY_DEV_CAP_RSZ_SRQ_OFFSET 0x33
+#define QUERY_DEV_CAP_ACK_DELAY_OFFSET 0x35
+#define QUERY_DEV_CAP_MTU_WIDTH_OFFSET 0x36
+#define QUERY_DEV_CAP_VL_PORT_OFFSET 0x37
+#define QUERY_DEV_CAP_MAX_MSG_SZ_OFFSET 0x38
+#define QUERY_DEV_CAP_MAX_GID_OFFSET 0x3b
+#define QUERY_DEV_CAP_RATE_SUPPORT_OFFSET 0x3c
+#define QUERY_DEV_CAP_MAX_PKEY_OFFSET 0x3f
+#define QUERY_DEV_CAP_EXT_FLAGS_OFFSET 0x40
+#define QUERY_DEV_CAP_UDP_RSS_OFFSET 0x42
+#define QUERY_DEV_CAP_ETH_UC_LOOPBACK_OFFSET 0x43
+#define QUERY_DEV_CAP_FLAGS_OFFSET 0x44
+#define QUERY_DEV_CAP_RSVD_UAR_OFFSET 0x48
+#define QUERY_DEV_CAP_UAR_SZ_OFFSET 0x49
+#define QUERY_DEV_CAP_PAGE_SZ_OFFSET 0x4b
+#define QUERY_DEV_CAP_BF_OFFSET 0x4c
+#define QUERY_DEV_CAP_LOG_BF_REG_SZ_OFFSET 0x4d
+#define QUERY_DEV_CAP_LOG_MAX_BF_REGS_PER_PAGE_OFFSET 0x4e
+#define QUERY_DEV_CAP_LOG_MAX_BF_PAGES_OFFSET 0x4f
+#define QUERY_DEV_CAP_MAX_SG_SQ_OFFSET 0x51
+#define QUERY_DEV_CAP_MAX_DESC_SZ_SQ_OFFSET 0x52
+#define QUERY_DEV_CAP_MAX_SG_RQ_OFFSET 0x55
+#define QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET 0x56
+#define QUERY_DEV_CAP_MAX_QP_MCG_OFFSET 0x61
+#define QUERY_DEV_CAP_RSVD_MCG_OFFSET 0x62
+#define QUERY_DEV_CAP_MAX_MCG_OFFSET 0x63
+#define QUERY_DEV_CAP_RSVD_PD_OFFSET 0x64
+#define QUERY_DEV_CAP_MAX_PD_OFFSET 0x65
+#define QUERY_DEV_CAP_RSVD_XRC_OFFSET 0x66
+#define QUERY_DEV_CAP_MAX_XRC_OFFSET 0x67
+#define QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET 0x80
+#define QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET 0x82
+#define QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET 0x84
+#define QUERY_DEV_CAP_ALTC_ENTRY_SZ_OFFSET 0x86
+#define QUERY_DEV_CAP_EQC_ENTRY_SZ_OFFSET 0x88
+#define QUERY_DEV_CAP_CQC_ENTRY_SZ_OFFSET 0x8a
+#define QUERY_DEV_CAP_SRQ_ENTRY_SZ_OFFSET 0x8c
+#define QUERY_DEV_CAP_C_MPT_ENTRY_SZ_OFFSET 0x8e
+#define QUERY_DEV_CAP_MTT_ENTRY_SZ_OFFSET 0x90
+#define QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET 0x92
+#define QUERY_DEV_CAP_BMME_FLAGS_OFFSET 0x94
+#define QUERY_DEV_CAP_RSVD_LKEY_OFFSET 0x98
+#define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET 0xa0
+#define QUERY_DEV_CAP_MAX_BASIC_CNT_OFFSET 0x68
+#define QUERY_DEV_CAP_MAX_EXT_CNT_OFFSET 0x6c
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ outbox = mailbox->buf;
+
+ err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_DEV_CAP,
+ MLX4_CMD_TIME_CLASS_A);
+ if (err)
+ goto out;
+
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_QP_OFFSET);
+ dev_cap->reserved_qps = 1 << (field & 0xf);
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_OFFSET);
+ dev_cap->max_qps = 1 << (field & 0x1f);
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_SRQ_OFFSET);
+ dev_cap->reserved_srqs = 1 << (field >> 4);
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SRQ_OFFSET);
+ dev_cap->max_srqs = 1 << (field & 0x1f);
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_CQ_SZ_OFFSET);
+ dev_cap->max_cq_sz = 1 << field;
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_CQ_OFFSET);
+ dev_cap->reserved_cqs = 1 << (field & 0xf);
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_CQ_OFFSET);
+ dev_cap->max_cqs = 1 << (field & 0x1f);
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MPT_OFFSET);
+ dev_cap->max_mpts = 1 << (field & 0x3f);
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_EQ_OFFSET);
+ dev_cap->reserved_eqs = 1 << (field & 0xf);
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_EQ_OFFSET);
+ dev_cap->max_eqs = 1 << (field & 0xf);
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MTT_OFFSET);
+ dev_cap->reserved_mtts = 1 << (field >> 4);
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MRW_SZ_OFFSET);
+ dev_cap->max_mrw_sz = 1 << field;
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MRW_OFFSET);
+ dev_cap->reserved_mrws = 1 << (field & 0xf);
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MTT_SEG_OFFSET);
+ dev_cap->max_mtt_seg = 1 << (field & 0x3f);
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_REQ_QP_OFFSET);
+ dev_cap->max_requester_per_qp = 1 << (field & 0x3f);
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_RES_QP_OFFSET);
+ dev_cap->max_responder_per_qp = 1 << (field & 0x3f);
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_GSO_OFFSET);
+ field &= 0x1f;
+ if (!field)
+ dev_cap->max_gso_sz = 0;
+ else
+ dev_cap->max_gso_sz = 1 << field;
+
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_RDMA_OFFSET);
+ dev_cap->max_rdma_global = 1 << (field & 0x3f);
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_ACK_DELAY_OFFSET);
+ dev_cap->local_ca_ack_delay = field & 0x1f;
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET);
+ dev_cap->num_ports = field & 0xf;
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MSG_SZ_OFFSET);
+ dev_cap->max_msg_sz = 1 << (field & 0x1f);
+ MLX4_GET(stat_rate, outbox, QUERY_DEV_CAP_RATE_SUPPORT_OFFSET);
+ dev_cap->stat_rate_support = stat_rate;
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_UDP_RSS_OFFSET);
+ dev_cap->udp_rss = field & 0x1;
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_ETH_UC_LOOPBACK_OFFSET);
+ dev_cap->loopback_support = field & 0x1;
+ MLX4_GET(tmp1, outbox, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
+ MLX4_GET(tmp2, outbox, QUERY_DEV_CAP_FLAGS_OFFSET);
+ dev_cap->flags = tmp2 | (u64)tmp1 << 32;
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_UAR_OFFSET);
+ dev_cap->reserved_uars = field >> 4;
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_UAR_SZ_OFFSET);
+ dev_cap->uar_size = 1 << ((field & 0x3f) + 20);
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_PAGE_SZ_OFFSET);
+ dev_cap->min_page_sz = 1 << field;
+
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_BF_OFFSET);
+ if (field & 0x80) {
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_LOG_BF_REG_SZ_OFFSET);
+ dev_cap->bf_reg_size = 1 << (field & 0x1f);
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_LOG_MAX_BF_REGS_PER_PAGE_OFFSET);
+ if ((1 << (field & 0x3f)) > (PAGE_SIZE / dev_cap->bf_reg_size)) {
+ mlx4_dbg(dev, "log blue flame is invalid (%d), forcing 3\n", field & 0x1f);
+ field = 3;
+ }
+ dev_cap->bf_regs_per_page = 1 << (field & 0x3f);
+ mlx4_dbg(dev, "BlueFlame available (reg size %d, regs/page %d)\n",
+ dev_cap->bf_reg_size, dev_cap->bf_regs_per_page);
+ } else {
+ dev_cap->bf_reg_size = 0;
+ mlx4_dbg(dev, "BlueFlame not available\n");
+ }
+
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SG_SQ_OFFSET);
+ dev_cap->max_sq_sg = field;
+ MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_SQ_OFFSET);
+ dev_cap->max_sq_desc_sz = size;
+
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_MCG_OFFSET);
+ dev_cap->max_qp_per_mcg = 1 << field;
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MCG_OFFSET);
+ dev_cap->reserved_mgms = field & 0xf;
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MCG_OFFSET);
+ dev_cap->max_mcgs = 1 << field;
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_PD_OFFSET);
+ dev_cap->reserved_pds = field >> 4;
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PD_OFFSET);
+ dev_cap->max_pds = 1 << (field & 0x3f);
+
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_XRC_OFFSET);
+ dev_cap->reserved_xrcds = field >> 4;
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_XRC_OFFSET);
+ dev_cap->max_xrcds = 1 << (field & 0x1f);
+
+ MLX4_GET(size, outbox, QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET);
+ dev_cap->rdmarc_entry_sz = size;
+ MLX4_GET(size, outbox, QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET);
+ dev_cap->qpc_entry_sz = size;
+ MLX4_GET(size, outbox, QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET);
+ dev_cap->aux_entry_sz = size;
+ MLX4_GET(size, outbox, QUERY_DEV_CAP_ALTC_ENTRY_SZ_OFFSET);
+ dev_cap->altc_entry_sz = size;
+ MLX4_GET(size, outbox, QUERY_DEV_CAP_EQC_ENTRY_SZ_OFFSET);
+ dev_cap->eqc_entry_sz = size;
+ MLX4_GET(size, outbox, QUERY_DEV_CAP_CQC_ENTRY_SZ_OFFSET);
+ dev_cap->cqc_entry_sz = size;
+ MLX4_GET(size, outbox, QUERY_DEV_CAP_SRQ_ENTRY_SZ_OFFSET);
+ dev_cap->srq_entry_sz = size;
+ MLX4_GET(size, outbox, QUERY_DEV_CAP_C_MPT_ENTRY_SZ_OFFSET);
+ dev_cap->cmpt_entry_sz = size;
+ MLX4_GET(size, outbox, QUERY_DEV_CAP_MTT_ENTRY_SZ_OFFSET);
+ dev_cap->mtt_entry_sz = size;
+ MLX4_GET(size, outbox, QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET);
+ dev_cap->dmpt_entry_sz = size;
+
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SRQ_SZ_OFFSET);
+ dev_cap->max_srq_sz = 1 << field;
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_SZ_OFFSET);
+ dev_cap->max_qp_sz = 1 << field;
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_STAT_CFG_INL_OFFSET);
+ dev_cap->inline_cfg = field & 1;
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_RSZ_SRQ_OFFSET);
+ dev_cap->resize_srq = field & 1;
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SG_RQ_OFFSET);
+ dev_cap->max_rq_sg = field;
+ MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET);
+ dev_cap->max_rq_desc_sz = size;
+
+ MLX4_GET(dev_cap->bmme_flags, outbox,
+ QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+ MLX4_GET(dev_cap->reserved_lkey, outbox,
+ QUERY_DEV_CAP_RSVD_LKEY_OFFSET);
+ MLX4_GET(dev_cap->max_icm_sz, outbox,
+ QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET);
+ MLX4_GET(dev_cap->max_basic_counters, outbox,
+ QUERY_DEV_CAP_MAX_BASIC_CNT_OFFSET);
+ MLX4_GET(dev_cap->max_ext_counters, outbox,
+ QUERY_DEV_CAP_MAX_EXT_CNT_OFFSET);
+
+ if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
+ for (i = 1; i <= dev_cap->num_ports; ++i) {
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET);
+ dev_cap->max_vl[i] = field >> 4;
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_MTU_WIDTH_OFFSET);
+ dev_cap->ib_mtu[i] = field >> 4;
+ dev_cap->max_port_width[i] = field & 0xf;
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_GID_OFFSET);
+ dev_cap->max_gids[i] = 1 << (field & 0xf);
+ MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PKEY_OFFSET);
+ dev_cap->max_pkeys[i] = 1 << (field & 0xf);
+ }
+ } else {
+#define QUERY_PORT_SUPPORTED_TYPE_OFFSET 0x00
+#define QUERY_PORT_MTU_OFFSET 0x01
+#define QUERY_PORT_ETH_MTU_OFFSET 0x02
+#define QUERY_PORT_WIDTH_OFFSET 0x06
+#define QUERY_PORT_MAX_GID_PKEY_OFFSET 0x07
+#define QUERY_PORT_MAX_MACVLAN_OFFSET 0x0a
+#define QUERY_PORT_MAX_VL_OFFSET 0x0b
+#define QUERY_PORT_MAC_OFFSET 0x10
+#define QUERY_PORT_TRANS_VENDOR_OFFSET 0x18
+#define QUERY_PORT_WAVELENGTH_OFFSET 0x1c
+#define QUERY_PORT_TRANS_CODE_OFFSET 0x20
+
+#define STAT_CFG_PORT_MODE (1 << 28)
+#define STAT_CFG_PORT_OFFSET 0x8
+#define STAT_CFG_PORT_MASK (1 << 20)
+#define STAT_CFG_MOD_INLINE 0x3
+
+ for (i = 1; i <= dev_cap->num_ports; ++i) {
+ err = mlx4_cmd_box(dev, 0, mailbox->dma, i, 0, MLX4_CMD_QUERY_PORT,
+ MLX4_CMD_TIME_CLASS_B);
+ if (err)
+ goto out;
+
+ MLX4_GET(field, outbox, QUERY_PORT_SUPPORTED_TYPE_OFFSET);
+ dev_cap->supported_port_types[i] = field & 3;
+ MLX4_GET(field, outbox, QUERY_PORT_MTU_OFFSET);
+ dev_cap->ib_mtu[i] = field & 0xf;
+ MLX4_GET(field, outbox, QUERY_PORT_WIDTH_OFFSET);
+ dev_cap->max_port_width[i] = field & 0xf;
+ MLX4_GET(field, outbox, QUERY_PORT_MAX_GID_PKEY_OFFSET);
+ dev_cap->max_gids[i] = 1 << (field >> 4);
+ dev_cap->max_pkeys[i] = 1 << (field & 0xf);
+ MLX4_GET(field, outbox, QUERY_PORT_MAX_VL_OFFSET);
+ dev_cap->max_vl[i] = field & 0xf;
+ MLX4_GET(field, outbox, QUERY_PORT_MAX_MACVLAN_OFFSET);
+ dev_cap->log_max_macs[i] = field & 0xf;
+ dev_cap->log_max_vlans[i] = field >> 4;
+ MLX4_GET(dev_cap->eth_mtu[i], outbox, QUERY_PORT_ETH_MTU_OFFSET);
+ MLX4_GET(dev_cap->def_mac[i], outbox, QUERY_PORT_MAC_OFFSET);
+ MLX4_GET(field32, outbox, QUERY_PORT_TRANS_VENDOR_OFFSET);
+ dev_cap->trans_type[i] = field32 >> 24;
+ dev_cap->vendor_oui[i] = field32 & 0xffffff;
+ MLX4_GET(dev_cap->wavelength[i], outbox, QUERY_PORT_WAVELENGTH_OFFSET);
+ MLX4_GET(dev_cap->trans_code[i], outbox, QUERY_PORT_TRANS_CODE_OFFSET);
+
+ /* Query stat cfg for port enablement */
+ if (dev_cap->inline_cfg) {
+ in_modifier = STAT_CFG_PORT_MODE | i << 8 |
+ STAT_CFG_PORT_OFFSET;
+ err = mlx4_cmd_imm(dev, 0, &out_param,
+ in_modifier,
+ STAT_CFG_MOD_INLINE,
+ MLX4_CMD_MOD_STAT_CFG,
+ MLX4_CMD_TIME_CLASS_B);
+ if (!err)
+ if (!(out_param & STAT_CFG_PORT_MASK))
+ dev_cap->supported_port_types[i] = 0;
+ }
+ }
+ }
+
+ mlx4_dbg(dev, "Base MM extensions: flags %08x, rsvd L_Key %08x\n",
+ dev_cap->bmme_flags, dev_cap->reserved_lkey);
+
+ /*
+ * Each UAR has 4 EQ doorbells; so if a UAR is reserved, then
+ * we can't use any EQs whose doorbell falls on that page,
+ * even if the EQ itself isn't reserved.
+ */
+ dev_cap->reserved_eqs = max(dev_cap->reserved_uars * 4,
+ dev_cap->reserved_eqs);
+
+ mlx4_dbg(dev, "Max ICM size %lld MB\n",
+ (unsigned long long) dev_cap->max_icm_sz >> 20);
+ mlx4_dbg(dev, "Max QPs: %d, reserved QPs: %d, entry size: %d\n",
+ dev_cap->max_qps, dev_cap->reserved_qps, dev_cap->qpc_entry_sz);
+ mlx4_dbg(dev, "Max SRQs: %d, reserved SRQs: %d, entry size: %d\n",
+ dev_cap->max_srqs, dev_cap->reserved_srqs, dev_cap->srq_entry_sz);
+ mlx4_dbg(dev, "Max CQs: %d, reserved CQs: %d, entry size: %d\n",
+ dev_cap->max_cqs, dev_cap->reserved_cqs, dev_cap->cqc_entry_sz);
+ mlx4_dbg(dev, "Max EQs: %d, reserved EQs: %d, entry size: %d\n",
+ dev_cap->max_eqs, dev_cap->reserved_eqs, dev_cap->eqc_entry_sz);
+ mlx4_dbg(dev, "reserved MPTs: %d, reserved MTTs: %d\n",
+ dev_cap->reserved_mrws, dev_cap->reserved_mtts);
+ mlx4_dbg(dev, "Max PDs: %d, reserved PDs: %d, reserved UARs: %d\n",
+ dev_cap->max_pds, dev_cap->reserved_pds, dev_cap->reserved_uars);
+ mlx4_dbg(dev, "Max QP/MCG: %d, reserved MGMs: %d\n",
+ dev_cap->max_pds, dev_cap->reserved_mgms);
+ mlx4_dbg(dev, "Max CQEs: %d, max WQEs: %d, max SRQ WQEs: %d\n",
+ dev_cap->max_cq_sz, dev_cap->max_qp_sz, dev_cap->max_srq_sz);
+ mlx4_dbg(dev, "Local CA ACK delay: %d, max MTU: %d, port width cap: %d\n",
+ dev_cap->local_ca_ack_delay, 128 << dev_cap->ib_mtu[1],
+ dev_cap->max_port_width[1]);
+ mlx4_dbg(dev, "Max SQ desc size: %d, max SQ S/G: %d\n",
+ dev_cap->max_sq_desc_sz, dev_cap->max_sq_sg);
+ mlx4_dbg(dev, "Max RQ desc size: %d, max RQ S/G: %d\n",
+ dev_cap->max_rq_desc_sz, dev_cap->max_rq_sg);
+ mlx4_dbg(dev, "Max GSO size: %d\n", dev_cap->max_gso_sz);
+
+ dump_dev_cap_flags(dev, dev_cap->flags);
+
+out:
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ return err;
+}
+
+int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ struct mlx4_icm_iter iter;
+ __be64 *pages;
+ int lg;
+ int nent = 0;
+ int i;
+ int err = 0;
+ int ts = 0, tc = 0;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ memset(mailbox->buf, 0, MLX4_MAILBOX_SIZE);
+ pages = mailbox->buf;
+
+ for (mlx4_icm_first(icm, &iter);
+ !mlx4_icm_last(&iter);
+ mlx4_icm_next(&iter)) {
+ /*
+ * We have to pass pages that are aligned to their
+ * size, so find the least significant 1 in the
+ * address or size and use that as our log2 size.
+ */
+ lg = ffs(mlx4_icm_addr(&iter) | mlx4_icm_size(&iter)) - 1;
+ if (lg < MLX4_ICM_PAGE_SHIFT) {
+ mlx4_warn(dev, "Got FW area not aligned to %d (%llx/%lx).\n",
+ MLX4_ICM_PAGE_SIZE,
+ (unsigned long long) mlx4_icm_addr(&iter),
+ mlx4_icm_size(&iter));
+ err = -EINVAL;
+ goto out;
+ }
+
+ for (i = 0; i < mlx4_icm_size(&iter) >> lg; ++i) {
+ if (virt != -1) {
+ pages[nent * 2] = cpu_to_be64(virt);
+ virt += 1 << lg;
+ }
+
+ pages[nent * 2 + 1] =
+ cpu_to_be64((mlx4_icm_addr(&iter) + (i << lg)) |
+ (lg - MLX4_ICM_PAGE_SHIFT));
+ ts += 1 << (lg - 10);
+ ++tc;
+
+ if (++nent == MLX4_MAILBOX_SIZE / 16) {
+ err = mlx4_cmd(dev, mailbox->dma, nent, 0, op,
+ MLX4_CMD_TIME_CLASS_B);
+ if (err)
+ goto out;
+ nent = 0;
+ }
+ }
+ }
+
+ if (nent)
+ err = mlx4_cmd(dev, mailbox->dma, nent, 0, op, MLX4_CMD_TIME_CLASS_B);
+ if (err)
+ goto out;
+
+ switch (op) {
+ case MLX4_CMD_MAP_FA:
+ mlx4_dbg(dev, "Mapped %d chunks/%d KB for FW.\n", tc, ts);
+ break;
+ case MLX4_CMD_MAP_ICM_AUX:
+ mlx4_dbg(dev, "Mapped %d chunks/%d KB for ICM aux.\n", tc, ts);
+ break;
+ case MLX4_CMD_MAP_ICM:
+ mlx4_dbg(dev, "Mapped %d chunks/%d KB at %llx for ICM.\n",
+ tc, ts, (unsigned long long) virt - (ts << 10));
+ break;
+ }
+
+out:
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ return err;
+}
+
+int mlx4_MAP_FA(struct mlx4_dev *dev, struct mlx4_icm *icm)
+{
+ return mlx4_map_cmd(dev, MLX4_CMD_MAP_FA, icm, -1);
+}
+
+int mlx4_UNMAP_FA(struct mlx4_dev *dev)
+{
+ return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_FA, MLX4_CMD_TIME_CLASS_B);
+}
+
+
+int mlx4_RUN_FW(struct mlx4_dev *dev)
+{
+ return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_RUN_FW, MLX4_CMD_TIME_CLASS_A);
+}
+
+int mlx4_QUERY_FW(struct mlx4_dev *dev)
+{
+ struct mlx4_fw *fw = &mlx4_priv(dev)->fw;
+ struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
+ struct mlx4_cmd_mailbox *mailbox;
+ u32 *outbox;
+ int err = 0;
+ u64 fw_ver;
+ u16 cmd_if_rev;
+ u8 lg;
+
+#define QUERY_FW_OUT_SIZE 0x100
+#define QUERY_FW_VER_OFFSET 0x00
+#define MC_PROMISC_VER 0x2000702bcull
+#define QUERY_FW_CMD_IF_REV_OFFSET 0x0a
+#define QUERY_FW_MAX_CMD_OFFSET 0x0f
+#define QUERY_FW_ERR_START_OFFSET 0x30
+#define QUERY_FW_ERR_SIZE_OFFSET 0x38
+#define QUERY_FW_ERR_BAR_OFFSET 0x3c
+
+#define QUERY_FW_SIZE_OFFSET 0x00
+#define QUERY_FW_CLR_INT_BASE_OFFSET 0x20
+#define QUERY_FW_CLR_INT_BAR_OFFSET 0x28
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ outbox = mailbox->buf;
+
+ err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_FW,
+ MLX4_CMD_TIME_CLASS_A);
+ if (err)
+ goto out;
+
+ MLX4_GET(fw_ver, outbox, QUERY_FW_VER_OFFSET);
+ /*
+ * FW subminor version is at more significant bits than minor
+ * version, so swap here.
+ */
+ dev->caps.fw_ver = (fw_ver & 0xffff00000000ull) |
+ ((fw_ver & 0xffff0000ull) >> 16) |
+ ((fw_ver & 0x0000ffffull) << 16);
+ if (dev->caps.fw_ver < MC_PROMISC_VER)
+ dev->caps.mc_promisc_mode = 2;
+ else
+ dev->caps.mc_promisc_mode = 1;
+
+ MLX4_GET(cmd_if_rev, outbox, QUERY_FW_CMD_IF_REV_OFFSET);
+ if (cmd_if_rev < MLX4_COMMAND_INTERFACE_MIN_REV ||
+ cmd_if_rev > MLX4_COMMAND_INTERFACE_MAX_REV) {
+ mlx4_err(dev, "Installed FW has unsupported "
+ "command interface revision %d.\n",
+ cmd_if_rev);
+ mlx4_err(dev, "(Installed FW version is %d.%d.%03d)\n",
+ (int) (dev->caps.fw_ver >> 32),
+ (int) (dev->caps.fw_ver >> 16) & 0xffff,
+ (int) dev->caps.fw_ver & 0xffff);
+ mlx4_err(dev, "This driver version supports only revisions %d to %d.\n",
+ MLX4_COMMAND_INTERFACE_MIN_REV, MLX4_COMMAND_INTERFACE_MAX_REV);
+ err = -ENODEV;
+ goto out;
+ }
+
+ if (cmd_if_rev < MLX4_COMMAND_INTERFACE_NEW_PORT_CMDS)
+ dev->flags |= MLX4_FLAG_OLD_PORT_CMDS;
+
+ MLX4_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET);
+ cmd->max_cmds = 1 << lg;
+
+ mlx4_dbg(dev, "FW version %d.%d.%03d (cmd intf rev %d), max commands %d\n",
+ (int) (dev->caps.fw_ver >> 32),
+ (int) (dev->caps.fw_ver >> 16) & 0xffff,
+ (int) dev->caps.fw_ver & 0xffff,
+ cmd_if_rev, cmd->max_cmds);
+
+ MLX4_GET(fw->catas_offset, outbox, QUERY_FW_ERR_START_OFFSET);
+ MLX4_GET(fw->catas_size, outbox, QUERY_FW_ERR_SIZE_OFFSET);
+ MLX4_GET(fw->catas_bar, outbox, QUERY_FW_ERR_BAR_OFFSET);
+ fw->catas_bar = (fw->catas_bar >> 6) * 2;
+
+ mlx4_dbg(dev, "Catastrophic error buffer at 0x%llx, size 0x%x, BAR %d\n",
+ (unsigned long long) fw->catas_offset, fw->catas_size, fw->catas_bar);
+
+ MLX4_GET(fw->fw_pages, outbox, QUERY_FW_SIZE_OFFSET);
+ MLX4_GET(fw->clr_int_base, outbox, QUERY_FW_CLR_INT_BASE_OFFSET);
+ MLX4_GET(fw->clr_int_bar, outbox, QUERY_FW_CLR_INT_BAR_OFFSET);
+ fw->clr_int_bar = (fw->clr_int_bar >> 6) * 2;
+
+ mlx4_dbg(dev, "FW size %d KB\n", fw->fw_pages >> 2);
+
+ /*
+ * Round up number of system pages needed in case
+ * MLX4_ICM_PAGE_SIZE < PAGE_SIZE.
+ */
+ fw->fw_pages =
+ ALIGN(fw->fw_pages, PAGE_SIZE / MLX4_ICM_PAGE_SIZE) >>
+ (PAGE_SHIFT - MLX4_ICM_PAGE_SHIFT);
+
+ mlx4_dbg(dev, "Clear int @ %llx, BAR %d\n",
+ (unsigned long long) fw->clr_int_base, fw->clr_int_bar);
+
+out:
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ return err;
+}
+
+static void get_board_id(void *vsd, char *board_id)
+{
+ int i;
+
+#define VSD_OFFSET_SIG1 0x00
+#define VSD_OFFSET_SIG2 0xde
+#define VSD_OFFSET_MLX_BOARD_ID 0xd0
+#define VSD_OFFSET_TS_BOARD_ID 0x20
+
+#define VSD_SIGNATURE_TOPSPIN 0x5ad
+
+ memset(board_id, 0, MLX4_BOARD_ID_LEN);
+
+ if (be16_to_cpup(vsd + VSD_OFFSET_SIG1) == VSD_SIGNATURE_TOPSPIN &&
+ be16_to_cpup(vsd + VSD_OFFSET_SIG2) == VSD_SIGNATURE_TOPSPIN) {
+ strlcpy(board_id, vsd + VSD_OFFSET_TS_BOARD_ID, MLX4_BOARD_ID_LEN);
+ } else {
+ /*
+ * The board ID is a string but the firmware byte
+ * swaps each 4-byte word before passing it back to
+ * us. Therefore we need to swab it before printing.
+ */
+ for (i = 0; i < 4; ++i)
+ ((u32 *) board_id)[i] =
+ swab32(*(u32 *) (vsd + VSD_OFFSET_MLX_BOARD_ID + i * 4));
+ }
+}
+
+int mlx4_QUERY_ADAPTER(struct mlx4_dev *dev, struct mlx4_adapter *adapter)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ u32 *outbox;
+ int err;
+
+#define QUERY_ADAPTER_OUT_SIZE 0x100
+#define QUERY_ADAPTER_INTA_PIN_OFFSET 0x10
+#define QUERY_ADAPTER_VSD_OFFSET 0x20
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ outbox = mailbox->buf;
+
+ err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_ADAPTER,
+ MLX4_CMD_TIME_CLASS_A);
+ if (err)
+ goto out;
+
+ MLX4_GET(adapter->inta_pin, outbox, QUERY_ADAPTER_INTA_PIN_OFFSET);
+
+ get_board_id(outbox + QUERY_ADAPTER_VSD_OFFSET / 4,
+ adapter->board_id);
+
+out:
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ return err;
+}
+
+int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ __be32 *inbox;
+ int err;
+
+#define INIT_HCA_IN_SIZE 0x200
+#define INIT_HCA_VERSION_OFFSET 0x000
+#define INIT_HCA_VERSION 2
+#define INIT_HCA_CACHELINE_SZ_OFFSET 0x0e
+#define INIT_HCA_X86_64_BYTE_CACHELINE_SZ 0x40
+#define INIT_HCA_FLAGS_OFFSET 0x014
+#define INIT_HCA_QPC_OFFSET 0x020
+#define INIT_HCA_QPC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x10)
+#define INIT_HCA_LOG_QP_OFFSET (INIT_HCA_QPC_OFFSET + 0x17)
+#define INIT_HCA_SRQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x28)
+#define INIT_HCA_LOG_SRQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x2f)
+#define INIT_HCA_CQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x30)
+#define INIT_HCA_LOG_CQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x37)
+#define INIT_HCA_ALTC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x40)
+#define INIT_HCA_AUXC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x50)
+#define INIT_HCA_EQC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x60)
+#define INIT_HCA_LOG_EQ_OFFSET (INIT_HCA_QPC_OFFSET + 0x67)
+#define INIT_HCA_RDMARC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x70)
+#define INIT_HCA_LOG_RD_OFFSET (INIT_HCA_QPC_OFFSET + 0x77)
+#define INIT_HCA_MCAST_OFFSET 0x0c0
+#define INIT_HCA_MC_BASE_OFFSET (INIT_HCA_MCAST_OFFSET + 0x00)
+#define INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x12)
+#define INIT_HCA_LOG_MC_HASH_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x16)
+#define INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b)
+#define INIT_HCA_TPT_OFFSET 0x0f0
+#define INIT_HCA_DMPT_BASE_OFFSET (INIT_HCA_TPT_OFFSET + 0x00)
+#define INIT_HCA_LOG_MPT_SZ_OFFSET (INIT_HCA_TPT_OFFSET + 0x0b)
+#define INIT_HCA_MTT_BASE_OFFSET (INIT_HCA_TPT_OFFSET + 0x10)
+#define INIT_HCA_CMPT_BASE_OFFSET (INIT_HCA_TPT_OFFSET + 0x18)
+#define INIT_HCA_UAR_OFFSET 0x120
+#define INIT_HCA_LOG_UAR_SZ_OFFSET (INIT_HCA_UAR_OFFSET + 0x0a)
+#define INIT_HCA_UAR_PAGE_SZ_OFFSET (INIT_HCA_UAR_OFFSET + 0x0b)
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ inbox = mailbox->buf;
+
+ memset(inbox, 0, INIT_HCA_IN_SIZE);
+
+ *((u8 *) mailbox->buf + INIT_HCA_VERSION_OFFSET) = INIT_HCA_VERSION;
+#if defined(__x86_64__) || defined(__PPC64__)
+ *((u8 *) mailbox->buf + INIT_HCA_CACHELINE_SZ_OFFSET) = INIT_HCA_X86_64_BYTE_CACHELINE_SZ;
+#endif
+
+#if defined(__LITTLE_ENDIAN)
+ *(inbox + INIT_HCA_FLAGS_OFFSET / 4) &= ~cpu_to_be32(1 << 1);
+#elif defined(__BIG_ENDIAN)
+ *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 1);
+#else
+#error Host endianness not defined
+#endif
+ /* Check port for UD address vector: */
+ *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1);
+
+ /* Enable IPoIB checksumming if we can: */
+ if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM)
+ *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 3);
+
+ /* Enable QoS support if module parameter set */
+ if (enable_qos)
+ *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 2);
+
+ /* counters mode */
+ *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |=
+ cpu_to_be32(dev->caps.counters_mode << 4);
+
+ /* QPC/EEC/CQC/EQC/RDMARC attributes */
+
+ MLX4_PUT(inbox, param->qpc_base, INIT_HCA_QPC_BASE_OFFSET);
+ MLX4_PUT(inbox, param->log_num_qps, INIT_HCA_LOG_QP_OFFSET);
+ MLX4_PUT(inbox, param->srqc_base, INIT_HCA_SRQC_BASE_OFFSET);
+ MLX4_PUT(inbox, param->log_num_srqs, INIT_HCA_LOG_SRQ_OFFSET);
+ MLX4_PUT(inbox, param->cqc_base, INIT_HCA_CQC_BASE_OFFSET);
+ MLX4_PUT(inbox, param->log_num_cqs, INIT_HCA_LOG_CQ_OFFSET);
+ MLX4_PUT(inbox, param->altc_base, INIT_HCA_ALTC_BASE_OFFSET);
+ MLX4_PUT(inbox, param->auxc_base, INIT_HCA_AUXC_BASE_OFFSET);
+ MLX4_PUT(inbox, param->eqc_base, INIT_HCA_EQC_BASE_OFFSET);
+ MLX4_PUT(inbox, param->log_num_eqs, INIT_HCA_LOG_EQ_OFFSET);
+ MLX4_PUT(inbox, param->rdmarc_base, INIT_HCA_RDMARC_BASE_OFFSET);
+ MLX4_PUT(inbox, param->log_rd_per_qp, INIT_HCA_LOG_RD_OFFSET);
+
+ /* multicast attributes */
+
+ MLX4_PUT(inbox, param->mc_base, INIT_HCA_MC_BASE_OFFSET);
+ MLX4_PUT(inbox, param->log_mc_entry_sz, INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
+ MLX4_PUT(inbox, param->log_mc_hash_sz, INIT_HCA_LOG_MC_HASH_SZ_OFFSET);
+ MLX4_PUT(inbox, param->log_mc_table_sz, INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
+
+ /* TPT attributes */
+
+ MLX4_PUT(inbox, param->dmpt_base, INIT_HCA_DMPT_BASE_OFFSET);
+ MLX4_PUT(inbox, param->log_mpt_sz, INIT_HCA_LOG_MPT_SZ_OFFSET);
+ MLX4_PUT(inbox, param->mtt_base, INIT_HCA_MTT_BASE_OFFSET);
+ MLX4_PUT(inbox, param->cmpt_base, INIT_HCA_CMPT_BASE_OFFSET);
+
+ /* UAR attributes */
+
+ MLX4_PUT(inbox, (u8) (PAGE_SHIFT - 12), INIT_HCA_UAR_PAGE_SZ_OFFSET);
+ MLX4_PUT(inbox, param->log_uar_sz, INIT_HCA_LOG_UAR_SZ_OFFSET);
+ if (!mlx4_pre_t11_mode && dev->caps.flags & (u32) MLX4_DEV_CAP_FLAG_FC_T11)
+ *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 10);
+
+
+ err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_INIT_HCA, 10000);
+
+ if (err)
+ mlx4_err(dev, "INIT_HCA returns %d\n", err);
+
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ return err;
+}
+
+int mlx4_INIT_PORT(struct mlx4_dev *dev, int port)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ u32 *inbox;
+ int err;
+ u32 flags;
+ u16 field;
+
+ if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
+#define INIT_PORT_IN_SIZE 256
+#define INIT_PORT_FLAGS_OFFSET 0x00
+#define INIT_PORT_FLAG_SIG (1 << 18)
+#define INIT_PORT_FLAG_NG (1 << 17)
+#define INIT_PORT_FLAG_G0 (1 << 16)
+#define INIT_PORT_VL_SHIFT 4
+#define INIT_PORT_PORT_WIDTH_SHIFT 8
+#define INIT_PORT_MTU_OFFSET 0x04
+#define INIT_PORT_MAX_GID_OFFSET 0x06
+#define INIT_PORT_MAX_PKEY_OFFSET 0x0a
+#define INIT_PORT_GUID0_OFFSET 0x10
+#define INIT_PORT_NODE_GUID_OFFSET 0x18
+#define INIT_PORT_SI_GUID_OFFSET 0x20
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ inbox = mailbox->buf;
+
+ memset(inbox, 0, INIT_PORT_IN_SIZE);
+
+ flags = 0;
+ flags |= (dev->caps.vl_cap[port] & 0xf) << INIT_PORT_VL_SHIFT;
+ flags |= (dev->caps.port_width_cap[port] & 0xf) << INIT_PORT_PORT_WIDTH_SHIFT;
+ MLX4_PUT(inbox, flags, INIT_PORT_FLAGS_OFFSET);
+
+ field = 128 << dev->caps.ib_mtu_cap[port];
+ MLX4_PUT(inbox, field, INIT_PORT_MTU_OFFSET);
+ field = dev->caps.gid_table_len[port];
+ MLX4_PUT(inbox, field, INIT_PORT_MAX_GID_OFFSET);
+ field = dev->caps.pkey_table_len[port];
+ MLX4_PUT(inbox, field, INIT_PORT_MAX_PKEY_OFFSET);
+
+ err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_INIT_PORT,
+ MLX4_CMD_TIME_CLASS_A);
+
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ } else
+ err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT,
+ MLX4_CMD_TIME_CLASS_A);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_INIT_PORT);
+
+int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port)
+{
+ return mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT, 1000);
+}
+EXPORT_SYMBOL_GPL(mlx4_CLOSE_PORT);
+
+int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic)
+{
+ return mlx4_cmd(dev, 0, 0, panic, MLX4_CMD_CLOSE_HCA, 1000);
+}
+
+int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages)
+{
+ int ret = mlx4_cmd_imm(dev, icm_size, aux_pages, 0, 0,
+ MLX4_CMD_SET_ICM_SIZE,
+ MLX4_CMD_TIME_CLASS_A);
+ if (ret)
+ return ret;
+
+ /*
+ * Round up number of system pages needed in case
+ * MLX4_ICM_PAGE_SIZE < PAGE_SIZE.
+ */
+ *aux_pages = ALIGN(*aux_pages, PAGE_SIZE / MLX4_ICM_PAGE_SIZE) >>
+ (PAGE_SHIFT - MLX4_ICM_PAGE_SHIFT);
+
+ return 0;
+}
+
+int mlx4_NOP(struct mlx4_dev *dev)
+{
+ /* Input modifier of 0x1f means "finish as soon as possible." */
+ return mlx4_cmd(dev, 0, 0x1f, 0, MLX4_CMD_NOP, 100);
+}
+
+int mlx4_query_diag_counters(struct mlx4_dev *dev, int array_length,
+ u8 op_modifier, u32 in_offset[], u32 counter_out[])
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ u32 *outbox;
+ int ret;
+ int i;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ outbox = mailbox->buf;
+
+ ret = mlx4_cmd_box(dev, 0, mailbox->dma, 0, op_modifier,
+ MLX4_CMD_DIAG_RPRT, MLX4_CMD_TIME_CLASS_A);
+ if (ret)
+ goto out;
+
+ for (i=0; i < array_length; i++) {
+ if (in_offset[i] > MLX4_MAILBOX_SIZE) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ MLX4_GET(counter_out[i], outbox, in_offset[i]);
+ }
+
+out:
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_query_diag_counters);
+
+void mlx4_get_fc_t11_settings(struct mlx4_dev *dev, int *enable_pre_t11, int *t11_supported)
+{
+ *enable_pre_t11 = !!mlx4_pre_t11_mode;
+ *t11_supported = !!(dev->caps.flags & MLX4_DEV_CAP_FLAG_FC_T11);
+}
+EXPORT_SYMBOL_GPL(mlx4_get_fc_t11_settings);
diff --git a/sys/ofed/drivers/net/mlx4/fw.h b/sys/ofed/drivers/net/mlx4/fw.h
new file mode 100644
index 0000000..fbdd95e
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/fw.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_FW_H
+#define MLX4_FW_H
+
+#include "mlx4.h"
+#include "icm.h"
+
+struct mlx4_mod_stat_cfg {
+ u8 log_pg_sz;
+ u8 log_pg_sz_m;
+};
+
+struct mlx4_dev_cap {
+ int max_srq_sz;
+ int max_qp_sz;
+ int reserved_qps;
+ int max_qps;
+ int reserved_srqs;
+ int max_srqs;
+ int max_cq_sz;
+ int reserved_cqs;
+ int max_cqs;
+ int max_mpts;
+ int reserved_eqs;
+ int max_eqs;
+ int reserved_mtts;
+ int max_mrw_sz;
+ int reserved_mrws;
+ int max_mtt_seg;
+ int max_requester_per_qp;
+ int max_responder_per_qp;
+ int max_rdma_global;
+ int local_ca_ack_delay;
+ int num_ports;
+ u32 max_msg_sz;
+ int ib_mtu[MLX4_MAX_PORTS + 1];
+ int max_port_width[MLX4_MAX_PORTS + 1];
+ int max_vl[MLX4_MAX_PORTS + 1];
+ int max_gids[MLX4_MAX_PORTS + 1];
+ int max_pkeys[MLX4_MAX_PORTS + 1];
+ u64 def_mac[MLX4_MAX_PORTS + 1];
+ u16 eth_mtu[MLX4_MAX_PORTS + 1];
+ int trans_type[MLX4_MAX_PORTS + 1];
+ int vendor_oui[MLX4_MAX_PORTS + 1];
+ u16 wavelength[MLX4_MAX_PORTS + 1];
+ u64 trans_code[MLX4_MAX_PORTS + 1];
+ u16 stat_rate_support;
+ int udp_rss;
+ int loopback_support;
+ u64 flags;
+ int reserved_uars;
+ int uar_size;
+ int min_page_sz;
+ int bf_reg_size;
+ int bf_regs_per_page;
+ int max_sq_sg;
+ int max_sq_desc_sz;
+ int max_rq_sg;
+ int max_rq_desc_sz;
+ int max_qp_per_mcg;
+ int reserved_mgms;
+ int max_mcgs;
+ int reserved_pds;
+ int max_pds;
+ int reserved_xrcds;
+ int max_xrcds;
+ int qpc_entry_sz;
+ int rdmarc_entry_sz;
+ int altc_entry_sz;
+ int aux_entry_sz;
+ int srq_entry_sz;
+ int cqc_entry_sz;
+ int eqc_entry_sz;
+ int dmpt_entry_sz;
+ int cmpt_entry_sz;
+ int mtt_entry_sz;
+ int inline_cfg;
+ int resize_srq;
+ u32 bmme_flags;
+ u32 reserved_lkey;
+ u64 max_icm_sz;
+ int max_gso_sz;
+ u8 supported_port_types[MLX4_MAX_PORTS + 1];
+ u8 log_max_macs[MLX4_MAX_PORTS + 1];
+ u8 log_max_vlans[MLX4_MAX_PORTS + 1];
+ u32 max_basic_counters;
+ u32 max_ext_counters;
+};
+
+struct mlx4_adapter {
+ char board_id[MLX4_BOARD_ID_LEN];
+ u8 inta_pin;
+};
+
+struct mlx4_init_hca_param {
+ u64 qpc_base;
+ u64 rdmarc_base;
+ u64 auxc_base;
+ u64 altc_base;
+ u64 srqc_base;
+ u64 cqc_base;
+ u64 eqc_base;
+ u64 mc_base;
+ u64 dmpt_base;
+ u64 cmpt_base;
+ u64 mtt_base;
+ u16 log_mc_entry_sz;
+ u16 log_mc_hash_sz;
+ u8 log_num_qps;
+ u8 log_num_srqs;
+ u8 log_num_cqs;
+ u8 log_num_eqs;
+ u8 log_rd_per_qp;
+ u8 log_mc_table_sz;
+ u8 log_mpt_sz;
+ u8 log_uar_sz;
+};
+
+struct mlx4_init_ib_param {
+ int port_width;
+ int vl_cap;
+ int mtu_cap;
+ u16 gid_cap;
+ u16 pkey_cap;
+ int set_guid0;
+ u64 guid0;
+ int set_node_guid;
+ u64 node_guid;
+ int set_si_guid;
+ u64 si_guid;
+};
+
+struct mlx4_set_ib_param {
+ int set_si_guid;
+ int reset_qkey_viol;
+ u64 si_guid;
+ u32 cap_mask;
+};
+
+int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap);
+int mlx4_MAP_FA(struct mlx4_dev *dev, struct mlx4_icm *icm);
+int mlx4_UNMAP_FA(struct mlx4_dev *dev);
+int mlx4_RUN_FW(struct mlx4_dev *dev);
+int mlx4_QUERY_FW(struct mlx4_dev *dev);
+int mlx4_QUERY_ADAPTER(struct mlx4_dev *dev, struct mlx4_adapter *adapter);
+int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param);
+int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic);
+int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt);
+int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages);
+int mlx4_NOP(struct mlx4_dev *dev);
+int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg);
+
+#endif /* MLX4_FW_H */
diff --git a/sys/ofed/drivers/net/mlx4/icm.c b/sys/ofed/drivers/net/mlx4/icm.c
new file mode 100644
index 0000000..3a14d6b
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/icm.c
@@ -0,0 +1,455 @@
+/*
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/scatterlist.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+#include "icm.h"
+#include "fw.h"
+
+/*
+ * We allocate in as big chunks as we can, up to a maximum of 256 KB
+ * per chunk.
+ */
+enum {
+ MLX4_ICM_ALLOC_SIZE = 1 << 18,
+ MLX4_TABLE_CHUNK_SIZE = 1 << 18
+};
+
+static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)
+{
+ int i;
+
+ if (chunk->nsg > 0)
+ pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages,
+ PCI_DMA_BIDIRECTIONAL);
+
+ for (i = 0; i < chunk->npages; ++i)
+ __free_pages(sg_page(&chunk->mem[i]),
+ get_order(chunk->mem[i].length));
+}
+
+static void mlx4_free_icm_coherent(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)
+{
+ int i;
+
+ for (i = 0; i < chunk->npages; ++i)
+ dma_free_coherent(&dev->pdev->dev, chunk->mem[i].length,
+ lowmem_page_address(sg_page(&chunk->mem[i])),
+ sg_dma_address(&chunk->mem[i]));
+}
+
+void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent)
+{
+ struct mlx4_icm_chunk *chunk, *tmp;
+
+ if (!icm)
+ return;
+
+ list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) {
+ if (coherent)
+ mlx4_free_icm_coherent(dev, chunk);
+ else
+ mlx4_free_icm_pages(dev, chunk);
+
+ kfree(chunk);
+ }
+
+ kfree(icm);
+}
+
+static int mlx4_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_mask)
+{
+ struct page *page;
+
+ page = alloc_pages(gfp_mask, order);
+ if (!page)
+ return -ENOMEM;
+
+ sg_set_page(mem, page, PAGE_SIZE << order, 0);
+ return 0;
+}
+
+static int mlx4_alloc_icm_coherent(struct device *dev, struct scatterlist *mem,
+ int order, gfp_t gfp_mask)
+{
+ void *buf = dma_alloc_coherent(dev, PAGE_SIZE << order,
+ &sg_dma_address(mem), gfp_mask);
+ if (!buf)
+ return -ENOMEM;
+
+ sg_set_buf(mem, buf, PAGE_SIZE << order);
+ BUG_ON(mem->offset);
+ sg_dma_len(mem) = PAGE_SIZE << order;
+ return 0;
+}
+
+struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
+ gfp_t gfp_mask, int coherent)
+{
+ struct mlx4_icm *icm;
+ struct mlx4_icm_chunk *chunk = NULL;
+ int cur_order;
+ int ret;
+
+ /* We use sg_set_buf for coherent allocs, which assumes low memory */
+ BUG_ON(coherent && (gfp_mask & __GFP_HIGHMEM));
+
+ icm = kmalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+ if (!icm)
+ return NULL;
+
+ icm->refcount = 0;
+ INIT_LIST_HEAD(&icm->chunk_list);
+
+ cur_order = get_order(MLX4_ICM_ALLOC_SIZE);
+
+ while (npages > 0) {
+ if (!chunk) {
+ chunk = kmalloc(sizeof *chunk,
+ gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+ if (!chunk)
+ goto fail;
+
+ sg_init_table(chunk->mem, MLX4_ICM_CHUNK_LEN);
+ chunk->npages = 0;
+ chunk->nsg = 0;
+ list_add_tail(&chunk->list, &icm->chunk_list);
+ }
+
+ while (1 << cur_order > npages)
+ --cur_order;
+
+ if (coherent)
+ ret = mlx4_alloc_icm_coherent(&dev->pdev->dev,
+ &chunk->mem[chunk->npages],
+ cur_order, gfp_mask);
+ else
+ ret = mlx4_alloc_icm_pages(&chunk->mem[chunk->npages],
+ cur_order, gfp_mask);
+
+ if (!ret) {
+ ++chunk->npages;
+
+ if (coherent)
+ ++chunk->nsg;
+ else if (chunk->npages == MLX4_ICM_CHUNK_LEN) {
+ chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+ chunk->npages,
+ PCI_DMA_BIDIRECTIONAL);
+
+ if (chunk->nsg <= 0)
+ goto fail;
+ }
+
+ if (chunk->npages == MLX4_ICM_CHUNK_LEN)
+ chunk = NULL;
+
+ npages -= 1 << cur_order;
+ } else {
+ --cur_order;
+ if (cur_order < 0)
+ goto fail;
+ }
+ }
+
+ if (!coherent && chunk) {
+ chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+ chunk->npages,
+ PCI_DMA_BIDIRECTIONAL);
+
+ if (chunk->nsg <= 0)
+ goto fail;
+ }
+
+ return icm;
+
+fail:
+ mlx4_free_icm(dev, icm, coherent);
+ return NULL;
+}
+
+static int mlx4_MAP_ICM(struct mlx4_dev *dev, struct mlx4_icm *icm, u64 virt)
+{
+ return mlx4_map_cmd(dev, MLX4_CMD_MAP_ICM, icm, virt);
+}
+
+int mlx4_UNMAP_ICM(struct mlx4_dev *dev, u64 virt, u32 page_count)
+{
+ return mlx4_cmd(dev, virt, page_count, 0, MLX4_CMD_UNMAP_ICM,
+ MLX4_CMD_TIME_CLASS_B);
+}
+
+int mlx4_MAP_ICM_page(struct mlx4_dev *dev, u64 dma_addr, u64 virt)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ __be64 *inbox;
+ int err;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ inbox = mailbox->buf;
+
+ inbox[0] = cpu_to_be64(virt);
+ inbox[1] = cpu_to_be64(dma_addr);
+
+ err = mlx4_cmd(dev, mailbox->dma, 1, 0, MLX4_CMD_MAP_ICM,
+ MLX4_CMD_TIME_CLASS_B);
+
+ mlx4_free_cmd_mailbox(dev, mailbox);
+
+ if (!err)
+ mlx4_dbg(dev, "Mapped page at %llx to %llx for ICM.\n",
+ (unsigned long long) dma_addr, (unsigned long long) virt);
+
+ return err;
+}
+
+int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm)
+{
+ return mlx4_map_cmd(dev, MLX4_CMD_MAP_ICM_AUX, icm, -1);
+}
+
+int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev)
+{
+ return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_ICM_AUX, MLX4_CMD_TIME_CLASS_B);
+}
+
+int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj)
+{
+ int i = (obj & (table->num_obj - 1)) / (MLX4_TABLE_CHUNK_SIZE / table->obj_size);
+ int ret = 0;
+
+ mutex_lock(&table->mutex);
+
+ if (table->icm[i]) {
+ ++table->icm[i]->refcount;
+ goto out;
+ }
+
+ table->icm[i] = mlx4_alloc_icm(dev, MLX4_TABLE_CHUNK_SIZE >> PAGE_SHIFT,
+ (table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+ __GFP_NOWARN, table->coherent);
+ if (!table->icm[i]) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (mlx4_MAP_ICM(dev, table->icm[i], table->virt +
+ (u64) i * MLX4_TABLE_CHUNK_SIZE)) {
+ mlx4_free_icm(dev, table->icm[i], table->coherent);
+ table->icm[i] = NULL;
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ++table->icm[i]->refcount;
+
+out:
+ mutex_unlock(&table->mutex);
+ return ret;
+}
+
+void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj)
+{
+ int i;
+
+ i = (obj & (table->num_obj - 1)) / (MLX4_TABLE_CHUNK_SIZE / table->obj_size);
+
+ mutex_lock(&table->mutex);
+
+ if (--table->icm[i]->refcount == 0) {
+ mlx4_UNMAP_ICM(dev, table->virt + i * MLX4_TABLE_CHUNK_SIZE,
+ MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
+ mlx4_free_icm(dev, table->icm[i], table->coherent);
+ table->icm[i] = NULL;
+ }
+
+ mutex_unlock(&table->mutex);
+}
+
+void *mlx4_table_find(struct mlx4_icm_table *table, int obj, dma_addr_t *dma_handle)
+{
+ int idx, offset, dma_offset, i;
+ struct mlx4_icm_chunk *chunk;
+ struct mlx4_icm *icm;
+ struct page *page = NULL;
+
+ if (!table->lowmem)
+ return NULL;
+
+ mutex_lock(&table->mutex);
+
+ idx = (obj & (table->num_obj - 1)) * table->obj_size;
+ icm = table->icm[idx / MLX4_TABLE_CHUNK_SIZE];
+ dma_offset = offset = idx % MLX4_TABLE_CHUNK_SIZE;
+
+ if (!icm)
+ goto out;
+
+ list_for_each_entry(chunk, &icm->chunk_list, list) {
+ for (i = 0; i < chunk->npages; ++i) {
+ if (dma_handle && dma_offset >= 0) {
+ if (sg_dma_len(&chunk->mem[i]) > dma_offset)
+ *dma_handle = sg_dma_address(&chunk->mem[i]) +
+ dma_offset;
+ dma_offset -= sg_dma_len(&chunk->mem[i]);
+ }
+ /*
+ * DMA mapping can merge pages but not split them,
+ * so if we found the page, dma_handle has already
+ * been assigned to.
+ */
+ if (chunk->mem[i].length > offset) {
+ page = sg_page(&chunk->mem[i]);
+ goto out;
+ }
+ offset -= chunk->mem[i].length;
+ }
+ }
+
+out:
+ mutex_unlock(&table->mutex);
+ return page ? lowmem_page_address(page) + offset : NULL;
+}
+
+int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+ int start, int end)
+{
+ int inc = MLX4_TABLE_CHUNK_SIZE / table->obj_size;
+ int i, err;
+
+ for (i = start; i <= end; i += inc) {
+ err = mlx4_table_get(dev, table, i);
+ if (err)
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ while (i > start) {
+ i -= inc;
+ mlx4_table_put(dev, table, i);
+ }
+
+ return err;
+}
+
+void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+ int start, int end)
+{
+ int i;
+
+ for (i = start; i <= end; i += MLX4_TABLE_CHUNK_SIZE / table->obj_size)
+ mlx4_table_put(dev, table, i);
+}
+
+int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+ u64 virt, int obj_size, int nobj, int reserved,
+ int use_lowmem, int use_coherent)
+{
+ int obj_per_chunk;
+ int num_icm;
+ unsigned chunk_size;
+ int i;
+
+ obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
+ num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
+
+ table->icm = kcalloc(num_icm, sizeof *table->icm, GFP_KERNEL);
+ if (!table->icm)
+ return -ENOMEM;
+ table->virt = virt;
+ table->num_icm = num_icm;
+ table->num_obj = nobj;
+ table->obj_size = obj_size;
+ table->lowmem = use_lowmem;
+ table->coherent = use_coherent;
+ mutex_init(&table->mutex);
+
+ for (i = 0; i * MLX4_TABLE_CHUNK_SIZE < reserved * obj_size; ++i) {
+ chunk_size = MLX4_TABLE_CHUNK_SIZE;
+ if ((i + 1) * MLX4_TABLE_CHUNK_SIZE > nobj * obj_size)
+ chunk_size = PAGE_ALIGN(nobj * obj_size - i * MLX4_TABLE_CHUNK_SIZE);
+
+ table->icm[i] = mlx4_alloc_icm(dev, chunk_size >> PAGE_SHIFT,
+ (use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+ __GFP_NOWARN, use_coherent);
+ if (!table->icm[i])
+ goto err;
+ if (mlx4_MAP_ICM(dev, table->icm[i], virt + i * MLX4_TABLE_CHUNK_SIZE)) {
+ mlx4_free_icm(dev, table->icm[i], use_coherent);
+ table->icm[i] = NULL;
+ goto err;
+ }
+
+ /*
+ * Add a reference to this ICM chunk so that it never
+ * gets freed (since it contains reserved firmware objects).
+ */
+ ++table->icm[i]->refcount;
+ }
+
+ return 0;
+
+err:
+ for (i = 0; i < num_icm; ++i)
+ if (table->icm[i]) {
+ mlx4_UNMAP_ICM(dev, virt + i * MLX4_TABLE_CHUNK_SIZE,
+ MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
+ mlx4_free_icm(dev, table->icm[i], use_coherent);
+ }
+
+ return -ENOMEM;
+}
+
+void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table)
+{
+ int i;
+
+ for (i = 0; i < table->num_icm; ++i)
+ if (table->icm[i]) {
+ mlx4_UNMAP_ICM(dev, table->virt + i * MLX4_TABLE_CHUNK_SIZE,
+ MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
+ mlx4_free_icm(dev, table->icm[i], table->coherent);
+ }
+
+ kfree(table->icm);
+}
diff --git a/sys/ofed/drivers/net/mlx4/icm.h b/sys/ofed/drivers/net/mlx4/icm.h
new file mode 100644
index 0000000..b87f726
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/icm.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_ICM_H
+#define MLX4_ICM_H
+
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/mutex.h>
+
+#define MLX4_ICM_CHUNK_LEN \
+ ((256 - sizeof (struct list_head) - 2 * sizeof (int)) / \
+ (sizeof (struct scatterlist)))
+
+enum {
+ MLX4_ICM_PAGE_SHIFT = 12,
+ MLX4_ICM_PAGE_SIZE = 1 << MLX4_ICM_PAGE_SHIFT,
+};
+
+struct mlx4_icm_chunk {
+ struct list_head list;
+ int npages;
+ int nsg;
+ struct scatterlist mem[MLX4_ICM_CHUNK_LEN];
+};
+
+struct mlx4_icm {
+ struct list_head chunk_list;
+ int refcount;
+};
+
+struct mlx4_icm_iter {
+ struct mlx4_icm *icm;
+ struct mlx4_icm_chunk *chunk;
+ int page_idx;
+};
+
+struct mlx4_dev;
+
+struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
+ gfp_t gfp_mask, int coherent);
+void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent);
+
+int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+ u64 virt, int obj_size, int nobj, int reserved,
+ int use_lowmem, int use_coherent);
+void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table);
+int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj);
+void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj);
+void *mlx4_table_find(struct mlx4_icm_table *table, int obj, dma_addr_t *dma_handle);
+int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+ int start, int end);
+void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+ int start, int end);
+
+static inline void mlx4_icm_first(struct mlx4_icm *icm,
+ struct mlx4_icm_iter *iter)
+{
+ iter->icm = icm;
+ iter->chunk = list_empty(&icm->chunk_list) ?
+ NULL : list_entry(icm->chunk_list.next,
+ struct mlx4_icm_chunk, list);
+ iter->page_idx = 0;
+}
+
+static inline int mlx4_icm_last(struct mlx4_icm_iter *iter)
+{
+ return !iter->chunk;
+}
+
+static inline void mlx4_icm_next(struct mlx4_icm_iter *iter)
+{
+ if (++iter->page_idx >= iter->chunk->nsg) {
+ if (iter->chunk->list.next == &iter->icm->chunk_list) {
+ iter->chunk = NULL;
+ return;
+ }
+
+ iter->chunk = list_entry(iter->chunk->list.next,
+ struct mlx4_icm_chunk, list);
+ iter->page_idx = 0;
+ }
+}
+
+static inline dma_addr_t mlx4_icm_addr(struct mlx4_icm_iter *iter)
+{
+ return sg_dma_address(&iter->chunk->mem[iter->page_idx]);
+}
+
+static inline unsigned long mlx4_icm_size(struct mlx4_icm_iter *iter)
+{
+ return sg_dma_len(&iter->chunk->mem[iter->page_idx]);
+}
+
+int mlx4_UNMAP_ICM(struct mlx4_dev *dev, u64 virt, u32 page_count);
+int mlx4_MAP_ICM_page(struct mlx4_dev *dev, u64 dma_addr, u64 virt);
+int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm);
+int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev);
+
+#endif /* MLX4_ICM_H */
diff --git a/sys/ofed/drivers/net/mlx4/intf.c b/sys/ofed/drivers/net/mlx4/intf.c
new file mode 100644
index 0000000..bdf7e7d
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/intf.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "mlx4.h"
+
+struct mlx4_device_context {
+ struct list_head list;
+ struct mlx4_interface *intf;
+ void *context;
+};
+
+static LIST_HEAD(intf_list);
+static LIST_HEAD(dev_list);
+static DEFINE_MUTEX(intf_mutex);
+
+static void mlx4_add_device(struct mlx4_interface *intf, struct mlx4_priv *priv)
+{
+ struct mlx4_device_context *dev_ctx;
+
+ dev_ctx = kmalloc(sizeof *dev_ctx, GFP_KERNEL);
+ if (!dev_ctx)
+ return;
+
+ dev_ctx->intf = intf;
+ dev_ctx->context = intf->add(&priv->dev);
+
+ if (dev_ctx->context) {
+ spin_lock_irq(&priv->ctx_lock);
+ list_add_tail(&dev_ctx->list, &priv->ctx_list);
+ spin_unlock_irq(&priv->ctx_lock);
+ } else
+ kfree(dev_ctx);
+}
+
+static void mlx4_remove_device(struct mlx4_interface *intf, struct mlx4_priv *priv)
+{
+ struct mlx4_device_context *dev_ctx;
+
+ list_for_each_entry(dev_ctx, &priv->ctx_list, list)
+ if (dev_ctx->intf == intf) {
+ spin_lock_irq(&priv->ctx_lock);
+ list_del(&dev_ctx->list);
+ spin_unlock_irq(&priv->ctx_lock);
+
+ intf->remove(&priv->dev, dev_ctx->context);
+ kfree(dev_ctx);
+ return;
+ }
+}
+
+int mlx4_register_interface(struct mlx4_interface *intf)
+{
+ struct mlx4_priv *priv;
+
+ if (!intf->add || !intf->remove)
+ return -EINVAL;
+
+ mutex_lock(&intf_mutex);
+
+ list_add_tail(&intf->list, &intf_list);
+ list_for_each_entry(priv, &dev_list, dev_list)
+ mlx4_add_device(intf, priv);
+
+ mutex_unlock(&intf_mutex);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_register_interface);
+
+void mlx4_unregister_interface(struct mlx4_interface *intf)
+{
+ struct mlx4_priv *priv;
+
+ mutex_lock(&intf_mutex);
+
+ list_for_each_entry(priv, &dev_list, dev_list)
+ mlx4_remove_device(intf, priv);
+
+ list_del(&intf->list);
+
+ mutex_unlock(&intf_mutex);
+}
+EXPORT_SYMBOL_GPL(mlx4_unregister_interface);
+
+struct mlx4_dev *mlx4_query_interface(void *int_dev, int *port)
+{
+ struct mlx4_priv *priv;
+ struct mlx4_device_context *dev_ctx;
+ enum mlx4_query_reply r;
+ unsigned long flags;
+
+ mutex_lock(&intf_mutex);
+
+ list_for_each_entry(priv, &dev_list, dev_list) {
+ spin_lock_irqsave(&priv->ctx_lock, flags);
+ list_for_each_entry(dev_ctx, &priv->ctx_list, list) {
+ if (!dev_ctx->intf->query)
+ continue;
+ r = dev_ctx->intf->query(dev_ctx->context, int_dev);
+ if (r != MLX4_QUERY_NOT_MINE) {
+ *port = r;
+ spin_unlock_irqrestore(&priv->ctx_lock, flags);
+ mutex_unlock(&intf_mutex);
+ return &priv->dev;
+ }
+ }
+ spin_unlock_irqrestore(&priv->ctx_lock, flags);
+ }
+
+ mutex_unlock(&intf_mutex);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(mlx4_query_interface);
+
+void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type, int port)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ struct mlx4_device_context *dev_ctx;
+ unsigned long flags;
+
+ spin_lock_irqsave(&priv->ctx_lock, flags);
+
+ list_for_each_entry(dev_ctx, &priv->ctx_list, list)
+ if (dev_ctx->intf->event)
+ dev_ctx->intf->event(dev, dev_ctx->context, type, port);
+
+ spin_unlock_irqrestore(&priv->ctx_lock, flags);
+}
+
+int mlx4_register_device(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ struct mlx4_interface *intf;
+
+ mutex_lock(&intf_mutex);
+
+ list_add_tail(&priv->dev_list, &dev_list);
+ list_for_each_entry(intf, &intf_list, list)
+ mlx4_add_device(intf, priv);
+
+ mutex_unlock(&intf_mutex);
+ mlx4_start_catas_poll(dev);
+
+ return 0;
+}
+
+void mlx4_unregister_device(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ struct mlx4_interface *intf;
+
+ mlx4_stop_catas_poll(dev);
+ mutex_lock(&intf_mutex);
+
+ list_for_each_entry(intf, &intf_list, list)
+ mlx4_remove_device(intf, priv);
+
+ list_del(&priv->dev_list);
+
+ mutex_unlock(&intf_mutex);
+}
+
+void *mlx4_find_get_prot_dev(struct mlx4_dev *dev, enum mlx4_prot proto, int port)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ struct mlx4_device_context *dev_ctx;
+ unsigned long flags;
+ void *result = NULL;
+
+ spin_lock_irqsave(&priv->ctx_lock, flags);
+
+ list_for_each_entry(dev_ctx, &priv->ctx_list, list)
+ if (dev_ctx->intf->protocol == proto && dev_ctx->intf->get_prot_dev) {
+ result = dev_ctx->intf->get_prot_dev(dev, dev_ctx->context, port);
+ break;
+ }
+
+ spin_unlock_irqrestore(&priv->ctx_lock, flags);
+
+ return result;
+}
+
diff --git a/sys/ofed/drivers/net/mlx4/main.c b/sys/ofed/drivers/net/mlx4/main.c
new file mode 100644
index 0000000..44aec46
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/main.c
@@ -0,0 +1,1704 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/io-mapping.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/doorbell.h>
+
+#include "mlx4.h"
+#include "fw.h"
+#include "icm.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("Mellanox ConnectX HCA low-level driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+struct workqueue_struct *mlx4_wq;
+
+#ifdef CONFIG_MLX4_DEBUG
+
+int mlx4_debug_level = 0;
+module_param_named(debug_level, mlx4_debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
+
+#endif /* CONFIG_MLX4_DEBUG */
+
+int mlx4_blck_lb=1;
+module_param_named(block_loopback, mlx4_blck_lb, int, 0644);
+MODULE_PARM_DESC(block_loopback, "Block multicast loopback packets if > 0");
+
+#ifdef CONFIG_PCI_MSI
+
+static int msi_x = 1;
+module_param(msi_x, int, 0444);
+MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero");
+
+#else /* CONFIG_PCI_MSI */
+
+#define msi_x (0)
+
+#endif /* CONFIG_PCI_MSI */
+
+static char mlx4_version[] __devinitdata =
+ DRV_NAME ": Mellanox ConnectX core driver v"
+ DRV_VERSION " (" DRV_RELDATE ")\n";
+
+struct mutex drv_mutex;
+
+static struct mlx4_profile default_profile = {
+ .num_qp = 1 << 18,
+ .num_srq = 1 << 16,
+ .rdmarc_per_qp = 1 << 4,
+ .num_cq = 1 << 16,
+ .num_mcg = 1 << 13,
+ .num_mpt = 1 << 19,
+ .num_mtt = 1 << 20,
+};
+
+static int log_num_mac = 2;
+module_param_named(log_num_mac, log_num_mac, int, 0444);
+MODULE_PARM_DESC(log_num_mac, "Log2 max number of MACs per ETH port (1-7)");
+
+static int use_prio;
+module_param_named(use_prio, use_prio, bool, 0444);
+MODULE_PARM_DESC(use_prio, "Enable steering by VLAN priority on ETH ports "
+ "(0/1, default 0)");
+
+static struct mlx4_profile mod_param_profile = { 0 };
+
+module_param_named(log_num_qp, mod_param_profile.num_qp, int, 0444);
+MODULE_PARM_DESC(log_num_qp, "log maximum number of QPs per HCA");
+
+module_param_named(log_num_srq, mod_param_profile.num_srq, int, 0444);
+MODULE_PARM_DESC(log_num_srq, "log maximum number of SRQs per HCA");
+
+module_param_named(log_rdmarc_per_qp, mod_param_profile.rdmarc_per_qp, int, 0444);
+MODULE_PARM_DESC(log_rdmarc_per_qp, "log number of RDMARC buffers per QP");
+
+module_param_named(log_num_cq, mod_param_profile.num_cq, int, 0444);
+MODULE_PARM_DESC(log_num_cq, "log maximum number of CQs per HCA");
+
+module_param_named(log_num_mcg, mod_param_profile.num_mcg, int, 0444);
+MODULE_PARM_DESC(log_num_mcg, "log maximum number of multicast groups per HCA");
+
+module_param_named(log_num_mpt, mod_param_profile.num_mpt, int, 0444);
+MODULE_PARM_DESC(log_num_mpt,
+ "log maximum number of memory protection table entries per HCA");
+
+module_param_named(log_num_mtt, mod_param_profile.num_mtt, int, 0444);
+MODULE_PARM_DESC(log_num_mtt,
+ "log maximum number of memory translation table segments per HCA");
+
+static int log_mtts_per_seg = 0;
+module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444);
+MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment (1-7)");
+
+static void process_mod_param_profile(void)
+{
+ default_profile.num_qp = (mod_param_profile.num_qp ?
+ 1 << mod_param_profile.num_qp :
+ default_profile.num_qp);
+ default_profile.num_srq = (mod_param_profile.num_srq ?
+ 1 << mod_param_profile.num_srq :
+ default_profile.num_srq);
+ default_profile.rdmarc_per_qp = (mod_param_profile.rdmarc_per_qp ?
+ 1 << mod_param_profile.rdmarc_per_qp :
+ default_profile.rdmarc_per_qp);
+ default_profile.num_cq = (mod_param_profile.num_cq ?
+ 1 << mod_param_profile.num_cq :
+ default_profile.num_cq);
+ default_profile.num_mcg = (mod_param_profile.num_mcg ?
+ 1 << mod_param_profile.num_mcg :
+ default_profile.num_mcg);
+ default_profile.num_mpt = (mod_param_profile.num_mpt ?
+ 1 << mod_param_profile.num_mpt :
+ default_profile.num_mpt);
+ default_profile.num_mtt = (mod_param_profile.num_mtt ?
+ 1 << mod_param_profile.num_mtt :
+ default_profile.num_mtt);
+}
+
+struct mlx4_port_config
+{
+ struct list_head list;
+ enum mlx4_port_type port_type[MLX4_MAX_PORTS + 1];
+ struct pci_dev *pdev;
+};
+static LIST_HEAD(config_list);
+
+static void mlx4_config_cleanup(void)
+{
+ struct mlx4_port_config *config, *tmp;
+
+ list_for_each_entry_safe(config, tmp, &config_list, list) {
+ list_del(&config->list);
+ kfree(config);
+ }
+}
+
+void *mlx4_get_prot_dev(struct mlx4_dev *dev, enum mlx4_prot proto, int port)
+{
+ return mlx4_find_get_prot_dev(dev, proto, port);
+}
+EXPORT_SYMBOL(mlx4_get_prot_dev);
+
+void mlx4_set_iboe_counter(struct mlx4_dev *dev, int index, u8 port)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+
+ priv->iboe_counter_index[port - 1] = index;
+}
+EXPORT_SYMBOL(mlx4_set_iboe_counter);
+
+int mlx4_get_iboe_counter(struct mlx4_dev *dev, u8 port)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+
+ return priv->iboe_counter_index[port - 1];
+}
+EXPORT_SYMBOL(mlx4_get_iboe_counter);
+
+int mlx4_check_port_params(struct mlx4_dev *dev,
+ enum mlx4_port_type *port_type)
+{
+ int i;
+
+ for (i = 0; i < dev->caps.num_ports - 1; i++) {
+ if (port_type[i] != port_type[i + 1]) {
+ if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) {
+ mlx4_err(dev, "Only same port types supported "
+ "on this HCA, aborting.\n");
+ return -EINVAL;
+ }
+ if (port_type[i] == MLX4_PORT_TYPE_ETH &&
+ port_type[i + 1] == MLX4_PORT_TYPE_IB)
+ return -EINVAL;
+ }
+ }
+
+ for (i = 0; i < dev->caps.num_ports; i++) {
+ if (!(port_type[i] & dev->caps.supported_type[i+1])) {
+ mlx4_err(dev, "Requested port type for port %d is not "
+ "supported on this HCA\n", i + 1);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+static void mlx4_set_port_mask(struct mlx4_dev *dev)
+{
+ int i;
+
+ for (i = 1; i <= dev->caps.num_ports; ++i)
+ dev->caps.port_mask[i] = dev->caps.port_type[i];
+}
+
+static u8 get_counters_mode(u64 flags)
+{
+ switch (flags >> 48 & 3) {
+ case 2:
+ case 3:
+ return MLX4_CUNTERS_EXT;
+ case 1:
+ return MLX4_CUNTERS_BASIC;
+ default:
+ return MLX4_CUNTERS_DISABLED;
+ }
+}
+
+static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
+{
+ int err;
+ int i;
+
+ err = mlx4_QUERY_DEV_CAP(dev, dev_cap);
+ if (err) {
+ mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
+ return err;
+ }
+
+ if (dev_cap->min_page_sz > PAGE_SIZE) {
+ mlx4_err(dev, "HCA minimum page size of %d bigger than "
+ "kernel PAGE_SIZE of %d, aborting.\n",
+ dev_cap->min_page_sz, PAGE_SIZE);
+ return -ENODEV;
+ }
+ if (dev_cap->num_ports > MLX4_MAX_PORTS) {
+ mlx4_err(dev, "HCA has %d ports, but we only support %d, "
+ "aborting.\n",
+ dev_cap->num_ports, MLX4_MAX_PORTS);
+ return -ENODEV;
+ }
+
+ if (dev_cap->uar_size > pci_resource_len(dev->pdev, 2)) {
+ mlx4_err(dev, "HCA reported UAR size of 0x%x bigger than "
+ "PCI resource 2 size of 0x%llx, aborting.\n",
+ dev_cap->uar_size,
+ (unsigned long long) pci_resource_len(dev->pdev, 2));
+ return -ENODEV;
+ }
+
+ dev->caps.num_ports = dev_cap->num_ports;
+ for (i = 1; i <= dev->caps.num_ports; ++i) {
+ dev->caps.vl_cap[i] = dev_cap->max_vl[i];
+ dev->caps.ib_mtu_cap[i] = dev_cap->ib_mtu[i];
+ dev->caps.gid_table_len[i] = dev_cap->max_gids[i];
+ dev->caps.pkey_table_len[i] = dev_cap->max_pkeys[i];
+ dev->caps.port_width_cap[i] = dev_cap->max_port_width[i];
+ dev->caps.eth_mtu_cap[i] = dev_cap->eth_mtu[i];
+ dev->caps.def_mac[i] = dev_cap->def_mac[i];
+ dev->caps.supported_type[i] = dev_cap->supported_port_types[i];
+ dev->caps.trans_type[i] = dev_cap->trans_type[i];
+ dev->caps.vendor_oui[i] = dev_cap->vendor_oui[i];
+ dev->caps.wavelength[i] = dev_cap->wavelength[i];
+ dev->caps.trans_code[i] = dev_cap->trans_code[i];
+ }
+
+ dev->caps.num_uars = dev_cap->uar_size / PAGE_SIZE;
+ dev->caps.local_ca_ack_delay = dev_cap->local_ca_ack_delay;
+ dev->caps.bf_reg_size = dev_cap->bf_reg_size;
+ dev->caps.bf_regs_per_page = dev_cap->bf_regs_per_page;
+ dev->caps.max_sq_sg = dev_cap->max_sq_sg;
+ dev->caps.max_rq_sg = dev_cap->max_rq_sg;
+ dev->caps.max_wqes = dev_cap->max_qp_sz;
+ dev->caps.max_qp_init_rdma = dev_cap->max_requester_per_qp;
+ dev->caps.max_srq_wqes = dev_cap->max_srq_sz;
+ dev->caps.max_srq_sge = dev_cap->max_rq_sg - 1;
+ dev->caps.reserved_srqs = dev_cap->reserved_srqs;
+ dev->caps.max_sq_desc_sz = dev_cap->max_sq_desc_sz;
+ dev->caps.max_rq_desc_sz = dev_cap->max_rq_desc_sz;
+ dev->caps.num_qp_per_mgm = MLX4_QP_PER_MGM;
+ /*
+ * Subtract 1 from the limit because we need to allocate a
+ * spare CQE so the HCA HW can tell the difference between an
+ * empty CQ and a full CQ.
+ */
+ dev->caps.max_cqes = dev_cap->max_cq_sz - 1;
+ dev->caps.reserved_cqs = dev_cap->reserved_cqs;
+ dev->caps.reserved_eqs = dev_cap->reserved_eqs;
+ dev->caps.mtts_per_seg = 1 << log_mtts_per_seg;
+ dev->caps.reserved_mtts = DIV_ROUND_UP(dev_cap->reserved_mtts,
+ dev->caps.mtts_per_seg);
+ dev->caps.reserved_mrws = dev_cap->reserved_mrws;
+ dev->caps.reserved_uars = dev_cap->reserved_uars;
+ dev->caps.reserved_pds = dev_cap->reserved_pds;
+ dev->caps.mtt_entry_sz = dev->caps.mtts_per_seg * dev_cap->mtt_entry_sz;
+ dev->caps.max_msg_sz = dev_cap->max_msg_sz;
+ dev->caps.page_size_cap = ~(u32) (dev_cap->min_page_sz - 1);
+ dev->caps.flags = dev_cap->flags;
+ dev->caps.bmme_flags = dev_cap->bmme_flags;
+ dev->caps.reserved_lkey = dev_cap->reserved_lkey;
+ dev->caps.stat_rate_support = dev_cap->stat_rate_support;
+ dev->caps.udp_rss = dev_cap->udp_rss;
+ dev->caps.loopback_support = dev_cap->loopback_support;
+ dev->caps.max_gso_sz = dev_cap->max_gso_sz;
+ dev->caps.reserved_xrcds = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ?
+ dev_cap->reserved_xrcds : 0;
+ dev->caps.max_xrcds = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ?
+ dev_cap->max_xrcds : 0;
+
+ dev->caps.log_num_macs = log_num_mac;
+ dev->caps.log_num_prios = use_prio ? 3 : 0;
+
+ for (i = 1; i <= dev->caps.num_ports; ++i) {
+ dev->caps.port_type[i] = MLX4_PORT_TYPE_NONE;
+ if (dev->caps.supported_type[i]) {
+ if (dev->caps.supported_type[i] != MLX4_PORT_TYPE_ETH)
+ dev->caps.port_type[i] = MLX4_PORT_TYPE_IB;
+ else
+ dev->caps.port_type[i] = MLX4_PORT_TYPE_ETH;
+ }
+ dev->caps.possible_type[i] = dev->caps.port_type[i];
+ mlx4_priv(dev)->sense.sense_allowed[i] =
+ dev->caps.supported_type[i] == MLX4_PORT_TYPE_AUTO;
+
+ if (dev->caps.log_num_macs > dev_cap->log_max_macs[i]) {
+ dev->caps.log_num_macs = dev_cap->log_max_macs[i];
+ mlx4_warn(dev, "Requested number of MACs is too much "
+ "for port %d, reducing to %d.\n",
+ i, 1 << dev->caps.log_num_macs);
+ }
+ dev->caps.log_num_vlans = dev_cap->log_max_vlans[i];
+ }
+
+ dev->caps.counters_mode = get_counters_mode(dev_cap->flags);
+ dev->caps.max_basic_counters = 1 << ilog2(dev_cap->max_basic_counters);
+ dev->caps.max_ext_counters = 1 << ilog2(dev_cap->max_ext_counters);
+
+ dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] = dev_cap->reserved_qps;
+ dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] =
+ dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] =
+ (1 << dev->caps.log_num_macs) *
+ (1 << dev->caps.log_num_vlans) *
+ (1 << dev->caps.log_num_prios) *
+ dev->caps.num_ports;
+
+ dev->caps.reserved_qps = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] +
+ dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] +
+ dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR];
+
+ return 0;
+}
+
+static int mlx4_save_config(struct mlx4_dev *dev)
+{
+ struct mlx4_port_config *config;
+ int i;
+
+ list_for_each_entry(config, &config_list, list) {
+ if (config->pdev == dev->pdev) {
+ for (i = 1; i <= dev->caps.num_ports; i++)
+ config->port_type[i] = dev->caps.possible_type[i];
+ return 0;
+ }
+ }
+
+ config = kmalloc(sizeof(struct mlx4_port_config), GFP_KERNEL);
+ if (!config)
+ return -ENOMEM;
+
+ config->pdev = dev->pdev;
+ for (i = 1; i <= dev->caps.num_ports; i++)
+ config->port_type[i] = dev->caps.possible_type[i];
+
+ list_add_tail(&config->list, &config_list);
+
+ return 0;
+}
+
+/*
+ * Change the port configuration of the device.
+ * Every user of this function must hold the port mutex.
+ */
+int mlx4_change_port_types(struct mlx4_dev *dev,
+ enum mlx4_port_type *port_types)
+{
+ int err = 0;
+ int change = 0;
+ int port;
+
+ for (port = 0; port < dev->caps.num_ports; port++) {
+ /* Change the port type only if the new type is different
+ * from the current, and not set to Auto */
+ if (port_types[port] != dev->caps.port_type[port + 1]) {
+ change = 1;
+ dev->caps.port_type[port + 1] = port_types[port];
+ }
+ }
+ if (change) {
+ mlx4_unregister_device(dev);
+ for (port = 1; port <= dev->caps.num_ports; port++) {
+ mlx4_CLOSE_PORT(dev, port);
+ err = mlx4_SET_PORT(dev, port);
+ if (err) {
+ mlx4_err(dev, "Failed to set port %d, "
+ "aborting\n", port);
+ goto out;
+ }
+ }
+ mlx4_set_port_mask(dev);
+ mlx4_save_config(dev);
+ err = mlx4_register_device(dev);
+ }
+
+out:
+ return err;
+}
+
+static ssize_t show_port_type(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info,
+ port_attr);
+ struct mlx4_dev *mdev = info->dev;
+ char type[8];
+
+ sprintf(type, "%s",
+ (mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_IB) ?
+ "ib" : "eth");
+ if (mdev->caps.possible_type[info->port] == MLX4_PORT_TYPE_AUTO)
+ sprintf(buf, "auto (%s)\n", type);
+ else
+ sprintf(buf, "%s\n", type);
+
+ return strlen(buf);
+}
+
+static ssize_t set_port_type(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info,
+ port_attr);
+ struct mlx4_dev *mdev = info->dev;
+ struct mlx4_priv *priv = mlx4_priv(mdev);
+ enum mlx4_port_type types[MLX4_MAX_PORTS];
+ enum mlx4_port_type new_types[MLX4_MAX_PORTS];
+ int i;
+ int err = 0;
+
+ if (!strcmp(buf, "ib\n"))
+ info->tmp_type = MLX4_PORT_TYPE_IB;
+ else if (!strcmp(buf, "eth\n"))
+ info->tmp_type = MLX4_PORT_TYPE_ETH;
+ else if (!strcmp(buf, "auto\n"))
+ info->tmp_type = MLX4_PORT_TYPE_AUTO;
+ else {
+ mlx4_err(mdev, "%s is not supported port type\n", buf);
+ return -EINVAL;
+ }
+
+ mlx4_stop_sense(mdev);
+ mutex_lock(&priv->port_mutex);
+ /* Possible type is always the one that was delivered */
+ mdev->caps.possible_type[info->port] = info->tmp_type;
+
+ for (i = 0; i < mdev->caps.num_ports; i++) {
+ types[i] = priv->port[i+1].tmp_type ? priv->port[i+1].tmp_type :
+ mdev->caps.possible_type[i+1];
+ if (types[i] == MLX4_PORT_TYPE_AUTO)
+ types[i] = mdev->caps.port_type[i+1];
+ }
+
+ if (priv->trig) {
+ if (++priv->changed_ports < mdev->caps.num_ports)
+ goto out;
+ else
+ priv->trig = priv->changed_ports = 0;
+ }
+
+ if (!(mdev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) {
+ for (i = 1; i <= mdev->caps.num_ports; i++) {
+ if (mdev->caps.possible_type[i] == MLX4_PORT_TYPE_AUTO) {
+ mdev->caps.possible_type[i] = mdev->caps.port_type[i];
+ err = -EINVAL;
+ }
+ }
+ }
+ if (err) {
+ mlx4_err(mdev, "Auto sensing is not supported on this HCA. "
+ "Set only 'eth' or 'ib' for both ports "
+ "(should be the same)\n");
+ goto out;
+ }
+
+ mlx4_do_sense_ports(mdev, new_types, types);
+
+ err = mlx4_check_port_params(mdev, new_types);
+ if (err)
+ goto out;
+
+ /* We are about to apply the changes after the configuration
+ * was verified, no need to remember the temporary types
+ * any more */
+ for (i = 0; i < mdev->caps.num_ports; i++)
+ priv->port[i + 1].tmp_type = 0;
+
+ err = mlx4_change_port_types(mdev, new_types);
+
+out:
+ mlx4_start_sense(mdev);
+ mutex_unlock(&priv->port_mutex);
+ return err ? err : count;
+}
+
+static ssize_t trigger_port(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+ struct mlx4_dev *mdev = pci_get_drvdata(pdev);
+ struct mlx4_priv *priv = container_of(mdev, struct mlx4_priv, dev);
+
+ if (!priv)
+ return -ENODEV;
+
+ mutex_lock(&priv->port_mutex);
+ priv->trig = 1;
+ mutex_unlock(&priv->port_mutex);
+ return count;
+}
+DEVICE_ATTR(port_trigger, S_IWUGO, NULL, trigger_port);
+
+static int mlx4_load_fw(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ int err;
+
+ priv->fw.fw_icm = mlx4_alloc_icm(dev, priv->fw.fw_pages,
+ GFP_HIGHUSER | __GFP_NOWARN, 0);
+ if (!priv->fw.fw_icm) {
+ mlx4_err(dev, "Couldn't allocate FW area, aborting.\n");
+ return -ENOMEM;
+ }
+
+ err = mlx4_MAP_FA(dev, priv->fw.fw_icm);
+ if (err) {
+ mlx4_err(dev, "MAP_FA command failed, aborting.\n");
+ goto err_free;
+ }
+
+ err = mlx4_RUN_FW(dev);
+ if (err) {
+ mlx4_err(dev, "RUN_FW command failed, aborting.\n");
+ goto err_unmap_fa;
+ }
+
+ return 0;
+
+err_unmap_fa:
+ mlx4_UNMAP_FA(dev);
+
+err_free:
+ mlx4_free_icm(dev, priv->fw.fw_icm, 0);
+ return err;
+}
+
+static int mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base,
+ int cmpt_entry_sz)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ int err;
+
+ err = mlx4_init_icm_table(dev, &priv->qp_table.cmpt_table,
+ cmpt_base +
+ ((u64) (MLX4_CMPT_TYPE_QP *
+ cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+ cmpt_entry_sz, dev->caps.num_qps,
+ dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+ 0, 0);
+ if (err)
+ goto err;
+
+ err = mlx4_init_icm_table(dev, &priv->srq_table.cmpt_table,
+ cmpt_base +
+ ((u64) (MLX4_CMPT_TYPE_SRQ *
+ cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+ cmpt_entry_sz, dev->caps.num_srqs,
+ dev->caps.reserved_srqs, 0, 0);
+ if (err)
+ goto err_qp;
+
+ err = mlx4_init_icm_table(dev, &priv->cq_table.cmpt_table,
+ cmpt_base +
+ ((u64) (MLX4_CMPT_TYPE_CQ *
+ cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+ cmpt_entry_sz, dev->caps.num_cqs,
+ dev->caps.reserved_cqs, 0, 0);
+ if (err)
+ goto err_srq;
+
+ err = mlx4_init_icm_table(dev, &priv->eq_table.cmpt_table,
+ cmpt_base +
+ ((u64) (MLX4_CMPT_TYPE_EQ *
+ cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+ cmpt_entry_sz,
+ dev->caps.num_eqs, dev->caps.num_eqs, 0, 0);
+ if (err)
+ goto err_cq;
+
+ return 0;
+
+err_cq:
+ mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table);
+
+err_srq:
+ mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table);
+
+err_qp:
+ mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table);
+
+err:
+ return err;
+}
+
+static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
+ struct mlx4_init_hca_param *init_hca, u64 icm_size)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ u64 aux_pages;
+ int err;
+
+ err = mlx4_SET_ICM_SIZE(dev, icm_size, &aux_pages);
+ if (err) {
+ mlx4_err(dev, "SET_ICM_SIZE command failed, aborting.\n");
+ return err;
+ }
+
+ mlx4_dbg(dev, "%lld KB of HCA context requires %lld KB aux memory.\n",
+ (unsigned long long) icm_size >> 10,
+ (unsigned long long) aux_pages << 2);
+
+ priv->fw.aux_icm = mlx4_alloc_icm(dev, aux_pages,
+ GFP_HIGHUSER | __GFP_NOWARN, 0);
+ if (!priv->fw.aux_icm) {
+ mlx4_err(dev, "Couldn't allocate aux memory, aborting.\n");
+ return -ENOMEM;
+ }
+
+ err = mlx4_MAP_ICM_AUX(dev, priv->fw.aux_icm);
+ if (err) {
+ mlx4_err(dev, "MAP_ICM_AUX command failed, aborting.\n");
+ goto err_free_aux;
+ }
+
+ err = mlx4_init_cmpt_table(dev, init_hca->cmpt_base, dev_cap->cmpt_entry_sz);
+ if (err) {
+ mlx4_err(dev, "Failed to map cMPT context memory, aborting.\n");
+ goto err_unmap_aux;
+ }
+
+ err = mlx4_init_icm_table(dev, &priv->eq_table.table,
+ init_hca->eqc_base, dev_cap->eqc_entry_sz,
+ dev->caps.num_eqs, dev->caps.num_eqs,
+ 0, 0);
+ if (err) {
+ mlx4_err(dev, "Failed to map EQ context memory, aborting.\n");
+ goto err_unmap_cmpt;
+ }
+
+ /*
+ * Reserved MTT entries must be aligned up to a cacheline
+ * boundary, since the FW will write to them, while the driver
+ * writes to all other MTT entries. (The variable
+ * dev->caps.mtt_entry_sz below is really the MTT segment
+ * size, not the raw entry size)
+ */
+ dev->caps.reserved_mtts =
+ ALIGN(dev->caps.reserved_mtts * dev->caps.mtt_entry_sz,
+ dma_get_cache_alignment()) / dev->caps.mtt_entry_sz;
+
+ err = mlx4_init_icm_table(dev, &priv->mr_table.mtt_table,
+ init_hca->mtt_base,
+ dev->caps.mtt_entry_sz,
+ dev->caps.num_mtt_segs,
+ dev->caps.reserved_mtts, 1, 0);
+ if (err) {
+ mlx4_err(dev, "Failed to map MTT context memory, aborting.\n");
+ goto err_unmap_eq;
+ }
+
+ err = mlx4_init_icm_table(dev, &priv->mr_table.dmpt_table,
+ init_hca->dmpt_base,
+ dev_cap->dmpt_entry_sz,
+ dev->caps.num_mpts,
+ dev->caps.reserved_mrws, 1, 1);
+ if (err) {
+ mlx4_err(dev, "Failed to map dMPT context memory, aborting.\n");
+ goto err_unmap_mtt;
+ }
+
+ err = mlx4_init_icm_table(dev, &priv->qp_table.qp_table,
+ init_hca->qpc_base,
+ dev_cap->qpc_entry_sz,
+ dev->caps.num_qps,
+ dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+ 0, 0);
+ if (err) {
+ mlx4_err(dev, "Failed to map QP context memory, aborting.\n");
+ goto err_unmap_dmpt;
+ }
+
+ err = mlx4_init_icm_table(dev, &priv->qp_table.auxc_table,
+ init_hca->auxc_base,
+ dev_cap->aux_entry_sz,
+ dev->caps.num_qps,
+ dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+ 0, 0);
+ if (err) {
+ mlx4_err(dev, "Failed to map AUXC context memory, aborting.\n");
+ goto err_unmap_qp;
+ }
+
+ err = mlx4_init_icm_table(dev, &priv->qp_table.altc_table,
+ init_hca->altc_base,
+ dev_cap->altc_entry_sz,
+ dev->caps.num_qps,
+ dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+ 0, 0);
+ if (err) {
+ mlx4_err(dev, "Failed to map ALTC context memory, aborting.\n");
+ goto err_unmap_auxc;
+ }
+
+ err = mlx4_init_icm_table(dev, &priv->qp_table.rdmarc_table,
+ init_hca->rdmarc_base,
+ dev_cap->rdmarc_entry_sz << priv->qp_table.rdmarc_shift,
+ dev->caps.num_qps,
+ dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+ 0, 0);
+ if (err) {
+ mlx4_err(dev, "Failed to map RDMARC context memory, aborting\n");
+ goto err_unmap_altc;
+ }
+
+ err = mlx4_init_icm_table(dev, &priv->cq_table.table,
+ init_hca->cqc_base,
+ dev_cap->cqc_entry_sz,
+ dev->caps.num_cqs,
+ dev->caps.reserved_cqs, 0, 0);
+ if (err) {
+ mlx4_err(dev, "Failed to map CQ context memory, aborting.\n");
+ goto err_unmap_rdmarc;
+ }
+
+ err = mlx4_init_icm_table(dev, &priv->srq_table.table,
+ init_hca->srqc_base,
+ dev_cap->srq_entry_sz,
+ dev->caps.num_srqs,
+ dev->caps.reserved_srqs, 0, 0);
+ if (err) {
+ mlx4_err(dev, "Failed to map SRQ context memory, aborting.\n");
+ goto err_unmap_cq;
+ }
+
+ /*
+ * It's not strictly required, but for simplicity just map the
+ * whole multicast group table now. The table isn't very big
+ * and it's a lot easier than trying to track ref counts.
+ */
+ err = mlx4_init_icm_table(dev, &priv->mcg_table.table,
+ init_hca->mc_base, MLX4_MGM_ENTRY_SIZE,
+ dev->caps.num_mgms + dev->caps.num_amgms,
+ dev->caps.num_mgms + dev->caps.num_amgms,
+ 0, 0);
+ if (err) {
+ mlx4_err(dev, "Failed to map MCG context memory, aborting.\n");
+ goto err_unmap_srq;
+ }
+
+ return 0;
+
+err_unmap_srq:
+ mlx4_cleanup_icm_table(dev, &priv->srq_table.table);
+
+err_unmap_cq:
+ mlx4_cleanup_icm_table(dev, &priv->cq_table.table);
+
+err_unmap_rdmarc:
+ mlx4_cleanup_icm_table(dev, &priv->qp_table.rdmarc_table);
+
+err_unmap_altc:
+ mlx4_cleanup_icm_table(dev, &priv->qp_table.altc_table);
+
+err_unmap_auxc:
+ mlx4_cleanup_icm_table(dev, &priv->qp_table.auxc_table);
+
+err_unmap_qp:
+ mlx4_cleanup_icm_table(dev, &priv->qp_table.qp_table);
+
+err_unmap_dmpt:
+ mlx4_cleanup_icm_table(dev, &priv->mr_table.dmpt_table);
+
+err_unmap_mtt:
+ mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table);
+
+err_unmap_eq:
+ mlx4_cleanup_icm_table(dev, &priv->eq_table.table);
+
+err_unmap_cmpt:
+ mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table);
+ mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table);
+ mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table);
+ mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table);
+
+err_unmap_aux:
+ mlx4_UNMAP_ICM_AUX(dev);
+
+err_free_aux:
+ mlx4_free_icm(dev, priv->fw.aux_icm, 0);
+
+ return err;
+}
+
+static void mlx4_free_icms(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+
+ mlx4_cleanup_icm_table(dev, &priv->mcg_table.table);
+ mlx4_cleanup_icm_table(dev, &priv->srq_table.table);
+ mlx4_cleanup_icm_table(dev, &priv->cq_table.table);
+ mlx4_cleanup_icm_table(dev, &priv->qp_table.rdmarc_table);
+ mlx4_cleanup_icm_table(dev, &priv->qp_table.altc_table);
+ mlx4_cleanup_icm_table(dev, &priv->qp_table.auxc_table);
+ mlx4_cleanup_icm_table(dev, &priv->qp_table.qp_table);
+ mlx4_cleanup_icm_table(dev, &priv->mr_table.dmpt_table);
+ mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table);
+ mlx4_cleanup_icm_table(dev, &priv->eq_table.table);
+ mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table);
+ mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table);
+ mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table);
+ mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table);
+
+ mlx4_UNMAP_ICM_AUX(dev);
+ mlx4_free_icm(dev, priv->fw.aux_icm, 0);
+}
+
+static int map_bf_area(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ resource_size_t bf_start;
+ resource_size_t bf_len;
+ int err = 0;
+
+ bf_start = pci_resource_start(dev->pdev, 2) + (dev->caps.num_uars << PAGE_SHIFT);
+ bf_len = pci_resource_len(dev->pdev, 2) - (dev->caps.num_uars << PAGE_SHIFT);
+ priv->bf_mapping = io_mapping_create_wc(bf_start, bf_len);
+ if (!priv->bf_mapping)
+ err = -ENOMEM;
+
+ return err;
+}
+
+static void unmap_bf_area(struct mlx4_dev *dev)
+{
+ if (mlx4_priv(dev)->bf_mapping)
+ io_mapping_free(mlx4_priv(dev)->bf_mapping);
+}
+
+static void mlx4_close_hca(struct mlx4_dev *dev)
+{
+ unmap_bf_area(dev);
+ mlx4_CLOSE_HCA(dev, 0);
+ mlx4_free_icms(dev);
+ mlx4_UNMAP_FA(dev);
+ mlx4_free_icm(dev, mlx4_priv(dev)->fw.fw_icm, 0);
+}
+
+static int mlx4_init_hca(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ struct mlx4_adapter adapter;
+ struct mlx4_dev_cap dev_cap;
+ struct mlx4_mod_stat_cfg mlx4_cfg;
+ struct mlx4_profile profile;
+ struct mlx4_init_hca_param init_hca;
+ struct mlx4_port_config *config;
+ u64 icm_size;
+ int err;
+ int i;
+
+ err = mlx4_QUERY_FW(dev);
+ if (err) {
+ if (err == -EACCES)
+ mlx4_info(dev, "non-primary physical function, skipping.\n");
+ else
+ mlx4_err(dev, "QUERY_FW command failed, aborting.\n");
+ return err;
+ }
+
+ err = mlx4_load_fw(dev);
+ if (err) {
+ mlx4_err(dev, "Failed to start FW, aborting.\n");
+ return err;
+ }
+
+ mlx4_cfg.log_pg_sz_m = 1;
+ mlx4_cfg.log_pg_sz = 0;
+ err = mlx4_MOD_STAT_CFG(dev, &mlx4_cfg);
+ if (err)
+ mlx4_warn(dev, "Failed to override log_pg_sz parameter\n");
+
+ err = mlx4_dev_cap(dev, &dev_cap);
+ if (err) {
+ mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
+ goto err_stop_fw;
+ }
+
+ process_mod_param_profile();
+ profile = default_profile;
+
+ list_for_each_entry(config, &config_list, list) {
+ if (config->pdev == dev->pdev) {
+ for (i = 1; i <= dev->caps.num_ports; i++) {
+ dev->caps.possible_type[i] = config->port_type[i];
+ if (config->port_type[i] != MLX4_PORT_TYPE_AUTO)
+ dev->caps.port_type[i] = config->port_type[i];
+ }
+ }
+ }
+
+ mlx4_set_port_mask(dev);
+ icm_size = mlx4_make_profile(dev, &profile, &dev_cap, &init_hca);
+ if ((long long) icm_size < 0) {
+ err = icm_size;
+ goto err_stop_fw;
+ }
+
+ if (map_bf_area(dev))
+ mlx4_dbg(dev, "Kernel support for blue flame is not available for kernels < 2.6.28\n");
+
+ init_hca.log_uar_sz = ilog2(dev->caps.num_uars);
+
+ err = mlx4_init_icm(dev, &dev_cap, &init_hca, icm_size);
+ if (err)
+ goto err_stop_fw;
+
+ err = mlx4_INIT_HCA(dev, &init_hca);
+ if (err) {
+ mlx4_err(dev, "INIT_HCA command failed, aborting.\n");
+ goto err_free_icm;
+ }
+
+ err = mlx4_QUERY_ADAPTER(dev, &adapter);
+ if (err) {
+ mlx4_err(dev, "QUERY_ADAPTER command failed, aborting.\n");
+ goto err_close;
+ }
+
+ priv->eq_table.inta_pin = adapter.inta_pin;
+ memcpy(dev->board_id, adapter.board_id, sizeof dev->board_id);
+
+ return 0;
+
+err_close:
+ mlx4_CLOSE_HCA(dev, 0);
+
+err_free_icm:
+ mlx4_free_icms(dev);
+
+err_stop_fw:
+ unmap_bf_area(dev);
+ mlx4_UNMAP_FA(dev);
+ mlx4_free_icm(dev, priv->fw.fw_icm, 0);
+
+ return err;
+}
+
+static int mlx4_init_counters_table(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ int err;
+ int nent;
+
+ switch (dev->caps.counters_mode) {
+ case MLX4_CUNTERS_BASIC:
+ nent = dev->caps.max_basic_counters;
+ break;
+ case MLX4_CUNTERS_EXT:
+ nent = dev->caps.max_ext_counters;
+ break;
+ default:
+ return -ENOENT;
+ }
+ err = mlx4_bitmap_init(&priv->counters_bitmap, nent, nent - 1, 0, 0);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static void mlx4_cleanup_counters_table(struct mlx4_dev *dev)
+{
+ switch (dev->caps.counters_mode) {
+ case MLX4_CUNTERS_BASIC:
+ case MLX4_CUNTERS_EXT:
+ mlx4_bitmap_cleanup(&mlx4_priv(dev)->counters_bitmap);
+ break;
+ default:
+ break;
+ }
+}
+
+int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+
+ switch (dev->caps.counters_mode) {
+ case MLX4_CUNTERS_BASIC:
+ case MLX4_CUNTERS_EXT:
+ *idx = mlx4_bitmap_alloc(&priv->counters_bitmap);
+ if (*idx == -1)
+ return -ENOMEM;
+ return 0;
+ default:
+ return -ENOMEM;
+ }
+}
+EXPORT_SYMBOL_GPL(mlx4_counter_alloc);
+
+void mlx4_counter_free(struct mlx4_dev *dev, u32 idx)
+{
+ switch (dev->caps.counters_mode) {
+ case MLX4_CUNTERS_BASIC:
+ case MLX4_CUNTERS_EXT:
+ mlx4_bitmap_free(&mlx4_priv(dev)->counters_bitmap, idx);
+ return;
+ default:
+ return;
+ }
+}
+EXPORT_SYMBOL_GPL(mlx4_counter_free);
+
+static int mlx4_setup_hca(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ int err;
+ int port;
+ __be32 ib_port_default_caps;
+
+ err = mlx4_init_uar_table(dev);
+ if (err) {
+ mlx4_err(dev, "Failed to initialize "
+ "user access region table, aborting.\n");
+ return err;
+ }
+
+ err = mlx4_uar_alloc(dev, &priv->driver_uar);
+ if (err) {
+ mlx4_err(dev, "Failed to allocate driver access region, "
+ "aborting.\n");
+ goto err_uar_table_free;
+ }
+
+ priv->kar = ioremap(priv->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
+ if (!priv->kar) {
+ mlx4_err(dev, "Couldn't map kernel access region, "
+ "aborting.\n");
+ err = -ENOMEM;
+ goto err_uar_free;
+ }
+
+ err = mlx4_init_pd_table(dev);
+ if (err) {
+ mlx4_err(dev, "Failed to initialize "
+ "protection domain table, aborting.\n");
+ goto err_kar_unmap;
+ }
+
+ err = mlx4_init_xrcd_table(dev);
+ if (err) {
+ mlx4_err(dev, "Failed to initialize extended "
+ "reliably connected domain table, aborting.\n");
+ goto err_pd_table_free;
+ }
+
+ err = mlx4_init_mr_table(dev);
+ if (err) {
+ mlx4_err(dev, "Failed to initialize "
+ "memory region table, aborting.\n");
+ goto err_xrcd_table_free;
+ }
+
+ err = mlx4_init_eq_table(dev);
+ if (err) {
+ mlx4_err(dev, "Failed to initialize "
+ "event queue table, aborting.\n");
+ goto err_mr_table_free;
+ }
+
+ err = mlx4_cmd_use_events(dev);
+ if (err) {
+ mlx4_err(dev, "Failed to switch to event-driven "
+ "firmware commands, aborting.\n");
+ goto err_eq_table_free;
+ }
+
+ err = mlx4_NOP(dev);
+ if (err) {
+ if (dev->flags & MLX4_FLAG_MSI_X) {
+ mlx4_warn(dev, "NOP command failed to generate MSI-X "
+ "interrupt IRQ %d).\n",
+ priv->eq_table.eq[dev->caps.num_comp_vectors].irq);
+ mlx4_warn(dev, "Trying again without MSI-X.\n");
+ } else {
+ mlx4_err(dev, "NOP command failed to generate interrupt "
+ "(IRQ %d), aborting.\n",
+ priv->eq_table.eq[dev->caps.num_comp_vectors].irq);
+ mlx4_err(dev, "BIOS or ACPI interrupt routing problem?\n");
+ }
+
+ goto err_cmd_poll;
+ }
+
+ mlx4_dbg(dev, "NOP command IRQ test passed\n");
+
+ err = mlx4_init_cq_table(dev);
+ if (err) {
+ mlx4_err(dev, "Failed to initialize "
+ "completion queue table, aborting.\n");
+ goto err_cmd_poll;
+ }
+
+ err = mlx4_init_srq_table(dev);
+ if (err) {
+ mlx4_err(dev, "Failed to initialize "
+ "shared receive queue table, aborting.\n");
+ goto err_cq_table_free;
+ }
+
+ err = mlx4_init_qp_table(dev);
+ if (err) {
+ mlx4_err(dev, "Failed to initialize "
+ "queue pair table, aborting.\n");
+ goto err_srq_table_free;
+ }
+
+ err = mlx4_init_mcg_table(dev);
+ if (err) {
+ mlx4_err(dev, "Failed to initialize "
+ "multicast group table, aborting.\n");
+ goto err_qp_table_free;
+ }
+
+ err = mlx4_init_counters_table(dev);
+ if (err && err != -ENOENT) {
+ mlx4_err(dev, "Failed to initialize counters table, aborting.\n");
+ goto err_mcg_table_free;
+ }
+
+ for (port = 1; port <= dev->caps.num_ports; port++) {
+ ib_port_default_caps = 0;
+ err = mlx4_get_port_ib_caps(dev, port, &ib_port_default_caps);
+ if (err)
+ mlx4_warn(dev, "failed to get port %d default "
+ "ib capabilities (%d). Continuing with "
+ "caps = 0\n", port, err);
+ dev->caps.ib_port_def_cap[port] = ib_port_default_caps;
+ err = mlx4_SET_PORT(dev, port);
+ if (err) {
+ mlx4_err(dev, "Failed to set port %d, aborting\n",
+ port);
+ goto err_counters_table_free;
+ }
+ }
+
+ return 0;
+
+err_counters_table_free:
+ mlx4_cleanup_counters_table(dev);
+
+err_mcg_table_free:
+ mlx4_cleanup_mcg_table(dev);
+
+err_qp_table_free:
+ mlx4_cleanup_qp_table(dev);
+
+err_srq_table_free:
+ mlx4_cleanup_srq_table(dev);
+
+err_cq_table_free:
+ mlx4_cleanup_cq_table(dev);
+
+err_cmd_poll:
+ mlx4_cmd_use_polling(dev);
+
+err_eq_table_free:
+ mlx4_cleanup_eq_table(dev);
+
+err_mr_table_free:
+ mlx4_cleanup_mr_table(dev);
+
+err_xrcd_table_free:
+ mlx4_cleanup_xrcd_table(dev);
+
+err_pd_table_free:
+ mlx4_cleanup_pd_table(dev);
+
+err_kar_unmap:
+ iounmap(priv->kar);
+
+err_uar_free:
+ mlx4_uar_free(dev, &priv->driver_uar);
+
+err_uar_table_free:
+ mlx4_cleanup_uar_table(dev);
+ return err;
+}
+
+static void mlx4_enable_msi_x(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ struct msix_entry *entries;
+ int nreq;
+ int err;
+ int i;
+
+ if (msi_x) {
+ nreq = min_t(int, dev->caps.num_eqs - dev->caps.reserved_eqs,
+ num_possible_cpus() + 1);
+ entries = kcalloc(nreq, sizeof *entries, GFP_KERNEL);
+ if (!entries)
+ goto no_msi;
+
+ for (i = 0; i < nreq; ++i)
+ entries[i].entry = i;
+
+ retry:
+ err = pci_enable_msix(dev->pdev, entries, nreq);
+ if (err) {
+ /* Try again if at least 2 vectors are available */
+ if (err > 1) {
+ mlx4_info(dev, "Requested %d vectors, "
+ "but only %d MSI-X vectors available, "
+ "trying again\n", nreq, err);
+ nreq = err;
+ goto retry;
+ }
+ kfree(entries);
+ goto no_msi;
+ }
+
+ dev->caps.num_comp_vectors = nreq - 1;
+ for (i = 0; i < nreq; ++i)
+ priv->eq_table.eq[i].irq = entries[i].vector;
+
+ dev->flags |= MLX4_FLAG_MSI_X;
+
+ kfree(entries);
+ return;
+ }
+
+no_msi:
+ dev->caps.num_comp_vectors = 1;
+
+ for (i = 0; i < 2; ++i)
+ priv->eq_table.eq[i].irq = dev->pdev->irq;
+}
+
+static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
+{
+ struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
+ int err = 0;
+
+ info->dev = dev;
+ info->port = port;
+ mlx4_init_mac_table(dev, &info->mac_table);
+ mlx4_init_vlan_table(dev, &info->vlan_table);
+
+ sprintf(info->dev_name, "mlx4_port%d", port);
+ info->port_attr.attr.name = info->dev_name;
+ info->port_attr.attr.mode = S_IRUGO | S_IWUSR;
+ info->port_attr.show = show_port_type;
+ info->port_attr.store = set_port_type;
+
+ err = device_create_file(&dev->pdev->dev, &info->port_attr);
+ if (err) {
+ mlx4_err(dev, "Failed to create file for port %d\n", port);
+ info->port = -1;
+ }
+
+ return err;
+}
+
+static void mlx4_cleanup_port_info(struct mlx4_port_info *info)
+{
+ if (info->port < 0)
+ return;
+
+ device_remove_file(&info->dev->pdev->dev, &info->port_attr);
+}
+
+static int mlx4_init_trigger(struct mlx4_priv *priv)
+{
+ memcpy(&priv->trigger_attr, &dev_attr_port_trigger,
+ sizeof(struct device_attribute));
+ return device_create_file(&priv->dev.pdev->dev, &priv->trigger_attr);
+}
+
+static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ struct mlx4_priv *priv;
+ struct mlx4_dev *dev;
+ int err;
+ int port;
+ int i;
+
+ printk(KERN_INFO PFX "Initializing %s\n",
+ pci_name(pdev));
+
+ err = pci_enable_device(pdev);
+ if (err) {
+ dev_err(&pdev->dev, "Cannot enable PCI device, "
+ "aborting.\n");
+ return err;
+ }
+
+ /*
+ * Check for BARs. We expect 0: 1MB
+ */
+ if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM) ||
+ pci_resource_len(pdev, 0) != 1 << 20) {
+ dev_err(&pdev->dev, "Missing DCS, aborting.\n");
+ err = -ENODEV;
+ goto err_disable_pdev;
+ }
+ if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) {
+ dev_err(&pdev->dev, "Missing UAR, aborting.\n");
+ err = -ENODEV;
+ goto err_disable_pdev;
+ }
+
+ err = pci_request_region(pdev, 0, DRV_NAME);
+ if (err) {
+ dev_err(&pdev->dev, "Cannot request control region, aborting.\n");
+ goto err_disable_pdev;
+ }
+
+ err = pci_request_region(pdev, 2, DRV_NAME);
+ if (err) {
+ dev_err(&pdev->dev, "Cannot request UAR region, aborting.\n");
+ goto err_release_bar0;
+ }
+
+ pci_set_master(pdev);
+
+ err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+ if (err) {
+ dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask.\n");
+ err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+ if (err) {
+ dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n");
+ goto err_release_bar2;
+ }
+ }
+ err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+ if (err) {
+ dev_warn(&pdev->dev, "Warning: couldn't set 64-bit "
+ "consistent PCI DMA mask.\n");
+ err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+ if (err) {
+ dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, "
+ "aborting.\n");
+ goto err_release_bar2;
+ }
+ }
+
+ priv = kzalloc(sizeof *priv, GFP_KERNEL);
+ if (!priv) {
+ dev_err(&pdev->dev, "Device struct alloc failed, "
+ "aborting.\n");
+ err = -ENOMEM;
+ goto err_release_bar2;
+ }
+
+ dev = &priv->dev;
+ dev->pdev = pdev;
+ INIT_LIST_HEAD(&priv->ctx_list);
+ spin_lock_init(&priv->ctx_lock);
+
+ mutex_init(&priv->port_mutex);
+
+ INIT_LIST_HEAD(&priv->pgdir_list);
+ mutex_init(&priv->pgdir_mutex);
+ for (i = 0; i < MLX4_MAX_PORTS; ++i)
+ priv->iboe_counter_index[i] = -1;
+
+ INIT_LIST_HEAD(&priv->bf_list);
+ mutex_init(&priv->bf_mutex);
+
+ /*
+ * Now reset the HCA before we touch the PCI capabilities or
+ * attempt a firmware command, since a boot ROM may have left
+ * the HCA in an undefined state.
+ */
+ err = mlx4_reset(dev);
+ if (err) {
+ mlx4_err(dev, "Failed to reset HCA, aborting.\n");
+ goto err_free_dev;
+ }
+
+ if (mlx4_cmd_init(dev)) {
+ mlx4_err(dev, "Failed to init command interface, aborting.\n");
+ goto err_free_dev;
+ }
+
+ err = mlx4_init_hca(dev);
+ if (err)
+ goto err_cmd;
+
+ err = mlx4_alloc_eq_table(dev);
+ if (err)
+ goto err_close;
+
+ mlx4_enable_msi_x(dev);
+
+ err = mlx4_setup_hca(dev);
+ if (err == -EBUSY && (dev->flags & MLX4_FLAG_MSI_X)) {
+ dev->flags &= ~MLX4_FLAG_MSI_X;
+ pci_disable_msix(pdev);
+ err = mlx4_setup_hca(dev);
+ }
+
+ if (err)
+ goto err_free_eq;
+
+ for (port = 1; port <= dev->caps.num_ports; port++) {
+ err = mlx4_init_port_info(dev, port);
+ if (err)
+ goto err_port;
+ }
+
+ err = mlx4_register_device(dev);
+ if (err)
+ goto err_port;
+
+ err = mlx4_init_trigger(priv);
+ if (err)
+ goto err_register;
+
+ err = mlx4_sense_init(dev);
+ if (err)
+ goto err_trigger;
+
+ mlx4_start_sense(dev);
+
+ pci_set_drvdata(pdev, dev);
+
+ return 0;
+
+err_trigger:
+ device_remove_file(&dev->pdev->dev, &priv->trigger_attr);
+err_register:
+ mlx4_unregister_device(dev);
+err_port:
+ for (--port; port >= 1; --port)
+ mlx4_cleanup_port_info(&priv->port[port]);
+
+ mlx4_cleanup_counters_table(dev);
+ mlx4_cleanup_mcg_table(dev);
+ mlx4_cleanup_qp_table(dev);
+ mlx4_cleanup_srq_table(dev);
+ mlx4_cleanup_cq_table(dev);
+ mlx4_cmd_use_polling(dev);
+ mlx4_cleanup_eq_table(dev);
+ mlx4_cleanup_mr_table(dev);
+ mlx4_cleanup_xrcd_table(dev);
+ mlx4_cleanup_pd_table(dev);
+ mlx4_cleanup_uar_table(dev);
+
+err_free_eq:
+ mlx4_free_eq_table(dev);
+
+err_close:
+ if (dev->flags & MLX4_FLAG_MSI_X)
+ pci_disable_msix(pdev);
+
+ mlx4_close_hca(dev);
+
+err_cmd:
+ mlx4_cmd_cleanup(dev);
+
+err_free_dev:
+ kfree(priv);
+
+err_release_bar2:
+ pci_release_region(pdev, 2);
+
+err_release_bar0:
+ pci_release_region(pdev, 0);
+
+err_disable_pdev:
+ pci_disable_device(pdev);
+ pci_set_drvdata(pdev, NULL);
+ return err;
+}
+
+static int __devinit mlx4_init_one(struct pci_dev *pdev,
+ const struct pci_device_id *id)
+{
+ static int mlx4_version_printed;
+
+ if (!mlx4_version_printed) {
+ printk(KERN_INFO "%s", mlx4_version);
+ ++mlx4_version_printed;
+ }
+
+ return __mlx4_init_one(pdev, id);
+}
+
+static void mlx4_remove_one(struct pci_dev *pdev)
+{
+ struct mlx4_dev *dev = pci_get_drvdata(pdev);
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ int p;
+
+ if (dev) {
+ mlx4_sense_cleanup(dev);
+ mlx4_unregister_device(dev);
+ device_remove_file(&dev->pdev->dev, &priv->trigger_attr);
+
+ for (p = 1; p <= dev->caps.num_ports; p++) {
+ mlx4_cleanup_port_info(&priv->port[p]);
+ mlx4_CLOSE_PORT(dev, p);
+ }
+
+ mlx4_cleanup_counters_table(dev);
+ mlx4_cleanup_mcg_table(dev);
+ mlx4_cleanup_qp_table(dev);
+ mlx4_cleanup_srq_table(dev);
+ mlx4_cleanup_cq_table(dev);
+ mlx4_cmd_use_polling(dev);
+ mlx4_cleanup_eq_table(dev);
+ mlx4_cleanup_mr_table(dev);
+ mlx4_cleanup_xrcd_table(dev);
+ mlx4_cleanup_pd_table(dev);
+
+ iounmap(priv->kar);
+ mlx4_uar_free(dev, &priv->driver_uar);
+ mlx4_cleanup_uar_table(dev);
+ mlx4_free_eq_table(dev);
+ mlx4_close_hca(dev);
+ mlx4_cmd_cleanup(dev);
+
+ if (dev->flags & MLX4_FLAG_MSI_X)
+ pci_disable_msix(pdev);
+
+ kfree(priv);
+ pci_release_region(pdev, 2);
+ pci_release_region(pdev, 0);
+ pci_disable_device(pdev);
+ pci_set_drvdata(pdev, NULL);
+ }
+}
+
+int mlx4_restart_one(struct pci_dev *pdev)
+{
+ mlx4_remove_one(pdev);
+ return __mlx4_init_one(pdev, NULL);
+}
+
+static struct pci_device_id mlx4_pci_table[] = {
+ { PCI_VDEVICE(MELLANOX, 0x6340) }, /* MT25408 "Hermon" SDR */
+ { PCI_VDEVICE(MELLANOX, 0x634a) }, /* MT25408 "Hermon" DDR */
+ { PCI_VDEVICE(MELLANOX, 0x6354) }, /* MT25408 "Hermon" QDR */
+ { PCI_VDEVICE(MELLANOX, 0x6732) }, /* MT25408 "Hermon" DDR PCIe gen2 */
+ { PCI_VDEVICE(MELLANOX, 0x673c) }, /* MT25408 "Hermon" QDR PCIe gen2 */
+ { PCI_VDEVICE(MELLANOX, 0x6368) }, /* MT25408 "Hermon" EN 10GigE */
+ { PCI_VDEVICE(MELLANOX, 0x6750) }, /* MT25408 "Hermon" EN 10GigE PCIe gen2 */
+ { PCI_VDEVICE(MELLANOX, 0x6372) }, /* MT25458 ConnectX EN 10GBASE-T 10GigE */
+ { PCI_VDEVICE(MELLANOX, 0x675a) }, /* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */
+ { PCI_VDEVICE(MELLANOX, 0x6764) }, /* MT26468 ConnectX EN 10GigE PCIe gen2 */
+ { PCI_VDEVICE(MELLANOX, 0x6746) }, /* MT26438 ConnectX VPI PCIe 2.0 5GT/s - IB QDR / 10GigE Virt+ */
+ { PCI_VDEVICE(MELLANOX, 0x676e) }, /* MT26478 ConnectX EN 40GigE PCIe 2.0 5GT/s */
+ { PCI_VDEVICE(MELLANOX, 0x6778) }, /* MT26488 ConnectX VPI PCIe 2.0 5GT/s - IB DDR / 10GigE Virt+ */
+ { PCI_VDEVICE(MELLANOX, 0x1000) },
+ { PCI_VDEVICE(MELLANOX, 0x1001) },
+ { PCI_VDEVICE(MELLANOX, 0x1002) },
+ { PCI_VDEVICE(MELLANOX, 0x1003) },
+ { PCI_VDEVICE(MELLANOX, 0x1004) },
+ { PCI_VDEVICE(MELLANOX, 0x1005) },
+ { PCI_VDEVICE(MELLANOX, 0x1006) },
+ { PCI_VDEVICE(MELLANOX, 0x1007) },
+ { PCI_VDEVICE(MELLANOX, 0x1008) },
+ { PCI_VDEVICE(MELLANOX, 0x1009) },
+ { PCI_VDEVICE(MELLANOX, 0x100a) },
+ { PCI_VDEVICE(MELLANOX, 0x100b) },
+ { PCI_VDEVICE(MELLANOX, 0x100c) },
+ { PCI_VDEVICE(MELLANOX, 0x100d) },
+ { PCI_VDEVICE(MELLANOX, 0x100e) },
+ { PCI_VDEVICE(MELLANOX, 0x100f) },
+ { 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, mlx4_pci_table);
+
+static struct pci_driver mlx4_driver = {
+ .name = DRV_NAME,
+ .id_table = mlx4_pci_table,
+ .probe = mlx4_init_one,
+ .remove = __devexit_p(mlx4_remove_one)
+};
+
+static int __init mlx4_verify_params(void)
+{
+ if ((log_num_mac < 0) || (log_num_mac > 7)) {
+ printk(KERN_WARNING "mlx4_core: bad num_mac: %d\n", log_num_mac);
+ return -1;
+ }
+
+ if (log_mtts_per_seg == 0)
+ log_mtts_per_seg = ilog2(MLX4_MTT_ENTRY_PER_SEG);
+ if ((log_mtts_per_seg < 1) || (log_mtts_per_seg > 7)) {
+ printk(KERN_WARNING "mlx4_core: bad log_mtts_per_seg: %d\n", log_mtts_per_seg);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int __init mlx4_init(void)
+{
+ int ret;
+
+ mutex_init(&drv_mutex);
+
+ if (mlx4_verify_params())
+ return -EINVAL;
+
+ mlx4_catas_init();
+
+ mlx4_wq = create_singlethread_workqueue("mlx4");
+ if (!mlx4_wq)
+ return -ENOMEM;
+
+ ret = pci_register_driver(&mlx4_driver);
+ return ret < 0 ? ret : 0;
+}
+
+static void __exit mlx4_cleanup(void)
+{
+ mutex_lock(&drv_mutex);
+ mlx4_config_cleanup();
+ pci_unregister_driver(&mlx4_driver);
+ mutex_unlock(&drv_mutex);
+ destroy_workqueue(mlx4_wq);
+}
+
+module_init_order(mlx4_init, SI_ORDER_MIDDLE);
+module_exit(mlx4_cleanup);
+
+#undef MODULE_VERSION
+#include <sys/module.h>
+static int
+mlx4_evhand(module_t mod, int event, void *arg)
+{
+ return (0);
+}
+
+static moduledata_t mlx4_mod = {
+ .name = "mlx4",
+ .evhand = mlx4_evhand,
+};
+MODULE_VERSION(mlx4, 1);
+DECLARE_MODULE(mlx4, mlx4_mod, SI_SUB_SMP, SI_ORDER_ANY);
diff --git a/sys/ofed/drivers/net/mlx4/mcg.c b/sys/ofed/drivers/net/mlx4/mcg.c
new file mode 100644
index 0000000..70493e3
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/mcg.c
@@ -0,0 +1,366 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/driver.h>
+
+#include "mlx4.h"
+
+#define MGM_QPN_MASK 0x00FFFFFF
+#define MGM_BLCK_LB_BIT 30
+
+struct mlx4_mgm {
+ __be32 next_gid_index;
+ __be32 members_count;
+ u32 reserved[2];
+ u8 gid[16];
+ __be32 qp[MLX4_QP_PER_MGM];
+};
+
+static const u8 zero_gid[16]; /* automatically initialized to 0 */
+
+static int mlx4_READ_MCG(struct mlx4_dev *dev, int index,
+ struct mlx4_cmd_mailbox *mailbox)
+{
+ return mlx4_cmd_box(dev, 0, mailbox->dma, index, 0, MLX4_CMD_READ_MCG,
+ MLX4_CMD_TIME_CLASS_A);
+}
+
+static int mlx4_WRITE_MCG(struct mlx4_dev *dev, int index,
+ struct mlx4_cmd_mailbox *mailbox)
+{
+ return mlx4_cmd(dev, mailbox->dma, index, 0, MLX4_CMD_WRITE_MCG,
+ MLX4_CMD_TIME_CLASS_A);
+}
+
+static int mlx4_MGID_HASH(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+ u16 *hash)
+{
+ u64 imm;
+ int err;
+
+ err = mlx4_cmd_imm(dev, mailbox->dma, &imm, 0, 0, MLX4_CMD_MGID_HASH,
+ MLX4_CMD_TIME_CLASS_A);
+
+ if (!err)
+ *hash = imm;
+
+ return err;
+}
+
+/*
+ * Caller must hold MCG table semaphore. gid and mgm parameters must
+ * be properly aligned for command interface.
+ *
+ * Returns 0 unless a firmware command error occurs.
+ *
+ * If GID is found in MGM or MGM is empty, *index = *hash, *prev = -1
+ * and *mgm holds MGM entry.
+ *
+ * if GID is found in AMGM, *index = index in AMGM, *prev = index of
+ * previous entry in hash chain and *mgm holds AMGM entry.
+ *
+ * If no AMGM exists for given gid, *index = -1, *prev = index of last
+ * entry in hash chain and *mgm holds end of hash chain.
+ */
+static int find_mgm(struct mlx4_dev *dev,
+ u8 *gid, enum mlx4_mcast_prot prot,
+ struct mlx4_cmd_mailbox *mgm_mailbox,
+ u16 *hash, int *prev, int *index)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ struct mlx4_mgm *mgm = mgm_mailbox->buf;
+ u8 *mgid;
+ int err;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return -ENOMEM;
+ mgid = mailbox->buf;
+
+ memcpy(mgid, gid, 16);
+
+ err = mlx4_MGID_HASH(dev, mailbox, hash);
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ if (err)
+ return err;
+
+ if (0)
+ mlx4_dbg(dev, "Hash for %pI6 is %04x\n", gid, *hash);
+
+ *index = *hash;
+ *prev = -1;
+
+ do {
+ err = mlx4_READ_MCG(dev, *index, mgm_mailbox);
+ if (err)
+ return err;
+
+ if (!memcmp(mgm->gid, zero_gid, 16)) {
+ if (*index != *hash) {
+ mlx4_err(dev, "Found zero MGID in AMGM.\n");
+ err = -EINVAL;
+ }
+ return err;
+ }
+
+ if (!memcmp(mgm->gid, gid, 16) &&
+ (prot == be32_to_cpu(mgm->members_count) >> 30))
+ return err;
+
+ *prev = *index;
+ *index = be32_to_cpu(mgm->next_gid_index) >> 6;
+ } while (*index);
+
+ *index = -1;
+ return err;
+}
+
+int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+ int block_mcast_loopback, enum mlx4_mcast_prot prot)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ struct mlx4_cmd_mailbox *mailbox;
+ struct mlx4_mgm *mgm;
+ u32 members_count;
+ u16 hash;
+ int index, prev;
+ int link = 0;
+ int i;
+ int err;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ mgm = mailbox->buf;
+
+ mutex_lock(&priv->mcg_table.mutex);
+
+ err = find_mgm(dev, gid, prot, mailbox, &hash, &prev, &index);
+ if (err)
+ goto out;
+
+ if (index != -1) {
+ if (!memcmp(mgm->gid, zero_gid, 16))
+ memcpy(mgm->gid, gid, 16);
+ } else {
+ link = 1;
+
+ index = mlx4_bitmap_alloc(&priv->mcg_table.bitmap);
+ if (index == -1) {
+ mlx4_err(dev, "No AMGM entries left\n");
+ err = -ENOMEM;
+ goto out;
+ }
+ index += dev->caps.num_mgms;
+
+ memset(mgm, 0, sizeof *mgm);
+ memcpy(mgm->gid, gid, 16);
+ }
+
+ members_count = be32_to_cpu(mgm->members_count) & 0xffffff;
+ if (members_count == MLX4_QP_PER_MGM) {
+ mlx4_err(dev, "MGM at index %x is full.\n", index);
+ err = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < members_count; ++i)
+ if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qp->qpn) {
+ mlx4_dbg(dev, "QP %06x already a member of MGM\n", qp->qpn);
+ err = 0;
+ goto out;
+ }
+
+ mgm->qp[members_count++] = cpu_to_be32((qp->qpn & MGM_QPN_MASK) |
+ (!!mlx4_blck_lb << MGM_BLCK_LB_BIT));
+
+ mgm->members_count = cpu_to_be32(members_count | ((u32) prot << 30));
+
+ err = mlx4_WRITE_MCG(dev, index, mailbox);
+ if (err)
+ goto out;
+
+ if (!link)
+ goto out;
+
+ err = mlx4_READ_MCG(dev, prev, mailbox);
+ if (err)
+ goto out;
+
+ mgm->next_gid_index = cpu_to_be32(index << 6);
+
+ err = mlx4_WRITE_MCG(dev, prev, mailbox);
+ if (err)
+ goto out;
+
+out:
+ if (err && link && index != -1) {
+ if (index < dev->caps.num_mgms)
+ mlx4_warn(dev, "Got AMGM index %d < %d",
+ index, dev->caps.num_mgms);
+ else
+ mlx4_bitmap_free(&priv->mcg_table.bitmap,
+ index - dev->caps.num_mgms);
+ }
+ mutex_unlock(&priv->mcg_table.mutex);
+
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_multicast_attach);
+
+int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+ enum mlx4_mcast_prot prot)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ struct mlx4_cmd_mailbox *mailbox;
+ struct mlx4_mgm *mgm;
+ u32 members_count;
+ u16 hash;
+ int prev, index;
+ int i, loc;
+ int err;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+ mgm = mailbox->buf;
+
+ mutex_lock(&priv->mcg_table.mutex);
+
+ err = find_mgm(dev, gid, prot, mailbox, &hash, &prev, &index);
+ if (err)
+ goto out;
+
+ if (index == -1) {
+ mlx4_err(dev, "MGID %pI6 not found\n", gid);
+ err = -EINVAL;
+ goto out;
+ }
+
+ members_count = be32_to_cpu(mgm->members_count) & 0xffffff;
+ for (loc = -1, i = 0; i < members_count; ++i)
+ if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qp->qpn)
+ loc = i;
+
+ if (loc == -1) {
+ mlx4_err(dev, "QP %06x not found in MGM\n", qp->qpn);
+ err = -EINVAL;
+ goto out;
+ }
+
+
+ mgm->members_count = cpu_to_be32(--members_count | ((u32) prot << 30));
+ mgm->qp[loc] = mgm->qp[i - 1];
+ mgm->qp[i - 1] = 0;
+
+ if (i != 1) {
+ err = mlx4_WRITE_MCG(dev, index, mailbox);
+ goto out;
+ }
+
+ if (prev == -1) {
+ /* Remove entry from MGM */
+ int amgm_index = be32_to_cpu(mgm->next_gid_index) >> 6;
+ if (amgm_index) {
+ err = mlx4_READ_MCG(dev, amgm_index, mailbox);
+ if (err)
+ goto out;
+ } else
+ memset(mgm->gid, 0, 16);
+
+ err = mlx4_WRITE_MCG(dev, index, mailbox);
+ if (err)
+ goto out;
+
+ if (amgm_index) {
+ if (amgm_index < dev->caps.num_mgms)
+ mlx4_warn(dev, "MGM entry %d had AMGM index %d < %d",
+ index, amgm_index, dev->caps.num_mgms);
+ else
+ mlx4_bitmap_free(&priv->mcg_table.bitmap,
+ amgm_index - dev->caps.num_mgms);
+ }
+ } else {
+ /* Remove entry from AMGM */
+ int cur_next_index = be32_to_cpu(mgm->next_gid_index) >> 6;
+ err = mlx4_READ_MCG(dev, prev, mailbox);
+ if (err)
+ goto out;
+
+ mgm->next_gid_index = cpu_to_be32(cur_next_index << 6);
+
+ err = mlx4_WRITE_MCG(dev, prev, mailbox);
+ if (err)
+ goto out;
+
+ if (index < dev->caps.num_mgms)
+ mlx4_warn(dev, "entry %d had next AMGM index %d < %d",
+ prev, index, dev->caps.num_mgms);
+ else
+ mlx4_bitmap_free(&priv->mcg_table.bitmap,
+ index - dev->caps.num_mgms);
+ }
+
+out:
+ mutex_unlock(&priv->mcg_table.mutex);
+
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_multicast_detach);
+
+int mlx4_init_mcg_table(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ int err;
+
+ err = mlx4_bitmap_init(&priv->mcg_table.bitmap, dev->caps.num_amgms,
+ dev->caps.num_amgms - 1, 0, 0);
+ if (err)
+ return err;
+
+ mutex_init(&priv->mcg_table.mutex);
+
+ return 0;
+}
+
+void mlx4_cleanup_mcg_table(struct mlx4_dev *dev)
+{
+ mlx4_bitmap_cleanup(&mlx4_priv(dev)->mcg_table.bitmap);
+}
diff --git a/sys/ofed/drivers/net/mlx4/mlx4.h b/sys/ofed/drivers/net/mlx4/mlx4.h
new file mode 100644
index 0000000..d5d3da9
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/mlx4.h
@@ -0,0 +1,427 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_H
+#define MLX4_H
+
+#include <linux/mutex.h>
+#include <linux/radix-tree.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/doorbell.h>
+
+#define DRV_NAME "mlx4_core"
+#define PFX DRV_NAME ": "
+#define DRV_VERSION "1.0-ofed1.5.2"
+#define DRV_RELDATE "August 4, 2010"
+
+enum {
+ MLX4_HCR_BASE = 0x80680,
+ MLX4_HCR_SIZE = 0x0001c,
+ MLX4_CLR_INT_SIZE = 0x00008
+};
+
+enum {
+ MLX4_MGM_ENTRY_SIZE = 0x100,
+ MLX4_QP_PER_MGM = 4 * (MLX4_MGM_ENTRY_SIZE / 16 - 2),
+ MLX4_MTT_ENTRY_PER_SEG = 8
+};
+
+enum {
+ MLX4_NUM_PDS = 1 << 15
+};
+
+enum {
+ MLX4_CMPT_TYPE_QP = 0,
+ MLX4_CMPT_TYPE_SRQ = 1,
+ MLX4_CMPT_TYPE_CQ = 2,
+ MLX4_CMPT_TYPE_EQ = 3,
+ MLX4_CMPT_NUM_TYPE
+};
+
+enum {
+ MLX4_CMPT_SHIFT = 24,
+ MLX4_NUM_CMPTS = MLX4_CMPT_NUM_TYPE << MLX4_CMPT_SHIFT
+};
+
+#ifdef CONFIG_MLX4_DEBUG
+extern int mlx4_debug_level;
+#else /* CONFIG_MLX4_DEBUG */
+#define mlx4_debug_level (0)
+#endif /* CONFIG_MLX4_DEBUG */
+
+#define mlx4_dbg(mdev, format, arg...) \
+ do { \
+ if (mlx4_debug_level) \
+ dev_printk(KERN_DEBUG, &mdev->pdev->dev, format, ## arg); \
+ } while (0)
+
+#define mlx4_err(mdev, format, arg...) \
+ dev_err(&mdev->pdev->dev, format, ## arg)
+#define mlx4_info(mdev, format, arg...) \
+ dev_info(&mdev->pdev->dev, format, ## arg)
+#define mlx4_warn(mdev, format, arg...) \
+ dev_warn(&mdev->pdev->dev, format, ## arg)
+
+extern int mlx4_blck_lb;
+
+struct mlx4_bitmap {
+ u32 last;
+ u32 top;
+ u32 max;
+ u32 reserved_top;
+ u32 mask;
+ u32 avail;
+ spinlock_t lock;
+ unsigned long *table;
+};
+
+struct mlx4_buddy {
+ unsigned long **bits;
+ unsigned int *num_free;
+ int max_order;
+ spinlock_t lock;
+};
+
+struct mlx4_icm;
+
+struct mlx4_icm_table {
+ u64 virt;
+ int num_icm;
+ int num_obj;
+ int obj_size;
+ int lowmem;
+ int coherent;
+ struct mutex mutex;
+ struct mlx4_icm **icm;
+};
+
+struct mlx4_eq {
+ struct mlx4_dev *dev;
+ void __iomem *doorbell;
+ int eqn;
+ u32 cons_index;
+ u16 irq;
+ u16 have_irq;
+ int nent;
+ int load;
+ struct mlx4_buf_list *page_list;
+ struct mlx4_mtt mtt;
+};
+
+struct mlx4_profile {
+ int num_qp;
+ int rdmarc_per_qp;
+ int num_srq;
+ int num_cq;
+ int num_mcg;
+ int num_mpt;
+ int num_mtt;
+};
+
+struct mlx4_fw {
+ u64 clr_int_base;
+ u64 catas_offset;
+ struct mlx4_icm *fw_icm;
+ struct mlx4_icm *aux_icm;
+ u32 catas_size;
+ u16 fw_pages;
+ u8 clr_int_bar;
+ u8 catas_bar;
+};
+
+struct mlx4_cmd {
+ struct pci_pool *pool;
+ void __iomem *hcr;
+ struct mutex hcr_mutex;
+ struct semaphore poll_sem;
+ struct semaphore event_sem;
+ int max_cmds;
+ spinlock_t context_lock;
+ int free_head;
+ struct mlx4_cmd_context *context;
+ u16 token_mask;
+ u8 use_events;
+ u8 toggle;
+};
+
+struct mlx4_uar_table {
+ struct mlx4_bitmap bitmap;
+};
+
+struct mlx4_mr_table {
+ struct mlx4_bitmap mpt_bitmap;
+ struct mlx4_buddy mtt_buddy;
+ u64 mtt_base;
+ u64 mpt_base;
+ struct mlx4_icm_table mtt_table;
+ struct mlx4_icm_table dmpt_table;
+};
+
+struct mlx4_cq_table {
+ struct mlx4_bitmap bitmap;
+ spinlock_t lock;
+ struct radix_tree_root tree;
+ struct mlx4_icm_table table;
+ struct mlx4_icm_table cmpt_table;
+};
+
+struct mlx4_eq_table {
+ struct mlx4_bitmap bitmap;
+ char *irq_names;
+ void __iomem *clr_int;
+ void __iomem **uar_map;
+ u32 clr_mask;
+ struct mlx4_eq *eq;
+ struct mlx4_icm_table table;
+ struct mlx4_icm_table cmpt_table;
+ int have_irq;
+ u8 inta_pin;
+};
+
+struct mlx4_srq_table {
+ struct mlx4_bitmap bitmap;
+ spinlock_t lock;
+ struct mlx4_icm_table table;
+ struct mlx4_icm_table cmpt_table;
+};
+
+struct mlx4_qp_table {
+ struct mlx4_bitmap bitmap;
+ u32 rdmarc_base;
+ int rdmarc_shift;
+ spinlock_t lock;
+ struct mlx4_icm_table qp_table;
+ struct mlx4_icm_table auxc_table;
+ struct mlx4_icm_table altc_table;
+ struct mlx4_icm_table rdmarc_table;
+ struct mlx4_icm_table cmpt_table;
+};
+
+struct mlx4_mcg_table {
+ struct mutex mutex;
+ struct mlx4_bitmap bitmap;
+ struct mlx4_icm_table table;
+};
+
+struct mlx4_catas_err {
+ u32 __iomem *map;
+ struct timer_list timer;
+ struct list_head list;
+};
+
+#define MLX4_MAX_MAC_NUM 128
+#define MLX4_MAC_TABLE_SIZE (MLX4_MAX_MAC_NUM << 3)
+
+struct mlx4_mac_table {
+ __be64 entries[MLX4_MAX_MAC_NUM];
+ int refs[MLX4_MAX_MAC_NUM];
+ struct mutex mutex;
+ int total;
+ int max;
+};
+
+#define MLX4_MAX_VLAN_NUM 128
+#define MLX4_VLAN_TABLE_SIZE (MLX4_MAX_VLAN_NUM << 2)
+
+struct mlx4_vlan_table {
+ __be32 entries[MLX4_MAX_VLAN_NUM];
+ int refs[MLX4_MAX_VLAN_NUM];
+ struct mutex mutex;
+ int total;
+ int max;
+};
+
+struct mlx4_port_info {
+ struct mlx4_dev *dev;
+ int port;
+ char dev_name[16];
+ struct device_attribute port_attr;
+ enum mlx4_port_type tmp_type;
+ struct mlx4_mac_table mac_table;
+ struct mlx4_vlan_table vlan_table;
+};
+
+struct mlx4_sense {
+ struct mlx4_dev *dev;
+ u8 do_sense_port[MLX4_MAX_PORTS + 1];
+ u8 sense_allowed[MLX4_MAX_PORTS + 1];
+ struct delayed_work sense_poll;
+ struct workqueue_struct *sense_wq;
+ u32 resched;
+};
+
+extern struct mutex drv_mutex;
+
+struct mlx4_priv {
+ struct mlx4_dev dev;
+
+ struct list_head dev_list;
+ struct list_head ctx_list;
+ spinlock_t ctx_lock;
+
+ struct list_head pgdir_list;
+ struct mutex pgdir_mutex;
+
+ struct mlx4_fw fw;
+ struct mlx4_cmd cmd;
+
+ struct mlx4_bitmap pd_bitmap;
+ struct mlx4_bitmap xrcd_bitmap;
+ struct mlx4_uar_table uar_table;
+ struct mlx4_mr_table mr_table;
+ struct mlx4_cq_table cq_table;
+ struct mlx4_eq_table eq_table;
+ struct mlx4_srq_table srq_table;
+ struct mlx4_qp_table qp_table;
+ struct mlx4_mcg_table mcg_table;
+ struct mlx4_bitmap counters_bitmap;
+ struct list_head bf_list;
+ struct mutex bf_mutex;
+
+ struct mlx4_catas_err catas_err;
+
+ void __iomem *clr_base;
+
+ struct mlx4_uar driver_uar;
+ void __iomem *kar;
+ struct mlx4_port_info port[MLX4_MAX_PORTS + 1];
+ struct device_attribute trigger_attr;
+ int trig;
+ int changed_ports;
+ struct mlx4_sense sense;
+ struct mutex port_mutex;
+ int iboe_counter_index[MLX4_MAX_PORTS];
+ struct io_mapping *bf_mapping;
+};
+
+static inline struct mlx4_priv *mlx4_priv(struct mlx4_dev *dev)
+{
+ return container_of(dev, struct mlx4_priv, dev);
+}
+
+#define MLX4_SENSE_RANGE (HZ * 3)
+
+extern struct workqueue_struct *mlx4_wq;
+
+u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap);
+void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj);
+u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align);
+void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt);
+u32 mlx4_bitmap_avail(struct mlx4_bitmap *bitmap);
+int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask,
+ u32 reserved_bot, u32 resetrved_top);
+void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap);
+
+int mlx4_reset(struct mlx4_dev *dev);
+
+int mlx4_alloc_eq_table(struct mlx4_dev *dev);
+void mlx4_free_eq_table(struct mlx4_dev *dev);
+
+int mlx4_init_pd_table(struct mlx4_dev *dev);
+int mlx4_init_xrcd_table(struct mlx4_dev *dev);
+int mlx4_init_uar_table(struct mlx4_dev *dev);
+int mlx4_init_mr_table(struct mlx4_dev *dev);
+int mlx4_init_eq_table(struct mlx4_dev *dev);
+int mlx4_init_cq_table(struct mlx4_dev *dev);
+int mlx4_init_qp_table(struct mlx4_dev *dev);
+int mlx4_init_srq_table(struct mlx4_dev *dev);
+int mlx4_init_mcg_table(struct mlx4_dev *dev);
+
+void mlx4_cleanup_pd_table(struct mlx4_dev *dev);
+void mlx4_cleanup_uar_table(struct mlx4_dev *dev);
+void mlx4_cleanup_mr_table(struct mlx4_dev *dev);
+void mlx4_cleanup_eq_table(struct mlx4_dev *dev);
+void mlx4_cleanup_cq_table(struct mlx4_dev *dev);
+void mlx4_cleanup_qp_table(struct mlx4_dev *dev);
+void mlx4_cleanup_srq_table(struct mlx4_dev *dev);
+void mlx4_cleanup_mcg_table(struct mlx4_dev *dev);
+void mlx4_cleanup_xrcd_table(struct mlx4_dev *dev);
+
+void mlx4_start_catas_poll(struct mlx4_dev *dev);
+void mlx4_stop_catas_poll(struct mlx4_dev *dev);
+void mlx4_catas_init(void);
+int mlx4_restart_one(struct pci_dev *pdev);
+int mlx4_register_device(struct mlx4_dev *dev);
+void mlx4_unregister_device(struct mlx4_dev *dev);
+void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type, int port);
+void *mlx4_find_get_prot_dev(struct mlx4_dev *dev, enum mlx4_prot proto, int port);
+
+struct mlx4_dev_cap;
+struct mlx4_init_hca_param;
+
+u64 mlx4_make_profile(struct mlx4_dev *dev,
+ struct mlx4_profile *request,
+ struct mlx4_dev_cap *dev_cap,
+ struct mlx4_init_hca_param *init_hca);
+
+int mlx4_cmd_init(struct mlx4_dev *dev);
+void mlx4_cmd_cleanup(struct mlx4_dev *dev);
+void mlx4_cmd_event(struct mlx4_dev *dev, u16 token, u8 status, u64 out_param);
+int mlx4_cmd_use_events(struct mlx4_dev *dev);
+void mlx4_cmd_use_polling(struct mlx4_dev *dev);
+
+void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn);
+void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type);
+
+void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type);
+
+void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type);
+
+void mlx4_handle_catas_err(struct mlx4_dev *dev);
+
+void mlx4_do_sense_ports(struct mlx4_dev *dev,
+ enum mlx4_port_type *stype,
+ enum mlx4_port_type *defaults);
+void mlx4_start_sense(struct mlx4_dev *dev);
+void mlx4_stop_sense(struct mlx4_dev *dev);
+int mlx4_sense_init(struct mlx4_dev *dev);
+void mlx4_sense_cleanup(struct mlx4_dev *dev);
+int mlx4_check_port_params(struct mlx4_dev *dev,
+ enum mlx4_port_type *port_type);
+int mlx4_change_port_types(struct mlx4_dev *dev,
+ enum mlx4_port_type *port_types);
+
+void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table);
+void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table);
+
+int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port);
+int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps);
+
+#endif /* MLX4_H */
diff --git a/sys/ofed/drivers/net/mlx4/mlx4_en.h b/sys/ofed/drivers/net/mlx4/mlx4_en.h
new file mode 100644
index 0000000..6ab258e
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/mlx4_en.h
@@ -0,0 +1,651 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef _MLX4_EN_H_
+#define _MLX4_EN_H_
+
+#include <sys/cdefs.h>
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/cq.h>
+#include <linux/mlx4/srq.h>
+#include <linux/mlx4/doorbell.h>
+#include <linux/mlx4/cmd.h>
+
+#include <net/if_media.h>
+#include <netinet/tcp_lro.h>
+
+#include "en_port.h"
+
+#define DRV_NAME "mlx4_en"
+#define DRV_VERSION "1.5.2"
+#define DRV_RELDATE "July 2010"
+
+/* XXX */
+#define NETIF_MSG_LINK 0x1
+#define NETIF_MSG_IFDOWN 0x2
+#define NETIF_MSG_HW 0x4
+#define NETIF_MSG_DRV 0x8
+#define NETIF_MSG_INTR 0x10
+#define NETIF_MSG_RX_ERR 0x20
+
+#define MLX4_EN_MSG_LEVEL (NETIF_MSG_LINK | NETIF_MSG_IFDOWN)
+
+#define en_print(level, priv, format, arg...) \
+ { \
+ if ((priv)->registered) \
+ printk(level "%s: %s: " format, DRV_NAME, \
+ (priv->dev)->if_xname, ## arg); \
+ else \
+ printk(level "%s: %s: Port %d: " format, \
+ DRV_NAME, dev_name(&priv->mdev->pdev->dev), \
+ (priv)->port, ## arg); \
+ }
+
+#define en_dbg(mlevel, priv, format, arg...) \
+ if (NETIF_MSG_##mlevel & priv->msg_enable) \
+ en_print(KERN_DEBUG, priv, format, ## arg)
+#define en_warn(priv, format, arg...) \
+ en_print(KERN_WARNING, priv, format, ## arg)
+#define en_err(priv, format, arg...) \
+ en_print(KERN_ERR, priv, format, ## arg)
+#define en_info(priv, format, arg...) \
+ en_print(KERN_INFO, priv, format, ## arg)
+
+#define mlx4_err(mdev, format, arg...) \
+ printk(KERN_ERR "%s %s: " format , DRV_NAME ,\
+ dev_name(&mdev->pdev->dev) , ## arg)
+#define mlx4_info(mdev, format, arg...) \
+ printk(KERN_INFO "%s %s: " format , DRV_NAME ,\
+ dev_name(&mdev->pdev->dev) , ## arg)
+#define mlx4_warn(mdev, format, arg...) \
+ printk(KERN_WARNING "%s %s: " format , DRV_NAME ,\
+ dev_name(&mdev->pdev->dev) , ## arg)
+
+/*
+ * Device constants
+ */
+
+
+#define MLX4_EN_PAGE_SHIFT 12
+#define MLX4_EN_PAGE_SIZE (1 << MLX4_EN_PAGE_SHIFT)
+#define MAX_TX_RINGS (MLX4_EN_NUM_HASH_RINGS + 1 + MLX4_EN_NUM_PPP_RINGS)
+#define MAX_RX_RINGS 16
+#define TXBB_SIZE 64
+#define HEADROOM (2048 / TXBB_SIZE + 1)
+#define STAMP_STRIDE 64
+#define STAMP_DWORDS (STAMP_STRIDE / 4)
+#define STAMP_SHIFT 31
+#define STAMP_VAL 0x7fffffff
+#define STATS_DELAY (HZ / 4)
+
+/* Typical TSO descriptor with 16 gather entries is 352 bytes... */
+#define MAX_DESC_SIZE 512
+#define MAX_DESC_TXBBS (MAX_DESC_SIZE / TXBB_SIZE)
+
+/*
+ * OS related constants and tunables
+ */
+
+#define MLX4_EN_WATCHDOG_TIMEOUT (15 * HZ)
+
+#define MLX4_EN_MAX_LRO_DESCRIPTORS 32
+#define MLX4_EN_NUM_IPFRAG_SESSIONS 16
+
+/* Receive fragment sizes; we use at most 3 fragments (for 9600 byte MTU
+ * and 4K allocations) */
+#if MJUMPAGESIZE == 4096
+enum {
+ FRAG_SZ0 = MCLBYTES,
+ FRAG_SZ1 = MJUMPAGESIZE,
+ FRAG_SZ2 = MJUMPAGESIZE,
+};
+#define MLX4_EN_MAX_RX_FRAGS 3
+#elif MJUMPAGESIZE == 8192
+enum {
+ FRAG_SZ0 = MCLBYTES,
+ FRAG_SZ1 = MJUMPAGESIZE,
+};
+#define MLX4_EN_MAX_RX_FRAGS 2
+#elif MJUMPAGESIZE == 8192
+#else
+#error "Unknown PAGE_SIZE"
+#endif
+
+/* Maximum ring sizes */
+#define MLX4_EN_MAX_TX_SIZE 8192
+#define MLX4_EN_MAX_RX_SIZE 8192
+
+#define MLX4_EN_MIN_RX_SIZE (128)
+#define MLX4_EN_MIN_TX_SIZE (4096 / TXBB_SIZE)
+
+#define MLX4_EN_SMALL_PKT_SIZE 64
+#define MLX4_EN_TX_HASH_SIZE 256
+#define MLX4_EN_TX_HASH_MASK (MLX4_EN_TX_HASH_SIZE - 1)
+#define MLX4_EN_NUM_HASH_RINGS 4
+#define MLX4_EN_NUM_PPP_RINGS 8
+#define MLX4_EN_DEF_TX_RING_SIZE 512
+#define MLX4_EN_DEF_TX_QUEUE_SIZE 4096
+#define MLX4_EN_DEF_RX_RING_SIZE 1024
+#define MLX4_EN_MAX_RX_POLL 16
+
+/* Target number of bytes to coalesce with interrupt moderation */
+#define MLX4_EN_RX_COAL_TARGET 0x20000
+#define MLX4_EN_RX_COAL_TIME 0x10
+
+#define MLX4_EN_TX_COAL_PKTS 5
+#define MLX4_EN_TX_COAL_TIME 0x80
+
+#define MLX4_EN_RX_RATE_LOW 400000
+#define MLX4_EN_RX_COAL_TIME_LOW 0
+#define MLX4_EN_RX_RATE_HIGH 450000
+#define MLX4_EN_RX_COAL_TIME_HIGH 128
+#define MLX4_EN_RX_SIZE_THRESH 1024
+#define MLX4_EN_RX_RATE_THRESH (1000000 / MLX4_EN_RX_COAL_TIME_HIGH)
+#define MLX4_EN_SAMPLE_INTERVAL 0
+#define MLX4_EN_AVG_PKT_SMALL 256
+
+#define MLX4_EN_AUTO_CONF 0xffff
+
+#define MLX4_EN_DEF_RX_PAUSE 1
+#define MLX4_EN_DEF_TX_PAUSE 1
+
+/* Interval between sucessive polls in the Tx routine when polling is used
+ instead of interrupts (in per-core Tx rings) - should be power of 2 */
+#define MLX4_EN_TX_POLL_MODER 16
+#define MLX4_EN_TX_POLL_TIMEOUT (HZ / 4)
+
+#define ETH_LLC_SNAP_SIZE 8
+
+#define SMALL_PACKET_SIZE (MHLEN)
+#define HEADER_COPY_SIZE (128)
+#define MLX4_LOOPBACK_TEST_PAYLOAD (HEADER_COPY_SIZE - ETHER_HDR_LEN)
+
+#define MLX4_EN_MIN_MTU 46
+#define ETH_BCAST 0xffffffffffffULL
+
+#define MLX4_EN_LOOPBACK_RETRIES 5
+#define MLX4_EN_LOOPBACK_TIMEOUT 100
+
+#ifdef MLX4_EN_PERF_STAT
+/* Number of samples to 'average' */
+#define AVG_SIZE 128
+#define AVG_FACTOR 1024
+#define NUM_PERF_STATS NUM_PERF_COUNTERS
+
+#define INC_PERF_COUNTER(cnt) (++(cnt))
+#define ADD_PERF_COUNTER(cnt, add) ((cnt) += (add))
+#define AVG_PERF_COUNTER(cnt, sample) \
+ ((cnt) = ((cnt) * (AVG_SIZE - 1) + (sample) * AVG_FACTOR) / AVG_SIZE)
+#define GET_PERF_COUNTER(cnt) (cnt)
+#define GET_AVG_PERF_COUNTER(cnt) ((cnt) / AVG_FACTOR)
+
+#else
+
+#define NUM_PERF_STATS 0
+#define INC_PERF_COUNTER(cnt) do {} while (0)
+#define ADD_PERF_COUNTER(cnt, add) do {} while (0)
+#define AVG_PERF_COUNTER(cnt, sample) do {} while (0)
+#define GET_PERF_COUNTER(cnt) (0)
+#define GET_AVG_PERF_COUNTER(cnt) (0)
+#endif /* MLX4_EN_PERF_STAT */
+
+/*
+ * Configurables
+ */
+
+enum cq_type {
+ RX = 0,
+ TX = 1,
+};
+
+
+/*
+ * Useful macros
+ */
+#define ROUNDUP_LOG2(x) ilog2(roundup_pow_of_two(x))
+#define XNOR(x, y) (!(x) == !(y))
+#define ILLEGAL_MAC(addr) (addr == 0xffffffffffffULL || addr == 0x0)
+
+
+struct mlx4_en_tx_info {
+ struct mbuf *mb;
+ u32 nr_txbb;
+ u8 nr_segs;
+ u8 data_offset;
+ u8 inl;
+};
+
+
+#define MLX4_EN_BIT_DESC_OWN 0x80000000
+#define CTRL_SIZE sizeof(struct mlx4_wqe_ctrl_seg)
+#define MLX4_EN_MEMTYPE_PAD 0x100
+#define DS_SIZE sizeof(struct mlx4_wqe_data_seg)
+
+
+struct mlx4_en_tx_desc {
+ struct mlx4_wqe_ctrl_seg ctrl;
+ union {
+ struct mlx4_wqe_data_seg data; /* at least one data segment */
+ struct mlx4_wqe_lso_seg lso;
+ struct mlx4_wqe_inline_seg inl;
+ };
+};
+
+#define MLX4_EN_USE_SRQ 0x01000000
+
+struct mlx4_en_tx_ring {
+ spinlock_t tx_lock;
+ struct mlx4_hwq_resources wqres;
+ u32 size ; /* number of TXBBs */
+ u32 size_mask;
+ u16 stride;
+ u16 cqn; /* index of port CQ associated with this ring */
+ u32 prod;
+ u32 cons;
+ u32 buf_size;
+ u32 doorbell_qpn;
+ void *buf;
+ u16 poll_cnt;
+ int blocked;
+ struct buf_ring *br;
+ struct mlx4_en_tx_info *tx_info;
+ u8 *bounce_buf;
+ u32 last_nr_txbb;
+ struct mlx4_qp qp;
+ struct mlx4_qp_context context;
+ int qpn;
+ enum mlx4_qp_state qp_state;
+ struct mlx4_srq dummy;
+ unsigned long bytes;
+ unsigned long packets;
+ unsigned long errors;
+ spinlock_t comp_lock;
+ struct mlx4_bf bf;
+ bool bf_enabled;
+ u64 watchdog_time;
+};
+
+struct mlx4_en_ipfrag {
+ struct mbuf *fragments;
+ struct mbuf *last;
+ __be32 saddr;
+ __be32 daddr;
+ __be16 id;
+ u8 protocol;
+ int total_len;
+ u16 offset;
+};
+
+struct mlx4_en_rx_desc {
+ /* actual number of entries depends on rx ring stride */
+ struct mlx4_wqe_data_seg data[0];
+};
+
+struct mlx4_en_rx_ring {
+ struct mlx4_hwq_resources wqres;
+ u32 size ; /* number of Rx descs*/
+ u32 actual_size;
+ u32 size_mask;
+ u16 stride;
+ u16 log_stride;
+ u16 cqn; /* index of port CQ associated with this ring */
+ u32 prod;
+ u32 cons;
+ u32 buf_size;
+ void *buf;
+ void *rx_info;
+ unsigned long bytes;
+ unsigned long packets;
+ unsigned long errors;
+ unsigned int use_frags;
+ struct lro_ctrl lro;
+ struct mlx4_en_ipfrag ipfrag[MLX4_EN_NUM_IPFRAG_SESSIONS];
+};
+
+
+static inline int mlx4_en_can_lro(__be16 status)
+{
+ return (status & cpu_to_be16(MLX4_CQE_STATUS_IPV4 |
+ MLX4_CQE_STATUS_IPV4F |
+ MLX4_CQE_STATUS_IPV6 |
+ MLX4_CQE_STATUS_IPV4OPT |
+ MLX4_CQE_STATUS_TCP |
+ MLX4_CQE_STATUS_UDP |
+ MLX4_CQE_STATUS_IPOK)) ==
+ cpu_to_be16(MLX4_CQE_STATUS_IPV4 |
+ MLX4_CQE_STATUS_IPOK |
+ MLX4_CQE_STATUS_TCP);
+}
+
+struct mlx4_en_cq {
+ struct mlx4_cq mcq;
+ struct mlx4_hwq_resources wqres;
+ int ring;
+ spinlock_t lock;
+ struct net_device *dev;
+ /* Per-core Tx cq processing support */
+ struct timer_list timer;
+ int size;
+ int buf_size;
+ unsigned vector;
+ enum cq_type is_tx;
+ u16 moder_time;
+ u16 moder_cnt;
+ struct mlx4_cqe *buf;
+ struct task cq_task;
+ struct taskqueue *tq;
+#define MLX4_EN_OPCODE_ERROR 0x1e
+ u32 tot_rx;
+};
+
+struct mlx4_en_port_profile {
+ u32 flags;
+ u32 tx_ring_num;
+ u32 rx_ring_num;
+ u32 tx_ring_size;
+ u32 rx_ring_size;
+ u8 rx_pause;
+ u8 rx_ppp;
+ u8 tx_pause;
+ u8 tx_ppp;
+};
+
+struct mlx4_en_profile {
+ int rss_xor;
+ int num_lro;
+ int ip_reasm;
+ int tcp_rss;
+ int udp_rss;
+ u8 rss_mask;
+ u32 active_ports;
+ u32 small_pkt_int;
+ u8 no_reset;
+ struct mlx4_en_port_profile prof[MLX4_MAX_PORTS + 1];
+};
+
+struct mlx4_en_dev {
+ struct mlx4_dev *dev;
+ struct pci_dev *pdev;
+ struct mutex state_lock;
+ struct net_device *pndev[MLX4_MAX_PORTS + 1];
+ u32 port_cnt;
+ bool device_up;
+ struct mlx4_en_profile profile;
+ u32 LSO_support;
+ struct workqueue_struct *workqueue;
+ struct device *dma_device;
+ void __iomem *uar_map;
+ struct mlx4_uar priv_uar;
+ struct mlx4_mr mr;
+ u32 priv_pdn;
+ spinlock_t uar_lock;
+ u8 mac_removed[MLX4_MAX_PORTS + 1];
+};
+
+
+struct mlx4_en_rss_map {
+ int base_qpn;
+ struct mlx4_qp qps[MAX_RX_RINGS];
+ enum mlx4_qp_state state[MAX_RX_RINGS];
+ struct mlx4_qp indir_qp;
+ enum mlx4_qp_state indir_state;
+};
+
+struct mlx4_en_rss_context {
+ __be32 base_qpn;
+ __be32 default_qpn;
+ u16 reserved;
+ u8 hash_fn;
+ u8 flags;
+ __be32 rss_key[10];
+ __be32 base_qpn_udp;
+};
+
+struct mlx4_en_port_state {
+ int link_state;
+ int link_speed;
+ int transciver;
+};
+
+struct mlx4_en_pkt_stats {
+ unsigned long broadcast;
+ unsigned long rx_prio[8];
+ unsigned long tx_prio[8];
+#define NUM_PKT_STATS 17
+};
+
+struct mlx4_en_port_stats {
+ unsigned long tso_packets;
+ unsigned long queue_stopped;
+ unsigned long wake_queue;
+ unsigned long tx_timeout;
+ unsigned long rx_alloc_failed;
+ unsigned long rx_chksum_good;
+ unsigned long rx_chksum_none;
+ unsigned long tx_chksum_offload;
+};
+
+struct mlx4_en_perf_stats {
+ u32 tx_poll;
+ u64 tx_pktsz_avg;
+ u32 inflight_avg;
+ u32 tx_coal_avg;
+ u32 rx_coal_avg;
+};
+
+struct mlx4_en_frag_info {
+ u16 frag_size;
+ u16 frag_prefix_size;
+};
+
+struct mlx4_en_tx_hash_entry {
+ u8 cnt;
+ unsigned int small_pkts;
+ unsigned int big_pkts;
+ unsigned int ring;
+};
+
+struct mlx4_en_priv {
+ struct mlx4_en_dev *mdev;
+ struct mlx4_en_port_profile *prof;
+ struct net_device *dev;
+ bool vlgrp_modified;
+ u32 vlan_register[VLAN_FLTR_SIZE];
+ u32 vlan_unregister[VLAN_FLTR_SIZE];
+ u32 vlans[VLAN_FLTR_SIZE];
+ spinlock_t vlan_lock;
+ struct mlx4_en_port_state port_state;
+ spinlock_t stats_lock;
+
+ unsigned long last_moder_packets;
+ unsigned long last_moder_tx_packets;
+ unsigned long last_moder_bytes;
+ unsigned long last_moder_jiffies;
+ int last_moder_time;
+ u16 rx_usecs;
+ u16 rx_frames;
+ u16 tx_usecs;
+ u16 tx_frames;
+ u32 pkt_rate_low;
+ u16 rx_usecs_low;
+ u32 pkt_rate_high;
+ u16 rx_usecs_high;
+ u16 sample_interval;
+ u16 adaptive_rx_coal;
+ u32 msg_enable;
+ u32 loopback_ok;
+ u32 validate_loopback;
+
+ struct mlx4_hwq_resources res;
+ int link_state;
+ int last_link_state;
+ bool port_up;
+ int port;
+ int registered;
+ int allocated;
+ int rx_csum;
+ u64 mac;
+ int mac_index;
+ unsigned max_mtu;
+ int base_qpn;
+
+ struct mlx4_en_rss_map rss_map;
+ u16 tx_prio_map[8];
+ u32 flags;
+#define MLX4_EN_FLAG_PROMISC 0x1
+ u32 tx_ring_num;
+ u32 rx_ring_num;
+ u32 udp_rings;
+ u32 rx_mb_size;
+ struct mlx4_en_frag_info frag_info[MLX4_EN_MAX_RX_FRAGS];
+ u16 num_frags;
+ u16 log_rx_info;
+
+ struct mlx4_en_tx_ring tx_ring[MAX_TX_RINGS];
+ struct mlx4_en_rx_ring rx_ring[MAX_RX_RINGS];
+ struct mlx4_en_cq tx_cq[MAX_TX_RINGS];
+ struct mlx4_en_cq rx_cq[MAX_RX_RINGS];
+ struct mlx4_en_tx_hash_entry tx_hash[MLX4_EN_TX_HASH_SIZE];
+ struct work_struct mcast_task;
+ struct work_struct watchdog_task;
+ struct work_struct linkstate_task;
+ struct delayed_work stats_task;
+ struct mlx4_en_perf_stats pstats;
+ struct mlx4_en_pkt_stats pkstats;
+ struct mlx4_en_port_stats port_stats;
+ struct mlx4_en_stat_out_mbox hw_stats;
+ struct ifmedia media;
+ eventhandler_tag vlan_attach;
+ eventhandler_tag vlan_detach;
+ struct callout watchdog_timer;
+ volatile int blocked;
+ struct sysctl_oid *sysctl;
+ struct sysctl_ctx_list conf_ctx;
+ struct sysctl_ctx_list stat_ctx;
+};
+
+
+int mlx4_en_transmit(struct net_device *dev, struct mbuf *mb);
+void mlx4_en_qflush(struct net_device *dev);
+
+int mlx4_en_rx_frags(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring,
+ struct mbuf *mb, struct mlx4_cqe *cqe);
+void mlx4_en_flush_frags(struct mlx4_en_priv *priv,
+ struct mlx4_en_rx_ring *ring);
+void mlx4_en_destroy_netdev(struct net_device *dev);
+int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
+ struct mlx4_en_port_profile *prof);
+
+int mlx4_en_start_port(struct net_device *dev);
+void mlx4_en_stop_port(struct net_device *dev);
+
+void mlx4_en_free_resources(struct mlx4_en_priv *priv);
+int mlx4_en_alloc_resources(struct mlx4_en_priv *priv);
+
+int mlx4_en_create_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
+ int entries, int ring, enum cq_type mode);
+void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
+int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
+void mlx4_en_deactivate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
+int mlx4_en_set_cq_moder(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
+int mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
+
+void mlx4_en_poll_tx_cq(unsigned long data);
+void mlx4_en_tx_irq(struct mlx4_cq *mcq);
+u16 mlx4_en_select_queue(struct net_device *dev, struct mbuf *mb);
+
+int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring,
+ u32 size, u16 stride);
+void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring);
+int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
+ struct mlx4_en_tx_ring *ring,
+ int cq);
+void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
+ struct mlx4_en_tx_ring *ring);
+
+int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
+ struct mlx4_en_rx_ring *ring, u32 size);
+void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
+ struct mlx4_en_rx_ring *ring);
+int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv);
+void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
+ struct mlx4_en_rx_ring *ring);
+int mlx4_en_process_rx_cq(struct net_device *dev,
+ struct mlx4_en_cq *cq,
+ int budget);
+int mlx4_en_process_rx_cq_mb(struct net_device *dev,
+ struct mlx4_en_cq *cq,
+ int budget);
+void mlx4_en_tx_que(void *context, int pending);
+void mlx4_en_rx_que(void *context, int pending);
+void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
+ int is_tx, int rss, int qpn, int cqn,
+ struct mlx4_qp_context *context);
+void mlx4_en_sqp_event(struct mlx4_qp *qp, enum mlx4_event event);
+int mlx4_en_map_buffer(struct mlx4_buf *buf);
+void mlx4_en_unmap_buffer(struct mlx4_buf *buf);
+
+void mlx4_en_calc_rx_buf(struct net_device *dev);
+void mlx4_en_set_prio_map(struct mlx4_en_priv *priv, u16 *prio_map, u32 ring_num);
+int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv);
+void mlx4_en_release_rss_steer(struct mlx4_en_priv *priv);
+int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring);
+void mlx4_en_rx_irq(struct mlx4_cq *mcq);
+
+int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode);
+int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, u8 port, u32 *vlans);
+int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
+ u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx);
+int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn,
+ u8 promisc);
+
+int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset);
+int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port);
+
+#define MLX4_EN_NUM_SELF_TEST 5
+void mlx4_en_ex_selftest(struct net_device *dev, u32 *flags, u64 *buf);
+u64 mlx4_en_mac_to_u64(u8 *addr);
+
+/*
+ * Globals
+ */
+extern const struct ethtool_ops mlx4_en_ethtool_ops;
+#endif
diff --git a/sys/ofed/drivers/net/mlx4/mr.c b/sys/ofed/drivers/net/mlx4/mr.c
new file mode 100644
index 0000000..9ed610a
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/mr.c
@@ -0,0 +1,773 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+/*
+ * Must be packed because mtt_seg is 64 bits but only aligned to 32 bits.
+ */
+struct mlx4_mpt_entry {
+ __be32 flags;
+ __be32 qpn;
+ __be32 key;
+ __be32 pd_flags;
+ __be64 start;
+ __be64 length;
+ __be32 lkey;
+ __be32 win_cnt;
+ u8 reserved1;
+ u8 flags2;
+ u8 reserved2;
+ u8 mtt_rep;
+ __be64 mtt_seg;
+ __be32 mtt_sz;
+ __be32 entity_size;
+ __be32 first_byte_offset;
+} __attribute__((packed));
+
+#define MLX4_MPT_FLAG_SW_OWNS (0xfUL << 28)
+#define MLX4_MPT_FLAG_FREE (0x3UL << 28)
+#define MLX4_MPT_FLAG_MIO (1 << 17)
+#define MLX4_MPT_FLAG_BIND_ENABLE (1 << 15)
+#define MLX4_MPT_FLAG_PHYSICAL (1 << 9)
+#define MLX4_MPT_FLAG_REGION (1 << 8)
+
+#define MLX4_MPT_PD_FLAG_FAST_REG (1 << 27)
+#define MLX4_MPT_PD_FLAG_RAE (1 << 28)
+#define MLX4_MPT_PD_FLAG_EN_INV (3 << 24)
+
+#define MLX4_MPT_FLAG2_FBO_EN (1 << 7)
+
+#define MLX4_MPT_STATUS_SW 0xF0
+#define MLX4_MPT_STATUS_HW 0x00
+
+static u32 mlx4_buddy_alloc(struct mlx4_buddy *buddy, int order)
+{
+ int o;
+ int m;
+ u32 seg;
+
+ spin_lock(&buddy->lock);
+
+ for (o = order; o <= buddy->max_order; ++o)
+ if (buddy->num_free[o]) {
+ m = 1 << (buddy->max_order - o);
+ seg = find_first_bit(buddy->bits[o], m);
+ if (seg < m)
+ goto found;
+ }
+
+ spin_unlock(&buddy->lock);
+ return -1;
+
+ found:
+ clear_bit(seg, buddy->bits[o]);
+ --buddy->num_free[o];
+
+ while (o > order) {
+ --o;
+ seg <<= 1;
+ set_bit(seg ^ 1, buddy->bits[o]);
+ ++buddy->num_free[o];
+ }
+
+ spin_unlock(&buddy->lock);
+
+ seg <<= order;
+
+ return seg;
+}
+
+static void mlx4_buddy_free(struct mlx4_buddy *buddy, u32 seg, int order)
+{
+ seg >>= order;
+
+ spin_lock(&buddy->lock);
+
+ while (test_bit(seg ^ 1, buddy->bits[order])) {
+ clear_bit(seg ^ 1, buddy->bits[order]);
+ --buddy->num_free[order];
+ seg >>= 1;
+ ++order;
+ }
+
+ set_bit(seg, buddy->bits[order]);
+ ++buddy->num_free[order];
+
+ spin_unlock(&buddy->lock);
+}
+
+static int mlx4_buddy_init(struct mlx4_buddy *buddy, int max_order)
+{
+ int i, s;
+
+ buddy->max_order = max_order;
+ spin_lock_init(&buddy->lock);
+
+ buddy->bits = kzalloc((buddy->max_order + 1) * sizeof (long *),
+ GFP_KERNEL);
+ buddy->num_free = kzalloc((buddy->max_order + 1) * sizeof (int *),
+ GFP_KERNEL);
+ if (!buddy->bits || !buddy->num_free)
+ goto err_out;
+
+ for (i = 0; i <= buddy->max_order; ++i) {
+ s = BITS_TO_LONGS(1 << (buddy->max_order - i));
+ buddy->bits[i] = kmalloc(s * sizeof (long), GFP_KERNEL);
+ if (!buddy->bits[i])
+ goto err_out_free;
+ bitmap_zero(buddy->bits[i], 1 << (buddy->max_order - i));
+ }
+
+ set_bit(0, buddy->bits[buddy->max_order]);
+ buddy->num_free[buddy->max_order] = 1;
+
+ return 0;
+
+err_out_free:
+ for (i = 0; i <= buddy->max_order; ++i)
+ kfree(buddy->bits[i]);
+
+err_out:
+ kfree(buddy->bits);
+ kfree(buddy->num_free);
+
+ return -ENOMEM;
+}
+
+static void mlx4_buddy_cleanup(struct mlx4_buddy *buddy)
+{
+ int i;
+
+ for (i = 0; i <= buddy->max_order; ++i)
+ kfree(buddy->bits[i]);
+
+ kfree(buddy->bits);
+ kfree(buddy->num_free);
+}
+
+static u32 mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order)
+{
+ struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+ u32 seg;
+
+ seg = mlx4_buddy_alloc(&mr_table->mtt_buddy, order);
+ if (seg == -1)
+ return -1;
+
+ if (mlx4_table_get_range(dev, &mr_table->mtt_table, seg,
+ seg + (1 << order) - 1)) {
+ mlx4_buddy_free(&mr_table->mtt_buddy, seg, order);
+ return -1;
+ }
+
+ return seg;
+}
+
+int mlx4_mtt_init(struct mlx4_dev *dev, int npages, int page_shift,
+ struct mlx4_mtt *mtt)
+{
+ int i;
+
+ if (!npages) {
+ mtt->order = -1;
+ mtt->page_shift = MLX4_ICM_PAGE_SHIFT;
+ return 0;
+ } else
+ mtt->page_shift = page_shift;
+
+ for (mtt->order = 0, i = dev->caps.mtts_per_seg; i < npages; i <<= 1)
+ ++mtt->order;
+
+ mtt->first_seg = mlx4_alloc_mtt_range(dev, mtt->order);
+ if (mtt->first_seg == -1)
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mtt_init);
+
+void mlx4_mtt_cleanup(struct mlx4_dev *dev, struct mlx4_mtt *mtt)
+{
+ struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+
+ if (mtt->order < 0)
+ return;
+
+ mlx4_buddy_free(&mr_table->mtt_buddy, mtt->first_seg, mtt->order);
+ mlx4_table_put_range(dev, &mr_table->mtt_table, mtt->first_seg,
+ mtt->first_seg + (1 << mtt->order) - 1);
+}
+EXPORT_SYMBOL_GPL(mlx4_mtt_cleanup);
+
+u64 mlx4_mtt_addr(struct mlx4_dev *dev, struct mlx4_mtt *mtt)
+{
+ return (u64) mtt->first_seg * dev->caps.mtt_entry_sz;
+}
+EXPORT_SYMBOL_GPL(mlx4_mtt_addr);
+
+static u32 hw_index_to_key(u32 ind)
+{
+ return (ind >> 24) | (ind << 8);
+}
+
+static u32 key_to_hw_index(u32 key)
+{
+ return (key << 24) | (key >> 8);
+}
+
+static int mlx4_SW2HW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+ int mpt_index)
+{
+ return mlx4_cmd(dev, mailbox->dma, mpt_index, 0, MLX4_CMD_SW2HW_MPT,
+ MLX4_CMD_TIME_CLASS_B);
+}
+
+static int mlx4_HW2SW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+ int mpt_index)
+{
+ return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, mpt_index,
+ !mailbox, MLX4_CMD_HW2SW_MPT, MLX4_CMD_TIME_CLASS_B);
+}
+
+int mlx4_mr_reserve_range(struct mlx4_dev *dev, int cnt, int align, u32 *base_mridx)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ u32 mridx;
+
+ mridx = mlx4_bitmap_alloc_range(&priv->mr_table.mpt_bitmap, cnt, align);
+ if (mridx == -1)
+ return -ENOMEM;
+
+ *base_mridx = mridx;
+ return 0;
+
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_reserve_range);
+
+void mlx4_mr_release_range(struct mlx4_dev *dev, u32 base_mridx, int cnt)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ mlx4_bitmap_free_range(&priv->mr_table.mpt_bitmap, base_mridx, cnt);
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_release_range);
+
+int mlx4_mr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd,
+ u64 iova, u64 size, u32 access, int npages,
+ int page_shift, struct mlx4_mr *mr)
+{
+ mr->iova = iova;
+ mr->size = size;
+ mr->pd = pd;
+ mr->access = access;
+ mr->enabled = 0;
+ mr->key = hw_index_to_key(mridx);
+
+ return mlx4_mtt_init(dev, npages, page_shift, &mr->mtt);
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_alloc_reserved);
+
+int mlx4_mr_alloc(struct mlx4_dev *dev, u32 pd, u64 iova, u64 size, u32 access,
+ int npages, int page_shift, struct mlx4_mr *mr)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ u32 index;
+ int err;
+
+ index = mlx4_bitmap_alloc(&priv->mr_table.mpt_bitmap);
+ if (index == -1)
+ return -ENOMEM;
+
+ err = mlx4_mr_alloc_reserved(dev, index, pd, iova, size,
+ access, npages, page_shift, mr);
+ if (err)
+ mlx4_bitmap_free(&priv->mr_table.mpt_bitmap, index);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_alloc);
+
+void mlx4_mr_free_reserved(struct mlx4_dev *dev, struct mlx4_mr *mr)
+{
+ int err;
+
+ if (mr->enabled) {
+ err = mlx4_HW2SW_MPT(dev, NULL,
+ key_to_hw_index(mr->key) &
+ (dev->caps.num_mpts - 1));
+ if (err)
+ mlx4_warn(dev, "HW2SW_MPT failed (%d)\n", err);
+ }
+
+ mlx4_mtt_cleanup(dev, &mr->mtt);
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_free_reserved);
+
+void mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ mlx4_mr_free_reserved(dev, mr);
+ mlx4_bitmap_free(&priv->mr_table.mpt_bitmap, key_to_hw_index(mr->key));
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_free);
+
+int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr)
+{
+ struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+ struct mlx4_cmd_mailbox *mailbox;
+ struct mlx4_mpt_entry *mpt_entry;
+ int err;
+
+ err = mlx4_table_get(dev, &mr_table->dmpt_table, key_to_hw_index(mr->key));
+ if (err)
+ return err;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox)) {
+ err = PTR_ERR(mailbox);
+ goto err_table;
+ }
+ mpt_entry = mailbox->buf;
+
+ memset(mpt_entry, 0, sizeof *mpt_entry);
+
+ mpt_entry->flags = cpu_to_be32(MLX4_MPT_FLAG_MIO |
+ MLX4_MPT_FLAG_REGION |
+ mr->access);
+
+ mpt_entry->key = cpu_to_be32(key_to_hw_index(mr->key));
+ mpt_entry->pd_flags = cpu_to_be32(mr->pd | MLX4_MPT_PD_FLAG_EN_INV);
+ mpt_entry->start = cpu_to_be64(mr->iova);
+ mpt_entry->length = cpu_to_be64(mr->size);
+ mpt_entry->entity_size = cpu_to_be32(mr->mtt.page_shift);
+
+ if (mr->mtt.order < 0) {
+ mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_PHYSICAL);
+ mpt_entry->mtt_seg = 0;
+ } else {
+ mpt_entry->mtt_seg = cpu_to_be64(mlx4_mtt_addr(dev, &mr->mtt));
+ }
+
+ if (mr->mtt.order >= 0 && mr->mtt.page_shift == 0) {
+ /* fast register MR in free state */
+ mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_FREE);
+ mpt_entry->pd_flags |= cpu_to_be32(MLX4_MPT_PD_FLAG_FAST_REG |
+ MLX4_MPT_PD_FLAG_RAE);
+ mpt_entry->mtt_sz = cpu_to_be32((1 << mr->mtt.order) *
+ dev->caps.mtts_per_seg);
+ } else {
+ mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_SW_OWNS);
+ }
+
+ err = mlx4_SW2HW_MPT(dev, mailbox,
+ key_to_hw_index(mr->key) & (dev->caps.num_mpts - 1));
+ if (err) {
+ mlx4_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+ goto err_cmd;
+ }
+
+ mr->enabled = 1;
+
+ mlx4_free_cmd_mailbox(dev, mailbox);
+
+ return 0;
+
+err_cmd:
+ mlx4_free_cmd_mailbox(dev, mailbox);
+
+err_table:
+ mlx4_table_put(dev, &mr_table->dmpt_table, key_to_hw_index(mr->key));
+ return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_enable);
+
+static int mlx4_write_mtt_chunk(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+ int start_index, int npages, u64 *page_list)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ __be64 *mtts;
+ dma_addr_t dma_handle;
+ int i;
+ int s = start_index * sizeof (u64);
+
+ /* All MTTs must fit in the same page */
+ if (start_index / (PAGE_SIZE / sizeof (u64)) !=
+ (start_index + npages - 1) / (PAGE_SIZE / sizeof (u64)))
+ return -EINVAL;
+
+ if (start_index & (dev->caps.mtts_per_seg - 1))
+ return -EINVAL;
+
+ mtts = mlx4_table_find(&priv->mr_table.mtt_table, mtt->first_seg +
+ s / dev->caps.mtt_entry_sz, &dma_handle);
+ if (!mtts)
+ return -ENOMEM;
+
+ for (i = 0; i < npages; ++i)
+ mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT);
+
+ dma_sync_single(&dev->pdev->dev, dma_handle, npages * sizeof (u64), DMA_TO_DEVICE);
+
+ return 0;
+}
+
+int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+ int start_index, int npages, u64 *page_list)
+{
+ int chunk;
+ int err;
+
+ if (mtt->order < 0)
+ return -EINVAL;
+
+ while (npages > 0) {
+ chunk = min_t(int, PAGE_SIZE / sizeof(u64), npages);
+ err = mlx4_write_mtt_chunk(dev, mtt, start_index, chunk, page_list);
+ if (err)
+ return err;
+
+ npages -= chunk;
+ start_index += chunk;
+ page_list += chunk;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_write_mtt);
+
+int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+ struct mlx4_buf *buf)
+{
+ u64 *page_list;
+ int err;
+ int i;
+
+ page_list = kmalloc(buf->npages * sizeof *page_list, GFP_KERNEL);
+ if (!page_list)
+ return -ENOMEM;
+
+ for (i = 0; i < buf->npages; ++i)
+ if (buf->direct.map)
+ page_list[i] = buf->direct.map + (i << buf->page_shift);
+ else
+ page_list[i] = buf->page_list[i].map;
+
+ err = mlx4_write_mtt(dev, mtt, 0, buf->npages, page_list);
+
+ kfree(page_list);
+ return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_buf_write_mtt);
+
+int mlx4_init_mr_table(struct mlx4_dev *dev)
+{
+ struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+ int err;
+
+ if (!is_power_of_2(dev->caps.num_mpts))
+ return -EINVAL;
+
+ err = mlx4_bitmap_init(&mr_table->mpt_bitmap, dev->caps.num_mpts,
+ ~0, dev->caps.reserved_mrws, 0);
+ if (err)
+ return err;
+
+ err = mlx4_buddy_init(&mr_table->mtt_buddy,
+ ilog2(dev->caps.num_mtt_segs));
+ if (err)
+ goto err_buddy;
+
+ if (dev->caps.reserved_mtts) {
+ if (mlx4_alloc_mtt_range(dev, fls(dev->caps.reserved_mtts - 1)) == -1) {
+ mlx4_warn(dev, "MTT table of order %d is too small.\n",
+ mr_table->mtt_buddy.max_order);
+ err = -ENOMEM;
+ goto err_reserve_mtts;
+ }
+ }
+
+ return 0;
+
+err_reserve_mtts:
+ mlx4_buddy_cleanup(&mr_table->mtt_buddy);
+
+err_buddy:
+ mlx4_bitmap_cleanup(&mr_table->mpt_bitmap);
+
+ return err;
+}
+
+void mlx4_cleanup_mr_table(struct mlx4_dev *dev)
+{
+ struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+
+ mlx4_buddy_cleanup(&mr_table->mtt_buddy);
+ mlx4_bitmap_cleanup(&mr_table->mpt_bitmap);
+}
+
+static inline int mlx4_check_fmr(struct mlx4_fmr *fmr, u64 *page_list,
+ int npages, u64 iova)
+{
+ int i, page_mask;
+
+ if (npages > fmr->max_pages)
+ return -EINVAL;
+
+ page_mask = (1 << fmr->page_shift) - 1;
+
+ /* We are getting page lists, so va must be page aligned. */
+ if (iova & page_mask)
+ return -EINVAL;
+
+ /* Trust the user not to pass misaligned data in page_list */
+ if (0)
+ for (i = 0; i < npages; ++i) {
+ if (page_list[i] & ~page_mask)
+ return -EINVAL;
+ }
+
+ if (fmr->maps >= fmr->max_maps)
+ return -EINVAL;
+
+ return 0;
+}
+
+int mlx4_map_phys_fmr_fbo(struct mlx4_dev *dev, struct mlx4_fmr *fmr,
+ u64 *page_list, int npages, u64 iova, u32 fbo,
+ u32 len, u32 *lkey, u32 *rkey, int same_key)
+{
+ u32 key;
+ int i, err;
+
+ err = mlx4_check_fmr(fmr, page_list, npages, iova);
+ if (err)
+ return err;
+
+ ++fmr->maps;
+
+ key = key_to_hw_index(fmr->mr.key);
+ if (!same_key)
+ key += dev->caps.num_mpts;
+ *lkey = *rkey = fmr->mr.key = hw_index_to_key(key);
+
+ *(u8 *) fmr->mpt = MLX4_MPT_STATUS_SW;
+
+ /* Make sure MPT status is visible before writing MTT entries */
+ wmb();
+
+ for (i = 0; i < npages; ++i)
+ fmr->mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT);
+
+ dma_sync_single(&dev->pdev->dev, fmr->dma_handle,
+ npages * sizeof(u64), DMA_TO_DEVICE);
+
+ fmr->mpt->key = cpu_to_be32(key);
+ fmr->mpt->lkey = cpu_to_be32(key);
+ fmr->mpt->length = cpu_to_be64(len);
+ fmr->mpt->start = cpu_to_be64(iova);
+ fmr->mpt->first_byte_offset = cpu_to_be32(fbo & 0x001fffff);
+ fmr->mpt->flags2 = (fbo ? MLX4_MPT_FLAG2_FBO_EN : 0);
+
+ /* Make MTT entries are visible before setting MPT status */
+ wmb();
+
+ *(u8 *) fmr->mpt = MLX4_MPT_STATUS_HW;
+
+ /* Make sure MPT status is visible before consumer can use FMR */
+ wmb();
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_map_phys_fmr_fbo);
+
+int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list,
+ int npages, u64 iova, u32 *lkey, u32 *rkey)
+{
+ u32 len = npages * (1ull << fmr->page_shift);
+
+ return mlx4_map_phys_fmr_fbo(dev, fmr, page_list, npages, iova, 0,
+ len, lkey, rkey, 0);
+}
+EXPORT_SYMBOL_GPL(mlx4_map_phys_fmr);
+
+int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages,
+ int max_maps, u8 page_shift, struct mlx4_fmr *fmr)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ u64 mtt_seg;
+ int err = -ENOMEM;
+
+ if (page_shift < (ffs(dev->caps.page_size_cap) - 1) || page_shift >= 32)
+ return -EINVAL;
+
+ /* All MTTs must fit in the same page */
+ if (max_pages * sizeof *fmr->mtts > PAGE_SIZE)
+ return -EINVAL;
+
+ fmr->page_shift = page_shift;
+ fmr->max_pages = max_pages;
+ fmr->max_maps = max_maps;
+ fmr->maps = 0;
+
+ err = mlx4_mr_alloc(dev, pd, 0, 0, access, max_pages,
+ page_shift, &fmr->mr);
+ if (err)
+ return err;
+
+ mtt_seg = fmr->mr.mtt.first_seg * dev->caps.mtt_entry_sz;
+
+ fmr->mtts = mlx4_table_find(&priv->mr_table.mtt_table,
+ fmr->mr.mtt.first_seg,
+ &fmr->dma_handle);
+ if (!fmr->mtts) {
+ err = -ENOMEM;
+ goto err_free;
+ }
+
+ return 0;
+
+err_free:
+ mlx4_mr_free(dev, &fmr->mr);
+ return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_fmr_alloc);
+
+int mlx4_fmr_alloc_reserved(struct mlx4_dev *dev, u32 mridx,
+ u32 pd, u32 access, int max_pages,
+ int max_maps, u8 page_shift, struct mlx4_fmr *fmr)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ u64 mtt_seg;
+ int err = -ENOMEM;
+
+ if (page_shift < (ffs(dev->caps.page_size_cap) - 1) || page_shift >= 32)
+ return -EINVAL;
+
+ /* All MTTs must fit in the same page */
+ if (max_pages * sizeof *fmr->mtts > PAGE_SIZE)
+ return -EINVAL;
+
+ fmr->page_shift = page_shift;
+ fmr->max_pages = max_pages;
+ fmr->max_maps = max_maps;
+ fmr->maps = 0;
+
+ err = mlx4_mr_alloc_reserved(dev, mridx, pd, 0, 0, access, max_pages,
+ page_shift, &fmr->mr);
+ if (err)
+ return err;
+
+ mtt_seg = fmr->mr.mtt.first_seg * dev->caps.mtt_entry_sz;
+
+ fmr->mtts = mlx4_table_find(&priv->mr_table.mtt_table,
+ fmr->mr.mtt.first_seg,
+ &fmr->dma_handle);
+ if (!fmr->mtts) {
+ err = -ENOMEM;
+ goto err_free;
+ }
+
+ return 0;
+
+err_free:
+ mlx4_mr_free_reserved(dev, &fmr->mr);
+ return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_fmr_alloc_reserved);
+
+int mlx4_fmr_enable(struct mlx4_dev *dev, struct mlx4_fmr *fmr)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ int err;
+
+ err = mlx4_mr_enable(dev, &fmr->mr);
+ if (err)
+ return err;
+
+ fmr->mpt = mlx4_table_find(&priv->mr_table.dmpt_table,
+ key_to_hw_index(fmr->mr.key), NULL);
+ if (!fmr->mpt)
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_fmr_enable);
+
+void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr,
+ u32 *lkey, u32 *rkey)
+{
+ if (!fmr->maps)
+ return;
+
+ fmr->maps = 0;
+
+ *(u8 *) fmr->mpt = MLX4_MPT_STATUS_SW;
+}
+EXPORT_SYMBOL_GPL(mlx4_fmr_unmap);
+
+int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr)
+{
+ if (fmr->maps)
+ return -EBUSY;
+
+ fmr->mr.enabled = 0;
+ mlx4_mr_free(dev, &fmr->mr);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_fmr_free);
+
+int mlx4_fmr_free_reserved(struct mlx4_dev *dev, struct mlx4_fmr *fmr)
+{
+ if (fmr->maps)
+ return -EBUSY;
+
+ fmr->mr.enabled = 0;
+ mlx4_mr_free_reserved(dev, &fmr->mr);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_fmr_free_reserved);
+
+int mlx4_SYNC_TPT(struct mlx4_dev *dev)
+{
+ return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_SYNC_TPT, 1000);
+}
+EXPORT_SYMBOL_GPL(mlx4_SYNC_TPT);
diff --git a/sys/ofed/drivers/net/mlx4/pd.c b/sys/ofed/drivers/net/mlx4/pd.c
new file mode 100644
index 0000000..cce9226
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/pd.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/io-mapping.h>
+
+#include <asm/page.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+enum {
+ MLX4_NUM_RESERVED_UARS = 8
+};
+
+int mlx4_pd_alloc(struct mlx4_dev *dev, u32 *pdn)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+
+ *pdn = mlx4_bitmap_alloc(&priv->pd_bitmap);
+ if (*pdn == -1)
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_pd_alloc);
+
+void mlx4_pd_free(struct mlx4_dev *dev, u32 pdn)
+{
+ mlx4_bitmap_free(&mlx4_priv(dev)->pd_bitmap, pdn);
+}
+EXPORT_SYMBOL_GPL(mlx4_pd_free);
+
+int mlx4_init_pd_table(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+
+ return mlx4_bitmap_init(&priv->pd_bitmap, dev->caps.num_pds,
+ (1 << 24) - 1, dev->caps.reserved_pds, 0);
+}
+
+void mlx4_cleanup_pd_table(struct mlx4_dev *dev)
+{
+ mlx4_bitmap_cleanup(&mlx4_priv(dev)->pd_bitmap);
+}
+
+
+int mlx4_uar_alloc(struct mlx4_dev *dev, struct mlx4_uar *uar)
+{
+ uar->index = mlx4_bitmap_alloc(&mlx4_priv(dev)->uar_table.bitmap);
+ if (uar->index == -1)
+ return -ENOMEM;
+
+ uar->pfn = (pci_resource_start(dev->pdev, 2) >> PAGE_SHIFT) + uar->index;
+ uar->map = NULL;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_uar_alloc);
+
+void mlx4_uar_free(struct mlx4_dev *dev, struct mlx4_uar *uar)
+{
+ mlx4_bitmap_free(&mlx4_priv(dev)->uar_table.bitmap, uar->index);
+}
+EXPORT_SYMBOL_GPL(mlx4_uar_free);
+
+int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ struct mlx4_uar *uar;
+ int err = 0;
+ int idx;
+
+ if (!priv->bf_mapping)
+ return -ENOMEM;
+
+ mutex_lock(&priv->bf_mutex);
+ if (!list_empty(&priv->bf_list))
+ uar = list_entry(priv->bf_list.next, struct mlx4_uar, bf_list);
+ else {
+ if (mlx4_bitmap_avail(&priv->uar_table.bitmap) < MLX4_NUM_RESERVED_UARS) {
+ err = -ENOMEM;
+ goto out;
+ }
+ uar = kmalloc(sizeof *uar, GFP_KERNEL);
+ if (!uar) {
+ err = -ENOMEM;
+ goto out;
+ }
+ err = mlx4_uar_alloc(dev, uar);
+ if (err)
+ goto free_kmalloc;
+
+ uar->map = ioremap(uar->pfn << PAGE_SHIFT, PAGE_SIZE);
+ if (!uar->map) {
+ err = -ENOMEM;
+ goto free_uar;
+ }
+
+ uar->bf_map = io_mapping_map_wc(priv->bf_mapping, uar->index << PAGE_SHIFT);
+ if (!uar->bf_map) {
+ err = -ENOMEM;
+ goto unamp_uar;
+ }
+ uar->free_bf_bmap = 0;
+ list_add(&uar->bf_list, &priv->bf_list);
+ }
+
+ bf->uar = uar;
+ idx = ffz(uar->free_bf_bmap);
+ uar->free_bf_bmap |= 1 << idx;
+ bf->uar = uar;
+ bf->offset = 0;
+ bf->buf_size = dev->caps.bf_reg_size / 2;
+ bf->reg = uar->bf_map + idx * dev->caps.bf_reg_size;
+ if (uar->free_bf_bmap == (1 << dev->caps.bf_regs_per_page) - 1)
+ list_del_init(&uar->bf_list);
+
+ goto out;
+
+unamp_uar:
+ bf->uar = NULL;
+ iounmap(uar->map);
+
+free_uar:
+ mlx4_uar_free(dev, uar);
+
+free_kmalloc:
+ kfree(uar);
+
+out:
+ mutex_unlock(&priv->bf_mutex);
+ return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_bf_alloc);
+
+void mlx4_bf_free(struct mlx4_dev *dev, struct mlx4_bf *bf)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ int idx;
+
+ if (!bf->uar || !bf->uar->bf_map)
+ return;
+
+ mutex_lock(&priv->bf_mutex);
+ idx = (bf->reg - bf->uar->bf_map) / dev->caps.bf_reg_size;
+ bf->uar->free_bf_bmap &= ~(1 << idx);
+ if (!bf->uar->free_bf_bmap) {
+ if (!list_empty(&bf->uar->bf_list))
+ list_del(&bf->uar->bf_list);
+
+ io_mapping_unmap(bf->uar->bf_map);
+ iounmap(bf->uar->map);
+ mlx4_uar_free(dev, bf->uar);
+ kfree(bf->uar);
+ } else if (list_empty(&bf->uar->bf_list))
+ list_add(&bf->uar->bf_list, &priv->bf_list);
+
+ mutex_unlock(&priv->bf_mutex);
+}
+EXPORT_SYMBOL_GPL(mlx4_bf_free);
+
+int mlx4_init_uar_table(struct mlx4_dev *dev)
+{
+ if (dev->caps.num_uars <= 128) {
+ mlx4_err(dev, "Only %d UAR pages (need more than 128)\n",
+ dev->caps.num_uars);
+ mlx4_err(dev, "Increase firmware log2_uar_bar_megabytes?\n");
+ return -ENODEV;
+ }
+
+ return mlx4_bitmap_init(&mlx4_priv(dev)->uar_table.bitmap,
+ dev->caps.num_uars, dev->caps.num_uars - 1,
+ max(128, dev->caps.reserved_uars), 0);
+}
+
+void mlx4_cleanup_uar_table(struct mlx4_dev *dev)
+{
+ mlx4_bitmap_cleanup(&mlx4_priv(dev)->uar_table.bitmap);
+}
diff --git a/sys/ofed/drivers/net/mlx4/port.c b/sys/ofed/drivers/net/mlx4/port.c
new file mode 100644
index 0000000..c8df375
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/port.c
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/if_ether.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+
+int mlx4_ib_set_4k_mtu = 0;
+module_param_named(set_4k_mtu, mlx4_ib_set_4k_mtu, int, 0444);
+MODULE_PARM_DESC(set_4k_mtu, "attempt to set 4K MTU to all ConnectX ports");
+
+#define MLX4_MAC_VALID (1ull << 63)
+#define MLX4_MAC_MASK 0xffffffffffffULL
+
+#define MLX4_VLAN_VALID (1u << 31)
+#define MLX4_VLAN_MASK 0xfff
+
+void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table)
+{
+ int i;
+
+ mutex_init(&table->mutex);
+ for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+ table->entries[i] = 0;
+ table->refs[i] = 0;
+ }
+ table->max = 1 << dev->caps.log_num_macs;
+ table->total = 0;
+}
+
+void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table)
+{
+ int i;
+
+ mutex_init(&table->mutex);
+ for (i = 0; i < MLX4_MAX_VLAN_NUM; i++) {
+ table->entries[i] = 0;
+ table->refs[i] = 0;
+ }
+ table->max = 1 << dev->caps.log_num_vlans;
+ table->total = 0;
+}
+
+static int mlx4_set_port_mac_table(struct mlx4_dev *dev, u8 port,
+ __be64 *entries)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ u32 in_mod;
+ int err;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+
+ memcpy(mailbox->buf, entries, MLX4_MAC_TABLE_SIZE);
+
+ in_mod = MLX4_SET_PORT_MAC_TABLE << 8 | port;
+ err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+ MLX4_CMD_TIME_CLASS_B);
+
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ return err;
+}
+
+int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *index)
+{
+ struct mlx4_mac_table *table = &mlx4_priv(dev)->port[port].mac_table;
+ int i, err = 0;
+ int free = -1;
+
+ mlx4_dbg(dev, "Registering MAC: 0x%llx\n", (unsigned long long) mac);
+ mutex_lock(&table->mutex);
+ for (i = 0; i < MLX4_MAX_MAC_NUM - 1; i++) {
+ if (free < 0 && !table->refs[i]) {
+ free = i;
+ continue;
+ }
+
+ if (mac == (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) {
+ /* MAC already registered, increase refernce count */
+ *index = i;
+ ++table->refs[i];
+ goto out;
+ }
+ }
+
+ if (free < 0) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ mlx4_dbg(dev, "Free MAC index is %d\n", free);
+
+ if (table->total == table->max) {
+ /* No free mac entries */
+ err = -ENOSPC;
+ goto out;
+ }
+
+ /* Register new MAC */
+ table->refs[free] = 1;
+ table->entries[free] = cpu_to_be64(mac | MLX4_MAC_VALID);
+
+ err = mlx4_set_port_mac_table(dev, port, table->entries);
+ if (unlikely(err)) {
+ mlx4_err(dev, "Failed adding MAC: 0x%llx\n", (unsigned long long) mac);
+ table->refs[free] = 0;
+ table->entries[free] = 0;
+ goto out;
+ }
+
+ *index = free;
+ ++table->total;
+out:
+ mutex_unlock(&table->mutex);
+ return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_register_mac);
+
+void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, int index)
+{
+ struct mlx4_mac_table *table = &mlx4_priv(dev)->port[port].mac_table;
+
+ mutex_lock(&table->mutex);
+ if (!table->refs[index]) {
+ mlx4_warn(dev, "No MAC entry for index %d\n", index);
+ goto out;
+ }
+ if (--table->refs[index]) {
+ mlx4_warn(dev, "Have more references for index %d,"
+ "no need to modify MAC table\n", index);
+ goto out;
+ }
+ table->entries[index] = 0;
+ mlx4_set_port_mac_table(dev, port, table->entries);
+ --table->total;
+out:
+ mutex_unlock(&table->mutex);
+}
+EXPORT_SYMBOL_GPL(mlx4_unregister_mac);
+
+static int mlx4_set_port_vlan_table(struct mlx4_dev *dev, u8 port,
+ __be32 *entries)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ u32 in_mod;
+ int err;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+
+ memcpy(mailbox->buf, entries, MLX4_VLAN_TABLE_SIZE);
+ in_mod = MLX4_SET_PORT_VLAN_TABLE << 8 | port;
+ err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+ MLX4_CMD_TIME_CLASS_B);
+
+ mlx4_free_cmd_mailbox(dev, mailbox);
+
+ return err;
+}
+
+int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx)
+{
+ struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table;
+ int i;
+
+ for (i = 0; i < MLX4_MAX_VLAN_NUM; ++i) {
+ if (table->refs[i] &&
+ (vid == (MLX4_VLAN_MASK &
+ be32_to_cpu(table->entries[i])))) {
+ /* Vlan already registered, increase refernce count */
+ *idx = i;
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+EXPORT_SYMBOL_GPL(mlx4_find_cached_vlan);
+
+int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index)
+{
+ struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table;
+ int i, err = 0;
+ int free = -1;
+
+ mutex_lock(&table->mutex);
+ for (i = MLX4_VLAN_REGULAR; i < MLX4_MAX_VLAN_NUM; i++) {
+ if (free < 0 && (table->refs[i] == 0)) {
+ free = i;
+ continue;
+ }
+
+ if (table->refs[i] &&
+ (vlan == (MLX4_VLAN_MASK &
+ be32_to_cpu(table->entries[i])))) {
+ /* Vlan already registered, increase refernce count */
+ *index = i;
+ ++table->refs[i];
+ goto out;
+ }
+ }
+
+ if (free < 0) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ if (table->total == table->max) {
+ /* No free vlan entries */
+ err = -ENOSPC;
+ goto out;
+ }
+
+ /* Register new MAC */
+ table->refs[free] = 1;
+ table->entries[free] = cpu_to_be32(vlan | MLX4_VLAN_VALID);
+
+ err = mlx4_set_port_vlan_table(dev, port, table->entries);
+ if (unlikely(err)) {
+ mlx4_warn(dev, "Failed adding vlan: %u\n", vlan);
+ table->refs[free] = 0;
+ table->entries[free] = 0;
+ goto out;
+ }
+
+ *index = free;
+ ++table->total;
+out:
+ mutex_unlock(&table->mutex);
+ return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_register_vlan);
+
+void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, int index)
+{
+ struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table;
+
+ if (index < MLX4_VLAN_REGULAR) {
+ mlx4_warn(dev, "Trying to free special vlan index %d\n", index);
+ return;
+ }
+
+ mutex_lock(&table->mutex);
+ if (!table->refs[index]) {
+ mlx4_warn(dev, "No vlan entry for index %d\n", index);
+ goto out;
+ }
+ if (--table->refs[index]) {
+ mlx4_dbg(dev, "Have more references for index %d,"
+ "no need to modify vlan table\n", index);
+ goto out;
+ }
+ table->entries[index] = 0;
+ mlx4_set_port_vlan_table(dev, port, table->entries);
+ --table->total;
+out:
+ mutex_unlock(&table->mutex);
+}
+EXPORT_SYMBOL_GPL(mlx4_unregister_vlan);
+
+int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps)
+{
+ struct mlx4_cmd_mailbox *inmailbox, *outmailbox;
+ u8 *inbuf, *outbuf;
+ int err;
+
+ inmailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(inmailbox))
+ return PTR_ERR(inmailbox);
+
+ outmailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(outmailbox)) {
+ mlx4_free_cmd_mailbox(dev, inmailbox);
+ return PTR_ERR(outmailbox);
+ }
+
+ inbuf = inmailbox->buf;
+ outbuf = outmailbox->buf;
+ memset(inbuf, 0, 256);
+ memset(outbuf, 0, 256);
+ inbuf[0] = 1;
+ inbuf[1] = 1;
+ inbuf[2] = 1;
+ inbuf[3] = 1;
+ *(__be16 *) (&inbuf[16]) = cpu_to_be16(0x0015);
+ *(__be32 *) (&inbuf[20]) = cpu_to_be32(port);
+
+ err = mlx4_cmd_box(dev, inmailbox->dma, outmailbox->dma, port, 3,
+ MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C);
+ if (!err)
+ *caps = *(__be32 *) (outbuf + 84);
+ mlx4_free_cmd_mailbox(dev, inmailbox);
+ mlx4_free_cmd_mailbox(dev, outmailbox);
+ return err;
+}
+
+int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ int err;
+
+ if (dev->caps.port_type[port] != MLX4_PORT_TYPE_IB)
+ return 0;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+
+ memset(mailbox->buf, 0, 256);
+
+ if (mlx4_ib_set_4k_mtu)
+ ((__be32 *) mailbox->buf)[0] |= cpu_to_be32((1 << 22) | (1 << 21) | (5 << 12) | (2 << 4));
+
+ ((__be32 *) mailbox->buf)[1] = dev->caps.ib_port_def_cap[port];
+ err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_SET_PORT,
+ MLX4_CMD_TIME_CLASS_B);
+
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ return err;
+}
diff --git a/sys/ofed/drivers/net/mlx4/profile.c b/sys/ofed/drivers/net/mlx4/profile.c
new file mode 100644
index 0000000..bd22df9
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/profile.c
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+
+#include "mlx4.h"
+#include "fw.h"
+
+enum {
+ MLX4_RES_QP,
+ MLX4_RES_RDMARC,
+ MLX4_RES_ALTC,
+ MLX4_RES_AUXC,
+ MLX4_RES_SRQ,
+ MLX4_RES_CQ,
+ MLX4_RES_EQ,
+ MLX4_RES_DMPT,
+ MLX4_RES_CMPT,
+ MLX4_RES_MTT,
+ MLX4_RES_MCG,
+ MLX4_RES_NUM
+};
+
+static const char *res_name[] = {
+ [MLX4_RES_QP] = "QP",
+ [MLX4_RES_RDMARC] = "RDMARC",
+ [MLX4_RES_ALTC] = "ALTC",
+ [MLX4_RES_AUXC] = "AUXC",
+ [MLX4_RES_SRQ] = "SRQ",
+ [MLX4_RES_CQ] = "CQ",
+ [MLX4_RES_EQ] = "EQ",
+ [MLX4_RES_DMPT] = "DMPT",
+ [MLX4_RES_CMPT] = "CMPT",
+ [MLX4_RES_MTT] = "MTT",
+ [MLX4_RES_MCG] = "MCG",
+};
+
+u64 mlx4_make_profile(struct mlx4_dev *dev,
+ struct mlx4_profile *request,
+ struct mlx4_dev_cap *dev_cap,
+ struct mlx4_init_hca_param *init_hca)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ struct mlx4_resource {
+ u64 size;
+ u64 start;
+ int type;
+ int num;
+ int log_num;
+ };
+
+ u64 total_size = 0;
+ struct mlx4_resource *profile;
+ struct mlx4_resource tmp;
+ int i, j;
+
+ profile = kzalloc(MLX4_RES_NUM * sizeof *profile, GFP_KERNEL);
+ if (!profile)
+ return -ENOMEM;
+
+ profile[MLX4_RES_QP].size = dev_cap->qpc_entry_sz;
+ profile[MLX4_RES_RDMARC].size = dev_cap->rdmarc_entry_sz;
+ profile[MLX4_RES_ALTC].size = dev_cap->altc_entry_sz;
+ profile[MLX4_RES_AUXC].size = dev_cap->aux_entry_sz;
+ profile[MLX4_RES_SRQ].size = dev_cap->srq_entry_sz;
+ profile[MLX4_RES_CQ].size = dev_cap->cqc_entry_sz;
+ profile[MLX4_RES_EQ].size = dev_cap->eqc_entry_sz;
+ profile[MLX4_RES_DMPT].size = dev_cap->dmpt_entry_sz;
+ profile[MLX4_RES_CMPT].size = dev_cap->cmpt_entry_sz;
+ profile[MLX4_RES_MTT].size = dev->caps.mtts_per_seg * dev_cap->mtt_entry_sz;
+ profile[MLX4_RES_MCG].size = MLX4_MGM_ENTRY_SIZE;
+
+ profile[MLX4_RES_QP].num = request->num_qp;
+ profile[MLX4_RES_RDMARC].num = request->num_qp * request->rdmarc_per_qp;
+ profile[MLX4_RES_ALTC].num = request->num_qp;
+ profile[MLX4_RES_AUXC].num = request->num_qp;
+ profile[MLX4_RES_SRQ].num = request->num_srq;
+ profile[MLX4_RES_CQ].num = request->num_cq;
+ profile[MLX4_RES_EQ].num = min_t(unsigned, dev_cap->max_eqs,
+ dev_cap->reserved_eqs +
+ num_possible_cpus() + 1);
+ profile[MLX4_RES_DMPT].num = request->num_mpt;
+ profile[MLX4_RES_CMPT].num = MLX4_NUM_CMPTS;
+ profile[MLX4_RES_MTT].num = request->num_mtt;
+ profile[MLX4_RES_MCG].num = request->num_mcg;
+
+ for (i = 0; i < MLX4_RES_NUM; ++i) {
+ profile[i].type = i;
+ profile[i].num = roundup_pow_of_two(profile[i].num);
+ profile[i].log_num = ilog2(profile[i].num);
+ profile[i].size *= profile[i].num;
+ profile[i].size = max(profile[i].size, (u64) PAGE_SIZE);
+ }
+
+ /*
+ * Sort the resources in decreasing order of size. Since they
+ * all have sizes that are powers of 2, we'll be able to keep
+ * resources aligned to their size and pack them without gaps
+ * using the sorted order.
+ */
+ for (i = MLX4_RES_NUM; i > 0; --i)
+ for (j = 1; j < i; ++j) {
+ if (profile[j].size > profile[j - 1].size) {
+ tmp = profile[j];
+ profile[j] = profile[j - 1];
+ profile[j - 1] = tmp;
+ }
+ }
+
+ for (i = 0; i < MLX4_RES_NUM; ++i) {
+ if (profile[i].size) {
+ profile[i].start = total_size;
+ total_size += profile[i].size;
+ }
+
+ if (total_size > dev_cap->max_icm_sz) {
+ mlx4_err(dev, "Profile requires 0x%llx bytes; "
+ "won't fit in 0x%llx bytes of context memory.\n",
+ (unsigned long long) total_size,
+ (unsigned long long) dev_cap->max_icm_sz);
+ kfree(profile);
+ return -ENOMEM;
+ }
+
+ if (profile[i].size)
+ mlx4_dbg(dev, " profile[%2d] (%6s): 2^%02d entries @ 0x%10llx, "
+ "size 0x%10llx\n",
+ i, res_name[profile[i].type], profile[i].log_num,
+ (unsigned long long) profile[i].start,
+ (unsigned long long) profile[i].size);
+ }
+
+ mlx4_dbg(dev, "HCA context memory: reserving %d KB\n",
+ (int) (total_size >> 10));
+
+ for (i = 0; i < MLX4_RES_NUM; ++i) {
+ switch (profile[i].type) {
+ case MLX4_RES_QP:
+ dev->caps.num_qps = profile[i].num;
+ init_hca->qpc_base = profile[i].start;
+ init_hca->log_num_qps = profile[i].log_num;
+ break;
+ case MLX4_RES_RDMARC:
+ for (priv->qp_table.rdmarc_shift = 0;
+ request->num_qp << priv->qp_table.rdmarc_shift < profile[i].num;
+ ++priv->qp_table.rdmarc_shift)
+ ; /* nothing */
+ dev->caps.max_qp_dest_rdma = 1 << priv->qp_table.rdmarc_shift;
+ priv->qp_table.rdmarc_base = (u32) profile[i].start;
+ init_hca->rdmarc_base = profile[i].start;
+ init_hca->log_rd_per_qp = priv->qp_table.rdmarc_shift;
+ break;
+ case MLX4_RES_ALTC:
+ init_hca->altc_base = profile[i].start;
+ break;
+ case MLX4_RES_AUXC:
+ init_hca->auxc_base = profile[i].start;
+ break;
+ case MLX4_RES_SRQ:
+ dev->caps.num_srqs = profile[i].num;
+ init_hca->srqc_base = profile[i].start;
+ init_hca->log_num_srqs = profile[i].log_num;
+ break;
+ case MLX4_RES_CQ:
+ dev->caps.num_cqs = profile[i].num;
+ init_hca->cqc_base = profile[i].start;
+ init_hca->log_num_cqs = profile[i].log_num;
+ break;
+ case MLX4_RES_EQ:
+ dev->caps.num_eqs = profile[i].num;
+ init_hca->eqc_base = profile[i].start;
+ init_hca->log_num_eqs = profile[i].log_num;
+ break;
+ case MLX4_RES_DMPT:
+ dev->caps.num_mpts = profile[i].num;
+ priv->mr_table.mpt_base = profile[i].start;
+ init_hca->dmpt_base = profile[i].start;
+ init_hca->log_mpt_sz = profile[i].log_num;
+ break;
+ case MLX4_RES_CMPT:
+ init_hca->cmpt_base = profile[i].start;
+ break;
+ case MLX4_RES_MTT:
+ dev->caps.num_mtt_segs = profile[i].num;
+ priv->mr_table.mtt_base = profile[i].start;
+ init_hca->mtt_base = profile[i].start;
+ break;
+ case MLX4_RES_MCG:
+ dev->caps.num_mgms = profile[i].num >> 1;
+ dev->caps.num_amgms = profile[i].num >> 1;
+ init_hca->mc_base = profile[i].start;
+ init_hca->log_mc_entry_sz = ilog2(MLX4_MGM_ENTRY_SIZE);
+ init_hca->log_mc_table_sz = profile[i].log_num;
+ init_hca->log_mc_hash_sz = profile[i].log_num - 1;
+ break;
+ default:
+ break;
+ }
+ }
+
+ /*
+ * PDs don't take any HCA memory, but we assign them as part
+ * of the HCA profile anyway.
+ */
+ dev->caps.num_pds = MLX4_NUM_PDS;
+
+ kfree(profile);
+ return total_size;
+}
diff --git a/sys/ofed/drivers/net/mlx4/qp.c b/sys/ofed/drivers/net/mlx4/qp.c
new file mode 100644
index 0000000..bf1c117
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/qp.c
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/qp.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type)
+{
+ struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+ struct mlx4_qp *qp;
+
+ spin_lock(&qp_table->lock);
+
+ qp = __mlx4_qp_lookup(dev, qpn);
+ if (qp)
+ atomic_inc(&qp->refcount);
+
+ spin_unlock(&qp_table->lock);
+
+ if (!qp) {
+ mlx4_warn(dev, "Async event for bogus QP %08x\n", qpn);
+ return;
+ }
+
+ qp->event(qp, event_type);
+
+ if (atomic_dec_and_test(&qp->refcount))
+ complete(&qp->free);
+}
+
+int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+ enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state,
+ struct mlx4_qp_context *context, enum mlx4_qp_optpar optpar,
+ int sqd_event, struct mlx4_qp *qp)
+{
+ static const u16 op[MLX4_QP_NUM_STATE][MLX4_QP_NUM_STATE] = {
+ [MLX4_QP_STATE_RST] = {
+ [MLX4_QP_STATE_RST] = MLX4_CMD_2RST_QP,
+ [MLX4_QP_STATE_ERR] = MLX4_CMD_2ERR_QP,
+ [MLX4_QP_STATE_INIT] = MLX4_CMD_RST2INIT_QP,
+ },
+ [MLX4_QP_STATE_INIT] = {
+ [MLX4_QP_STATE_RST] = MLX4_CMD_2RST_QP,
+ [MLX4_QP_STATE_ERR] = MLX4_CMD_2ERR_QP,
+ [MLX4_QP_STATE_INIT] = MLX4_CMD_INIT2INIT_QP,
+ [MLX4_QP_STATE_RTR] = MLX4_CMD_INIT2RTR_QP,
+ },
+ [MLX4_QP_STATE_RTR] = {
+ [MLX4_QP_STATE_RST] = MLX4_CMD_2RST_QP,
+ [MLX4_QP_STATE_ERR] = MLX4_CMD_2ERR_QP,
+ [MLX4_QP_STATE_RTS] = MLX4_CMD_RTR2RTS_QP,
+ },
+ [MLX4_QP_STATE_RTS] = {
+ [MLX4_QP_STATE_RST] = MLX4_CMD_2RST_QP,
+ [MLX4_QP_STATE_ERR] = MLX4_CMD_2ERR_QP,
+ [MLX4_QP_STATE_RTS] = MLX4_CMD_RTS2RTS_QP,
+ [MLX4_QP_STATE_SQD] = MLX4_CMD_RTS2SQD_QP,
+ },
+ [MLX4_QP_STATE_SQD] = {
+ [MLX4_QP_STATE_RST] = MLX4_CMD_2RST_QP,
+ [MLX4_QP_STATE_ERR] = MLX4_CMD_2ERR_QP,
+ [MLX4_QP_STATE_RTS] = MLX4_CMD_SQD2RTS_QP,
+ [MLX4_QP_STATE_SQD] = MLX4_CMD_SQD2SQD_QP,
+ },
+ [MLX4_QP_STATE_SQER] = {
+ [MLX4_QP_STATE_RST] = MLX4_CMD_2RST_QP,
+ [MLX4_QP_STATE_ERR] = MLX4_CMD_2ERR_QP,
+ [MLX4_QP_STATE_RTS] = MLX4_CMD_SQERR2RTS_QP,
+ },
+ [MLX4_QP_STATE_ERR] = {
+ [MLX4_QP_STATE_RST] = MLX4_CMD_2RST_QP,
+ [MLX4_QP_STATE_ERR] = MLX4_CMD_2ERR_QP,
+ }
+ };
+
+ struct mlx4_cmd_mailbox *mailbox;
+ int ret = 0;
+
+ if (cur_state >= MLX4_QP_NUM_STATE || new_state >= MLX4_QP_NUM_STATE ||
+ !op[cur_state][new_state])
+ return -EINVAL;
+
+ if (op[cur_state][new_state] == MLX4_CMD_2RST_QP)
+ return mlx4_cmd(dev, 0, qp->qpn, 2,
+ MLX4_CMD_2RST_QP, MLX4_CMD_TIME_CLASS_A);
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+
+ if (cur_state == MLX4_QP_STATE_RST && new_state == MLX4_QP_STATE_INIT) {
+ u64 mtt_addr = mlx4_mtt_addr(dev, mtt);
+ context->mtt_base_addr_h = mtt_addr >> 32;
+ context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff);
+ context->log_page_size = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
+ }
+
+ *(__be32 *) mailbox->buf = cpu_to_be32(optpar);
+ memcpy(mailbox->buf + 8, context, sizeof *context);
+
+ ((struct mlx4_qp_context *) (mailbox->buf + 8))->local_qpn =
+ cpu_to_be32(qp->qpn);
+
+ ret = mlx4_cmd(dev, mailbox->dma, qp->qpn | (!!sqd_event << 31),
+ new_state == MLX4_QP_STATE_RST ? 2 : 0,
+ op[cur_state][new_state], MLX4_CMD_TIME_CLASS_C);
+
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_modify);
+
+int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ struct mlx4_qp_table *qp_table = &priv->qp_table;
+ int qpn;
+
+ qpn = mlx4_bitmap_alloc_range(&qp_table->bitmap, cnt, align);
+ if (qpn == -1)
+ return -ENOMEM;
+
+ *base = qpn;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_reserve_range);
+
+void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ struct mlx4_qp_table *qp_table = &priv->qp_table;
+ if (base_qpn < dev->caps.sqp_start + 8)
+ return;
+
+ mlx4_bitmap_free_range(&qp_table->bitmap, base_qpn, cnt);
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_release_range);
+
+int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ struct mlx4_qp_table *qp_table = &priv->qp_table;
+ int err;
+
+ if (!qpn)
+ return -EINVAL;
+
+ qp->qpn = qpn;
+
+ err = mlx4_table_get(dev, &qp_table->qp_table, qp->qpn);
+ if (err)
+ goto err_out;
+
+ err = mlx4_table_get(dev, &qp_table->auxc_table, qp->qpn);
+ if (err)
+ goto err_put_qp;
+
+ err = mlx4_table_get(dev, &qp_table->altc_table, qp->qpn);
+ if (err)
+ goto err_put_auxc;
+
+ err = mlx4_table_get(dev, &qp_table->rdmarc_table, qp->qpn);
+ if (err)
+ goto err_put_altc;
+
+ err = mlx4_table_get(dev, &qp_table->cmpt_table, qp->qpn);
+ if (err)
+ goto err_put_rdmarc;
+
+ spin_lock_irq(&qp_table->lock);
+ err = radix_tree_insert(&dev->qp_table_tree, qp->qpn & (dev->caps.num_qps - 1), qp);
+ spin_unlock_irq(&qp_table->lock);
+ if (err)
+ goto err_put_cmpt;
+
+ atomic_set(&qp->refcount, 1);
+ init_completion(&qp->free);
+
+ return 0;
+
+err_put_cmpt:
+ mlx4_table_put(dev, &qp_table->cmpt_table, qp->qpn);
+
+err_put_rdmarc:
+ mlx4_table_put(dev, &qp_table->rdmarc_table, qp->qpn);
+
+err_put_altc:
+ mlx4_table_put(dev, &qp_table->altc_table, qp->qpn);
+
+err_put_auxc:
+ mlx4_table_put(dev, &qp_table->auxc_table, qp->qpn);
+
+err_put_qp:
+ mlx4_table_put(dev, &qp_table->qp_table, qp->qpn);
+
+err_out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_alloc);
+
+struct mlx4_qp *mlx4_qp_lookup_lock(struct mlx4_dev *dev, u32 qpn)
+{
+ struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+ unsigned long flags;
+ struct mlx4_qp *qp;
+
+ spin_lock_irqsave(&qp_table->lock, flags);
+ qp = radix_tree_lookup(&dev->qp_table_tree, qpn & (dev->caps.num_qps - 1));
+ spin_unlock_irqrestore(&qp_table->lock, flags);
+ return qp;
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_lookup_lock);
+
+void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp)
+{
+ struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp_table->lock, flags);
+ radix_tree_delete(&dev->qp_table_tree, qp->qpn & (dev->caps.num_qps - 1));
+ spin_unlock_irqrestore(&qp_table->lock, flags);
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_remove);
+
+void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp)
+{
+ struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+
+ if (atomic_dec_and_test(&qp->refcount))
+ complete(&qp->free);
+ wait_for_completion(&qp->free);
+
+ mlx4_table_put(dev, &qp_table->cmpt_table, qp->qpn);
+ mlx4_table_put(dev, &qp_table->rdmarc_table, qp->qpn);
+ mlx4_table_put(dev, &qp_table->altc_table, qp->qpn);
+ mlx4_table_put(dev, &qp_table->auxc_table, qp->qpn);
+ mlx4_table_put(dev, &qp_table->qp_table, qp->qpn);
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_free);
+
+static int mlx4_CONF_SPECIAL_QP(struct mlx4_dev *dev, u32 base_qpn)
+{
+ return mlx4_cmd(dev, 0, base_qpn,
+ (dev->caps.flags & MLX4_DEV_CAP_FLAG_RAW_ETY) ? 4 : 0,
+ MLX4_CMD_CONF_SPECIAL_QP, MLX4_CMD_TIME_CLASS_B);
+}
+
+int mlx4_init_qp_table(struct mlx4_dev *dev)
+{
+ struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+ int err;
+ int reserved_from_top = 0;
+
+ spin_lock_init(&qp_table->lock);
+ INIT_RADIX_TREE(&dev->qp_table_tree, GFP_ATOMIC);
+
+ /*
+ * We reserve 2 extra QPs per port for the special QPs. The
+ * block of special QPs must be aligned to a multiple of 8, so
+ * round up.
+ * We also reserve the MSB of the 24-bit QP number to indicate
+ * an XRC qp.
+ */
+ dev->caps.sqp_start =
+ ALIGN(dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], 8);
+
+ {
+ int sort[MLX4_NUM_QP_REGION];
+ int i, j, tmp;
+ int last_base = dev->caps.num_qps;
+
+ for (i = 1; i < MLX4_NUM_QP_REGION; ++i)
+ sort[i] = i;
+
+ for (i = MLX4_NUM_QP_REGION; i > 0; --i) {
+ for (j = 2; j < i; ++j) {
+ if (dev->caps.reserved_qps_cnt[sort[j]] >
+ dev->caps.reserved_qps_cnt[sort[j - 1]]) {
+ tmp = sort[j];
+ sort[j] = sort[j - 1];
+ sort[j - 1] = tmp;
+ }
+ }
+ }
+
+ for (i = 1; i < MLX4_NUM_QP_REGION; ++i) {
+ last_base -= dev->caps.reserved_qps_cnt[sort[i]];
+ dev->caps.reserved_qps_base[sort[i]] = last_base;
+ reserved_from_top +=
+ dev->caps.reserved_qps_cnt[sort[i]];
+ }
+
+ }
+
+ err = mlx4_bitmap_init(&qp_table->bitmap, dev->caps.num_qps,
+ (1 << 23) - 1, dev->caps.sqp_start + 8,
+ reserved_from_top);
+ if (err)
+ return err;
+
+ return mlx4_CONF_SPECIAL_QP(dev, dev->caps.sqp_start);
+}
+
+void mlx4_cleanup_qp_table(struct mlx4_dev *dev)
+{
+ mlx4_CONF_SPECIAL_QP(dev, 0);
+ mlx4_bitmap_cleanup(&mlx4_priv(dev)->qp_table.bitmap);
+}
+
+int mlx4_qp_get_region(struct mlx4_dev *dev, enum mlx4_qp_region region,
+ int *base_qpn, int *cnt)
+{
+ if ((region < 0) || (region >= MLX4_NUM_QP_REGION))
+ return -EINVAL;
+
+ *base_qpn = dev->caps.reserved_qps_base[region];
+ *cnt = dev->caps.reserved_qps_cnt[region];
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_get_region);
+
+int mlx4_qp_query(struct mlx4_dev *dev, struct mlx4_qp *qp,
+ struct mlx4_qp_context *context)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ int err;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+
+ err = mlx4_cmd_box(dev, 0, mailbox->dma, qp->qpn, 0,
+ MLX4_CMD_QUERY_QP, MLX4_CMD_TIME_CLASS_A);
+ if (!err)
+ memcpy(context, mailbox->buf + 8, sizeof *context);
+
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_query);
+
+int mlx4_qp_to_ready(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+ struct mlx4_qp_context *context,
+ struct mlx4_qp *qp, enum mlx4_qp_state *qp_state)
+{
+ int err;
+ int i;
+ enum mlx4_qp_state states[] = {
+ MLX4_QP_STATE_RST,
+ MLX4_QP_STATE_INIT,
+ MLX4_QP_STATE_RTR,
+ MLX4_QP_STATE_RTS
+ };
+
+ for (i = 0; i < ARRAY_SIZE(states) - 1; i++) {
+ context->flags &= cpu_to_be32(~(0xf << 28));
+ context->flags |= cpu_to_be32(states[i + 1] << 28);
+ err = mlx4_qp_modify(dev, mtt, states[i], states[i + 1],
+ context, 0, 0, qp);
+ if (err) {
+ mlx4_err(dev, "Failed to bring QP to state: "
+ "%d with error: %d\n",
+ states[i + 1], err);
+ return err;
+ }
+
+ *qp_state = states[i + 1];
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_to_ready);
diff --git a/sys/ofed/drivers/net/mlx4/reset.c b/sys/ofed/drivers/net/mlx4/reset.c
new file mode 100644
index 0000000..3951b88
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/reset.c
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/jiffies.h>
+
+#include "mlx4.h"
+
+int mlx4_reset(struct mlx4_dev *dev)
+{
+ void __iomem *reset;
+ u32 *hca_header = NULL;
+ int pcie_cap;
+ u16 devctl;
+ u16 linkctl;
+ u16 vendor;
+ unsigned long end;
+ u32 sem;
+ int i;
+ int err = 0;
+
+#define MLX4_RESET_BASE 0xf0000
+#define MLX4_RESET_SIZE 0x400
+#define MLX4_SEM_OFFSET 0x3fc
+#define MLX4_RESET_OFFSET 0x10
+#define MLX4_RESET_VALUE swab32(1)
+
+#define MLX4_SEM_TIMEOUT_JIFFIES (10 * HZ)
+#define MLX4_RESET_TIMEOUT_JIFFIES (2 * HZ)
+
+ /*
+ * Reset the chip. This is somewhat ugly because we have to
+ * save off the PCI header before reset and then restore it
+ * after the chip reboots. We skip config space offsets 22
+ * and 23 since those have a special meaning.
+ */
+
+ /* Do we need to save off the full 4K PCI Express header?? */
+ hca_header = kmalloc(256, GFP_KERNEL);
+ if (!hca_header) {
+ err = -ENOMEM;
+ mlx4_err(dev, "Couldn't allocate memory to save HCA "
+ "PCI header, aborting.\n");
+ goto out;
+ }
+
+ pcie_cap = pci_find_capability(dev->pdev, PCI_CAP_ID_EXP);
+
+ for (i = 0; i < 64; ++i) {
+ if (i == 22 || i == 23)
+ continue;
+ if (pci_read_config_dword(dev->pdev, i * 4, hca_header + i)) {
+ err = -ENODEV;
+ mlx4_err(dev, "Couldn't save HCA "
+ "PCI header, aborting.\n");
+ goto out;
+ }
+ }
+
+ reset = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_RESET_BASE,
+ MLX4_RESET_SIZE);
+ if (!reset) {
+ err = -ENOMEM;
+ mlx4_err(dev, "Couldn't map HCA reset register, aborting.\n");
+ goto out;
+ }
+
+ /* grab HW semaphore to lock out flash updates */
+ end = jiffies + MLX4_SEM_TIMEOUT_JIFFIES;
+ do {
+ sem = readl(reset + MLX4_SEM_OFFSET);
+ if (!sem)
+ break;
+
+ msleep(1);
+ } while (time_before(jiffies, end));
+
+ if (sem) {
+ mlx4_err(dev, "Failed to obtain HW semaphore, aborting\n");
+ err = -EAGAIN;
+ iounmap(reset);
+ goto out;
+ }
+
+ /* actually hit reset */
+ writel(MLX4_RESET_VALUE, reset + MLX4_RESET_OFFSET);
+ iounmap(reset);
+
+ /* Docs say to wait one second before accessing device */
+ msleep(1000);
+
+ end = jiffies + MLX4_RESET_TIMEOUT_JIFFIES;
+ do {
+ if (!pci_read_config_word(dev->pdev, PCI_VENDOR_ID, &vendor) &&
+ vendor != 0xffff)
+ break;
+
+ msleep(1);
+ } while (time_before(jiffies, end));
+
+ if (vendor == 0xffff) {
+ err = -ENODEV;
+ mlx4_err(dev, "PCI device did not come back after reset, "
+ "aborting.\n");
+ goto out;
+ }
+
+ /* Now restore the PCI headers */
+ if (pcie_cap) {
+ devctl = hca_header[(pcie_cap + PCI_EXP_DEVCTL) / 4];
+ if (pci_write_config_word(dev->pdev, pcie_cap + PCI_EXP_DEVCTL,
+ devctl)) {
+ err = -ENODEV;
+ mlx4_err(dev, "Couldn't restore HCA PCI Express "
+ "Device Control register, aborting.\n");
+ goto out;
+ }
+ linkctl = hca_header[(pcie_cap + PCI_EXP_LNKCTL) / 4];
+ if (pci_write_config_word(dev->pdev, pcie_cap + PCI_EXP_LNKCTL,
+ linkctl)) {
+ err = -ENODEV;
+ mlx4_err(dev, "Couldn't restore HCA PCI Express "
+ "Link control register, aborting.\n");
+ goto out;
+ }
+ }
+
+ for (i = 0; i < 16; ++i) {
+ if (i * 4 == PCI_COMMAND)
+ continue;
+
+ if (pci_write_config_dword(dev->pdev, i * 4, hca_header[i])) {
+ err = -ENODEV;
+ mlx4_err(dev, "Couldn't restore HCA reg %x, "
+ "aborting.\n", i);
+ goto out;
+ }
+ }
+
+ if (pci_write_config_dword(dev->pdev, PCI_COMMAND,
+ hca_header[PCI_COMMAND / 4])) {
+ err = -ENODEV;
+ mlx4_err(dev, "Couldn't restore HCA COMMAND, "
+ "aborting.\n");
+ goto out;
+ }
+
+out:
+ kfree(hca_header);
+
+ return err;
+}
diff --git a/sys/ofed/drivers/net/mlx4/sense.c b/sys/ofed/drivers/net/mlx4/sense.c
new file mode 100644
index 0000000..0fcf025
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/sense.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/errno.h>
+#include <linux/if_ether.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+
+static int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port,
+ enum mlx4_port_type *type)
+{
+ u64 out_param;
+ int err = 0;
+
+ err = mlx4_cmd_imm(dev, 0, &out_param, port, 0,
+ MLX4_CMD_SENSE_PORT, MLX4_CMD_TIME_CLASS_B);
+ if (err) {
+ mlx4_err(dev, "Sense command failed for port: %d\n", port);
+ return err;
+ }
+
+ if (out_param > 2) {
+ mlx4_err(dev, "Sense returned illegal value: 0x%llx\n", out_param);
+ return EINVAL;
+ }
+
+ *type = out_param;
+ return 0;
+}
+
+void mlx4_do_sense_ports(struct mlx4_dev *dev,
+ enum mlx4_port_type *stype,
+ enum mlx4_port_type *defaults)
+{
+ struct mlx4_sense *sense = &mlx4_priv(dev)->sense;
+ int err;
+ int i;
+
+ for (i = 1; i <= dev->caps.num_ports; i++) {
+ stype[i - 1] = 0;
+ if (sense->do_sense_port[i] && sense->sense_allowed[i] &&
+ dev->caps.possible_type[i] == MLX4_PORT_TYPE_AUTO) {
+ err = mlx4_SENSE_PORT(dev, i, &stype[i - 1]);
+ if (err)
+ stype[i - 1] = defaults[i - 1];
+ } else
+ stype[i - 1] = defaults[i - 1];
+ }
+
+ /*
+ * Adjust port configuration:
+ * If port 1 sensed nothing and port 2 is IB, set both as IB
+ * If port 2 sensed nothing and port 1 is Eth, set both as Eth
+ */
+ if (stype[0] == MLX4_PORT_TYPE_ETH) {
+ for (i = 1; i < dev->caps.num_ports; i++)
+ stype[i] = stype[i] ? stype[i] : MLX4_PORT_TYPE_ETH;
+ }
+ if (stype[dev->caps.num_ports - 1] == MLX4_PORT_TYPE_IB) {
+ for (i = 0; i < dev->caps.num_ports - 1; i++)
+ stype[i] = stype[i] ? stype[i] : MLX4_PORT_TYPE_IB;
+ }
+
+ /*
+ * If sensed nothing, remain in current configuration.
+ */
+ for (i = 0; i < dev->caps.num_ports; i++)
+ stype[i] = stype[i] ? stype[i] : defaults[i];
+
+}
+
+static void mlx4_sense_port(struct work_struct *work)
+{
+ struct delayed_work *delay = to_delayed_work(work);
+ struct mlx4_sense *sense = container_of(delay, struct mlx4_sense,
+ sense_poll);
+ struct mlx4_dev *dev = sense->dev;
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ enum mlx4_port_type stype[MLX4_MAX_PORTS];
+
+ mutex_lock(&priv->port_mutex);
+ mlx4_do_sense_ports(dev, stype, &dev->caps.port_type[1]);
+
+ if (mlx4_check_port_params(dev, stype))
+ goto sense_again;
+
+ if (mlx4_change_port_types(dev, stype))
+ mlx4_err(dev, "Failed to change port_types\n");
+
+sense_again:
+ mutex_unlock(&priv->port_mutex);
+ if (sense->resched)
+ queue_delayed_work(sense->sense_wq , &sense->sense_poll,
+ round_jiffies(MLX4_SENSE_RANGE));
+}
+
+void mlx4_start_sense(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ struct mlx4_sense *sense = &priv->sense;
+
+ if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP))
+ return;
+
+ sense->resched = 1;
+ queue_delayed_work(sense->sense_wq , &sense->sense_poll,
+ round_jiffies(MLX4_SENSE_RANGE));
+}
+
+
+void mlx4_stop_sense(struct mlx4_dev *dev)
+{
+ mlx4_priv(dev)->sense.resched = 0;
+}
+
+int mlx4_sense_init(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ struct mlx4_sense *sense = &priv->sense;
+ int port;
+
+ sense->dev = dev;
+ sense->sense_wq = create_singlethread_workqueue("mlx4_sense");
+ if (!sense->sense_wq)
+ return -ENOMEM;
+
+ for (port = 1; port <= dev->caps.num_ports; port++)
+ sense->do_sense_port[port] = 1;
+
+ INIT_DELAYED_WORK_DEFERRABLE(&sense->sense_poll, mlx4_sense_port);
+ return 0;
+}
+
+void mlx4_sense_cleanup(struct mlx4_dev *dev)
+{
+ mlx4_stop_sense(dev);
+ cancel_delayed_work(&mlx4_priv(dev)->sense.sense_poll);
+ destroy_workqueue(mlx4_priv(dev)->sense.sense_wq);
+}
+
diff --git a/sys/ofed/drivers/net/mlx4/srq.c b/sys/ofed/drivers/net/mlx4/srq.c
new file mode 100644
index 0000000..f856b8d
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/srq.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/srq.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+struct mlx4_srq_context {
+ __be32 state_logsize_srqn;
+ u8 logstride;
+ u8 reserved1;
+ __be16 xrc_domain;
+ __be32 pg_offset_cqn;
+ u32 reserved2;
+ u8 log_page_size;
+ u8 reserved3[2];
+ u8 mtt_base_addr_h;
+ __be32 mtt_base_addr_l;
+ __be32 pd;
+ __be16 limit_watermark;
+ __be16 wqe_cnt;
+ u16 reserved4;
+ __be16 wqe_counter;
+ u32 reserved5;
+ __be64 db_rec_addr;
+};
+
+void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type)
+{
+ struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+ struct mlx4_srq *srq;
+
+ spin_lock(&srq_table->lock);
+
+ srq = radix_tree_lookup(&dev->srq_table_tree,
+ srqn & (dev->caps.num_srqs - 1));
+ if (srq)
+ atomic_inc(&srq->refcount);
+
+ spin_unlock(&srq_table->lock);
+
+ if (!srq) {
+ mlx4_warn(dev, "Async event for bogus SRQ %08x\n", srqn);
+ return;
+ }
+
+ srq->event(srq, event_type);
+
+ if (atomic_dec_and_test(&srq->refcount))
+ complete(&srq->free);
+}
+
+static int mlx4_SW2HW_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+ int srq_num)
+{
+ return mlx4_cmd(dev, mailbox->dma, srq_num, 0, MLX4_CMD_SW2HW_SRQ,
+ MLX4_CMD_TIME_CLASS_A);
+}
+
+static int mlx4_HW2SW_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+ int srq_num)
+{
+ return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, srq_num,
+ mailbox ? 0 : 1, MLX4_CMD_HW2SW_SRQ,
+ MLX4_CMD_TIME_CLASS_A);
+}
+
+static int mlx4_ARM_SRQ(struct mlx4_dev *dev, int srq_num, int limit_watermark)
+{
+ return mlx4_cmd(dev, limit_watermark, srq_num, 0, MLX4_CMD_ARM_SRQ,
+ MLX4_CMD_TIME_CLASS_B);
+}
+
+static int mlx4_QUERY_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+ int srq_num)
+{
+ return mlx4_cmd_box(dev, 0, mailbox->dma, srq_num, 0, MLX4_CMD_QUERY_SRQ,
+ MLX4_CMD_TIME_CLASS_A);
+}
+
+int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcd,
+ struct mlx4_mtt *mtt, u64 db_rec, struct mlx4_srq *srq)
+{
+ struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+ struct mlx4_cmd_mailbox *mailbox;
+ struct mlx4_srq_context *srq_context;
+ u64 mtt_addr;
+ int err;
+
+ srq->srqn = mlx4_bitmap_alloc(&srq_table->bitmap);
+ if (srq->srqn == -1)
+ return -ENOMEM;
+
+ err = mlx4_table_get(dev, &srq_table->table, srq->srqn);
+ if (err)
+ goto err_out;
+
+ err = mlx4_table_get(dev, &srq_table->cmpt_table, srq->srqn);
+ if (err)
+ goto err_put;
+
+ spin_lock_irq(&srq_table->lock);
+ err = radix_tree_insert(&dev->srq_table_tree, srq->srqn, srq);
+ spin_unlock_irq(&srq_table->lock);
+ if (err)
+ goto err_cmpt_put;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox)) {
+ err = PTR_ERR(mailbox);
+ goto err_radix;
+ }
+
+ srq_context = mailbox->buf;
+ memset(srq_context, 0, sizeof *srq_context);
+
+ srq_context->state_logsize_srqn = cpu_to_be32((ilog2(srq->max) << 24) |
+ srq->srqn);
+ srq_context->logstride = srq->wqe_shift - 4;
+ srq_context->xrc_domain = cpu_to_be16(xrcd);
+ srq_context->pg_offset_cqn = cpu_to_be32(cqn & 0xffffff);
+ srq_context->log_page_size = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
+
+ mtt_addr = mlx4_mtt_addr(dev, mtt);
+ srq_context->mtt_base_addr_h = mtt_addr >> 32;
+ srq_context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff);
+ srq_context->pd = cpu_to_be32(pdn);
+ srq_context->db_rec_addr = cpu_to_be64(db_rec);
+
+ err = mlx4_SW2HW_SRQ(dev, mailbox, srq->srqn);
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ if (err)
+ goto err_radix;
+
+ atomic_set(&srq->refcount, 1);
+ init_completion(&srq->free);
+
+ return 0;
+
+err_radix:
+ spin_lock_irq(&srq_table->lock);
+ radix_tree_delete(&dev->srq_table_tree, srq->srqn);
+ spin_unlock_irq(&srq_table->lock);
+
+err_cmpt_put:
+ mlx4_table_put(dev, &srq_table->cmpt_table, srq->srqn);
+
+err_put:
+ mlx4_table_put(dev, &srq_table->table, srq->srqn);
+
+err_out:
+ mlx4_bitmap_free(&srq_table->bitmap, srq->srqn);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_alloc);
+
+void mlx4_srq_invalidate(struct mlx4_dev *dev, struct mlx4_srq *srq)
+{
+ int err;
+
+ err = mlx4_HW2SW_SRQ(dev, NULL, srq->srqn);
+ if (err)
+ mlx4_warn(dev, "HW2SW_SRQ failed (%d) for SRQN %06x\n", err, srq->srqn);
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_invalidate);
+
+void mlx4_srq_remove(struct mlx4_dev *dev, struct mlx4_srq *srq)
+{
+ struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+
+ spin_lock_irq(&srq_table->lock);
+ radix_tree_delete(&dev->srq_table_tree, srq->srqn);
+ spin_unlock_irq(&srq_table->lock);
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_remove);
+
+void mlx4_srq_free(struct mlx4_dev *dev, struct mlx4_srq *srq)
+{
+ struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+
+ if (atomic_dec_and_test(&srq->refcount))
+ complete(&srq->free);
+ wait_for_completion(&srq->free);
+
+ mlx4_table_put(dev, &srq_table->table, srq->srqn);
+ mlx4_bitmap_free(&srq_table->bitmap, srq->srqn);
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_free);
+
+int mlx4_srq_arm(struct mlx4_dev *dev, struct mlx4_srq *srq, int limit_watermark)
+{
+ return mlx4_ARM_SRQ(dev, srq->srqn, limit_watermark);
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_arm);
+
+int mlx4_srq_query(struct mlx4_dev *dev, struct mlx4_srq *srq, int *limit_watermark)
+{
+ struct mlx4_cmd_mailbox *mailbox;
+ struct mlx4_srq_context *srq_context;
+ int err;
+
+ mailbox = mlx4_alloc_cmd_mailbox(dev);
+ if (IS_ERR(mailbox))
+ return PTR_ERR(mailbox);
+
+ srq_context = mailbox->buf;
+
+ err = mlx4_QUERY_SRQ(dev, mailbox, srq->srqn);
+ if (err)
+ goto err_out;
+ *limit_watermark = be16_to_cpu(srq_context->limit_watermark);
+
+err_out:
+ mlx4_free_cmd_mailbox(dev, mailbox);
+ return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_query);
+
+int mlx4_init_srq_table(struct mlx4_dev *dev)
+{
+ struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+ int err;
+
+ spin_lock_init(&srq_table->lock);
+ INIT_RADIX_TREE(&dev->srq_table_tree, GFP_ATOMIC);
+
+ err = mlx4_bitmap_init(&srq_table->bitmap, dev->caps.num_srqs,
+ dev->caps.num_srqs - 1, dev->caps.reserved_srqs, 0);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+void mlx4_cleanup_srq_table(struct mlx4_dev *dev)
+{
+ mlx4_bitmap_cleanup(&mlx4_priv(dev)->srq_table.bitmap);
+}
diff --git a/sys/ofed/drivers/net/mlx4/xrcd.c b/sys/ofed/drivers/net/mlx4/xrcd.c
new file mode 100644
index 0000000..d1bfc11
--- /dev/null
+++ b/sys/ofed/drivers/net/mlx4/xrcd.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+
+#include "mlx4.h"
+
+int mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+
+ *xrcdn = mlx4_bitmap_alloc(&priv->xrcd_bitmap);
+ if (*xrcdn == -1)
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_xrcd_alloc);
+
+void mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn)
+{
+ mlx4_bitmap_free(&mlx4_priv(dev)->xrcd_bitmap, xrcdn);
+}
+EXPORT_SYMBOL_GPL(mlx4_xrcd_free);
+
+int __devinit mlx4_init_xrcd_table(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+
+ return mlx4_bitmap_init(&priv->xrcd_bitmap, (1 << 16),
+ (1 << 16) - 1, dev->caps.reserved_xrcds + 1, 0);
+}
+
+void mlx4_cleanup_xrcd_table(struct mlx4_dev *dev)
+{
+ mlx4_bitmap_cleanup(&mlx4_priv(dev)->xrcd_bitmap);
+}
+
+
diff --git a/sys/ofed/include/asm/atomic-long.h b/sys/ofed/include/asm/atomic-long.h
new file mode 100644
index 0000000..5075ad8
--- /dev/null
+++ b/sys/ofed/include/asm/atomic-long.h
@@ -0,0 +1,79 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _ATOMIC_LONG_H_
+#define _ATOMIC_LONG_H_
+
+#include <sys/cdefs.h>
+#include <sys/types.h>
+#include <machine/atomic.h>
+
+typedef struct {
+ volatile u_long counter;
+} atomic_long_t;
+
+#define atomic_long_add(i, v) atomic_long_add_return((i), (v))
+#define atomic_long_inc_return(v) atomic_long_add_return(1, (v))
+
+static inline long
+atomic_long_add_return(long i, atomic_long_t *v)
+{
+ return i + atomic_fetchadd_long(&v->counter, i);
+}
+
+static inline void
+atomic_long_set(atomic_long_t *v, long i)
+{
+ atomic_store_rel_long(&v->counter, i);
+}
+
+static inline long
+atomic_long_read(atomic_long_t *v)
+{
+ return atomic_load_acq_long(&v->counter);
+}
+
+static inline long
+atomic_long_inc(atomic_long_t *v)
+{
+ return atomic_fetchadd_long(&v->counter, 1) + 1;
+}
+
+static inline long
+atomic_long_dec(atomic_long_t *v)
+{
+ return atomic_fetchadd_long(&v->counter, -1) - 1;
+}
+
+static inline long
+atomic_long_dec_and_test(atomic_long_t *v)
+{
+ long i = atomic_long_add(-1, v);
+ return i == 0 ;
+}
+
+#endif /* _ATOMIC_LONG_H_ */
diff --git a/sys/ofed/include/asm/atomic.h b/sys/ofed/include/asm/atomic.h
new file mode 100644
index 0000000..5c5caa0
--- /dev/null
+++ b/sys/ofed/include/asm/atomic.h
@@ -0,0 +1,85 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _ASM_ATOMIC_H_
+#define _ASM_ATOMIC_H_
+
+#include <sys/cdefs.h>
+#include <sys/types.h>
+#include <machine/atomic.h>
+#include <asm/atomic-long.h>
+
+typedef struct {
+ volatile u_int counter;
+} atomic_t;
+
+#define atomic_add(i, v) atomic_add_return((i), (v))
+#define atomic_sub(i, v) atomic_sub_return((i), (v))
+#define atomic_inc_return(v) atomic_add_return(1, (v))
+#define atomic_add_negative(i, v) (atomic_add_return((i), (v)) < 0)
+#define atomic_sub_and_test(i, v) (atomic_sub_return((i), (v)) == 0)
+#define atomic_dec_and_test(v) (atomic_sub_return(1, (v)) == 0)
+#define atomic_inc_and_test(v) (atomic_add_return(1, (v)) == 0)
+
+static inline int
+atomic_add_return(int i, atomic_t *v)
+{
+ return i + atomic_fetchadd_int(&v->counter, i);
+}
+
+static inline int
+atomic_sub_return(int i, atomic_t *v)
+{
+ return atomic_fetchadd_int(&v->counter, -i) - i;
+}
+
+static inline void
+atomic_set(atomic_t *v, int i)
+{
+ atomic_store_rel_int(&v->counter, i);
+}
+
+static inline int
+atomic_read(atomic_t *v)
+{
+ return atomic_load_acq_int(&v->counter);
+}
+
+static inline int
+atomic_inc(atomic_t *v)
+{
+ return atomic_fetchadd_int(&v->counter, 1) + 1;
+}
+
+static inline int
+atomic_dec(atomic_t *v)
+{
+ return atomic_fetchadd_int(&v->counter, -1) - 1;
+}
+
+#endif /* _ASM_ATOMIC_H_ */
diff --git a/sys/ofed/include/asm/byteorder.h b/sys/ofed/include/asm/byteorder.h
new file mode 100644
index 0000000..341c548
--- /dev/null
+++ b/sys/ofed/include/asm/byteorder.h
@@ -0,0 +1,90 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _ASM_BYTEORDER_H_
+#define _ASM_BYTEORDER_H_
+
+#include <sys/types.h>
+#include <sys/endian.h>
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define __LITTLE_ENDIAN
+#else
+#define __BIG_ENDIAN
+#endif
+
+#define cpu_to_le64 htole64
+#define le64_to_cpu le64toh
+#define cpu_to_le32 htole32
+#define le32_to_cpu le32toh
+#define cpu_to_le16 htole16
+#define le16_to_cpu le16toh
+#define cpu_to_be64 htobe64
+#define be64_to_cpu be64toh
+#define cpu_to_be32 htobe32
+#define be32_to_cpu be32toh
+#define cpu_to_be16 htobe16
+#define be16_to_cpu be16toh
+#define __be16_to_cpu be16toh
+
+#define cpu_to_le64p(x) htole64(*((uint64_t *)x))
+#define le64_to_cpup(x) le64toh(*((uint64_t *)x))
+#define cpu_to_le32p(x) htole32(*((uint32_t *)x))
+#define le32_to_cpup(x) le32toh(*((uint32_t *)x))
+#define cpu_to_le16p(x) htole16(*((uint16_t *)x))
+#define le16_to_cpup(x) le16toh(*((uint16_t *)x))
+#define cpu_to_be64p(x) htobe64(*((uint64_t *)x))
+#define be64_to_cpup(x) be64toh(*((uint64_t *)x))
+#define cpu_to_be32p(x) htobe32(*((uint32_t *)x))
+#define be32_to_cpup(x) be32toh(*((uint32_t *)x))
+#define cpu_to_be16p(x) htobe16(*((uint16_t *)x))
+#define be16_to_cpup(x) be16toh(*((uint16_t *)x))
+
+#define cpu_to_le64s(x) do { *((uint64_t *)x) = cpu_to_le64p((x)) } while (0)
+#define le64_to_cpus(x) do { *((uint64_t *)x) = le64_to_cpup((x)) } while (0)
+#define cpu_to_le32s(x) do { *((uint32_t *)x) = cpu_to_le32p((x)) } while (0)
+#define le32_to_cpus(x) do { *((uint32_t *)x) = le32_to_cpup((x)) } while (0)
+#define cpu_to_le16s(x) do { *((uint16_t *)x) = cpu_to_le16p((x)) } while (0)
+#define le16_to_cpus(x) do { *((uint16_t *)x) = le16_to_cpup((x)) } while (0)
+#define cpu_to_be64s(x) do { *((uint64_t *)x) = cpu_to_be64p((x)) } while (0)
+#define be64_to_cpus(x) do { *((uint64_t *)x) = be64_to_cpup((x)) } while (0)
+#define cpu_to_be32s(x) do { *((uint32_t *)x) = cpu_to_be32p((x)) } while (0)
+#define be32_to_cpus(x) do { *((uint32_t *)x) = be32_to_cpup((x)) } while (0)
+#define cpu_to_be16s(x) do { *((uint16_t *)x) = cpu_to_be16p((x)) } while (0)
+#define be16_to_cpus(x) do { *((uint16_t *)x) = be16_to_cpup((x)) } while (0)
+
+#define swab16 bswap16
+#define swab32 bswap32
+#define swab64 bswap64
+
+static inline void
+be16_add_cpu(u16 *var, u16 val)
+{
+ *var = cpu_to_be16(be16_to_cpu(*var) + val);
+}
+
+#endif /* _ASM_BYTEORDER_H_ */
diff --git a/sys/ofed/include/asm/current.h b/sys/ofed/include/asm/current.h
new file mode 100644
index 0000000..33bd120
--- /dev/null
+++ b/sys/ofed/include/asm/current.h
@@ -0,0 +1,32 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _ASM_CURRENT_H_
+#define _ASM_CURRENT_H_
+
+#endif /* _ASM_CURRENT_H_ */
diff --git a/sys/ofed/include/asm/fcntl.h b/sys/ofed/include/asm/fcntl.h
new file mode 100644
index 0000000..a650f5b
--- /dev/null
+++ b/sys/ofed/include/asm/fcntl.h
@@ -0,0 +1,33 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _ASM_FCNTL_H_
+#define _ASM_FCNTL_H_
+
+#include <sys/fcntl.h>
+
+#endif /* _ASM_FCNTL_H_ */
diff --git a/sys/ofed/include/asm/io.h b/sys/ofed/include/asm/io.h
new file mode 100644
index 0000000..7a742d9
--- /dev/null
+++ b/sys/ofed/include/asm/io.h
@@ -0,0 +1,29 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/io.h>
diff --git a/sys/ofed/include/asm/page.h b/sys/ofed/include/asm/page.h
new file mode 100644
index 0000000..da42df7
--- /dev/null
+++ b/sys/ofed/include/asm/page.h
@@ -0,0 +1,29 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/page.h>
diff --git a/sys/ofed/include/asm/pgtable.h b/sys/ofed/include/asm/pgtable.h
new file mode 100644
index 0000000..087f525
--- /dev/null
+++ b/sys/ofed/include/asm/pgtable.h
@@ -0,0 +1,33 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _ASM_PGTABLE_H_
+#define _ASM_PGTABLE_H_
+
+typedef int pgprot_t;
+
+#endif /* _ASM_PGTABLE_H_ */
diff --git a/sys/ofed/include/asm/semaphore.h b/sys/ofed/include/asm/semaphore.h
new file mode 100644
index 0000000..a60ba8c
--- /dev/null
+++ b/sys/ofed/include/asm/semaphore.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _ASM_SEMAPHORE_H_
+#define _ASM_SEMAPHORE_H_
+
+#include <linux/semaphore.h>
+
+#endif /* _ASM_SEMAPHORE_H_ */
diff --git a/sys/ofed/include/asm/system.h b/sys/ofed/include/asm/system.h
new file mode 100644
index 0000000..e5d814e
--- /dev/null
+++ b/sys/ofed/include/asm/system.h
@@ -0,0 +1,27 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
diff --git a/sys/ofed/include/asm/types.h b/sys/ofed/include/asm/types.h
new file mode 100644
index 0000000..70dd2be
--- /dev/null
+++ b/sys/ofed/include/asm/types.h
@@ -0,0 +1,67 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _ASM_TYPES_H_
+#define _ASM_TYPES_H_
+
+typedef unsigned short umode_t;
+
+typedef __signed__ char __s8;
+typedef unsigned char __u8;
+
+typedef __signed__ short __s16;
+typedef unsigned short __u16;
+
+typedef __signed__ int __s32;
+typedef unsigned int __u32;
+
+#if defined(__GNUC__) // && !defined(__STRICT_ANSI__)
+typedef __signed__ long long __s64;
+typedef unsigned long long __u64;
+#endif
+
+#ifdef _KERNEL
+
+typedef signed char s8;
+typedef unsigned char u8;
+
+typedef signed short s16;
+typedef unsigned short u16;
+
+typedef signed int s32;
+typedef unsigned int u32;
+
+typedef signed long long s64;
+typedef unsigned long long u64;
+
+/* DMA addresses come in generic and 64-bit flavours. */
+typedef vm_paddr_t dma_addr_t;
+typedef vm_paddr_t dma64_addr_t;
+
+#endif /* _KERNEL */
+
+#endif /* _ASM_TYPES_H_ */
diff --git a/sys/ofed/include/asm/uaccess.h b/sys/ofed/include/asm/uaccess.h
new file mode 100644
index 0000000..b7c32fa
--- /dev/null
+++ b/sys/ofed/include/asm/uaccess.h
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _ASM_UACCESS_H_
+#define _ASM_UACCESS_H_
+
+#include <linux/uaccess.h>
+
+static inline long
+copy_to_user(void *to, const void *from, unsigned long n)
+{
+ if (copyout(from, to, n) != 0)
+ return n;
+ return 0;
+}
+
+static inline long
+copy_from_user(void *to, const void *from, unsigned long n)
+{
+ if (copyin(from, to, n) != 0)
+ return n;
+ return 0;
+}
+
+#endif /* _ASM_UACCESS_H_ */
diff --git a/sys/ofed/include/linux/bitmap.h b/sys/ofed/include/linux/bitmap.h
new file mode 100644
index 0000000..66059ac
--- /dev/null
+++ b/sys/ofed/include/linux/bitmap.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_BITMAP_H_
+#define _LINUX_BITMAP_H_
+
+#include <linux/bitops.h>
+#include <linux/string.h>
+
+#endif /* _LINUX_BITMAP_H_ */
diff --git a/sys/ofed/include/linux/bitops.h b/sys/ofed/include/linux/bitops.h
new file mode 100644
index 0000000..4305a3a
--- /dev/null
+++ b/sys/ofed/include/linux/bitops.h
@@ -0,0 +1,312 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_BITOPS_H_
+#define _LINUX_BITOPS_H_
+
+#ifdef __LP64__
+#define BITS_PER_LONG 64
+#else
+#define BITS_PER_LONG 32
+#endif
+#define BIT_MASK(n) (~0UL >> (BITS_PER_LONG - (n)))
+#define BITS_TO_LONGS(n) howmany((n), BITS_PER_LONG)
+
+static inline int
+__ffs(int mask)
+{
+ return (ffs(mask) - 1);
+}
+
+static inline int
+__fls(int mask)
+{
+ return (fls(mask) - 1);
+}
+
+static inline int
+__ffsl(long mask)
+{
+ return (ffsl(mask) - 1);
+}
+
+static inline int
+__flsl(long mask)
+{
+ return (flsl(mask) - 1);
+}
+
+
+#define ffz(mask) __ffs(~(mask))
+
+static inline unsigned long
+find_first_bit(unsigned long *addr, unsigned long size)
+{
+ long mask;
+ int bit;
+
+ for (bit = 0; size >= BITS_PER_LONG;
+ size -= BITS_PER_LONG, bit += BITS_PER_LONG, addr++) {
+ if (*addr == 0)
+ continue;
+ return (bit + __ffsl(*addr));
+ }
+ if (size) {
+ mask = (*addr) & BIT_MASK(size);
+ if (mask)
+ bit += __ffsl(mask);
+ else
+ bit += size;
+ }
+ return (bit);
+}
+
+static inline unsigned long
+find_first_zero_bit(unsigned long *addr, unsigned long size)
+{
+ long mask;
+ int bit;
+
+ for (bit = 0; size >= BITS_PER_LONG;
+ size -= BITS_PER_LONG, bit += BITS_PER_LONG, addr++) {
+ if (~(*addr) == 0)
+ continue;
+ return (bit + __ffsl(~(*addr)));
+ }
+ if (size) {
+ mask = ~(*addr) & BIT_MASK(size);
+ if (mask)
+ bit += __ffsl(mask);
+ else
+ bit += size;
+ }
+ return (bit);
+}
+
+static inline unsigned long
+find_last_bit(unsigned long *addr, unsigned long size)
+{
+ long mask;
+ int offs;
+ int bit;
+ int pos;
+
+ pos = size / BITS_PER_LONG;
+ offs = size % BITS_PER_LONG;
+ bit = BITS_PER_LONG * pos;
+ addr += pos;
+ if (offs) {
+ mask = (*addr) & BIT_MASK(offs);
+ if (mask)
+ return (bit + __flsl(mask));
+ }
+ while (--pos) {
+ addr--;
+ bit -= BITS_PER_LONG;
+ if (*addr)
+ return (bit + __flsl(mask));
+ }
+ return (size);
+}
+
+static inline unsigned long
+find_next_bit(unsigned long *addr, unsigned long size, unsigned long offset)
+{
+ long mask;
+ int offs;
+ int bit;
+ int pos;
+
+ if (offset >= size)
+ return (size);
+ pos = offset / BITS_PER_LONG;
+ offs = offset % BITS_PER_LONG;
+ bit = BITS_PER_LONG * pos;
+ addr += pos;
+ if (offs) {
+ mask = (*addr) & ~BIT_MASK(offs);
+ if (mask)
+ return (bit + __ffsl(mask));
+ bit += BITS_PER_LONG;
+ addr++;
+ }
+ for (size -= bit; size >= BITS_PER_LONG;
+ size -= BITS_PER_LONG, bit += BITS_PER_LONG, addr++) {
+ if (*addr == 0)
+ continue;
+ return (bit + __ffsl(*addr));
+ }
+ if (size) {
+ mask = (*addr) & BIT_MASK(size);
+ if (mask)
+ bit += __ffsl(mask);
+ else
+ bit += size;
+ }
+ return (bit);
+}
+
+static inline unsigned long
+find_next_zero_bit(unsigned long *addr, unsigned long size,
+ unsigned long offset)
+{
+ long mask;
+ int offs;
+ int bit;
+ int pos;
+
+ if (offset >= size)
+ return (size);
+ pos = offset / BITS_PER_LONG;
+ offs = offset % BITS_PER_LONG;
+ bit = BITS_PER_LONG * pos;
+ addr += pos;
+ if (offs) {
+ mask = ~(*addr) & ~BIT_MASK(offs);
+ if (mask)
+ return (bit + __ffsl(mask));
+ bit += BITS_PER_LONG;
+ addr++;
+ }
+ for (size -= bit; size >= BITS_PER_LONG;
+ size -= BITS_PER_LONG, bit += BITS_PER_LONG, addr++) {
+ if (~(*addr) == 0)
+ continue;
+ return (bit + __ffsl(~(*addr)));
+ }
+ if (size) {
+ mask = ~(*addr) & BIT_MASK(size);
+ if (mask)
+ bit += __ffsl(mask);
+ else
+ bit += size;
+ }
+ return (bit);
+}
+
+static inline void
+bitmap_zero(unsigned long *addr, int size)
+{
+ int len;
+
+ len = BITS_TO_LONGS(size) * sizeof(long);
+ memset(addr, 0, len);
+}
+
+static inline void
+bitmap_fill(unsigned long *addr, int size)
+{
+ int tail;
+ int len;
+
+ len = (size / BITS_PER_LONG) * sizeof(long);
+ memset(addr, 0xff, len);
+ tail = size & (BITS_PER_LONG - 1);
+ if (tail)
+ addr[size / BITS_PER_LONG] = BIT_MASK(tail);
+}
+
+static inline int
+bitmap_full(unsigned long *addr, int size)
+{
+ long mask;
+ int tail;
+ int len;
+ int i;
+
+ len = size / BITS_PER_LONG;
+ for (i = 0; i < len; i++)
+ if (addr[i] != ~0UL)
+ return (0);
+ tail = size & (BITS_PER_LONG - 1);
+ if (tail) {
+ mask = BIT_MASK(tail);
+ if ((addr[i] & mask) != mask)
+ return (0);
+ }
+ return (1);
+}
+
+static inline int
+bitmap_empty(unsigned long *addr, int size)
+{
+ long mask;
+ int tail;
+ int len;
+ int i;
+
+ len = size / BITS_PER_LONG;
+ for (i = 0; i < len; i++)
+ if (addr[i] != 0)
+ return (0);
+ tail = size & (BITS_PER_LONG - 1);
+ if (tail) {
+ mask = BIT_MASK(tail);
+ if ((addr[i] & mask) != 0)
+ return (0);
+ }
+ return (1);
+}
+
+#define NBINT (NBBY * sizeof(int))
+
+#define set_bit(i, a) \
+ atomic_set_int(&((volatile int *)(a))[(i)/NBINT], 1 << (i) % NBINT)
+
+#define clear_bit(i, a) \
+ atomic_clear_int(&((volatile int *)(a))[(i)/NBINT], 1 << (i) % NBINT)
+
+#define test_bit(i, a) \
+ !!(atomic_load_acq_int(&((volatile int *)(a))[(i)/NBINT]) & 1 << ((i) % NBINT))
+
+static inline long
+test_and_clear_bit(long bit, long *var)
+{
+ long val;
+
+ bit = 1 << bit;
+ do {
+ val = *(volatile long *)var;
+ } while (atomic_cmpset_long(var, val, val & ~bit) == 0);
+
+ return !!(val & bit);
+}
+
+static inline long
+test_and_set_bit(long bit, long *var)
+{
+ long val;
+
+ bit = 1 << bit;
+ do {
+ val = *(volatile long *)var;
+ } while (atomic_cmpset_long(var, val, val | bit) == 0);
+
+ return !!(val & bit);
+}
+
+#endif /* _LINUX_BITOPS_H_ */
diff --git a/sys/ofed/include/linux/cdev.h b/sys/ofed/include/linux/cdev.h
new file mode 100644
index 0000000..cc77495
--- /dev/null
+++ b/sys/ofed/include/linux/cdev.h
@@ -0,0 +1,129 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_CDEV_H_
+#define _LINUX_CDEV_H_
+
+#include <linux/kobject.h>
+#include <linux/kdev_t.h>
+#include <linux/list.h>
+
+struct file_operations;
+struct inode;
+struct module;
+
+extern struct cdevsw linuxcdevsw;
+
+struct linux_cdev {
+ struct kobject kobj;
+ struct module *owner;
+ struct cdev *cdev;
+ dev_t dev;
+ const struct file_operations *ops;
+};
+
+static inline void
+cdev_release(struct kobject *kobj)
+{
+ struct linux_cdev *cdev;
+
+ cdev = container_of(kobj, struct linux_cdev, kobj);
+ if (cdev->cdev)
+ destroy_dev(cdev->cdev);
+ kfree(cdev);
+}
+
+static inline void
+cdev_static_release(struct kobject *kobj)
+{
+ struct linux_cdev *cdev;
+
+ cdev = container_of(kobj, struct linux_cdev, kobj);
+ if (cdev->cdev)
+ destroy_dev(cdev->cdev);
+}
+
+static struct kobj_type cdev_ktype = {
+ .release = cdev_release,
+};
+
+static struct kobj_type cdev_static_ktype = {
+ .release = cdev_static_release,
+};
+
+static inline void
+cdev_init(struct linux_cdev *cdev, const struct file_operations *ops)
+{
+
+ kobject_init(&cdev->kobj, &cdev_static_ktype);
+ cdev->ops = ops;
+}
+
+static inline struct linux_cdev *
+cdev_alloc(void)
+{
+ struct linux_cdev *cdev;
+
+ cdev = kzalloc(sizeof(struct linux_cdev), M_WAITOK);
+ if (cdev)
+ kobject_init(&cdev->kobj, &cdev_ktype);
+ return (cdev);
+}
+
+static inline void
+cdev_put(struct linux_cdev *p)
+{
+ kobject_put(&p->kobj);
+}
+
+static inline int
+cdev_add(struct linux_cdev *cdev, dev_t dev, unsigned count)
+{
+ if (count != 1)
+ panic("cdev_add: Unsupported count: %d", count);
+ cdev->cdev = make_dev(&linuxcdevsw, MINOR(dev), 0, 0, 0700,
+ kobject_name(&cdev->kobj));
+ cdev->dev = dev;
+ cdev->cdev->si_drv1 = cdev;
+
+ return (0);
+}
+
+static inline void
+cdev_del(struct linux_cdev *cdev)
+{
+ if (cdev->cdev) {
+ destroy_dev(cdev->cdev);
+ cdev->cdev = NULL;
+ }
+ kobject_put(&cdev->kobj);
+}
+
+#define cdev linux_cdev
+
+#endif /* _LINUX_CDEV_H_ */
diff --git a/sys/ofed/include/linux/compat.h b/sys/ofed/include/linux/compat.h
new file mode 100644
index 0000000..cfb1671
--- /dev/null
+++ b/sys/ofed/include/linux/compat.h
@@ -0,0 +1,33 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_COMPAT_H_
+#define _LINUX_COMPAT_H_
+
+
+#endif /* _LINUX_COMPAT_H_ */
diff --git a/sys/ofed/include/linux/compiler.h b/sys/ofed/include/linux/compiler.h
new file mode 100644
index 0000000..12938ba
--- /dev/null
+++ b/sys/ofed/include/linux/compiler.h
@@ -0,0 +1,65 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_COMPILER_H_
+#define _LINUX_COMPILER_H_
+
+#include <sys/cdefs.h>
+
+#define __user
+#define __kernel
+#define __safe
+#define __force
+#define __nocast
+#define __iomem
+#define __chk_user_ptr(x) 0
+#define __chk_io_ptr(x) 0
+#define __builtin_warning(x, y...) (1)
+#define __acquires(x)
+#define __releases(x)
+#define __acquire(x) 0
+#define __release(x) 0
+#define __cond_lock(x,c) (c)
+#define __bitwise
+#define __devinitdata
+#define __init
+#define __devinit
+#define __devexit
+#define __exit
+#define __stringify(x) #x
+#define __attribute_const__ __attribute__((__const__))
+#undef __always_inline
+#define __always_inline inline
+
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#define typeof(x) __typeof(x)
+
+#define uninitialized_var(x) x = x
+
+#endif /* _LINUX_COMPILER_H_ */
diff --git a/sys/ofed/include/linux/completion.h b/sys/ofed/include/linux/completion.h
new file mode 100644
index 0000000..59f36b0
--- /dev/null
+++ b/sys/ofed/include/linux/completion.h
@@ -0,0 +1,155 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_COMPLETION_H_
+#define _LINUX_COMPLETION_H_
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sleepqueue.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+
+struct completion {
+ unsigned int done;
+};
+
+#define INIT_COMPLETION(c) ((c).done = 0)
+#define init_completion(c) ((c)->done = 0)
+
+static inline void
+_complete_common(struct completion *c, int all)
+{
+ int wakeup_swapper;
+
+ sleepq_lock(c);
+ c->done++;
+ if (all)
+ wakeup_swapper = sleepq_broadcast(c, SLEEPQ_SLEEP, 0, 0);
+ else
+ wakeup_swapper = sleepq_signal(c, SLEEPQ_SLEEP, 0, 0);
+ sleepq_release(c);
+ if (wakeup_swapper)
+ kick_proc0();
+}
+
+#define complete(c) _complete_common(c, 0)
+#define complete_all(c) _complete_common(c, 1)
+
+/*
+ * Indefinite wait for done != 0 with or without signals.
+ */
+static inline long
+_wait_for_common(struct completion *c, int flags)
+{
+
+ flags |= SLEEPQ_SLEEP;
+ for (;;) {
+ sleepq_lock(c);
+ if (c->done)
+ break;
+ sleepq_add(c, NULL, "completion", flags, 0);
+ if (flags & SLEEPQ_INTERRUPTIBLE) {
+ if (sleepq_wait_sig(c, 0) != 0)
+ return (-ERESTARTSYS);
+ } else
+ sleepq_wait(c, 0);
+ }
+ c->done--;
+ sleepq_release(c);
+
+ return (0);
+}
+
+#define wait_for_completion(c) _wait_for_common(c, 0)
+#define wait_for_completion_interuptible(c) \
+ _wait_for_common(c, SLEEPQ_INTERRUPTIBLE)
+
+static inline long
+_wait_for_timeout_common(struct completion *c, long timeout, int flags)
+{
+ long end;
+
+ end = ticks + timeout;
+ flags |= SLEEPQ_SLEEP;
+ for (;;) {
+ sleepq_lock(c);
+ if (c->done)
+ break;
+ sleepq_add(c, NULL, "completion", flags, 0);
+ sleepq_set_timeout(c, end - ticks);
+ if (flags & SLEEPQ_INTERRUPTIBLE) {
+ if (sleepq_timedwait_sig(c, 0) != 0)
+ return (-ERESTARTSYS);
+ } else
+ sleepq_timedwait(c, 0);
+ }
+ c->done--;
+ sleepq_release(c);
+ timeout = end - ticks;
+
+ return (timeout > 0 ? timeout : 1);
+}
+
+#define wait_for_completion_timeout(c, timeout) \
+ _wait_for_timeout_common(c, timeout, 0)
+#define wait_for_completion_interruptible_timeout(c, timeout) \
+ _wait_for_timeout_common(c, timeout, SLEEPQ_INTERRUPTIBLE)
+
+static inline int
+try_wait_for_completion(struct completion *c)
+{
+ int isdone;
+
+ isdone = 1;
+ sleepq_lock(c);
+ if (c->done)
+ c->done--;
+ else
+ isdone = 0;
+ sleepq_release(c);
+ return (isdone);
+}
+
+static inline int
+completion_done(struct completion *c)
+{
+ int isdone;
+
+ isdone = 1;
+ sleepq_lock(c);
+ if (c->done == 0)
+ isdone = 0;
+ sleepq_release(c);
+ return (isdone);
+}
+
+#endif /* _LINUX_COMPLETION_H_ */
diff --git a/sys/ofed/include/linux/ctype.h b/sys/ofed/include/linux/ctype.h
new file mode 100644
index 0000000..3ed4137
--- /dev/null
+++ b/sys/ofed/include/linux/ctype.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_CTYPE_H_
+#define _LINUX_CTYPE_H_
+
+#include <sys/ctype.h>
+
+#endif /* _LINUX_CTYPE_H_ */
diff --git a/sys/ofed/include/linux/delay.h b/sys/ofed/include/linux/delay.h
new file mode 100644
index 0000000..019ef8a
--- /dev/null
+++ b/sys/ofed/include/linux/delay.h
@@ -0,0 +1,43 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_DELAY_H_
+#define _LINUX_DELAY_H_
+
+#include <linux/jiffies.h>
+
+static inline void
+linux_msleep(int ms)
+{
+ pause("lnxsleep", msecs_to_jiffies(ms));
+}
+
+#undef msleep
+#define msleep linux_msleep
+
+#endif /* _LINUX_DELAY_H_ */
diff --git a/sys/ofed/include/linux/device.h b/sys/ofed/include/linux/device.h
new file mode 100644
index 0000000..cce46ca
--- /dev/null
+++ b/sys/ofed/include/linux/device.h
@@ -0,0 +1,388 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_DEVICE_H_
+#define _LINUX_DEVICE_H_
+
+#include <linux/types.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/workqueue.h>
+#include <linux/sysfs.h>
+#include <linux/kdev_t.h>
+#include <asm/atomic.h>
+
+#include <sys/bus.h>
+
+enum irqreturn { IRQ_NONE = 0, IRQ_HANDLED, IRQ_WAKE_THREAD, };
+typedef enum irqreturn irqreturn_t;
+
+struct class {
+ const char *name;
+ struct module *owner;
+ struct kobject kobj;
+ devclass_t bsdclass;
+ void (*class_release)(struct class *class);
+ void (*dev_release)(struct device *dev);
+};
+
+struct device {
+ struct device *parent;
+ struct list_head irqents;
+ device_t bsddev;
+ dev_t devt;
+ struct class *class;
+ void (*release)(struct device *dev);
+ struct kobject kobj;
+ uint64_t *dma_mask;
+ void *driver_data;
+ unsigned int irq;
+ unsigned int msix;
+ unsigned int msix_max;
+};
+
+extern struct device linux_rootdev;
+extern struct kobject class_root;
+
+struct class_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct class *, char *);
+ ssize_t (*store)(struct class *, const char *, size_t);
+};
+#define CLASS_ATTR(_name, _mode, _show, _store) \
+ struct class_attribute class_attr_##_name = \
+ { { #_name, NULL, _mode }, _show, _store }
+
+struct device_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct device *,
+ struct device_attribute *, char *);
+ ssize_t (*store)(struct device *,
+ struct device_attribute *, const char *,
+ size_t);
+};
+
+#define DEVICE_ATTR(_name, _mode, _show, _store) \
+ struct device_attribute dev_attr_##_name = \
+ { { #_name, NULL, _mode }, _show, _store }
+
+#define dev_err(dev, fmt, ...) device_printf((dev)->bsddev, fmt, ##__VA_ARGS__)
+#define dev_warn(dev, fmt, ...) device_printf((dev)->bsddev, fmt, ##__VA_ARGS__)
+#define dev_info(dev, fmt, ...) device_printf((dev)->bsddev, fmt, ##__VA_ARGS__)
+#define dev_printk(lvl, dev, fmt, ...) \
+ device_printf((dev)->bsddev, fmt, ##__VA_ARGS__)
+
+static inline void *
+dev_get_drvdata(struct device *dev)
+{
+
+ return dev->driver_data;
+}
+
+static inline void
+dev_set_drvdata(struct device *dev, void *data)
+{
+
+ dev->driver_data = data;
+}
+
+static inline struct device *
+get_device(struct device *dev)
+{
+
+ if (dev)
+ kobject_get(&dev->kobj);
+
+ return (dev);
+}
+
+static inline char *
+dev_name(const struct device *dev)
+{
+
+ return kobject_name(&dev->kobj);
+}
+
+#define dev_set_name(_dev, _fmt, ...) \
+ kobject_set_name(&(_dev)->kobj, (_fmt), ##__VA_ARGS__)
+
+static inline void
+put_device(struct device *dev)
+{
+
+ if (dev)
+ kobject_put(&dev->kobj);
+}
+
+static inline ssize_t
+class_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+ struct class_attribute *dattr;
+ ssize_t error;
+
+ dattr = container_of(attr, struct class_attribute, attr);
+ error = -EIO;
+ if (dattr->show)
+ error = dattr->show(container_of(kobj, struct class, kobj),
+ buf);
+ return (error);
+}
+
+static inline ssize_t
+class_store(struct kobject *kobj, struct attribute *attr, const char *buf,
+ size_t count)
+{
+ struct class_attribute *dattr;
+ ssize_t error;
+
+ dattr = container_of(attr, struct class_attribute, attr);
+ error = -EIO;
+ if (dattr->store)
+ error = dattr->store(container_of(kobj, struct class, kobj),
+ buf, count);
+ return (error);
+}
+
+static inline void
+class_release(struct kobject *kobj)
+{
+ struct class *class;
+
+ class = container_of(kobj, struct class, kobj);
+ if (class->class_release)
+ class->class_release(class);
+}
+
+static struct sysfs_ops class_sysfs = {
+ .show = class_show,
+ .store = class_store,
+};
+static struct kobj_type class_ktype = {
+ .release = class_release,
+ .sysfs_ops = &class_sysfs
+};
+
+static inline int
+class_register(struct class *class)
+{
+
+ class->bsdclass = devclass_create(class->name);
+ kobject_init(&class->kobj, &class_ktype);
+ kobject_set_name(&class->kobj, class->name);
+ kobject_add(&class->kobj, &class_root, class->name);
+
+ return (0);
+}
+
+static inline void
+class_unregister(struct class *class)
+{
+
+ kobject_put(&class->kobj);
+}
+
+static inline void
+device_release(struct kobject *kobj)
+{
+ struct device *dev;
+
+ dev = container_of(kobj, struct device, kobj);
+ /* This is the precedence defined by linux. */
+ if (dev->release)
+ dev->release(dev);
+ else if (dev->class && dev->class->dev_release)
+ dev->class->dev_release(dev);
+}
+
+static inline ssize_t
+dev_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+ struct device_attribute *dattr;
+ ssize_t error;
+
+ dattr = container_of(attr, struct device_attribute, attr);
+ error = -EIO;
+ if (dattr->show)
+ error = dattr->show(container_of(kobj, struct device, kobj),
+ dattr, buf);
+ return (error);
+}
+
+static inline ssize_t
+dev_store(struct kobject *kobj, struct attribute *attr, const char *buf,
+ size_t count)
+{
+ struct device_attribute *dattr;
+ ssize_t error;
+
+ dattr = container_of(attr, struct device_attribute, attr);
+ error = -EIO;
+ if (dattr->store)
+ error = dattr->store(container_of(kobj, struct device, kobj),
+ dattr, buf, count);
+ return (error);
+}
+
+static struct sysfs_ops dev_sysfs = { .show = dev_show, .store = dev_store, };
+static struct kobj_type dev_ktype = {
+ .release = device_release,
+ .sysfs_ops = &dev_sysfs
+};
+
+/*
+ * Devices are registered and created for exporting to sysfs. create
+ * implies register and register assumes the device fields have been
+ * setup appropriately before being called.
+ */
+static inline int
+device_register(struct device *dev)
+{
+ device_t bsddev;
+ int unit;
+
+ bsddev = NULL;
+ if (dev->devt) {
+ unit = MINOR(dev->devt);
+ bsddev = devclass_get_device(dev->class->bsdclass, unit);
+ } else
+ unit = -1;
+ if (bsddev == NULL)
+ bsddev = device_add_child(dev->parent->bsddev,
+ dev->class->kobj.name, unit);
+ if (bsddev) {
+ if (dev->devt == 0)
+ dev->devt = makedev(0, device_get_unit(bsddev));
+ device_set_softc(bsddev, dev);
+ }
+ dev->bsddev = bsddev;
+ kobject_init(&dev->kobj, &dev_ktype);
+ kobject_add(&dev->kobj, &dev->class->kobj, dev_name(dev));
+
+ return (0);
+}
+
+static inline void
+device_unregister(struct device *dev)
+{
+ device_t bsddev;
+
+ bsddev = dev->bsddev;
+ mtx_lock(&Giant);
+ if (bsddev)
+ device_delete_child(device_get_parent(bsddev), bsddev);
+ mtx_unlock(&Giant);
+ put_device(dev);
+}
+
+struct device *device_create(struct class *class, struct device *parent,
+ dev_t devt, void *drvdata, const char *fmt, ...);
+
+static inline void
+device_destroy(struct class *class, dev_t devt)
+{
+ device_t bsddev;
+ int unit;
+
+ unit = MINOR(devt);
+ bsddev = devclass_get_device(class->bsdclass, unit);
+ if (bsddev)
+ device_unregister(device_get_softc(bsddev));
+}
+
+static inline void
+class_kfree(struct class *class)
+{
+
+ kfree(class);
+}
+
+static inline struct class *
+class_create(struct module *owner, const char *name)
+{
+ struct class *class;
+ int error;
+
+ class = kzalloc(sizeof(*class), M_WAITOK);
+ class->owner = owner;
+ class->name= name;
+ class->class_release = class_kfree;
+ error = class_register(class);
+ if (error) {
+ kfree(class);
+ return (NULL);
+ }
+
+ return (class);
+}
+
+static inline void
+class_destroy(struct class *class)
+{
+
+ if (class == NULL)
+ return;
+ class_unregister(class);
+}
+
+static inline int
+device_create_file(struct device *dev, const struct device_attribute *attr)
+{
+
+ if (dev)
+ return sysfs_create_file(&dev->kobj, &attr->attr);
+ return -EINVAL;
+}
+
+static inline void
+device_remove_file(struct device *dev, const struct device_attribute *attr)
+{
+
+ if (dev)
+ sysfs_remove_file(&dev->kobj, &attr->attr);
+}
+
+static inline int
+class_create_file(struct class *class, const struct class_attribute *attr)
+{
+
+ if (class)
+ return sysfs_create_file(&class->kobj, &attr->attr);
+ return -EINVAL;
+}
+
+static inline void
+class_remove_file(struct class *class, const struct class_attribute *attr)
+{
+
+ if (class)
+ sysfs_remove_file(&class->kobj, &attr->attr);
+}
+
+#endif /* _LINUX_DEVICE_H_ */
diff --git a/sys/ofed/include/linux/dma-attrs.h b/sys/ofed/include/linux/dma-attrs.h
new file mode 100644
index 0000000..9e625bd
--- /dev/null
+++ b/sys/ofed/include/linux/dma-attrs.h
@@ -0,0 +1,47 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_DMA_ATTR_H_
+#define _LINUX_DMA_ATTR_H_
+
+enum dma_attr { DMA_ATTR_WRITE_BARRIER, DMA_ATTR_WEAK_ORDERING, DMA_ATTR_MAX, };
+
+#define __DMA_ATTRS_LONGS BITS_TO_LONGS(DMA_ATTR_MAX)
+
+struct dma_attrs {
+ unsigned long flags;
+};
+
+#define DEFINE_DMA_ATTRS(x) struct dma_attrs x = { }
+
+static inline void
+init_dma_attrs(struct dma_attrs *attrs)
+{
+ attrs->flags = 0;
+}
+
+#endif /* _LINUX_DMA_ATTR_H_ */
diff --git a/sys/ofed/include/linux/dma-mapping.h b/sys/ofed/include/linux/dma-mapping.h
new file mode 100644
index 0000000..c653524
--- /dev/null
+++ b/sys/ofed/include/linux/dma-mapping.h
@@ -0,0 +1,263 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_DMA_MAPPING_H_
+#define _LINUX_DMA_MAPPING_H_
+
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/dma-attrs.h>
+#include <linux/scatterlist.h>
+#include <linux/mm.h>
+#include <linux/page.h>
+
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/pmap.h>
+
+#include <machine/bus.h>
+#include <machine/pmap.h>
+
+enum dma_data_direction {
+ DMA_BIDIRECTIONAL = 0,
+ DMA_TO_DEVICE = 1,
+ DMA_FROM_DEVICE = 2,
+ DMA_NONE = 3,
+};
+
+struct dma_map_ops {
+ void* (*alloc_coherent)(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t gfp);
+ void (*free_coherent)(struct device *dev, size_t size,
+ void *vaddr, dma_addr_t dma_handle);
+ dma_addr_t (*map_page)(struct device *dev, struct page *page,
+ unsigned long offset, size_t size, enum dma_data_direction dir,
+ struct dma_attrs *attrs);
+ void (*unmap_page)(struct device *dev, dma_addr_t dma_handle,
+ size_t size, enum dma_data_direction dir, struct dma_attrs *attrs);
+ int (*map_sg)(struct device *dev, struct scatterlist *sg,
+ int nents, enum dma_data_direction dir, struct dma_attrs *attrs);
+ void (*unmap_sg)(struct device *dev, struct scatterlist *sg, int nents,
+ enum dma_data_direction dir, struct dma_attrs *attrs);
+ void (*sync_single_for_cpu)(struct device *dev, dma_addr_t dma_handle,
+ size_t size, enum dma_data_direction dir);
+ void (*sync_single_for_device)(struct device *dev,
+ dma_addr_t dma_handle, size_t size, enum dma_data_direction dir);
+ void (*sync_single_range_for_cpu)(struct device *dev,
+ dma_addr_t dma_handle, unsigned long offset, size_t size,
+ enum dma_data_direction dir);
+ void (*sync_single_range_for_device)(struct device *dev,
+ dma_addr_t dma_handle, unsigned long offset, size_t size,
+ enum dma_data_direction dir);
+ void (*sync_sg_for_cpu)(struct device *dev, struct scatterlist *sg,
+ int nents, enum dma_data_direction dir);
+ void (*sync_sg_for_device)(struct device *dev, struct scatterlist *sg,
+ int nents, enum dma_data_direction dir);
+ int (*mapping_error)(struct device *dev, dma_addr_t dma_addr);
+ int (*dma_supported)(struct device *dev, u64 mask);
+ int is_phys;
+};
+
+#define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL << (n)) - 1))
+
+static inline int
+dma_supported(struct device *dev, u64 mask)
+{
+
+ /* XXX busdma takes care of this elsewhere. */
+ return (1);
+}
+
+static inline int
+dma_set_mask(struct device *dev, u64 dma_mask)
+{
+
+ if (!dev->dma_mask || !dma_supported(dev, dma_mask))
+ return -EIO;
+
+ *dev->dma_mask = dma_mask;
+ return (0);
+}
+
+static inline int
+dma_set_coherent_mask(struct device *dev, u64 mask)
+{
+
+ if (!dma_supported(dev, mask))
+ return -EIO;
+ /* XXX Currently we don't support a seperate coherent mask. */
+ return 0;
+}
+
+static inline void *
+dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
+ gfp_t flag)
+{
+ vm_paddr_t high;
+ size_t align;
+ void *mem;
+
+ if (dev->dma_mask)
+ high = *dev->dma_mask;
+ else
+ high = BUS_SPACE_MAXADDR_32BIT;
+ align = PAGE_SIZE << get_order(size);
+ mem = (void *)kmem_alloc_contig(kmem_map, size, flag, 0, high, align,
+ 0, VM_MEMATTR_DEFAULT);
+ if (mem)
+ *dma_handle = vtophys(mem);
+ else
+ *dma_handle = 0;
+ return (mem);
+}
+
+static inline void
+dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
+ dma_addr_t dma_handle)
+{
+
+ kmem_free(kmem_map, (vm_offset_t)cpu_addr, size);
+}
+
+/* XXX This only works with no iommu. */
+static inline dma_addr_t
+dma_map_single_attrs(struct device *dev, void *ptr, size_t size,
+ enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+
+ return vtophys(ptr);
+}
+
+static inline void
+dma_unmap_single_attrs(struct device *dev, dma_addr_t addr, size_t size,
+ enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+}
+
+static inline int
+dma_map_sg_attrs(struct device *dev, struct scatterlist *sgl, int nents,
+ enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+ struct scatterlist *sg;
+ int i;
+
+ for_each_sg(sgl, sg, nents, i)
+ sg_dma_address(sg) = sg_phys(sg);
+
+ return (nents);
+}
+
+static inline void
+dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
+ enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+}
+
+static inline dma_addr_t
+dma_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size, enum dma_data_direction direction)
+{
+
+ return VM_PAGE_TO_PHYS(page) + offset;
+}
+
+static inline void
+dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
+ enum dma_data_direction direction)
+{
+}
+
+static inline void
+dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
+ enum dma_data_direction direction)
+{
+}
+
+static inline void
+dma_sync_single(struct device *dev, dma_addr_t addr, size_t size,
+ enum dma_data_direction dir)
+{
+ dma_sync_single_for_cpu(dev, addr, size, dir);
+}
+
+static inline void
+dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle,
+ size_t size, enum dma_data_direction direction)
+{
+}
+
+static inline void
+dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
+ enum dma_data_direction direction)
+{
+}
+
+static inline void
+dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
+ enum dma_data_direction direction)
+{
+}
+
+static inline void
+dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle,
+ unsigned long offset, size_t size, int direction)
+{
+}
+
+static inline void
+dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
+ unsigned long offset, size_t size, int direction)
+{
+}
+
+static inline int
+dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+
+ return (0);
+}
+
+#define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, NULL)
+#define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, NULL)
+#define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, NULL)
+#define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, NULL)
+
+#define DEFINE_DMA_UNMAP_ADDR(name) dma_addr_t name
+#define DEFINE_DMA_UNMAP_LEN(name) __u32 name
+#define dma_unmap_addr(p, name) ((p)->name)
+#define dma_unmap_addr_set(p, name, v) (((p)->name) = (v))
+#define dma_unmap_len(p, name) ((p)->name)
+#define dma_unmap_len_set(p, name, v) (((p)->name) = (v))
+
+extern int uma_align_cache;
+#define dma_get_cache_alignment() uma_align_cache
+
+#endif /* _LINUX_DMA_MAPPING_H_ */
diff --git a/sys/ofed/include/linux/dmapool.h b/sys/ofed/include/linux/dmapool.h
new file mode 100644
index 0000000..3b58164
--- /dev/null
+++ b/sys/ofed/include/linux/dmapool.h
@@ -0,0 +1,85 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_DMAPOOL_H_
+#define _LINUX_DMAPOOL_H_
+
+#include <linux/types.h>
+#include <linux/io.h>
+#include <linux/scatterlist.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+
+struct dma_pool {
+ uma_zone_t pool_zone;
+};
+
+static inline struct dma_pool *
+dma_pool_create(char *name, struct device *dev, size_t size,
+ size_t align, size_t boundary)
+{
+ struct dma_pool *pool;
+
+ pool = kmalloc(sizeof(*pool), GFP_KERNEL);
+ align--;
+ /*
+ * XXX Eventually this could use a seperate allocf to honor boundary
+ * and physical address requirements of the device.
+ */
+ pool->pool_zone = uma_zcreate(name, size, NULL, NULL, NULL, NULL,
+ align, UMA_ZONE_OFFPAGE|UMA_ZONE_HASH);
+
+ return (pool);
+}
+
+static inline void
+dma_pool_destroy(struct dma_pool *pool)
+{
+ uma_zdestroy(pool->pool_zone);
+ kfree(pool);
+}
+
+static inline void *
+dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, dma_addr_t *handle)
+{
+ void *vaddr;
+
+ vaddr = uma_zalloc(pool->pool_zone, mem_flags);
+ if (vaddr)
+ *handle = vtophys(vaddr);
+ return (vaddr);
+}
+
+static inline void
+dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t addr)
+{
+ uma_zfree(pool->pool_zone, vaddr);
+}
+
+
+#endif /* _LINUX_DMAPOOL_H_ */
diff --git a/sys/ofed/include/linux/err.h b/sys/ofed/include/linux/err.h
new file mode 100644
index 0000000..858931d
--- /dev/null
+++ b/sys/ofed/include/linux/err.h
@@ -0,0 +1,60 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_ERR_H_
+#define _LINUX_ERR_H_
+
+#define MAX_ERRNO 4095
+
+#define IS_ERR_VALUE(x) ((x) >= (unsigned long)-MAX_ERRNO)
+
+static inline void *
+ERR_PTR(long error)
+{
+ return (void *)error;
+}
+
+static inline long
+PTR_ERR(const void *ptr)
+{
+ return (long)ptr;
+}
+
+static inline long
+IS_ERR(const void *ptr)
+{
+ return IS_ERR_VALUE((unsigned long)ptr);
+}
+
+static inline void *
+ERR_CAST(void *ptr)
+{
+ return (void *)ptr;
+}
+
+#endif /* _LINUX_ERR_H_ */
diff --git a/sys/ofed/include/linux/errno.h b/sys/ofed/include/linux/errno.h
new file mode 100644
index 0000000..b107c45
--- /dev/null
+++ b/sys/ofed/include/linux/errno.h
@@ -0,0 +1,39 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_ERRNO_H_
+#define _LINUX_ERRNO_H_
+
+#include <sys/errno.h>
+
+#define ECOMM ESTALE
+#define ENODATA ECONNREFUSED
+#define ENOIOCTLCMD ENOIOCTL /* XXX this is negative */
+#define ERESTARTSYS ERESTART /* XXX this is negative */
+
+#endif /* _LINUX_ERRNO_H_ */
diff --git a/sys/ofed/include/linux/ethtool.h b/sys/ofed/include/linux/ethtool.h
new file mode 100644
index 0000000..a267209
--- /dev/null
+++ b/sys/ofed/include/linux/ethtool.h
@@ -0,0 +1,31 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_ETHTOOL_H_
+#define _LINUX_ETHTOOL_H_
+
+#endif /* _LINUX_ETHTOOL_H_ */
diff --git a/sys/ofed/include/linux/file.h b/sys/ofed/include/linux/file.h
new file mode 100644
index 0000000..12858d7
--- /dev/null
+++ b/sys/ofed/include/linux/file.h
@@ -0,0 +1,120 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_FILE_H_
+#define _LINUX_FILE_H_
+
+#include <sys/param.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/refcount.h>
+#include <sys/proc.h>
+
+#include <linux/fs.h>
+
+struct linux_file;
+
+#undef file
+
+extern struct fileops linuxfileops;
+
+static inline struct linux_file *
+linux_fget(unsigned int fd)
+{
+ struct file *file;
+
+ file = fget_unlocked(curthread->td_proc->p_fd, fd);
+ return (struct linux_file *)file->f_data;
+}
+
+static inline void
+fput(struct linux_file *filp)
+{
+ if (filp->_file == NULL) {
+ kfree(filp);
+ return;
+ }
+ if (refcount_release(&filp->_file->f_count)) {
+ _fdrop(filp->_file, curthread);
+ kfree(filp);
+ }
+}
+
+static inline void
+put_unused_fd(unsigned int fd)
+{
+ struct file *file;
+
+ file = fget_unlocked(curthread->td_proc->p_fd, fd);
+ if (file == NULL)
+ return;
+ fdclose(curthread->td_proc->p_fd, file, fd, curthread);
+}
+
+static inline void
+fd_install(unsigned int fd, struct linux_file *filp)
+{
+ struct file *file;
+
+ file = fget_unlocked(curthread->td_proc->p_fd, fd);
+ filp->_file = file;
+ finit(file, filp->f_mode, DTYPE_DEV, filp, &linuxfileops);
+}
+
+static inline int
+get_unused_fd(void)
+{
+ struct file *file;
+ int error;
+ int fd;
+
+ error = falloc(curthread, &file, &fd);
+ if (error)
+ return -error;
+ return fd;
+}
+
+static inline struct linux_file *
+_alloc_file(int mode, const struct file_operations *fops)
+{
+ struct linux_file *filp;
+
+ filp = kzalloc(sizeof(*filp), GFP_KERNEL);
+ if (filp == NULL)
+ return (NULL);
+ filp->f_op = fops;
+ filp->f_mode = mode;
+
+ return filp;
+}
+
+#define alloc_file(mnt, root, mode, fops) _alloc_file((mode), (fops))
+
+#define file linux_file
+#define fget linux_fget
+
+#endif /* _LINUX_FILE_H_ */
diff --git a/sys/ofed/include/linux/fs.h b/sys/ofed/include/linux/fs.h
new file mode 100644
index 0000000..4e667cc
--- /dev/null
+++ b/sys/ofed/include/linux/fs.h
@@ -0,0 +1,182 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_FS_H_
+#define _LINUX_FS_H_
+
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <linux/semaphore.h>
+
+struct module;
+struct kiocb;
+struct iovec;
+struct dentry;
+struct page;
+struct file_lock;
+struct pipe_inode_info;
+struct vm_area_struct;
+struct poll_table_struct;
+struct files_struct;
+
+#define inode vnode
+#define i_cdev v_rdev
+
+#define S_IRUGO (S_IRUSR | S_IRGRP | S_IROTH)
+#define S_IWUGO (S_IWUSR | S_IWGRP | S_IWOTH)
+
+
+typedef struct files_struct *fl_owner_t;
+
+struct dentry {
+ struct inode *d_inode;
+};
+
+struct file_operations;
+
+struct linux_file {
+ struct file *_file;
+ const struct file_operations *f_op;
+ void *private_data;
+ int f_flags;
+ int f_mode; /* Just starting mode. */
+ struct dentry *f_dentry;
+ struct dentry f_dentry_store;
+ struct selinfo f_selinfo;
+ struct sigio *f_sigio;
+};
+
+#define file linux_file
+#define fasync_struct sigio *
+
+#define fasync_helper(fd, filp, on, queue) \
+({ \
+ if ((on)) \
+ *(queue) = &(filp)->f_sigio; \
+ else \
+ *(queue) = NULL; \
+ 0; \
+})
+
+#define kill_fasync(queue, sig, pollstat) \
+do { \
+ if (*(queue) != NULL) \
+ pgsigio(*(queue), (sig), 0); \
+} while (0)
+
+typedef int (*filldir_t)(void *, const char *, int, loff_t, u64, unsigned);
+
+struct file_operations {
+ struct module *owner;
+ ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
+ ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
+ unsigned int (*poll) (struct file *, struct poll_table_struct *);
+ long (*unlocked_ioctl)(struct file *, unsigned int, unsigned long);
+ int (*mmap)(struct file *, struct vm_area_struct *);
+ int (*open)(struct inode *, struct file *);
+ int (*release)(struct inode *, struct file *);
+ int (*fasync)(int, struct file *, int);
+#if 0
+ /* We do not support these methods. Don't permit them to compile. */
+ loff_t (*llseek)(struct file *, loff_t, int);
+ ssize_t (*aio_read)(struct kiocb *, const struct iovec *,
+ unsigned long, loff_t);
+ ssize_t (*aio_write)(struct kiocb *, const struct iovec *,
+ unsigned long, loff_t);
+ int (*readdir)(struct file *, void *, filldir_t);
+ int (*ioctl)(struct inode *, struct file *, unsigned int,
+ unsigned long);
+ long (*compat_ioctl)(struct file *, unsigned int, unsigned long);
+ int (*flush)(struct file *, fl_owner_t id);
+ int (*fsync)(struct file *, struct dentry *, int datasync);
+ int (*aio_fsync)(struct kiocb *, int datasync);
+ int (*lock)(struct file *, int, struct file_lock *);
+ ssize_t (*sendpage)(struct file *, struct page *, int, size_t,
+ loff_t *, int);
+ unsigned long (*get_unmapped_area)(struct file *, unsigned long,
+ unsigned long, unsigned long, unsigned long);
+ int (*check_flags)(int);
+ int (*flock)(struct file *, int, struct file_lock *);
+ ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
+ loff_t *, size_t, unsigned int);
+ ssize_t (*splice_read)(struct file *, loff_t *,
+ struct pipe_inode_info *, size_t, unsigned int);
+ int (*setlease)(struct file *, long, struct file_lock **);
+#endif
+};
+#define fops_get(fops) (fops)
+
+#define FMODE_READ FREAD
+#define FMODE_WRITE FWRITE
+#define FMODE_EXEC FEXEC
+
+static inline int
+register_chrdev_region(dev_t dev, unsigned range, const char *name)
+{
+
+ return 0;
+}
+
+static inline void
+unregister_chrdev_region(dev_t dev, unsigned range)
+{
+
+ return;
+}
+
+static inline dev_t
+iminor(struct inode *inode)
+{
+
+ return dev2unit(inode->v_rdev);
+}
+
+static inline struct inode *
+igrab(struct inode *inode)
+{
+ int error;
+
+ error = vget(inode, 0, curthread);
+ if (error)
+ return (NULL);
+
+ return (inode);
+}
+
+static inline void
+iput(struct inode *inode)
+{
+
+ vrele(inode);
+}
+
+#endif /* _LINUX_FS_H_ */
diff --git a/sys/ofed/include/linux/gfp.h b/sys/ofed/include/linux/gfp.h
new file mode 100644
index 0000000..7f8a24f
--- /dev/null
+++ b/sys/ofed/include/linux/gfp.h
@@ -0,0 +1,122 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_GFP_H_
+#define _LINUX_GFP_H_
+
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include <linux/page.h>
+
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+
+#define __GFP_NOWARN 0
+#define __GFP_HIGHMEM 0
+#define __GFP_ZERO M_ZERO
+
+#define GFP_NOWAIT M_NOWAIT
+#define GFP_ATOMIC (M_NOWAIT | M_USE_RESERVE)
+#define GFP_KERNEL M_WAITOK
+#define GFP_USER M_WAITOK
+#define GFP_HIGHUSER M_WAITOK
+#define GFP_HIGHUSER_MOVABLE M_WAITOK
+#define GFP_IOFS M_NOWAIT
+
+static inline void *
+page_address(struct page *page)
+{
+
+ if (page->object != kmem_object && page->object != kernel_object)
+ return (NULL);
+ return (void *)(VM_MIN_KERNEL_ADDRESS + IDX_TO_OFF(page->pindex));
+}
+
+static inline unsigned long
+_get_page(gfp_t mask)
+{
+
+ return kmem_malloc(kmem_map, PAGE_SIZE, mask);
+}
+
+#define get_zeroed_page(mask) _get_page((mask) | M_ZERO)
+#define alloc_page(mask) virt_to_page(_get_page((mask)))
+#define __get_free_page(mask) _get_page((mask))
+
+static inline void
+free_page(unsigned long page)
+{
+
+ if (page == 0)
+ return;
+ kmem_free(kmem_map, page, PAGE_SIZE);
+}
+
+static inline void
+__free_page(struct page *m)
+{
+
+ if (m->object != kmem_object)
+ panic("__free_page: Freed page %p not allocated via wrappers.",
+ m);
+ kmem_free(kmem_map, (vm_offset_t)page_address(m), PAGE_SIZE);
+}
+
+static inline void
+__free_pages(void *p, unsigned int order)
+{
+ size_t size;
+
+ if (p == 0)
+ return;
+ size = PAGE_SIZE << order;
+ kmem_free(kmem_map, (vm_offset_t)p, size);
+}
+
+/*
+ * Alloc pages allocates directly from the buddy allocator on linux so
+ * order specifies a power of two bucket of pages and the results
+ * are expected to be aligned on the size as well.
+ */
+static inline struct page *
+alloc_pages(gfp_t gfp_mask, unsigned int order)
+{
+ unsigned long page;
+ size_t size;
+
+ size = PAGE_SIZE << order;
+ page = kmem_alloc_contig(kmem_map, size, gfp_mask, 0, -1,
+ size, 0, VM_MEMATTR_DEFAULT);
+ if (page == 0)
+ return (NULL);
+ return (virt_to_page(page));
+}
+
+#endif /* _LINUX_GFP_H_ */
diff --git a/sys/ofed/include/linux/hardirq.h b/sys/ofed/include/linux/hardirq.h
new file mode 100644
index 0000000..4c3aeba
--- /dev/null
+++ b/sys/ofed/include/linux/hardirq.h
@@ -0,0 +1,39 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_HARDIRQ_H_
+#define _LINUX_HARDIRQ_H_
+
+#include <linux/types.h>
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
+
+#define synchronize_irq(irq) _intr_drain((irq))
+
+#endif /* _LINUX_HARDIRQ_H_ */
diff --git a/sys/ofed/include/linux/idr.h b/sys/ofed/include/linux/idr.h
new file mode 100644
index 0000000..40b25b6
--- /dev/null
+++ b/sys/ofed/include/linux/idr.h
@@ -0,0 +1,70 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_IDR_H_
+#define _LINUX_IDR_H_
+
+#include <sys/kernel.h>
+
+#define IDR_BITS 5
+#define IDR_SIZE (1 << IDR_BITS)
+#define IDR_MASK (IDR_SIZE - 1)
+
+#define MAX_ID_SHIFT ((sizeof(int) * NBBY) - 1)
+#define MAX_ID_BIT (1U << MAX_ID_SHIFT)
+#define MAX_ID_MASK (MAX_ID_BIT - 1)
+#define MAX_LEVEL (MAX_ID_SHIFT + IDR_BITS - 1) / IDR_BITS
+
+struct idr_layer {
+ unsigned long bitmap;
+ struct idr_layer *ary[IDR_SIZE];
+};
+
+struct idr {
+ struct mtx lock;
+ struct idr_layer *top;
+ struct idr_layer *free;
+ int layers;
+};
+
+#define DEFINE_IDR(name) \
+ struct idr name; \
+ SYSINIT(name##_idr_sysinit, SI_SUB_DRIVERS, SI_ORDER_FIRST, \
+ idr_init, &(name));
+
+void *idr_find(struct idr *idp, int id);
+int idr_pre_get(struct idr *idp, gfp_t gfp_mask);
+int idr_get_new(struct idr *idp, void *ptr, int *id);
+int idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id);
+void *idr_replace(struct idr *idp, void *ptr, int id);
+void idr_remove(struct idr *idp, int id);
+void idr_remove_all(struct idr *idp);
+void idr_destroy(struct idr *idp);
+void idr_init(struct idr *idp);
+
+#endif /* _LINUX_IDR_H_ */
diff --git a/sys/ofed/include/linux/if_arp.h b/sys/ofed/include/linux/if_arp.h
new file mode 100644
index 0000000..c82a2c5
--- /dev/null
+++ b/sys/ofed/include/linux/if_arp.h
@@ -0,0 +1,32 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_IF_ARP_H_
+#define _LINUX_IF_ARP_H_
+#include <sys/socket.h>
+#include <net/if_arp.h>
+#endif /* _LINUX_IF_ARP_H_ */
diff --git a/sys/ofed/include/linux/if_ether.h b/sys/ofed/include/linux/if_ether.h
new file mode 100644
index 0000000..9608657
--- /dev/null
+++ b/sys/ofed/include/linux/if_ether.h
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_IF_ETHER_H_
+#define _LINUX_IF_ETHER_H_
+
+#include <linux/types.h>
+
+#include <net/ethernet.h>
+
+#define ETH_P_8021Q ETHERTYPE_VLAN
+
+#endif /* _LINUX_IF_ETHER_H_ */
diff --git a/sys/ofed/include/linux/if_vlan.h b/sys/ofed/include/linux/if_vlan.h
new file mode 100644
index 0000000..bb7eee0
--- /dev/null
+++ b/sys/ofed/include/linux/if_vlan.h
@@ -0,0 +1,35 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_IF_VLAN_H_
+#define _LINUX_IF_VLAN_H_
+
+#include <net/ethernet.h>
+#include <net/if_vlan_var.h>
+
+#endif /* _LINUX_IF_VLAN_H_ */
diff --git a/sys/ofed/include/linux/in.h b/sys/ofed/include/linux/in.h
new file mode 100644
index 0000000..8fa3dc2
--- /dev/null
+++ b/sys/ofed/include/linux/in.h
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_IN_H_
+#define _LINUX_IN_H_
+
+#include <netinet/in.h>
+#include <asm/byteorder.h>
+
+#define ipv4_is_zeronet IN_ZERONET
+#define ipv4_is_loopback IN_LOOPBACK
+
+#endif /* _LINUX_IN_H_ */
diff --git a/sys/ofed/include/linux/in6.h b/sys/ofed/include/linux/in6.h
new file mode 100644
index 0000000..2032b61
--- /dev/null
+++ b/sys/ofed/include/linux/in6.h
@@ -0,0 +1,36 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_IN6_H_
+#define _LINUX_IN6_H_
+
+#ifndef KLD_MODULE
+#include "opt_inet6.h"
+#endif
+
+#endif /* _LINUX_IN6_H_ */
diff --git a/sys/ofed/include/linux/inet.h b/sys/ofed/include/linux/inet.h
new file mode 100644
index 0000000..07fcc73
--- /dev/null
+++ b/sys/ofed/include/linux/inet.h
@@ -0,0 +1,31 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_INET_H_
+#define _LINUX_INET_H_
+#endif /* _LINUX_INET_H_ */
diff --git a/sys/ofed/include/linux/inetdevice.h b/sys/ofed/include/linux/inetdevice.h
new file mode 100644
index 0000000..c7fe1d2
--- /dev/null
+++ b/sys/ofed/include/linux/inetdevice.h
@@ -0,0 +1,56 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_INETDEVICE_H_
+#define _LINUX_INETDEVICE_H_
+
+#include <linux/netdevice.h>
+
+static inline struct net_device *
+ip_dev_find(struct net *net, uint32_t addr)
+{
+ struct sockaddr_in sin;
+ struct ifaddr *ifa;
+ struct ifnet *ifp;
+
+ ifp = NULL;
+ memset(&sin, 0, sizeof(sin));
+ sin.sin_addr.s_addr = addr;
+ sin.sin_port = 0;
+ sin.sin_len = sizeof(sin);
+ sin.sin_family = AF_INET;
+ ifa = ifa_ifwithaddr((struct sockaddr *)&sin);
+ if (ifa) {
+ ifp = ifa->ifa_ifp;
+ if_ref(ifp);
+ ifa_free(ifa);
+ }
+ return (ifp);
+}
+
+#endif /* _LINUX_INETDEVICE_H_ */
diff --git a/sys/ofed/include/linux/init.h b/sys/ofed/include/linux/init.h
new file mode 100644
index 0000000..d7c2bb1
--- /dev/null
+++ b/sys/ofed/include/linux/init.h
@@ -0,0 +1,31 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_INIT_H_
+#define _LINUX_INIT_H_
+
+#endif /* _LINUX_INIT_H_ */
diff --git a/sys/ofed/include/linux/interrupt.h b/sys/ofed/include/linux/interrupt.h
new file mode 100644
index 0000000..e35882c
--- /dev/null
+++ b/sys/ofed/include/linux/interrupt.h
@@ -0,0 +1,139 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_INTERRUPT_H_
+#define _LINUX_INTERRUPT_H_
+
+#include <linux/device.h>
+#include <linux/pci.h>
+
+#include <sys/bus.h>
+#include <sys/rman.h>
+
+typedef irqreturn_t (*irq_handler_t)(int, void *);
+
+#define IRQ_RETVAL(x) ((x) != IRQ_NONE)
+
+#define IRQF_SHARED RF_SHAREABLE
+
+struct irq_ent {
+ struct list_head links;
+ struct device *dev;
+ struct resource *res;
+ void *arg;
+ irqreturn_t (*handler)(int, void *);
+ void *tag;
+ int irq;
+};
+
+static inline int
+_irq_rid(struct device *dev, int irq)
+{
+ if (irq == dev->irq)
+ return (0);
+ return irq - dev->msix + 1;
+}
+
+static void
+_irq_handler(void *ent)
+{
+ struct irq_ent *irqe;
+
+ irqe = ent;
+ irqe->handler(irqe->irq, irqe->arg);
+}
+
+static inline struct irq_ent *
+_irq_ent(struct device *dev, int irq)
+{
+ struct irq_ent *irqe;
+
+ list_for_each_entry(irqe, &dev->irqents, links)
+ if (irqe->irq == irq)
+ return (irqe);
+
+ return (NULL);
+}
+
+static inline int
+request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags,
+ const char *name, void *arg)
+{
+ struct resource *res;
+ struct irq_ent *irqe;
+ struct device *dev;
+ int error;
+ int rid;
+
+ dev = _pci_find_irq_dev(irq);
+ if (dev == NULL)
+ return -ENXIO;
+ rid = _irq_rid(dev, irq);
+ res = bus_alloc_resource_any(dev->bsddev, SYS_RES_IRQ, &rid,
+ flags | RF_ACTIVE);
+ if (res == NULL)
+ return (-ENXIO);
+ irqe = kmalloc(sizeof(*irqe), GFP_KERNEL);
+ irqe->dev = dev;
+ irqe->res = res;
+ irqe->arg = arg;
+ irqe->handler = handler;
+ irqe->irq = irq;
+ error = bus_setup_intr(dev->bsddev, res, INTR_TYPE_NET | INTR_MPSAFE,
+ NULL, _irq_handler, irqe, &irqe->tag);
+ if (error) {
+ bus_release_resource(dev->bsddev, SYS_RES_IRQ, rid, irqe->res);
+ kfree(irqe);
+ return (-error);
+ }
+ list_add(&irqe->links, &dev->irqents);
+
+ return 0;
+}
+
+static inline void
+free_irq(unsigned int irq, void *device)
+{
+ struct irq_ent *irqe;
+ struct device *dev;
+ int rid;
+
+ dev = _pci_find_irq_dev(irq);
+ if (dev == NULL)
+ return;
+ rid = _irq_rid(dev, irq);
+ irqe = _irq_ent(dev, irq);
+ if (irqe == NULL)
+ return;
+ bus_teardown_intr(dev->bsddev, irqe->res, irqe->tag);
+ bus_release_resource(dev->bsddev, SYS_RES_IRQ, rid, irqe->res);
+ list_del(&irqe->links);
+ kfree(irqe);
+}
+
+#endif /* _LINUX_INTERRUPT_H_ */
diff --git a/sys/ofed/include/linux/io-mapping.h b/sys/ofed/include/linux/io-mapping.h
new file mode 100644
index 0000000..0753bbc
--- /dev/null
+++ b/sys/ofed/include/linux/io-mapping.h
@@ -0,0 +1,77 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_IO_MAPPING_H_
+#define _LINUX_IO_MAPPING_H_
+
+#include <linux/types.h>
+#include <linux/io.h>
+
+struct io_mapping;
+
+static inline struct io_mapping *
+io_mapping_create_wc(resource_size_t base, unsigned long size)
+{
+
+ return ioremap_wc(base, size);
+}
+
+static inline void
+io_mapping_free(struct io_mapping *mapping)
+{
+
+ iounmap(mapping);
+}
+
+static inline void *
+io_mapping_map_atomic_wc(struct io_mapping *mapping, unsigned long offset)
+{
+
+ return (((char *)mapping) + offset);
+}
+
+static inline void
+io_mapping_unmap_atomic(void *vaddr)
+{
+
+}
+
+static inline void *
+io_mapping_map_wc(struct io_mapping *mapping, unsigned long offset)
+{
+
+ return (((char *) mapping) + offset);
+}
+
+static inline void
+io_mapping_unmap(void *vaddr)
+{
+
+}
+
+#endif /* _LINUX_IO_MAPPING_H_ */
diff --git a/sys/ofed/include/linux/io.h b/sys/ofed/include/linux/io.h
new file mode 100644
index 0000000..5405be7
--- /dev/null
+++ b/sys/ofed/include/linux/io.h
@@ -0,0 +1,125 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_IO_H_
+#define _LINUX_IO_H_
+
+#include <machine/vm.h>
+
+static inline uint32_t
+__raw_readl(const volatile void *addr)
+{
+ return *(const volatile uint32_t *)addr;
+}
+
+static inline void
+__raw_writel(uint32_t b, volatile void *addr)
+{
+ *(volatile uint32_t *)addr = b;
+}
+
+static inline uint64_t
+__raw_readq(const volatile void *addr)
+{
+ return *(const volatile uint64_t *)addr;
+}
+
+static inline void
+__raw_writeq(uint64_t b, volatile void *addr)
+{
+ *(volatile uint64_t *)addr = b;
+}
+
+/*
+ * XXX This is all x86 specific. It should be bus space access.
+ */
+#define mmiowb()
+
+#undef writel
+static inline void
+writel(uint32_t b, void *addr)
+{
+ *(volatile uint32_t *)addr = b;
+}
+
+#undef writeq
+static inline void
+writeq(uint64_t b, void *addr)
+{
+ *(volatile uint64_t *)addr = b;
+}
+
+#undef writeb
+static inline void
+writeb(uint8_t b, void *addr)
+{
+ *(volatile uint8_t *)addr = b;
+}
+
+#undef writew
+static inline void
+writew(uint16_t b, void *addr)
+{
+ *(volatile uint16_t *)addr = b;
+}
+
+void *_ioremap_attr(vm_paddr_t phys_addr, unsigned long size, int attr);
+#define ioremap_nocache(addr, size) \
+ _ioremap_attr((addr), (size), VM_MEMATTR_UNCACHED)
+#define ioremap_wc(addr, size) \
+ _ioremap_attr((addr), (size), VM_MEMATTR_WRITE_COMBINING)
+#define ioremap ioremap_nocache
+void iounmap(void *addr);
+
+#define memset_io(a, b, c) memset((a), (b), (c))
+#define memcpy_fromio(a, b, c) memcpy((a), (b), (c))
+#define memcpy_toio(a, b, c) memcpy((a), (b), (c))
+
+static inline void
+__iowrite64_copy(void *to, void *from, size_t count)
+{
+#ifdef __LP64__
+ uint64_t *src;
+ uint64_t *dst;
+ int i;
+
+ for (i = 0, src = from, dst = to; i < count; i++, src++, dst++)
+ __raw_writeq(*src, dst);
+#else
+ uint32_t *src;
+ uint32_t *dst;
+ int i;
+
+ count *= 2;
+ for (i = 0, src = from, dst = to; i < count; i++, src++, dst++)
+ __raw_writel(*src, dst);
+#endif
+}
+
+
+#endif /* _LINUX_IO_H_ */
diff --git a/sys/ofed/include/linux/ioctl.h b/sys/ofed/include/linux/ioctl.h
new file mode 100644
index 0000000..9e00b7f
--- /dev/null
+++ b/sys/ofed/include/linux/ioctl.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_IOCTL_H_
+#define _LINUX_IOCTL_H_
+
+#include <sys/ioccom.h>
+
+#endif /* _LINUX_IOCTL_H_ */
diff --git a/sys/ofed/include/linux/jhash.h b/sys/ofed/include/linux/jhash.h
new file mode 100644
index 0000000..ff6ff09
--- /dev/null
+++ b/sys/ofed/include/linux/jhash.h
@@ -0,0 +1,143 @@
+#ifndef _LINUX_JHASH_H_
+#define _LINUX_JHASH_H_
+
+/* jhash.h: Jenkins hash support.
+ *
+ * Copyright (C) 1996 Bob Jenkins (bob_jenkins@burtleburtle.net)
+ *
+ * http://burtleburtle.net/bob/hash/
+ *
+ * These are the credits from Bob's sources:
+ *
+ * lookup2.c, by Bob Jenkins, December 1996, Public Domain.
+ * hash(), hash2(), hash3, and mix() are externally useful functions.
+ * Routines to test the hash are included if SELF_TEST is defined.
+ * You can use this free for any purpose. It has no warranty.
+ *
+ * Copyright (C) 2003 David S. Miller (davem@redhat.com)
+ *
+ * I've modified Bob's hash to be useful in the Linux kernel, and
+ * any bugs present are surely my fault. -DaveM
+ */
+
+/* NOTE: Arguments are modified. */
+#define __jhash_mix(a, b, c) \
+{ \
+ a -= b; a -= c; a ^= (c>>13); \
+ b -= c; b -= a; b ^= (a<<8); \
+ c -= a; c -= b; c ^= (b>>13); \
+ a -= b; a -= c; a ^= (c>>12); \
+ b -= c; b -= a; b ^= (a<<16); \
+ c -= a; c -= b; c ^= (b>>5); \
+ a -= b; a -= c; a ^= (c>>3); \
+ b -= c; b -= a; b ^= (a<<10); \
+ c -= a; c -= b; c ^= (b>>15); \
+}
+
+/* The golden ration: an arbitrary value */
+#define JHASH_GOLDEN_RATIO 0x9e3779b9
+
+/* The most generic version, hashes an arbitrary sequence
+ * of bytes. No alignment or length assumptions are made about
+ * the input key.
+ */
+static inline u32 jhash(const void *key, u32 length, u32 initval)
+{
+ u32 a, b, c, len;
+ const u8 *k = key;
+
+ len = length;
+ a = b = JHASH_GOLDEN_RATIO;
+ c = initval;
+
+ while (len >= 12) {
+ a += (k[0] +((u32)k[1]<<8) +((u32)k[2]<<16) +((u32)k[3]<<24));
+ b += (k[4] +((u32)k[5]<<8) +((u32)k[6]<<16) +((u32)k[7]<<24));
+ c += (k[8] +((u32)k[9]<<8) +((u32)k[10]<<16)+((u32)k[11]<<24));
+
+ __jhash_mix(a,b,c);
+
+ k += 12;
+ len -= 12;
+ }
+
+ c += length;
+ switch (len) {
+ case 11: c += ((u32)k[10]<<24);
+ case 10: c += ((u32)k[9]<<16);
+ case 9 : c += ((u32)k[8]<<8);
+ case 8 : b += ((u32)k[7]<<24);
+ case 7 : b += ((u32)k[6]<<16);
+ case 6 : b += ((u32)k[5]<<8);
+ case 5 : b += k[4];
+ case 4 : a += ((u32)k[3]<<24);
+ case 3 : a += ((u32)k[2]<<16);
+ case 2 : a += ((u32)k[1]<<8);
+ case 1 : a += k[0];
+ };
+
+ __jhash_mix(a,b,c);
+
+ return c;
+}
+
+/* A special optimized version that handles 1 or more of u32s.
+ * The length parameter here is the number of u32s in the key.
+ */
+static inline u32 jhash2(const u32 *k, u32 length, u32 initval)
+{
+ u32 a, b, c, len;
+
+ a = b = JHASH_GOLDEN_RATIO;
+ c = initval;
+ len = length;
+
+ while (len >= 3) {
+ a += k[0];
+ b += k[1];
+ c += k[2];
+ __jhash_mix(a, b, c);
+ k += 3; len -= 3;
+ }
+
+ c += length * 4;
+
+ switch (len) {
+ case 2 : b += k[1];
+ case 1 : a += k[0];
+ };
+
+ __jhash_mix(a,b,c);
+
+ return c;
+}
+
+
+/* A special ultra-optimized versions that knows they are hashing exactly
+ * 3, 2 or 1 word(s).
+ *
+ * NOTE: In partilar the "c += length; __jhash_mix(a,b,c);" normally
+ * done at the end is not done here.
+ */
+static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval)
+{
+ a += JHASH_GOLDEN_RATIO;
+ b += JHASH_GOLDEN_RATIO;
+ c += initval;
+
+ __jhash_mix(a, b, c);
+
+ return c;
+}
+
+static inline u32 jhash_2words(u32 a, u32 b, u32 initval)
+{
+ return jhash_3words(a, b, 0, initval);
+}
+
+static inline u32 jhash_1word(u32 a, u32 initval)
+{
+ return jhash_3words(a, 0, 0, initval);
+}
+
+#endif /* _LINUX_JHASH_H_ */
diff --git a/sys/ofed/include/linux/jiffies.h b/sys/ofed/include/linux/jiffies.h
new file mode 100644
index 0000000..7ca6337
--- /dev/null
+++ b/sys/ofed/include/linux/jiffies.h
@@ -0,0 +1,56 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_JIFFIES_H_
+#define _LINUX_JIFFIES_H_
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+
+#include <sys/time.h>
+#include <sys/kernel.h>
+
+static inline int
+msecs_to_jiffies(int msec)
+{
+ struct timeval tv;
+
+ tv.tv_sec = msec / 1000;
+ tv.tv_usec = (msec % 1000) * 1000;
+ return (tvtohz(&tv));
+}
+
+#define jiffies ticks
+
+#define time_after(a, b) ((long)(b) - (long)(a) < 0)
+#define time_before(a, b) time_after(b,a)
+#define time_after_eq(a, b) ((long)(a) - (long)(b) >= 0)
+#define time_before_eq(a, b) time_after_eq(b, a)
+
+#define HZ hz
+
+#endif /* _LINUX_JIFFIES_H_ */
diff --git a/sys/ofed/include/linux/kdev_t.h b/sys/ofed/include/linux/kdev_t.h
new file mode 100644
index 0000000..4b4f43e
--- /dev/null
+++ b/sys/ofed/include/linux/kdev_t.h
@@ -0,0 +1,36 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_KDEV_T_H_
+#define _LINUX_KDEV_T_H_
+
+#define MAJOR(dev) major((dev))
+#define MINOR(dev) minor((dev))
+#define MKDEV(ma, mi) makedev((ma), (mi))
+
+#endif /* _LINUX_KDEV_T_H_ */
diff --git a/sys/ofed/include/linux/kernel.h b/sys/ofed/include/linux/kernel.h
new file mode 100644
index 0000000..f49036e
--- /dev/null
+++ b/sys/ofed/include/linux/kernel.h
@@ -0,0 +1,88 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_KERNEL_H_
+#define _LINUX_KERNEL_H_
+
+#include <sys/systm.h>
+#include <sys/param.h>
+#include <sys/libkern.h>
+#include <sys/stat.h>
+#include <sys/smp.h>
+
+#include <linux/bitops.h>
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/stddef.h>
+#include <linux/kthread.h>
+#include <linux/types.h>
+#include <linux/jiffies.h>
+#include <linux/wait.h>
+#include <linux/fs.h>
+#include <linux/notifier.h>
+#include <linux/log2.h>
+#include <asm/byteorder.h>
+
+#define KERN_EMERG "<0>"
+#define KERN_ALERT "<1>"
+#define KERN_CRIT "<2>"
+#define KERN_ERR "<3>"
+#define KERN_WARNING "<4>"
+#define KERN_NOTICE "<5>"
+#define KERN_INFO "<6>"
+#define KERN_DEBUG "<7>"
+
+#define BUG() panic("BUG")
+#define BUG_ON(condition) do { if (condition) BUG(); } while(0)
+#define WARN_ON BUG_ON
+
+#undef ALIGN
+#define ALIGN(x, y) roundup2((x), (y))
+#define DIV_ROUND_UP howmany
+
+#define printk(X...) printf(X)
+#define pr_debug(fmt, ...) printk(KERN_DEBUG # fmt, ##__VA_ARGS__)
+#define udelay(t) DELAY(t)
+
+#define container_of(ptr, type, member) \
+({ \
+ __typeof(((type *)0)->member) *_p = (ptr); \
+ (type *)((char *)_p - offsetof(type, member)); \
+})
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+#define simple_strtoul strtoul
+
+#define min(x, y) (x < y ? x : y)
+#define max(x, y) (x > y ? x : y)
+#define min_t(type, _x, _y) (type)(_x) < (type)(_y) ? (type)(_x) : (_y)
+#define max_t(type, _x, _y) (type)(_x) > (type)(_y) ? (type)(_x) : (_y)
+
+#define num_possible_cpus() mp_ncpus
+
+#endif /* _LINUX_KERNEL_H_ */
diff --git a/sys/ofed/include/linux/kobject.h b/sys/ofed/include/linux/kobject.h
new file mode 100644
index 0000000..5872c05
--- /dev/null
+++ b/sys/ofed/include/linux/kobject.h
@@ -0,0 +1,153 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_KOBJECT_H_
+#define _LINUX_KOBJECT_H_
+
+#include <machine/stdarg.h>
+
+#include <linux/kernel.h>
+#include <linux/kref.h>
+#include <linux/slab.h>
+
+struct kobject;
+struct sysctl_oid;
+
+struct kobj_type {
+ void (*release)(struct kobject *kobj);
+ const struct sysfs_ops *sysfs_ops;
+ struct attribute **default_attrs;
+};
+
+extern struct kobj_type kfree_type;
+
+struct kobject {
+ struct kobject *parent;
+ char *name;
+ struct kref kref;
+ struct kobj_type *ktype;
+ struct list_head entry;
+ struct sysctl_oid *oidp;
+};
+
+static inline void
+kobject_init(struct kobject *kobj, struct kobj_type *ktype)
+{
+
+ kref_init(&kobj->kref);
+ INIT_LIST_HEAD(&kobj->entry);
+ kobj->ktype = ktype;
+ kobj->oidp = NULL;
+}
+
+static inline void kobject_put(struct kobject *kobj);
+void kobject_release(struct kref *kref);
+
+static inline void
+kobject_put(struct kobject *kobj)
+{
+
+ if (kobj)
+ kref_put(&kobj->kref, kobject_release);
+}
+
+static inline struct kobject *
+kobject_get(struct kobject *kobj)
+{
+
+ if (kobj)
+ kref_get(&kobj->kref);
+ return kobj;
+}
+
+static inline int
+kobject_set_name_vargs(struct kobject *kobj, const char *fmt, va_list args)
+{
+ char *old;
+ char *name;
+
+ old = kobj->name;
+
+ if (old && !fmt)
+ return 0;
+
+ name = kzalloc(MAXPATHLEN, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+ vsnprintf(name, MAXPATHLEN, fmt, args);
+ kobj->name = name;
+ kfree(old);
+ for (; *name != '\0'; name++)
+ if (*name == '/')
+ *name = '!';
+ return (0);
+}
+
+int kobject_add(struct kobject *kobj, struct kobject *parent,
+ const char *fmt, ...);
+
+static inline struct kobject *
+kobject_create(void)
+{
+ struct kobject *kobj;
+
+ kobj = kzalloc(sizeof(*kobj), GFP_KERNEL);
+ if (kobj == NULL)
+ return (NULL);
+ kobject_init(kobj, &kfree_type);
+
+ return (kobj);
+}
+
+static inline struct kobject *
+kobject_create_and_add(const char *name, struct kobject *parent)
+{
+ struct kobject *kobj;
+
+ kobj = kobject_create();
+ if (kobj == NULL)
+ return (NULL);
+ if (kobject_add(kobj, parent, "%s", name) == 0)
+ return (kobj);
+ kobject_put(kobj);
+
+ return (NULL);
+}
+
+
+static inline char *
+kobject_name(const struct kobject *kobj)
+{
+
+ return kobj->name;
+}
+
+int kobject_set_name(struct kobject *kobj, const char *fmt, ...);
+int kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype,
+ struct kobject *parent, const char *fmt, ...);
+
+#endif /* _LINUX_KOBJECT_H_ */
diff --git a/sys/ofed/include/linux/kref.h b/sys/ofed/include/linux/kref.h
new file mode 100644
index 0000000..14346c1
--- /dev/null
+++ b/sys/ofed/include/linux/kref.h
@@ -0,0 +1,62 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_KREF_H_
+#define _LINUX_KREF_H_
+
+#include <sys/refcount.h>
+
+struct kref {
+ volatile u_int count;
+};
+
+static inline void
+kref_init(struct kref *kref)
+{
+
+ refcount_init(&kref->count, 1);
+}
+
+static inline void
+kref_get(struct kref *kref)
+{
+
+ refcount_acquire(&kref->count);
+}
+
+static inline int
+kref_put(struct kref *kref, void (*rel)(struct kref *kref))
+{
+
+ if (refcount_release(&kref->count)) {
+ rel(kref);
+ return 1;
+ }
+ return 0;
+}
+
+#endif /* _KREF_H_ */
diff --git a/sys/ofed/include/linux/kthread.h b/sys/ofed/include/linux/kthread.h
new file mode 100644
index 0000000..e2882958
--- /dev/null
+++ b/sys/ofed/include/linux/kthread.h
@@ -0,0 +1,104 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_KTHREAD_H_
+#define _LINUX_KTHREAD_H_
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/sleepqueue.h>
+
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+static inline void
+_kthread_fn(void *arg)
+{
+ struct task_struct *task;
+
+ task = arg;
+ task_struct_set(curthread, task);
+ if (task->should_stop == 0)
+ task->task_ret = task->task_fn(task->task_data);
+ PROC_LOCK(task->task_thread->td_proc);
+ task->should_stop = TASK_STOPPED;
+ wakeup(task);
+ PROC_UNLOCK(task->task_thread->td_proc);
+ kthread_exit();
+}
+
+static inline struct task_struct *
+_kthread_create(int (*threadfn)(void *data), void *data)
+{
+ struct task_struct *task;
+
+ task = kzalloc(sizeof(*task), GFP_KERNEL);
+ task->task_fn = threadfn;
+ task->task_data = data;
+
+ return (task);
+}
+
+struct task_struct *kthread_create(int (*threadfn)(void *data),
+ void *data,
+ const char namefmt[], ...)
+ __attribute__((format(printf, 3, 4)));
+
+#define kthread_run(fn, data, fmt, ...) \
+({ \
+ struct task_struct *_task; \
+ \
+ _task = _kthread_create((fn), (data)); \
+ if (kthread_add(_kthread_fn, _task, NULL, &_task->task_thread, \
+ 0, 0, fmt, ## __VA_ARGS__)) { \
+ kfree(_task); \
+ _task = NULL; \
+ } else \
+ task_struct_set(_task->task_thread, _task); \
+ _task; \
+})
+
+#define kthread_should_stop() current->should_stop
+
+static inline int
+kthread_stop(struct task_struct *task)
+{
+
+ PROC_LOCK(task->task_thread->td_proc);
+ task->should_stop = TASK_SHOULD_STOP;
+ wake_up_process(task);
+ while (task->should_stop != TASK_STOPPED)
+ msleep(task, &task->task_thread->td_proc->p_mtx, PWAIT,
+ "kstop", hz);
+ PROC_UNLOCK(task->task_thread->td_proc);
+ return task->task_ret;
+}
+
+#endif /* _LINUX_KTHREAD_H_ */
diff --git a/sys/ofed/include/linux/linux_compat.c b/sys/ofed/include/linux/linux_compat.c
new file mode 100644
index 0000000..98ad807
--- /dev/null
+++ b/sys/ofed/include/linux/linux_compat.c
@@ -0,0 +1,695 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/bus.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filio.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/stdarg.h>
+#include <machine/pmap.h>
+
+#include <linux/kobject.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/cdev.h>
+#include <linux/file.h>
+#include <linux/sysfs.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+#include <linux/vmalloc.h>
+
+#include <vm/vm_pager.h>
+
+MALLOC_DEFINE(M_KMALLOC, "linux", "Linux kmalloc compat");
+
+#include <linux/rbtree.h>
+/* Undo Linux compat changes. */
+#undef RB_ROOT
+#undef file
+#undef cdev
+#define RB_ROOT(head) (head)->rbh_root
+#undef LIST_HEAD
+/* From sys/queue.h */
+#define LIST_HEAD(name, type) \
+struct name { \
+ struct type *lh_first; /* first element */ \
+}
+
+struct kobject class_root;
+struct device linux_rootdev;
+struct class miscclass;
+struct list_head pci_drivers;
+struct list_head pci_devices;
+spinlock_t pci_lock;
+
+int
+panic_cmp(struct rb_node *one, struct rb_node *two)
+{
+ panic("no cmp");
+}
+
+RB_GENERATE(linux_root, rb_node, __entry, panic_cmp);
+
+int
+kobject_set_name(struct kobject *kobj, const char *fmt, ...)
+{
+ va_list args;
+ int error;
+
+ va_start(args, fmt);
+ error = kobject_set_name_vargs(kobj, fmt, args);
+ va_end(args);
+
+ return (error);
+}
+
+static inline int
+kobject_add_complete(struct kobject *kobj, struct kobject *parent)
+{
+ struct kobj_type *t;
+ int error;
+
+ kobj->parent = kobject_get(parent);
+ error = sysfs_create_dir(kobj);
+ if (error == 0 && kobj->ktype && kobj->ktype->default_attrs) {
+ struct attribute **attr;
+ t = kobj->ktype;
+
+ for (attr = t->default_attrs; *attr != NULL; attr++) {
+ error = sysfs_create_file(kobj, *attr);
+ if (error)
+ break;
+ }
+ if (error)
+ sysfs_remove_dir(kobj);
+
+ }
+ return (error);
+}
+
+int
+kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...)
+{
+ va_list args;
+ int error;
+
+ va_start(args, fmt);
+ error = kobject_set_name_vargs(kobj, fmt, args);
+ va_end(args);
+ if (error)
+ return (error);
+
+ return kobject_add_complete(kobj, parent);
+}
+
+void
+kobject_release(struct kref *kref)
+{
+ struct kobject *kobj;
+ char *name;
+
+ kobj = container_of(kref, struct kobject, kref);
+ sysfs_remove_dir(kobj);
+ if (kobj->parent)
+ kobject_put(kobj->parent);
+ kobj->parent = NULL;
+ name = kobj->name;
+ if (kobj->ktype && kobj->ktype->release)
+ kobj->ktype->release(kobj);
+ kfree(name);
+}
+
+static void
+kobject_kfree(struct kobject *kobj)
+{
+
+ kfree(kobj);
+}
+
+struct kobj_type kfree_type = { .release = kobject_kfree };
+
+struct device *
+device_create(struct class *class, struct device *parent, dev_t devt,
+ void *drvdata, const char *fmt, ...)
+{
+ struct device *dev;
+ va_list args;
+
+ dev = kzalloc(sizeof(*dev), M_WAITOK);
+ dev->parent = parent;
+ dev->class = class;
+ dev->devt = devt;
+ dev->driver_data = drvdata;
+ va_start(args, fmt);
+ kobject_set_name_vargs(&dev->kobj, fmt, args);
+ va_end(args);
+ device_register(dev);
+
+ return (dev);
+}
+
+int
+kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype,
+ struct kobject *parent, const char *fmt, ...)
+{
+ va_list args;
+ int error;
+
+ kobject_init(kobj, ktype);
+ kobj->ktype = ktype;
+ kobj->parent = parent;
+ kobj->name = NULL;
+
+ va_start(args, fmt);
+ error = kobject_set_name_vargs(kobj, fmt, args);
+ va_end(args);
+ if (error)
+ return (error);
+ return kobject_add_complete(kobj, parent);
+}
+
+static void
+linux_file_dtor(void *cdp)
+{
+ struct linux_file *filp;
+
+ filp = cdp;
+ filp->f_op->release(curthread->td_fpop->f_vnode, filp);
+ kfree(filp);
+}
+
+static int
+linux_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
+{
+ struct linux_cdev *ldev;
+ struct linux_file *filp;
+ struct file *file;
+ int error;
+
+ file = curthread->td_fpop;
+ ldev = dev->si_drv1;
+ if (ldev == NULL)
+ return (ENODEV);
+ filp = kzalloc(sizeof(*filp), GFP_KERNEL);
+ filp->f_dentry = &filp->f_dentry_store;
+ filp->f_op = ldev->ops;
+ filp->f_flags = file->f_flag;
+ if (filp->f_op->open) {
+ error = -filp->f_op->open(file->f_vnode, filp);
+ if (error) {
+ kfree(filp);
+ return (error);
+ }
+ }
+ error = devfs_set_cdevpriv(filp, linux_file_dtor);
+ if (error) {
+ filp->f_op->release(file->f_vnode, filp);
+ kfree(filp);
+ return (error);
+ }
+
+ return 0;
+}
+
+static int
+linux_dev_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
+{
+ struct linux_cdev *ldev;
+ struct linux_file *filp;
+ struct file *file;
+ int error;
+
+ file = curthread->td_fpop;
+ ldev = dev->si_drv1;
+ if (ldev == NULL)
+ return (0);
+ if ((error = devfs_get_cdevpriv((void **)&filp)) != 0)
+ return (error);
+ filp->f_flags = file->f_flag;
+ devfs_clear_cdevpriv();
+
+ return (0);
+}
+
+static int
+linux_dev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
+ struct thread *td)
+{
+ struct linux_cdev *ldev;
+ struct linux_file *filp;
+ struct file *file;
+ int error;
+
+ file = curthread->td_fpop;
+ ldev = dev->si_drv1;
+ if (ldev == NULL)
+ return (0);
+ if ((error = devfs_get_cdevpriv((void **)&filp)) != 0)
+ return (error);
+ filp->f_flags = file->f_flag;
+ /*
+ * Linux does not have a generic ioctl copyin/copyout layer. All
+ * linux ioctls must be converted to void ioctls which pass a
+ * pointer to the address of the data. We want the actual user
+ * address so we dereference here.
+ */
+ data = *(void **)data;
+ if (filp->f_op->unlocked_ioctl)
+ error = -filp->f_op->unlocked_ioctl(filp, cmd, (u_long)data);
+ else
+ error = ENOTTY;
+
+ return (error);
+}
+
+static int
+linux_dev_read(struct cdev *dev, struct uio *uio, int ioflag)
+{
+ struct linux_cdev *ldev;
+ struct linux_file *filp;
+ struct file *file;
+ ssize_t bytes;
+ int error;
+
+ file = curthread->td_fpop;
+ ldev = dev->si_drv1;
+ if (ldev == NULL)
+ return (0);
+ if ((error = devfs_get_cdevpriv((void **)&filp)) != 0)
+ return (error);
+ filp->f_flags = file->f_flag;
+ if (uio->uio_iovcnt != 1)
+ panic("linux_dev_read: uio %p iovcnt %d",
+ uio, uio->uio_iovcnt);
+ if (filp->f_op->read) {
+ bytes = filp->f_op->read(filp, uio->uio_iov->iov_base,
+ uio->uio_iov->iov_len, &uio->uio_offset);
+ if (bytes >= 0) {
+ uio->uio_iov->iov_base += bytes;
+ uio->uio_iov->iov_len -= bytes;
+ uio->uio_resid -= bytes;
+ } else
+ error = -bytes;
+ } else
+ error = ENXIO;
+
+ return (error);
+}
+
+static int
+linux_dev_write(struct cdev *dev, struct uio *uio, int ioflag)
+{
+ struct linux_cdev *ldev;
+ struct linux_file *filp;
+ struct file *file;
+ ssize_t bytes;
+ int error;
+
+ file = curthread->td_fpop;
+ ldev = dev->si_drv1;
+ if (ldev == NULL)
+ return (0);
+ if ((error = devfs_get_cdevpriv((void **)&filp)) != 0)
+ return (error);
+ filp->f_flags = file->f_flag;
+ if (uio->uio_iovcnt != 1)
+ panic("linux_dev_write: uio %p iovcnt %d",
+ uio, uio->uio_iovcnt);
+ if (filp->f_op->write) {
+ bytes = filp->f_op->write(filp, uio->uio_iov->iov_base,
+ uio->uio_iov->iov_len, &uio->uio_offset);
+ if (bytes >= 0) {
+ uio->uio_iov->iov_base += bytes;
+ uio->uio_iov->iov_len -= bytes;
+ uio->uio_resid -= bytes;
+ } else
+ error = -bytes;
+ } else
+ error = ENXIO;
+
+ return (error);
+}
+
+static int
+linux_dev_poll(struct cdev *dev, int events, struct thread *td)
+{
+ struct linux_cdev *ldev;
+ struct linux_file *filp;
+ struct file *file;
+ int revents;
+ int error;
+
+ file = curthread->td_fpop;
+ ldev = dev->si_drv1;
+ if (ldev == NULL)
+ return (0);
+ if ((error = devfs_get_cdevpriv((void **)&filp)) != 0)
+ return (error);
+ filp->f_flags = file->f_flag;
+ if (filp->f_op->poll)
+ revents = filp->f_op->poll(filp, NULL) & events;
+ else
+ revents = 0;
+
+ return (revents);
+}
+
+static int
+linux_dev_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
+ int nprot, vm_memattr_t *memattr)
+{
+
+ /* XXX memattr not honored. */
+ *paddr = offset;
+ return (0);
+}
+
+static int
+linux_dev_mmap_single(struct cdev *dev, vm_ooffset_t *offset,
+ vm_size_t size, struct vm_object **object, int nprot)
+{
+ struct linux_cdev *ldev;
+ struct linux_file *filp;
+ struct file *file;
+ struct vm_area_struct vma;
+ vm_paddr_t paddr;
+ vm_page_t m;
+ int error;
+
+ file = curthread->td_fpop;
+ ldev = dev->si_drv1;
+ if (ldev == NULL)
+ return (ENODEV);
+ if (size != PAGE_SIZE)
+ return (EINVAL);
+ if ((error = devfs_get_cdevpriv((void **)&filp)) != 0)
+ return (error);
+ filp->f_flags = file->f_flag;
+ vma.vm_start = 0;
+ vma.vm_end = PAGE_SIZE;
+ vma.vm_pgoff = *offset / PAGE_SIZE;
+ vma.vm_pfn = 0;
+ vma.vm_page_prot = 0;
+ if (filp->f_op->mmap) {
+ error = -filp->f_op->mmap(filp, &vma);
+ if (error == 0) {
+ paddr = (vm_paddr_t)vma.vm_pfn << PAGE_SHIFT;
+ *offset = paddr;
+ m = PHYS_TO_VM_PAGE(paddr);
+ *object = vm_pager_allocate(OBJT_DEVICE, dev,
+ PAGE_SIZE, nprot, *offset, curthread->td_ucred);
+ if (*object == NULL)
+ return (EINVAL);
+ if (vma.vm_page_prot != VM_MEMATTR_DEFAULT)
+ pmap_page_set_memattr(m, vma.vm_page_prot);
+ }
+ } else
+ error = ENODEV;
+
+ return (error);
+}
+
+struct cdevsw linuxcdevsw = {
+ .d_version = D_VERSION,
+ .d_flags = D_TRACKCLOSE,
+ .d_open = linux_dev_open,
+ .d_close = linux_dev_close,
+ .d_read = linux_dev_read,
+ .d_write = linux_dev_write,
+ .d_ioctl = linux_dev_ioctl,
+ .d_mmap_single = linux_dev_mmap_single,
+ .d_mmap = linux_dev_mmap,
+ .d_poll = linux_dev_poll,
+};
+
+static int
+linux_file_read(struct file *file, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+ struct linux_file *filp;
+ ssize_t bytes;
+ int error;
+
+ error = 0;
+ filp = (struct linux_file *)file->f_data;
+ filp->f_flags = file->f_flag;
+ if (uio->uio_iovcnt != 1)
+ panic("linux_file_read: uio %p iovcnt %d",
+ uio, uio->uio_iovcnt);
+ if (filp->f_op->read) {
+ bytes = filp->f_op->read(filp, uio->uio_iov->iov_base,
+ uio->uio_iov->iov_len, &uio->uio_offset);
+ if (bytes >= 0) {
+ uio->uio_iov->iov_base += bytes;
+ uio->uio_iov->iov_len -= bytes;
+ uio->uio_resid -= bytes;
+ } else
+ error = -bytes;
+ } else
+ error = ENXIO;
+
+ return (error);
+}
+
+static int
+linux_file_poll(struct file *file, int events, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct linux_file *filp;
+ int revents;
+
+ filp = (struct linux_file *)file->f_data;
+ filp->f_flags = file->f_flag;
+ if (filp->f_op->poll)
+ revents = filp->f_op->poll(filp, NULL) & events;
+ else
+ revents = 0;
+
+ return (0);
+}
+
+static int
+linux_file_close(struct file *file, struct thread *td)
+{
+ struct linux_file *filp;
+ int error;
+
+ filp = (struct linux_file *)file->f_data;
+ filp->f_flags = file->f_flag;
+ error = -filp->f_op->release(NULL, filp);
+ funsetown(&filp->f_sigio);
+ kfree(filp);
+
+ return (error);
+}
+
+static int
+linux_file_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *cred,
+ struct thread *td)
+{
+ struct linux_file *filp;
+ int error;
+
+ filp = (struct linux_file *)fp->f_data;
+ filp->f_flags = fp->f_flag;
+ error = 0;
+
+ switch (cmd) {
+ case FIONBIO:
+ break;
+ case FIOASYNC:
+ if (filp->f_op->fasync == NULL)
+ break;
+ error = filp->f_op->fasync(0, filp, fp->f_flag & FASYNC);
+ break;
+ case FIOSETOWN:
+ error = fsetown(*(int *)data, &filp->f_sigio);
+ if (error == 0)
+ error = filp->f_op->fasync(0, filp,
+ fp->f_flag & FASYNC);
+ break;
+ case FIOGETOWN:
+ *(int *)data = fgetown(&filp->f_sigio);
+ break;
+ default:
+ error = ENOTTY;
+ break;
+ }
+ return (error);
+}
+
+struct fileops linuxfileops = {
+ .fo_read = linux_file_read,
+ .fo_poll = linux_file_poll,
+ .fo_close = linux_file_close,
+ .fo_ioctl = linux_file_ioctl
+};
+
+/*
+ * Hash of vmmap addresses. This is infrequently accessed and does not
+ * need to be particularly large. This is done because we must store the
+ * caller's idea of the map size to properly unmap.
+ */
+struct vmmap {
+ LIST_ENTRY(vmmap) vm_next;
+ void *vm_addr;
+ unsigned long vm_size;
+};
+
+LIST_HEAD(vmmaphd, vmmap);
+#define VMMAP_HASH_SIZE 64
+#define VMMAP_HASH_MASK (VMMAP_HASH_SIZE - 1)
+#define VM_HASH(addr) ((uintptr_t)(addr) >> PAGE_SHIFT) & VMMAP_HASH_MASK
+static struct vmmaphd vmmaphead[VMMAP_HASH_SIZE];
+static struct mtx vmmaplock;
+
+static void
+vmmap_add(void *addr, unsigned long size)
+{
+ struct vmmap *vmmap;
+
+ vmmap = kmalloc(sizeof(*vmmap), GFP_KERNEL);
+ mtx_lock(&vmmaplock);
+ vmmap->vm_size = size;
+ vmmap->vm_addr = addr;
+ LIST_INSERT_HEAD(&vmmaphead[VM_HASH(addr)], vmmap, vm_next);
+ mtx_unlock(&vmmaplock);
+}
+
+static struct vmmap *
+vmmap_remove(void *addr)
+{
+ struct vmmap *vmmap;
+
+ mtx_lock(&vmmaplock);
+ LIST_FOREACH(vmmap, &vmmaphead[VM_HASH(addr)], vm_next)
+ if (vmmap->vm_addr == addr)
+ break;
+ if (vmmap)
+ LIST_REMOVE(vmmap, vm_next);
+ mtx_unlock(&vmmaplock);
+
+ return (vmmap);
+}
+
+void *
+_ioremap_attr(vm_paddr_t phys_addr, unsigned long size, int attr)
+{
+ void *addr;
+
+ addr = pmap_mapdev_attr(phys_addr, size, attr);
+ if (addr == NULL)
+ return (NULL);
+ vmmap_add(addr, size);
+
+ return (addr);
+}
+
+void
+iounmap(void *addr)
+{
+ struct vmmap *vmmap;
+
+ vmmap = vmmap_remove(addr);
+ if (vmmap == NULL)
+ return;
+ pmap_unmapdev((vm_offset_t)addr, vmmap->vm_size);
+ kfree(vmmap);
+}
+
+
+void *
+vmap(struct page **pages, unsigned int count, unsigned long flags, int prot)
+{
+ vm_offset_t off;
+ size_t size;
+
+ size = count * PAGE_SIZE;
+ off = kmem_alloc_nofault(kernel_map, size);
+ if (off == 0)
+ return (NULL);
+ vmmap_add((void *)off, size);
+ pmap_qenter(off, pages, count);
+
+ return ((void *)off);
+}
+
+void
+vunmap(void *addr)
+{
+ struct vmmap *vmmap;
+
+ vmmap = vmmap_remove(addr);
+ if (vmmap == NULL)
+ return;
+ pmap_qremove((vm_offset_t)addr, vmmap->vm_size / PAGE_SIZE);
+ kmem_free(kernel_map, (vm_offset_t)addr, vmmap->vm_size);
+ kfree(vmmap);
+}
+
+static void
+linux_compat_init(void)
+{
+ struct sysctl_oid *rootoid;
+ int i;
+
+ rootoid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(),
+ OID_AUTO, "sys", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "sys");
+ kobject_init(&class_root, &class_ktype);
+ kobject_set_name(&class_root, "class");
+ class_root.oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(rootoid),
+ OID_AUTO, "class", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "class");
+ kobject_init(&linux_rootdev.kobj, &dev_ktype);
+ kobject_set_name(&linux_rootdev.kobj, "device");
+ linux_rootdev.kobj.oidp = SYSCTL_ADD_NODE(NULL,
+ SYSCTL_CHILDREN(rootoid), OID_AUTO, "device", CTLFLAG_RD, NULL,
+ "device");
+ linux_rootdev.bsddev = root_bus;
+ miscclass.name = "misc";
+ class_register(&miscclass);
+ INIT_LIST_HEAD(&pci_drivers);
+ INIT_LIST_HEAD(&pci_devices);
+ spin_lock_init(&pci_lock);
+ mtx_init(&vmmaplock, "IO Map lock", NULL, MTX_DEF);
+ for (i = 0; i < VMMAP_HASH_SIZE; i++)
+ LIST_INIT(&vmmaphead[i]);
+}
+
+SYSINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_init, NULL);
diff --git a/sys/ofed/include/linux/linux_idr.c b/sys/ofed/include/linux/linux_idr.c
new file mode 100644
index 0000000..5cfaff5
--- /dev/null
+++ b/sys/ofed/include/linux/linux_idr.c
@@ -0,0 +1,447 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+
+#include <machine/stdarg.h>
+
+#include <linux/bitops.h>
+#include <linux/kobject.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/err.h>
+
+/*
+ * IDR Implementation.
+ *
+ * This is quick and dirty and not as re-entrant as the linux version
+ * however it should be fairly fast. It is basically a radix tree with
+ * a builtin bitmap for allocation.
+ */
+MALLOC_DEFINE(M_IDR, "idr", "Linux IDR compat");
+
+static inline int
+idr_max(struct idr *idr)
+{
+ return (1 << (idr->layers * IDR_BITS)) - 1;
+}
+
+static inline int
+idr_pos(int id, int layer)
+{
+ return (id >> (IDR_BITS * layer)) & IDR_MASK;
+}
+
+void
+idr_init(struct idr *idr)
+{
+ bzero(idr, sizeof(*idr));
+ mtx_init(&idr->lock, "idr", NULL, MTX_DEF);
+}
+
+/* Only frees cached pages. */
+void
+idr_destroy(struct idr *idr)
+{
+ struct idr_layer *il, *iln;
+
+ mtx_lock(&idr->lock);
+ for (il = idr->free; il != NULL; il = iln) {
+ iln = il->ary[0];
+ free(il, M_IDR);
+ }
+ mtx_unlock(&idr->lock);
+}
+
+static void
+idr_remove_layer(struct idr_layer *il, int layer)
+{
+ int i;
+
+ if (il == NULL)
+ return;
+ if (layer == 0) {
+ free(il, M_IDR);
+ return;
+ }
+ for (i = 0; i < IDR_SIZE; i++)
+ if (il->ary[i])
+ idr_remove_layer(il->ary[i], layer - 1);
+}
+
+void
+idr_remove_all(struct idr *idr)
+{
+
+ mtx_lock(&idr->lock);
+ idr_remove_layer(idr->top, idr->layers - 1);
+ idr->top = NULL;
+ idr->layers = 0;
+ mtx_unlock(&idr->lock);
+}
+
+void
+idr_remove(struct idr *idr, int id)
+{
+ struct idr_layer *il;
+ int layer;
+ int idx;
+
+ id &= MAX_ID_MASK;
+ mtx_lock(&idr->lock);
+ il = idr->top;
+ layer = idr->layers - 1;
+ if (il == NULL || id > idr_max(idr)) {
+ mtx_unlock(&idr->lock);
+ return;
+ }
+ /*
+ * Walk down the tree to this item setting bitmaps along the way
+ * as we know at least one item will be free along this path.
+ */
+ while (layer && il) {
+ idx = idr_pos(id, layer);
+ il->bitmap |= 1 << idx;
+ il = il->ary[idx];
+ layer--;
+ }
+ idx = id & IDR_MASK;
+ /*
+ * At this point we've set free space bitmaps up the whole tree.
+ * We could make this non-fatal and unwind but linux dumps a stack
+ * and a warning so I don't think it's necessary.
+ */
+ if (il == NULL || (il->bitmap & (1 << idx)) != 0)
+ panic("idr_remove: Item %d not allocated (%p, %p)\n",
+ id, idr, il);
+ il->ary[idx] = NULL;
+ il->bitmap |= 1 << idx;
+ mtx_unlock(&idr->lock);
+ return;
+}
+
+void *
+idr_replace(struct idr *idr, void *ptr, int id)
+{
+ struct idr_layer *il;
+ void *res;
+ int layer;
+ int idx;
+
+ res = ERR_PTR(-EINVAL);
+ id &= MAX_ID_MASK;
+ mtx_lock(&idr->lock);
+ il = idr->top;
+ layer = idr->layers - 1;
+ if (il == NULL || id > idr_max(idr))
+ goto out;
+ while (layer && il) {
+ il = il->ary[idr_pos(id, layer)];
+ layer--;
+ }
+ idx = id & IDR_MASK;
+ /*
+ * Replace still returns an error if the item was not allocated.
+ */
+ if (il != NULL && (il->bitmap & (1 << idx)) != 0) {
+ res = il->ary[idx];
+ il->ary[idx] = ptr;
+ }
+out:
+ mtx_unlock(&idr->lock);
+ return (res);
+}
+
+void *
+idr_find(struct idr *idr, int id)
+{
+ struct idr_layer *il;
+ void *res;
+ int layer;
+
+ res = NULL;
+ id &= MAX_ID_MASK;
+ mtx_lock(&idr->lock);
+ il = idr->top;
+ layer = idr->layers - 1;
+ if (il == NULL || id > idr_max(idr))
+ goto out;
+ while (layer && il) {
+ il = il->ary[idr_pos(id, layer)];
+ layer--;
+ }
+ if (il != NULL)
+ res = il->ary[id & IDR_MASK];
+out:
+ mtx_unlock(&idr->lock);
+ return (res);
+}
+
+int
+idr_pre_get(struct idr *idr, gfp_t gfp_mask)
+{
+ struct idr_layer *il, *iln;
+ struct idr_layer *head;
+ int need;
+
+ mtx_lock(&idr->lock);
+ for (;;) {
+ need = idr->layers + 1;
+ for (il = idr->free; il != NULL; il = il->ary[0])
+ need--;
+ mtx_unlock(&idr->lock);
+ if (need == 0)
+ break;
+ for (head = NULL; need; need--) {
+ iln = malloc(sizeof(*il), M_IDR, M_ZERO | gfp_mask);
+ if (iln == NULL)
+ break;
+ bitmap_fill(&iln->bitmap, IDR_SIZE);
+ if (head != NULL) {
+ il->ary[0] = iln;
+ il = iln;
+ } else
+ head = il = iln;
+ }
+ if (head == NULL)
+ return (0);
+ mtx_lock(&idr->lock);
+ il->ary[0] = idr->free;
+ idr->free = head;
+ }
+ return (1);
+}
+
+static inline struct idr_layer *
+idr_get(struct idr *idr)
+{
+ struct idr_layer *il;
+
+ il = idr->free;
+ if (il) {
+ idr->free = il->ary[0];
+ il->ary[0] = NULL;
+ return (il);
+ }
+ il = malloc(sizeof(*il), M_IDR, M_ZERO | M_NOWAIT);
+ bitmap_fill(&il->bitmap, IDR_SIZE);
+ return (il);
+}
+
+/*
+ * Could be implemented as get_new_above(idr, ptr, 0, idp) but written
+ * first for simplicity sake.
+ */
+int
+idr_get_new(struct idr *idr, void *ptr, int *idp)
+{
+ struct idr_layer *stack[MAX_LEVEL];
+ struct idr_layer *il;
+ int error;
+ int layer;
+ int idx;
+ int id;
+
+ error = -EAGAIN;
+ mtx_lock(&idr->lock);
+ /*
+ * Expand the tree until there is free space.
+ */
+ if (idr->top == NULL || idr->top->bitmap == 0) {
+ if (idr->layers == MAX_LEVEL + 1) {
+ error = -ENOSPC;
+ goto out;
+ }
+ il = idr_get(idr);
+ if (il == NULL)
+ goto out;
+ il->ary[0] = idr->top;
+ if (idr->top)
+ il->bitmap &= ~1;
+ idr->top = il;
+ idr->layers++;
+ }
+ il = idr->top;
+ id = 0;
+ /*
+ * Walk the tree following free bitmaps, record our path.
+ */
+ for (layer = idr->layers - 1;; layer--) {
+ stack[layer] = il;
+ idx = ffsl(il->bitmap);
+ if (idx == 0)
+ panic("idr_get_new: Invalid leaf state (%p, %p)\n",
+ idr, il);
+ idx--;
+ id |= idx << (layer * IDR_BITS);
+ if (layer == 0)
+ break;
+ if (il->ary[idx] == NULL) {
+ il->ary[idx] = idr_get(idr);
+ if (il->ary[idx] == NULL)
+ goto out;
+ }
+ il = il->ary[idx];
+ }
+ /*
+ * Allocate the leaf to the consumer.
+ */
+ il->bitmap &= ~(1 << idx);
+ il->ary[idx] = ptr;
+ *idp = id;
+ /*
+ * Clear bitmaps potentially up to the root.
+ */
+ while (il->bitmap == 0 && ++layer < idr->layers) {
+ il = stack[layer];
+ il->bitmap &= ~(1 << idr_pos(id, layer));
+ }
+ error = 0;
+out:
+ mtx_unlock(&idr->lock);
+#ifdef INVARIANTS
+ if (error == 0 && idr_find(idr, id) != ptr) {
+ panic("idr_get_new: Failed for idr %p, id %d, ptr %p\n",
+ idr, id, ptr);
+ }
+#endif
+ return (error);
+}
+
+int
+idr_get_new_above(struct idr *idr, void *ptr, int starting_id, int *idp)
+{
+ struct idr_layer *stack[MAX_LEVEL];
+ struct idr_layer *il;
+ int error;
+ int layer;
+ int idx, sidx;
+ int id;
+
+ error = -EAGAIN;
+ mtx_lock(&idr->lock);
+ /*
+ * Compute the layers required to support starting_id and the mask
+ * at the top layer.
+ */
+restart:
+ idx = starting_id;
+ layer = 0;
+ while (idx & ~IDR_MASK) {
+ layer++;
+ idx >>= IDR_BITS;
+ }
+ if (layer == MAX_LEVEL + 1) {
+ error = -ENOSPC;
+ goto out;
+ }
+ /*
+ * Expand the tree until there is free space at or beyond starting_id.
+ */
+ while (idr->layers <= layer ||
+ idr->top->bitmap < (1 << idr_pos(starting_id, idr->layers - 1))) {
+ if (idr->layers == MAX_LEVEL + 1) {
+ error = -ENOSPC;
+ goto out;
+ }
+ il = idr_get(idr);
+ if (il == NULL)
+ goto out;
+ il->ary[0] = idr->top;
+ if (idr->top && idr->top->bitmap == 0)
+ il->bitmap &= ~1;
+ idr->top = il;
+ idr->layers++;
+ }
+ il = idr->top;
+ id = 0;
+ /*
+ * Walk the tree following free bitmaps, record our path.
+ */
+ for (layer = idr->layers - 1;; layer--) {
+ stack[layer] = il;
+ sidx = idr_pos(starting_id, layer);
+ /* Returns index numbered from 0 or size if none exists. */
+ idx = find_next_bit(&il->bitmap, IDR_SIZE, sidx);
+ if (idx == IDR_SIZE && sidx == 0)
+ panic("idr_get_new: Invalid leaf state (%p, %p)\n",
+ idr, il);
+ /*
+ * We may have walked a path where there was a free bit but
+ * it was lower than what we wanted. Restart the search with
+ * a larger starting id. id contains the progress we made so
+ * far. Search the leaf one above this level. This may
+ * restart as many as MAX_LEVEL times but that is expected
+ * to be rare.
+ */
+ if (idx == IDR_SIZE) {
+ starting_id = id + (1 << (layer+1 * IDR_BITS));
+ goto restart;
+ }
+ if (idx > sidx)
+ starting_id = 0; /* Search the whole subtree. */
+ id |= idx << (layer * IDR_BITS);
+ if (layer == 0)
+ break;
+ if (il->ary[idx] == NULL) {
+ il->ary[idx] = idr_get(idr);
+ if (il->ary[idx] == NULL)
+ goto out;
+ }
+ il = il->ary[idx];
+ }
+ /*
+ * Allocate the leaf to the consumer.
+ */
+ il->bitmap &= ~(1 << idx);
+ il->ary[idx] = ptr;
+ *idp = id;
+ /*
+ * Clear bitmaps potentially up to the root.
+ */
+ while (il->bitmap == 0 && ++layer < idr->layers) {
+ il = stack[layer];
+ il->bitmap &= ~(1 << idr_pos(id, layer));
+ }
+ error = 0;
+out:
+ mtx_unlock(&idr->lock);
+#ifdef INVARIANTS
+ if (error == 0 && idr_find(idr, id) != ptr) {
+ panic("idr_get_new_above: Failed for idr %p, id %d, ptr %p\n",
+ idr, id, ptr);
+ }
+#endif
+ return (error);
+}
diff --git a/sys/ofed/include/linux/linux_radix.c b/sys/ofed/include/linux/linux_radix.c
new file mode 100644
index 0000000..e642eae
--- /dev/null
+++ b/sys/ofed/include/linux/linux_radix.c
@@ -0,0 +1,170 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/radix-tree.h>
+#include <linux/err.h>
+
+MALLOC_DEFINE(M_RADIX, "radix", "Linux radix compat");
+
+static inline int
+radix_max(struct radix_tree_root *root)
+{
+ return (1 << (root->height * RADIX_TREE_MAP_SHIFT)) - 1;
+}
+
+static inline int
+radix_pos(long id, int height)
+{
+ return (id >> (RADIX_TREE_MAP_SHIFT * height)) & RADIX_TREE_MAP_MASK;
+}
+
+void *
+radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
+{
+ struct radix_tree_node *node;
+ void *item;
+ int height;
+
+ item = NULL;
+ node = root->rnode;
+ height = root->height - 1;
+ if (index > radix_max(root))
+ goto out;
+ while (height && node)
+ node = node->slots[radix_pos(index, height--)];
+ if (node)
+ item = node->slots[radix_pos(index, 0)];
+
+out:
+ return (item);
+}
+
+void *
+radix_tree_delete(struct radix_tree_root *root, unsigned long index)
+{
+ struct radix_tree_node *stack[RADIX_TREE_MAX_HEIGHT];
+ struct radix_tree_node *node;
+ void *item;
+ int height;
+ int idx;
+
+ item = NULL;
+ node = root->rnode;
+ height = root->height - 1;
+ if (index > radix_max(root))
+ goto out;
+ /*
+ * Find the node and record the path in stack.
+ */
+ while (height && node) {
+ stack[height] = node;
+ node = node->slots[radix_pos(index, height--)];
+ }
+ idx = radix_pos(index, 0);
+ if (node)
+ item = node->slots[idx];
+ /*
+ * If we removed something reduce the height of the tree.
+ */
+ if (item)
+ for (;;) {
+ node->slots[idx] = NULL;
+ node->count--;
+ if (node->count > 0)
+ break;
+ free(node, M_RADIX);
+ if (node == root->rnode) {
+ root->rnode = NULL;
+ root->height = 0;
+ break;
+ }
+ height++;
+ node = stack[height];
+ idx = radix_pos(index, height);
+ }
+out:
+ return (item);
+}
+
+int
+radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item)
+{
+ struct radix_tree_node *node;
+ int height;
+ int idx;
+
+ /*
+ * Expand the tree to fit indexes as big as requested.
+ */
+ while (root->rnode == NULL || radix_max(root) < index) {
+ node = malloc(sizeof(*node), M_RADIX, root->gfp_mask | M_ZERO);
+ if (node == NULL)
+ return (-ENOMEM);
+ node->slots[0] = root->rnode;
+ if (root->rnode)
+ node->count++;
+ root->rnode = node;
+ root->height++;
+ }
+ node = root->rnode;
+ height = root->height - 1;
+ /*
+ * Walk down the tree finding the correct node and allocating any
+ * missing nodes along the way.
+ */
+ while (height) {
+ idx = radix_pos(index, height);
+ if (node->slots[idx] == NULL) {
+ node->slots[idx] = malloc(sizeof(*node), M_RADIX,
+ root->gfp_mask | M_ZERO);
+ if (node->slots[idx] == NULL)
+ return (-ENOMEM);
+ node->count++;
+ }
+ node = node->slots[idx];
+ height--;
+ }
+ /*
+ * Insert and adjust count if the item does not already exist.
+ */
+ idx = radix_pos(index, 0);
+ if (node->slots[idx])
+ return (-EEXIST);
+ node->slots[idx] = item;
+ node->count++;
+
+ return (0);
+}
diff --git a/sys/ofed/include/linux/list.h b/sys/ofed/include/linux/list.h
new file mode 100644
index 0000000..f6f9404
--- /dev/null
+++ b/sys/ofed/include/linux/list.h
@@ -0,0 +1,331 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_LIST_H_
+#define _LINUX_LIST_H_
+
+/*
+ * Since LIST_HEAD conflicts with the linux definition we must include any
+ * FreeBSD header which requires it here so it is resolved with the correct
+ * definition prior to the undef.
+ */
+#include <linux/types.h>
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/queue.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/conf.h>
+#include <sys/socket.h>
+#include <sys/mbuf.h>
+
+#include <net/bpf.h>
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/if_media.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+
+#include <netinet6/in6_var.h>
+#include <netinet6/nd6.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+
+#define prefetch(x)
+
+struct list_head {
+ struct list_head *next;
+ struct list_head *prev;
+};
+
+static inline void
+INIT_LIST_HEAD(struct list_head *list)
+{
+
+ list->next = list->prev = list;
+}
+
+static inline int
+list_empty(const struct list_head *head)
+{
+
+ return (head->next == head);
+}
+
+static inline void
+list_del(struct list_head *entry)
+{
+
+ entry->next->prev = entry->prev;
+ entry->prev->next = entry->next;
+}
+
+static inline void
+_list_add(struct list_head *new, struct list_head *prev,
+ struct list_head *next)
+{
+
+ next->prev = new;
+ new->next = next;
+ new->prev = prev;
+ prev->next = new;
+}
+
+static inline void
+list_del_init(struct list_head *entry)
+{
+
+ list_del(entry);
+ INIT_LIST_HEAD(entry);
+}
+
+#define list_entry(ptr, type, field) container_of(ptr, type, field)
+
+#define list_for_each(p, head) \
+ for (p = (head)->next; p != (head); p = p->next)
+
+#define list_for_each_safe(p, n, head) \
+ for (p = (head)->next, n = p->next; p != (head); p = n, n = p->next)
+
+#define list_for_each_entry(p, h, field) \
+ for (p = list_entry((h)->next, typeof(*p), field); &p->field != (h); \
+ p = list_entry(p->field.next, typeof(*p), field))
+
+#define list_for_each_entry_safe(p, n, h, field) \
+ for (p = list_entry((h)->next, typeof(*p), field), \
+ n = list_entry(p->field.next, typeof(*p), field); &p->field != (h);\
+ p = n, n = list_entry(n->field.next, typeof(*n), field))
+
+#define list_for_each_entry_reverse(p, h, field) \
+ for (p = list_entry((h)->prev, typeof(*p), field); &p->field != (h); \
+ p = list_entry(p->field.prev, typeof(*p), field))
+
+#define list_for_each_prev(p, h) for (p = (h)->prev; p != (h); p = p->prev)
+
+static inline void
+list_add(struct list_head *new, struct list_head *head)
+{
+
+ _list_add(new, head, head->next);
+}
+
+static inline void
+list_add_tail(struct list_head *new, struct list_head *head)
+{
+
+ _list_add(new, head->prev, head);
+}
+
+static inline void
+list_move(struct list_head *list, struct list_head *head)
+{
+
+ list_del(list);
+ list_add(list, head);
+}
+
+static inline void
+list_move_tail(struct list_head *entry, struct list_head *head)
+{
+
+ list_del(entry);
+ list_add_tail(entry, head);
+}
+
+static inline void
+_list_splice(const struct list_head *list, struct list_head *prev,
+ struct list_head *next)
+{
+ struct list_head *first;
+ struct list_head *last;
+
+ if (list_empty(list))
+ return;
+ first = list->next;
+ last = list->prev;
+ first->prev = prev;
+ prev->next = first;
+ last->next = next;
+ next->prev = last;
+}
+
+static inline void
+list_splice(const struct list_head *list, struct list_head *head)
+{
+
+ _list_splice(list, head, head->next);
+}
+
+static inline void
+list_splice_tail(struct list_head *list, struct list_head *head)
+{
+
+ _list_splice(list, head->prev, head);
+}
+
+static inline void
+list_splice_init(struct list_head *list, struct list_head *head)
+{
+
+ _list_splice(list, head, head->next);
+ INIT_LIST_HEAD(list);
+}
+
+static inline void
+list_splice_tail_init(struct list_head *list, struct list_head *head)
+{
+
+ _list_splice(list, head->prev, head);
+ INIT_LIST_HEAD(list);
+}
+
+#undef LIST_HEAD
+#define LIST_HEAD(name) struct list_head name = { &(name), &(name) }
+
+
+struct hlist_head {
+ struct hlist_node *first;
+};
+
+struct hlist_node {
+ struct hlist_node *next, **pprev;
+};
+
+#define HLIST_HEAD_INIT { }
+#define HLIST_HEAD(name) struct hlist_head name = HLIST_HEAD_INIT
+#define INIT_HLIST_HEAD(head) (head)->first = NULL
+#define INIT_HLIST_NODE(node) \
+do { \
+ (node)->next = NULL; \
+ (node)->pprev = NULL; \
+} while (0)
+
+static inline int
+hlist_unhashed(const struct hlist_node *h)
+{
+
+ return !h->pprev;
+}
+
+static inline int
+hlist_empty(const struct hlist_head *h)
+{
+
+ return !h->first;
+}
+
+static inline void
+hlist_del(struct hlist_node *n)
+{
+
+ if (n->next)
+ n->next->pprev = n->pprev;
+ *n->pprev = n->next;
+}
+
+static inline void
+hlist_del_init(struct hlist_node *n)
+{
+
+ if (hlist_unhashed(n))
+ return;
+ hlist_del(n);
+ INIT_HLIST_NODE(n);
+}
+
+static inline void
+hlist_add_head(struct hlist_node *n, struct hlist_head *h)
+{
+
+ n->next = h->first;
+ if (h->first)
+ h->first->pprev = &n->next;
+ h->first = n;
+ n->pprev = &h->first;
+}
+
+static inline void
+hlist_add_before(struct hlist_node *n, struct hlist_node *next)
+{
+
+ n->pprev = next->pprev;
+ n->next = next;
+ next->pprev = &n->next;
+ *(n->pprev) = n;
+}
+
+static inline void
+hlist_add_after(struct hlist_node *n, struct hlist_node *next)
+{
+
+ next->next = n->next;
+ n->next = next;
+ next->pprev = &n->next;
+ if (next->next)
+ next->next->pprev = &next->next;
+}
+
+static inline void
+hlist_move_list(struct hlist_head *old, struct hlist_head *new)
+{
+
+ new->first = old->first;
+ if (new->first)
+ new->first->pprev = &new->first;
+ old->first = NULL;
+}
+
+#define hlist_entry(ptr, type, field) container_of(ptr, type, field)
+
+#define hlist_for_each(p, head) \
+ for (p = (head)->first; p; p = p->next)
+
+#define hlist_for_each_safe(p, n, head) \
+ for (p = (head)->first; p && ({ n = p->next; 1; }); p = n)
+
+#define hlist_for_each_entry(tp, p, head, field) \
+ for (p = (head)->first; \
+ p ? (tp = hlist_entry(p, typeof(*tp), field)): NULL; p = p->next)
+
+#define hlist_for_each_entry_continue(tp, p, field) \
+ for (p = (p)->next; \
+ p ? (tp = hlist_entry(p, typeof(*tp), field)): NULL; p = p->next)
+
+#define hlist_for_each_entry_from(tp, p, field) \
+ for (; p ? (tp = hlist_entry(p, typeof(*tp), field)): NULL; p = p->next)
+
+#define hlist_for_each_entry_safe(tp, p, n, head, field) \
+ for (p = (head)->first; p ? \
+ (n = p->next) | (tp = hlist_entry(p, typeof(*tp), field)) : \
+ NULL; p = n)
+
+#endif /* _LINUX_LIST_H_ */
diff --git a/sys/ofed/include/linux/lockdep.h b/sys/ofed/include/linux/lockdep.h
new file mode 100644
index 0000000..8ddb079
--- /dev/null
+++ b/sys/ofed/include/linux/lockdep.h
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_LOCKDEP_H_
+#define _LINUX_LOCKDEP_H_
+
+struct lock_class_key {
+};
+
+#define lockdep_set_class(lock, key)
+
+#endif /* _LINUX_LOCKDEP_H_ */
diff --git a/sys/ofed/include/linux/log2.h b/sys/ofed/include/linux/log2.h
new file mode 100644
index 0000000..0a8315a
--- /dev/null
+++ b/sys/ofed/include/linux/log2.h
@@ -0,0 +1,60 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_LOG2_H_
+#define _LINUX_LOG2_H_
+
+#include <linux/types.h>
+
+#include <sys/libkern.h>
+
+static inline unsigned long
+roundup_pow_of_two(unsigned long x)
+{
+ return (1UL << flsl(x - 1));
+}
+
+static inline int
+is_power_of_2(unsigned long n)
+{
+ return (n == roundup_pow_of_two(n));
+}
+
+static inline unsigned long
+rounddown_pow_of_two(unsigned long x)
+{
+ return (1UL << (flsl(x) - 1));
+}
+
+static inline unsigned long
+ilog2(unsigned long x)
+{
+ return (flsl(x) - 1);
+}
+
+#endif /* _LINUX_LOG2_H_ */
diff --git a/sys/ofed/include/linux/miscdevice.h b/sys/ofed/include/linux/miscdevice.h
new file mode 100644
index 0000000..e6a4435
--- /dev/null
+++ b/sys/ofed/include/linux/miscdevice.h
@@ -0,0 +1,72 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_MISCDEVICE_H_
+#define _LINUX_MISCDEVICE_H_
+
+#define MISC_DYNAMIC_MINOR -1
+
+#include <linux/device.h>
+#include <linux/cdev.h>
+
+struct miscdevice {
+ const char *name;
+ struct device *this_device;
+ const struct file_operations *fops;
+ struct cdev *cdev;
+ int minor;
+};
+
+extern struct class miscclass;
+
+static inline int
+misc_register(struct miscdevice *misc)
+{
+ misc->this_device = device_create(&miscclass, &linux_rootdev, 0, misc,
+ misc->name);
+ misc->cdev = cdev_alloc();
+ if (misc->cdev == NULL)
+ return -ENOMEM;
+ misc->cdev->owner = THIS_MODULE;
+ misc->cdev->ops = misc->fops;
+ kobject_set_name(&misc->cdev->kobj, misc->name);
+ if (cdev_add(misc->cdev, misc->this_device->devt, 1))
+ return -EINVAL;
+ return (0);
+}
+
+static inline int
+misc_deregister(struct miscdevice *misc)
+{
+ device_destroy(&miscclass, misc->this_device->devt);
+ cdev_del(misc->cdev);
+
+ return (0);
+}
+
+#endif /* _LINUX_MISCDEVICE_H_ */
diff --git a/sys/ofed/include/linux/mlx4/cmd.h b/sys/ofed/include/linux/mlx4/cmd.h
new file mode 100644
index 0000000..60d3036
--- /dev/null
+++ b/sys/ofed/include/linux/mlx4/cmd.h
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_CMD_H
+#define MLX4_CMD_H
+
+#include <linux/dma-mapping.h>
+
+enum {
+ /* initialization and general commands */
+ MLX4_CMD_SYS_EN = 0x1,
+ MLX4_CMD_SYS_DIS = 0x2,
+ MLX4_CMD_MAP_FA = 0xfff,
+ MLX4_CMD_UNMAP_FA = 0xffe,
+ MLX4_CMD_RUN_FW = 0xff6,
+ MLX4_CMD_MOD_STAT_CFG = 0x34,
+ MLX4_CMD_QUERY_DEV_CAP = 0x3,
+ MLX4_CMD_QUERY_FW = 0x4,
+ MLX4_CMD_ENABLE_LAM = 0xff8,
+ MLX4_CMD_DISABLE_LAM = 0xff7,
+ MLX4_CMD_QUERY_DDR = 0x5,
+ MLX4_CMD_QUERY_ADAPTER = 0x6,
+ MLX4_CMD_INIT_HCA = 0x7,
+ MLX4_CMD_CLOSE_HCA = 0x8,
+ MLX4_CMD_INIT_PORT = 0x9,
+ MLX4_CMD_CLOSE_PORT = 0xa,
+ MLX4_CMD_QUERY_HCA = 0xb,
+ MLX4_CMD_QUERY_PORT = 0x43,
+ MLX4_CMD_SENSE_PORT = 0x4d,
+ MLX4_CMD_HW_HEALTH_CHECK = 0x50,
+ MLX4_CMD_SET_PORT = 0xc,
+ MLX4_CMD_SET_NODE = 0x5a,
+ MLX4_CMD_ACCESS_DDR = 0x2e,
+ MLX4_CMD_MAP_ICM = 0xffa,
+ MLX4_CMD_UNMAP_ICM = 0xff9,
+ MLX4_CMD_MAP_ICM_AUX = 0xffc,
+ MLX4_CMD_UNMAP_ICM_AUX = 0xffb,
+ MLX4_CMD_SET_ICM_SIZE = 0xffd,
+
+ /* TPT commands */
+ MLX4_CMD_SW2HW_MPT = 0xd,
+ MLX4_CMD_QUERY_MPT = 0xe,
+ MLX4_CMD_HW2SW_MPT = 0xf,
+ MLX4_CMD_READ_MTT = 0x10,
+ MLX4_CMD_WRITE_MTT = 0x11,
+ MLX4_CMD_SYNC_TPT = 0x2f,
+
+ /* EQ commands */
+ MLX4_CMD_MAP_EQ = 0x12,
+ MLX4_CMD_SW2HW_EQ = 0x13,
+ MLX4_CMD_HW2SW_EQ = 0x14,
+ MLX4_CMD_QUERY_EQ = 0x15,
+
+ /* CQ commands */
+ MLX4_CMD_SW2HW_CQ = 0x16,
+ MLX4_CMD_HW2SW_CQ = 0x17,
+ MLX4_CMD_QUERY_CQ = 0x18,
+ MLX4_CMD_MODIFY_CQ = 0x2c,
+
+ /* SRQ commands */
+ MLX4_CMD_SW2HW_SRQ = 0x35,
+ MLX4_CMD_HW2SW_SRQ = 0x36,
+ MLX4_CMD_QUERY_SRQ = 0x37,
+ MLX4_CMD_ARM_SRQ = 0x40,
+
+ /* QP/EE commands */
+ MLX4_CMD_RST2INIT_QP = 0x19,
+ MLX4_CMD_INIT2RTR_QP = 0x1a,
+ MLX4_CMD_RTR2RTS_QP = 0x1b,
+ MLX4_CMD_RTS2RTS_QP = 0x1c,
+ MLX4_CMD_SQERR2RTS_QP = 0x1d,
+ MLX4_CMD_2ERR_QP = 0x1e,
+ MLX4_CMD_RTS2SQD_QP = 0x1f,
+ MLX4_CMD_SQD2SQD_QP = 0x38,
+ MLX4_CMD_SQD2RTS_QP = 0x20,
+ MLX4_CMD_2RST_QP = 0x21,
+ MLX4_CMD_QUERY_QP = 0x22,
+ MLX4_CMD_INIT2INIT_QP = 0x2d,
+ MLX4_CMD_SUSPEND_QP = 0x32,
+ MLX4_CMD_UNSUSPEND_QP = 0x33,
+ /* special QP and management commands */
+ MLX4_CMD_CONF_SPECIAL_QP = 0x23,
+ MLX4_CMD_MAD_IFC = 0x24,
+
+ /* multicast commands */
+ MLX4_CMD_READ_MCG = 0x25,
+ MLX4_CMD_WRITE_MCG = 0x26,
+ MLX4_CMD_MGID_HASH = 0x27,
+
+ /* miscellaneous commands */
+ MLX4_CMD_DIAG_RPRT = 0x30,
+ MLX4_CMD_NOP = 0x31,
+
+ /* debug commands */
+ MLX4_CMD_QUERY_DEBUG_MSG = 0x2a,
+ MLX4_CMD_SET_DEBUG_MSG = 0x2b,
+
+ /* statistics commands */
+ MLX4_CMD_QUERY_IF_STAT = 0X54,
+ MLX4_CMD_SET_IF_STAT = 0X55,
+};
+
+enum {
+ MLX4_CMD_TIME_CLASS_A = 10000,
+ MLX4_CMD_TIME_CLASS_B = 10000,
+ MLX4_CMD_TIME_CLASS_C = 10000,
+};
+
+enum {
+ MLX4_MAILBOX_SIZE = 4096
+};
+
+enum {
+ /* set port opcode modifiers */
+ MLX4_SET_PORT_GENERAL = 0x0,
+ MLX4_SET_PORT_RQP_CALC = 0x1,
+ MLX4_SET_PORT_MAC_TABLE = 0x2,
+ MLX4_SET_PORT_VLAN_TABLE = 0x3,
+ MLX4_SET_PORT_PRIO_MAP = 0x4,
+ MLX4_SET_PORT_GID_TABLE = 0x5,
+};
+
+struct mlx4_dev;
+
+struct mlx4_cmd_mailbox {
+ void *buf;
+ dma_addr_t dma;
+};
+
+int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+ int out_is_imm, u32 in_modifier, u8 op_modifier,
+ u16 op, unsigned long timeout);
+
+/* Invoke a command with no output parameter */
+static inline int mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u32 in_modifier,
+ u8 op_modifier, u16 op, unsigned long timeout)
+{
+ return __mlx4_cmd(dev, in_param, NULL, 0, in_modifier,
+ op_modifier, op, timeout);
+}
+
+/* Invoke a command with an output mailbox */
+static inline int mlx4_cmd_box(struct mlx4_dev *dev, u64 in_param, u64 out_param,
+ u32 in_modifier, u8 op_modifier, u16 op,
+ unsigned long timeout)
+{
+ return __mlx4_cmd(dev, in_param, &out_param, 0, in_modifier,
+ op_modifier, op, timeout);
+}
+
+/*
+ * Invoke a command with an immediate output parameter (and copy the
+ * output into the caller's out_param pointer after the command
+ * executes).
+ */
+static inline int mlx4_cmd_imm(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+ u32 in_modifier, u8 op_modifier, u16 op,
+ unsigned long timeout)
+{
+ return __mlx4_cmd(dev, in_param, out_param, 1, in_modifier,
+ op_modifier, op, timeout);
+}
+
+struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev);
+void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox);
+
+#endif /* MLX4_CMD_H */
diff --git a/sys/ofed/include/linux/mlx4/cq.h b/sys/ofed/include/linux/mlx4/cq.h
new file mode 100644
index 0000000..6f65b2c
--- /dev/null
+++ b/sys/ofed/include/linux/mlx4/cq.h
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_CQ_H
+#define MLX4_CQ_H
+
+#include <linux/types.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/doorbell.h>
+
+struct mlx4_cqe {
+ __be32 vlan_my_qpn;
+ __be32 immed_rss_invalid;
+ __be32 g_mlpath_rqpn;
+ __be16 sl_vid;
+ __be16 rlid;
+ __be16 status;
+ u8 ipv6_ext_mask;
+ u8 badfcs_enc;
+ __be32 byte_cnt;
+ __be16 wqe_index;
+ __be16 checksum;
+ u8 reserved[3];
+ u8 owner_sr_opcode;
+};
+
+struct mlx4_err_cqe {
+ __be32 my_qpn;
+ u32 reserved1[5];
+ __be16 wqe_index;
+ u8 vendor_err_syndrome;
+ u8 syndrome;
+ u8 reserved2[3];
+ u8 owner_sr_opcode;
+};
+
+enum {
+ MLX4_CQE_VLAN_PRESENT_MASK = 1 << 29,
+ MLX4_CQE_QPN_MASK = 0xffffff,
+};
+
+enum {
+ MLX4_CQE_OWNER_MASK = 0x80,
+ MLX4_CQE_IS_SEND_MASK = 0x40,
+ MLX4_CQE_OPCODE_MASK = 0x1f
+};
+
+enum {
+ MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR = 0x01,
+ MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR = 0x02,
+ MLX4_CQE_SYNDROME_LOCAL_PROT_ERR = 0x04,
+ MLX4_CQE_SYNDROME_WR_FLUSH_ERR = 0x05,
+ MLX4_CQE_SYNDROME_MW_BIND_ERR = 0x06,
+ MLX4_CQE_SYNDROME_BAD_RESP_ERR = 0x10,
+ MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR = 0x11,
+ MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR = 0x12,
+ MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR = 0x13,
+ MLX4_CQE_SYNDROME_REMOTE_OP_ERR = 0x14,
+ MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR = 0x15,
+ MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR = 0x16,
+ MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR = 0x22,
+};
+
+enum {
+ MLX4_CQE_STATUS_IPV4 = 1 << 6,
+ MLX4_CQE_STATUS_IPV4F = 1 << 7,
+ MLX4_CQE_STATUS_IPV6 = 1 << 8,
+ MLX4_CQE_STATUS_IPV4OPT = 1 << 9,
+ MLX4_CQE_STATUS_TCP = 1 << 10,
+ MLX4_CQE_STATUS_UDP = 1 << 11,
+ MLX4_CQE_STATUS_IPOK = 1 << 12,
+};
+
+enum {
+ MLX4_CQE_LLC = 1,
+ MLX4_CQE_SNAP = 1 << 1,
+ MLX4_CQE_BAD_FCS = 1 << 4,
+};
+
+static inline void mlx4_cq_arm(struct mlx4_cq *cq, u32 cmd,
+ void __iomem *uar_page,
+ spinlock_t *doorbell_lock)
+{
+ __be32 doorbell[2];
+ u32 sn;
+ u32 ci;
+
+ sn = cq->arm_sn & 3;
+ ci = cq->cons_index & 0xffffff;
+
+ *cq->arm_db = cpu_to_be32(sn << 28 | cmd | ci);
+
+ /*
+ * Make sure that the doorbell record in host memory is
+ * written before ringing the doorbell via PCI MMIO.
+ */
+ wmb();
+
+ doorbell[0] = cpu_to_be32(sn << 28 | cmd | cq->cqn);
+ doorbell[1] = cpu_to_be32(ci);
+
+ mlx4_write64(doorbell, uar_page + MLX4_CQ_DOORBELL, doorbell_lock);
+}
+
+static inline void mlx4_cq_set_ci(struct mlx4_cq *cq)
+{
+ *cq->set_ci_db = cpu_to_be32(cq->cons_index & 0xffffff);
+}
+
+enum {
+ MLX4_CQ_DB_REQ_NOT_SOL = 1 << 24,
+ MLX4_CQ_DB_REQ_NOT = 2 << 24
+};
+
+int mlx4_cq_modify(struct mlx4_dev *dev, struct mlx4_cq *cq,
+ u16 count, u16 period);
+int mlx4_cq_resize(struct mlx4_dev *dev, struct mlx4_cq *cq,
+ int entries, struct mlx4_mtt *mtt);
+
+#endif /* MLX4_CQ_H */
diff --git a/sys/ofed/include/linux/mlx4/device.h b/sys/ofed/include/linux/mlx4/device.h
new file mode 100644
index 0000000..5272e5f
--- /dev/null
+++ b/sys/ofed/include/linux/mlx4/device.h
@@ -0,0 +1,619 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_DEVICE_H
+#define MLX4_DEVICE_H
+
+#include <linux/pci.h>
+#include <linux/completion.h>
+#include <linux/radix-tree.h>
+
+#include <asm/atomic.h>
+
+#include <linux/mlx4/driver.h>
+
+enum {
+ MLX4_FLAG_MSI_X = 1 << 0,
+ MLX4_FLAG_OLD_PORT_CMDS = 1 << 1,
+};
+
+enum {
+ MLX4_MAX_PORTS = 2
+};
+
+enum {
+ MLX4_BOARD_ID_LEN = 64
+};
+
+enum {
+ MLX4_DEV_CAP_FLAG_RC = 1 << 0,
+ MLX4_DEV_CAP_FLAG_UC = 1 << 1,
+ MLX4_DEV_CAP_FLAG_UD = 1 << 2,
+ MLX4_DEV_CAP_FLAG_XRC = 1 << 3,
+ MLX4_DEV_CAP_FLAG_SRQ = 1 << 6,
+ MLX4_DEV_CAP_FLAG_IPOIB_CSUM = 1 << 7,
+ MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR = 1 << 8,
+ MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR = 1 << 9,
+ MLX4_DEV_CAP_FLAG_DPDP = 1 << 12,
+ MLX4_DEV_CAP_FLAG_RAW_ETY = 1 << 13,
+ MLX4_DEV_CAP_FLAG_BLH = 1 << 15,
+ MLX4_DEV_CAP_FLAG_MEM_WINDOW = 1 << 16,
+ MLX4_DEV_CAP_FLAG_APM = 1 << 17,
+ MLX4_DEV_CAP_FLAG_ATOMIC = 1 << 18,
+ MLX4_DEV_CAP_FLAG_RAW_MCAST = 1 << 19,
+ MLX4_DEV_CAP_FLAG_UD_AV_PORT = 1 << 20,
+ MLX4_DEV_CAP_FLAG_UD_MCAST = 1 << 21,
+ MLX4_DEV_CAP_FLAG_IBOE = 1 << 30,
+ MLX4_DEV_CAP_FLAG_FC_T11 = 1 << 31
+};
+
+enum {
+ MLX4_BMME_FLAG_LOCAL_INV = 1 << 6,
+ MLX4_BMME_FLAG_REMOTE_INV = 1 << 7,
+ MLX4_BMME_FLAG_TYPE_2_WIN = 1 << 9,
+ MLX4_BMME_FLAG_RESERVED_LKEY = 1 << 10,
+ MLX4_BMME_FLAG_FAST_REG_WR = 1 << 11,
+};
+
+enum mlx4_event {
+ MLX4_EVENT_TYPE_COMP = 0x00,
+ MLX4_EVENT_TYPE_PATH_MIG = 0x01,
+ MLX4_EVENT_TYPE_COMM_EST = 0x02,
+ MLX4_EVENT_TYPE_SQ_DRAINED = 0x03,
+ MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE = 0x13,
+ MLX4_EVENT_TYPE_SRQ_LIMIT = 0x14,
+ MLX4_EVENT_TYPE_CQ_ERROR = 0x04,
+ MLX4_EVENT_TYPE_WQ_CATAS_ERROR = 0x05,
+ MLX4_EVENT_TYPE_EEC_CATAS_ERROR = 0x06,
+ MLX4_EVENT_TYPE_PATH_MIG_FAILED = 0x07,
+ MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR = 0x10,
+ MLX4_EVENT_TYPE_WQ_ACCESS_ERROR = 0x11,
+ MLX4_EVENT_TYPE_SRQ_CATAS_ERROR = 0x12,
+ MLX4_EVENT_TYPE_LOCAL_CATAS_ERROR = 0x08,
+ MLX4_EVENT_TYPE_PORT_CHANGE = 0x09,
+ MLX4_EVENT_TYPE_EQ_OVERFLOW = 0x0f,
+ MLX4_EVENT_TYPE_ECC_DETECT = 0x0e,
+ MLX4_EVENT_TYPE_CMD = 0x0a
+};
+
+enum {
+ MLX4_PORT_CHANGE_SUBTYPE_DOWN = 1,
+ MLX4_PORT_CHANGE_SUBTYPE_ACTIVE = 4
+};
+
+enum {
+ MLX4_PERM_LOCAL_READ = 1 << 10,
+ MLX4_PERM_LOCAL_WRITE = 1 << 11,
+ MLX4_PERM_REMOTE_READ = 1 << 12,
+ MLX4_PERM_REMOTE_WRITE = 1 << 13,
+ MLX4_PERM_ATOMIC = 1 << 14
+};
+
+enum {
+ MLX4_OPCODE_NOP = 0x00,
+ MLX4_OPCODE_SEND_INVAL = 0x01,
+ MLX4_OPCODE_RDMA_WRITE = 0x08,
+ MLX4_OPCODE_RDMA_WRITE_IMM = 0x09,
+ MLX4_OPCODE_SEND = 0x0a,
+ MLX4_OPCODE_SEND_IMM = 0x0b,
+ MLX4_OPCODE_LSO = 0x0e,
+ MLX4_OPCODE_BIG_LSO = 0x2e,
+ MLX4_OPCODE_RDMA_READ = 0x10,
+ MLX4_OPCODE_ATOMIC_CS = 0x11,
+ MLX4_OPCODE_ATOMIC_FA = 0x12,
+ MLX4_OPCODE_MASKED_ATOMIC_CS = 0x14,
+ MLX4_OPCODE_MASKED_ATOMIC_FA = 0x15,
+ MLX4_OPCODE_BIND_MW = 0x18,
+ MLX4_OPCODE_FMR = 0x19,
+ MLX4_OPCODE_LOCAL_INVAL = 0x1b,
+ MLX4_OPCODE_CONFIG_CMD = 0x1f,
+
+ MLX4_RECV_OPCODE_RDMA_WRITE_IMM = 0x00,
+ MLX4_RECV_OPCODE_SEND = 0x01,
+ MLX4_RECV_OPCODE_SEND_IMM = 0x02,
+ MLX4_RECV_OPCODE_SEND_INVAL = 0x03,
+
+ MLX4_CQE_OPCODE_ERROR = 0x1e,
+ MLX4_CQE_OPCODE_RESIZE = 0x16,
+};
+
+enum {
+ MLX4_STAT_RATE_OFFSET = 5
+};
+
+enum {
+ MLX4_MTT_FLAG_PRESENT = 1
+};
+
+enum mlx4_qp_region {
+ MLX4_QP_REGION_FW = 0,
+ MLX4_QP_REGION_ETH_ADDR,
+ MLX4_QP_REGION_FC_ADDR,
+ MLX4_NUM_QP_REGION
+};
+
+enum mlx4_port_type {
+ MLX4_PORT_TYPE_NONE = 0,
+ MLX4_PORT_TYPE_IB = 1,
+ MLX4_PORT_TYPE_ETH = 2,
+ MLX4_PORT_TYPE_AUTO = 3
+};
+
+enum mlx4_special_vlan_idx {
+ MLX4_NO_VLAN_IDX = 0,
+ MLX4_VLAN_MISS_IDX,
+ MLX4_VLAN_REGULAR
+};
+#define MLX4_LEAST_ATTACHED_VECTOR 0xffffffff
+
+enum {
+ MLX4_CUNTERS_DISABLED,
+ MLX4_CUNTERS_BASIC,
+ MLX4_CUNTERS_EXT
+};
+
+enum {
+ MAX_FAST_REG_PAGES = 511,
+};
+
+static inline u64 mlx4_fw_ver(u64 major, u64 minor, u64 subminor)
+{
+ return (major << 32) | (minor << 16) | subminor;
+}
+
+struct mlx4_caps {
+ u64 fw_ver;
+ int num_ports;
+ int vl_cap[MLX4_MAX_PORTS + 1];
+ int ib_mtu_cap[MLX4_MAX_PORTS + 1];
+ __be32 ib_port_def_cap[MLX4_MAX_PORTS + 1];
+ u64 def_mac[MLX4_MAX_PORTS + 1];
+ int eth_mtu_cap[MLX4_MAX_PORTS + 1];
+ int gid_table_len[MLX4_MAX_PORTS + 1];
+ int pkey_table_len[MLX4_MAX_PORTS + 1];
+ int trans_type[MLX4_MAX_PORTS + 1];
+ int vendor_oui[MLX4_MAX_PORTS + 1];
+ int wavelength[MLX4_MAX_PORTS + 1];
+ u64 trans_code[MLX4_MAX_PORTS + 1];
+ int local_ca_ack_delay;
+ int num_uars;
+ int bf_reg_size;
+ int bf_regs_per_page;
+ int max_sq_sg;
+ int max_rq_sg;
+ int num_qps;
+ int max_wqes;
+ int max_sq_desc_sz;
+ int max_rq_desc_sz;
+ int max_qp_init_rdma;
+ int max_qp_dest_rdma;
+ int sqp_start;
+ int num_srqs;
+ int max_srq_wqes;
+ int max_srq_sge;
+ int reserved_srqs;
+ int num_cqs;
+ int max_cqes;
+ int reserved_cqs;
+ int num_eqs;
+ int reserved_eqs;
+ int num_comp_vectors;
+ int num_mpts;
+ int num_mtt_segs;
+ int mtts_per_seg;
+ int fmr_reserved_mtts;
+ int reserved_mtts;
+ int reserved_mrws;
+ int reserved_uars;
+ int num_mgms;
+ int num_amgms;
+ int reserved_mcgs;
+ int num_qp_per_mgm;
+ int num_pds;
+ int reserved_pds;
+ int mtt_entry_sz;
+ int reserved_xrcds;
+ int max_xrcds;
+ u32 max_msg_sz;
+ u32 page_size_cap;
+ u64 flags;
+ u32 bmme_flags;
+ u32 reserved_lkey;
+ u16 stat_rate_support;
+ int udp_rss;
+ int loopback_support;
+ u8 port_width_cap[MLX4_MAX_PORTS + 1];
+ int max_gso_sz;
+ int reserved_qps_cnt[MLX4_NUM_QP_REGION];
+ int reserved_qps;
+ int reserved_qps_base[MLX4_NUM_QP_REGION];
+ int log_num_macs;
+ int log_num_vlans;
+ int log_num_prios;
+ enum mlx4_port_type port_type[MLX4_MAX_PORTS + 1];
+ u8 supported_type[MLX4_MAX_PORTS + 1];
+ enum mlx4_port_type port_mask[MLX4_MAX_PORTS + 1];
+ enum mlx4_port_type possible_type[MLX4_MAX_PORTS + 1];
+ u8 counters_mode;
+ u32 max_basic_counters;
+ u32 max_ext_counters;
+ u32 mc_promisc_mode;
+};
+
+struct mlx4_buf_list {
+ void *buf;
+ dma_addr_t map;
+};
+
+struct mlx4_buf {
+ struct mlx4_buf_list direct;
+ struct mlx4_buf_list *page_list;
+ int nbufs;
+ int npages;
+ int page_shift;
+};
+
+struct mlx4_mtt {
+ u32 first_seg;
+ int order;
+ int page_shift;
+};
+
+enum {
+ MLX4_DB_PER_PAGE = PAGE_SIZE / 4
+};
+
+struct mlx4_db_pgdir {
+ struct list_head list;
+ DECLARE_BITMAP(order0, MLX4_DB_PER_PAGE);
+ DECLARE_BITMAP(order1, MLX4_DB_PER_PAGE / 2);
+ unsigned long *bits[2];
+ __be32 *db_page;
+ dma_addr_t db_dma;
+};
+
+struct mlx4_ib_user_db_page;
+
+struct mlx4_db {
+ __be32 *db;
+ union {
+ struct mlx4_db_pgdir *pgdir;
+ struct mlx4_ib_user_db_page *user_page;
+ } u;
+ dma_addr_t dma;
+ int index;
+ int order;
+};
+
+struct mlx4_hwq_resources {
+ struct mlx4_db db;
+ struct mlx4_mtt mtt;
+ struct mlx4_buf buf;
+};
+
+struct mlx4_mr {
+ struct mlx4_mtt mtt;
+ u64 iova;
+ u64 size;
+ u32 key;
+ u32 pd;
+ u32 access;
+ int enabled;
+};
+
+struct mlx4_fmr {
+ struct mlx4_mr mr;
+ struct mlx4_mpt_entry *mpt;
+ __be64 *mtts;
+ dma_addr_t dma_handle;
+ int max_pages;
+ int max_maps;
+ int maps;
+ u8 page_shift;
+};
+
+struct mlx4_uar {
+ unsigned long pfn;
+ int index;
+ struct list_head bf_list;
+ unsigned free_bf_bmap;
+ void __iomem *map;
+ void __iomem *bf_map;
+};
+
+struct mlx4_bf {
+ unsigned long offset;
+ int buf_size;
+ struct mlx4_uar *uar;
+ void __iomem *reg;
+};
+
+struct mlx4_cq {
+ void (*comp) (struct mlx4_cq *);
+ void (*event) (struct mlx4_cq *, enum mlx4_event);
+
+ struct mlx4_uar *uar;
+
+ u32 cons_index;
+
+ __be32 *set_ci_db;
+ __be32 *arm_db;
+ int arm_sn;
+
+ int cqn;
+ unsigned vector;
+
+ atomic_t refcount;
+ struct completion free;
+};
+
+struct mlx4_qp {
+ void (*event) (struct mlx4_qp *, enum mlx4_event);
+
+ int qpn;
+
+ atomic_t refcount;
+ struct completion free;
+};
+
+struct mlx4_srq {
+ void (*event) (struct mlx4_srq *, enum mlx4_event);
+
+ int srqn;
+ int max;
+ int max_gs;
+ int wqe_shift;
+
+ atomic_t refcount;
+ struct completion free;
+};
+
+struct mlx4_av {
+ __be32 port_pd;
+ u8 reserved1;
+ u8 g_slid;
+ __be16 dlid;
+ u8 reserved2;
+ u8 gid_index;
+ u8 stat_rate;
+ u8 hop_limit;
+ __be32 sl_tclass_flowlabel;
+ u8 dgid[16];
+};
+
+struct mlx4_eth_av {
+ __be32 port_pd;
+ u8 reserved1;
+ u8 smac_idx;
+ u16 reserved2;
+ u8 reserved3;
+ u8 gid_index;
+ u8 stat_rate;
+ u8 hop_limit;
+ __be32 sl_tclass_flowlabel;
+ u8 dgid[16];
+ u32 reserved4[2];
+ __be16 vlan;
+ u8 mac[6];
+};
+
+union mlx4_ext_av {
+ struct mlx4_av ib;
+ struct mlx4_eth_av eth;
+};
+
+struct mlx4_counters {
+ __be32 counter_mode;
+ __be32 num_ifc;
+ u32 reserved[2];
+ __be64 rx_frames;
+ __be64 rx_bytes;
+ __be64 tx_frames;
+ __be64 tx_bytes;
+};
+
+struct mlx4_counters_ext {
+ __be32 counter_mode;
+ __be32 num_ifc;
+ u32 reserved[2];
+ __be64 rx_uni_frames;
+ __be64 rx_uni_bytes;
+ __be64 rx_mcast_frames;
+ __be64 rx_mcast_bytes;
+ __be64 rx_bcast_frames;
+ __be64 rx_bcast_bytes;
+ __be64 rx_nobuf_frames;
+ __be64 rx_nobuf_bytes;
+ __be64 rx_err_frames;
+ __be64 rx_err_bytes;
+ __be64 tx_uni_frames;
+ __be64 tx_uni_bytes;
+ __be64 tx_mcast_frames;
+ __be64 tx_mcast_bytes;
+ __be64 tx_bcast_frames;
+ __be64 tx_bcast_bytes;
+ __be64 tx_nobuf_frames;
+ __be64 tx_nobuf_bytes;
+ __be64 tx_err_frames;
+ __be64 tx_err_bytes;
+};
+
+struct mlx4_dev {
+ struct pci_dev *pdev;
+ unsigned long flags;
+ struct mlx4_caps caps;
+ struct radix_tree_root qp_table_tree;
+ struct radix_tree_root srq_table_tree;
+ u32 rev_id;
+ char board_id[MLX4_BOARD_ID_LEN];
+};
+
+struct mlx4_init_port_param {
+ int set_guid0;
+ int set_node_guid;
+ int set_si_guid;
+ u16 mtu;
+ int port_width_cap;
+ u16 vl_cap;
+ u16 max_gid;
+ u16 max_pkey;
+ u64 guid0;
+ u64 node_guid;
+ u64 si_guid;
+};
+
+static inline void mlx4_query_steer_cap(struct mlx4_dev *dev, int *log_mac,
+ int *log_vlan, int *log_prio)
+{
+ *log_mac = dev->caps.log_num_macs;
+ *log_vlan = dev->caps.log_num_vlans;
+ *log_prio = dev->caps.log_num_prios;
+}
+
+#define mlx4_foreach_port(port, dev, type) \
+ for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \
+ if ((type) == (dev)->caps.port_mask[(port)])
+
+#define mlx4_foreach_ib_transport_port(port, dev) \
+ for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \
+ if (((dev)->caps.port_mask[port] == MLX4_PORT_TYPE_IB) || \
+ ((dev)->caps.flags & MLX4_DEV_CAP_FLAG_IBOE))
+
+int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
+ struct mlx4_buf *buf);
+void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf);
+static inline void *mlx4_buf_offset(struct mlx4_buf *buf, int offset)
+{
+ if (buf->direct.buf != NULL)
+ return buf->direct.buf + offset;
+ else
+ return buf->page_list[offset >> PAGE_SHIFT].buf +
+ (offset & (PAGE_SIZE - 1));
+}
+
+int mlx4_pd_alloc(struct mlx4_dev *dev, u32 *pdn);
+void mlx4_pd_free(struct mlx4_dev *dev, u32 pdn);
+
+int mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn);
+void mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn);
+
+int mlx4_uar_alloc(struct mlx4_dev *dev, struct mlx4_uar *uar);
+void mlx4_uar_free(struct mlx4_dev *dev, struct mlx4_uar *uar);
+int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf);
+void mlx4_bf_free(struct mlx4_dev *dev, struct mlx4_bf *bf);
+
+int mlx4_mtt_init(struct mlx4_dev *dev, int npages, int page_shift,
+ struct mlx4_mtt *mtt);
+void mlx4_mtt_cleanup(struct mlx4_dev *dev, struct mlx4_mtt *mtt);
+u64 mlx4_mtt_addr(struct mlx4_dev *dev, struct mlx4_mtt *mtt);
+
+int mlx4_mr_reserve_range(struct mlx4_dev *dev, int cnt, int align, u32 *base_mridx);
+void mlx4_mr_release_range(struct mlx4_dev *dev, u32 base_mridx, int cnt);
+int mlx4_mr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd,
+ u64 iova, u64 size, u32 access, int npages,
+ int page_shift, struct mlx4_mr *mr);
+int mlx4_mr_alloc(struct mlx4_dev *dev, u32 pd, u64 iova, u64 size, u32 access,
+ int npages, int page_shift, struct mlx4_mr *mr);
+void mlx4_mr_free_reserved(struct mlx4_dev *dev, struct mlx4_mr *mr);
+void mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr);
+int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr);
+int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+ int start_index, int npages, u64 *page_list);
+int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+ struct mlx4_buf *buf);
+
+int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order);
+void mlx4_db_free(struct mlx4_dev *dev, struct mlx4_db *db);
+
+int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres,
+ int size, int max_direct);
+void mlx4_free_hwq_res(struct mlx4_dev *mdev, struct mlx4_hwq_resources *wqres,
+ int size);
+
+int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt,
+ struct mlx4_uar *uar, u64 db_rec, struct mlx4_cq *cq,
+ unsigned vector, int collapsed);
+void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq);
+
+int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base);
+void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt);
+
+int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp);
+void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp);
+
+int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcd,
+ struct mlx4_mtt *mtt, u64 db_rec, struct mlx4_srq *srq);
+void mlx4_srq_free(struct mlx4_dev *dev, struct mlx4_srq *srq);
+int mlx4_srq_arm(struct mlx4_dev *dev, struct mlx4_srq *srq, int limit_watermark);
+int mlx4_srq_query(struct mlx4_dev *dev, struct mlx4_srq *srq, int *limit_watermark);
+
+int mlx4_INIT_PORT(struct mlx4_dev *dev, int port);
+int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port);
+
+int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+ int block_mcast_loopback, enum mlx4_mcast_prot prot);
+int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+ enum mlx4_mcast_prot prot);
+
+int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *index);
+void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, int index);
+
+int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx);
+int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index);
+void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, int index);
+
+int mlx4_map_phys_fmr_fbo(struct mlx4_dev *dev, struct mlx4_fmr *fmr,
+ u64 *page_list, int npages, u64 iova, u32 fbo,
+ u32 len, u32 *lkey, u32 *rkey, int same_key);
+int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list,
+ int npages, u64 iova, u32 *lkey, u32 *rkey);
+int mlx4_fmr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd,
+ u32 access, int max_pages, int max_maps,
+ u8 page_shift, struct mlx4_fmr *fmr);
+int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages,
+ int max_maps, u8 page_shift, struct mlx4_fmr *fmr);
+int mlx4_fmr_enable(struct mlx4_dev *dev, struct mlx4_fmr *fmr);
+void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr,
+ u32 *lkey, u32 *rkey);
+int mlx4_fmr_free_reserved(struct mlx4_dev *dev, struct mlx4_fmr *fmr);
+int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr);
+int mlx4_SYNC_TPT(struct mlx4_dev *dev);
+int mlx4_query_diag_counters(struct mlx4_dev *mlx4_dev, int array_length,
+ u8 op_modifier, u32 in_offset[], u32 counter_out[]);
+int mlx4_test_interrupts(struct mlx4_dev *dev);
+
+void mlx4_get_fc_t11_settings(struct mlx4_dev *dev, int *enable_pre_t11, int *t11_supported);
+
+int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx);
+void mlx4_counter_free(struct mlx4_dev *dev, u32 idx);
+
+#endif /* MLX4_DEVICE_H */
diff --git a/sys/ofed/include/linux/mlx4/doorbell.h b/sys/ofed/include/linux/mlx4/doorbell.h
new file mode 100644
index 0000000..f31bba2
--- /dev/null
+++ b/sys/ofed/include/linux/mlx4/doorbell.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_DOORBELL_H
+#define MLX4_DOORBELL_H
+
+#include <linux/types.h>
+#include <linux/io.h>
+
+#define MLX4_SEND_DOORBELL 0x14
+#define MLX4_CQ_DOORBELL 0x20
+
+#if BITS_PER_LONG == 64
+/*
+ * Assume that we can just write a 64-bit doorbell atomically. s390
+ * actually doesn't have writeq() but S/390 systems don't even have
+ * PCI so we won't worry about it.
+ */
+
+#define MLX4_DECLARE_DOORBELL_LOCK(name)
+#define MLX4_INIT_DOORBELL_LOCK(ptr) do { } while (0)
+#define MLX4_GET_DOORBELL_LOCK(ptr) (NULL)
+
+static inline void mlx4_write64(__be32 val[2], void __iomem *dest,
+ spinlock_t *doorbell_lock)
+{
+ __raw_writeq(*(u64 *) val, dest);
+}
+
+#else
+
+/*
+ * Just fall back to a spinlock to protect the doorbell if
+ * BITS_PER_LONG is 32 -- there's no portable way to do atomic 64-bit
+ * MMIO writes.
+ */
+
+#define MLX4_DECLARE_DOORBELL_LOCK(name) spinlock_t name;
+#define MLX4_INIT_DOORBELL_LOCK(ptr) spin_lock_init(ptr)
+#define MLX4_GET_DOORBELL_LOCK(ptr) (ptr)
+
+static inline void mlx4_write64(__be32 val[2], void __iomem *dest,
+ spinlock_t *doorbell_lock)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(doorbell_lock, flags);
+ __raw_writel((__force u32) val[0], dest);
+ __raw_writel((__force u32) val[1], dest + 4);
+ spin_unlock_irqrestore(doorbell_lock, flags);
+}
+
+#endif
+
+#endif /* MLX4_DOORBELL_H */
diff --git a/sys/ofed/include/linux/mlx4/driver.h b/sys/ofed/include/linux/mlx4/driver.h
new file mode 100644
index 0000000..15c8319
--- /dev/null
+++ b/sys/ofed/include/linux/mlx4/driver.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_DRIVER_H
+#define MLX4_DRIVER_H
+
+#include <linux/device.h>
+
+struct mlx4_dev;
+
+enum mlx4_dev_event {
+ MLX4_DEV_EVENT_CATASTROPHIC_ERROR,
+ MLX4_DEV_EVENT_PORT_UP,
+ MLX4_DEV_EVENT_PORT_DOWN,
+ MLX4_DEV_EVENT_PORT_REINIT,
+};
+
+enum mlx4_query_reply {
+ MLX4_QUERY_NOT_MINE = -1,
+ MLX4_QUERY_MINE_NOPORT = 0
+};
+
+enum mlx4_prot {
+ MLX4_PROT_IB,
+ MLX4_PROT_EN,
+};
+
+enum mlx4_mcast_prot {
+ MLX4_MCAST_PROT_IB = 0,
+ MLX4_MCAST_PROT_EN = 1,
+};
+
+struct mlx4_interface {
+ void * (*add) (struct mlx4_dev *dev);
+ void (*remove)(struct mlx4_dev *dev, void *context);
+ void (*event) (struct mlx4_dev *dev, void *context,
+ enum mlx4_dev_event event, int port);
+ void * (*get_prot_dev) (struct mlx4_dev *dev, void *context, u8 port);
+ enum mlx4_prot protocol;
+
+ enum mlx4_query_reply (*query) (void *context, void *);
+ struct list_head list;
+};
+
+int mlx4_register_interface(struct mlx4_interface *intf);
+void mlx4_unregister_interface(struct mlx4_interface *intf);
+void *mlx4_get_prot_dev(struct mlx4_dev *dev, enum mlx4_prot proto, int port);
+
+struct mlx4_dev *mlx4_query_interface(void *, int *port);
+void mlx4_set_iboe_counter(struct mlx4_dev *dev, int index, u8 port);
+int mlx4_get_iboe_counter(struct mlx4_dev *dev, u8 port);
+
+#endif /* MLX4_DRIVER_H */
diff --git a/sys/ofed/include/linux/mlx4/qp.h b/sys/ofed/include/linux/mlx4/qp.h
new file mode 100644
index 0000000..3fe2bc5
--- /dev/null
+++ b/sys/ofed/include/linux/mlx4/qp.h
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_QP_H
+#define MLX4_QP_H
+
+#include <linux/types.h>
+
+#include <linux/mlx4/device.h>
+
+#define MLX4_INVALID_LKEY 0x100
+
+enum mlx4_qp_optpar {
+ MLX4_QP_OPTPAR_ALT_ADDR_PATH = 1 << 0,
+ MLX4_QP_OPTPAR_RRE = 1 << 1,
+ MLX4_QP_OPTPAR_RAE = 1 << 2,
+ MLX4_QP_OPTPAR_RWE = 1 << 3,
+ MLX4_QP_OPTPAR_PKEY_INDEX = 1 << 4,
+ MLX4_QP_OPTPAR_Q_KEY = 1 << 5,
+ MLX4_QP_OPTPAR_RNR_TIMEOUT = 1 << 6,
+ MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH = 1 << 7,
+ MLX4_QP_OPTPAR_SRA_MAX = 1 << 8,
+ MLX4_QP_OPTPAR_RRA_MAX = 1 << 9,
+ MLX4_QP_OPTPAR_PM_STATE = 1 << 10,
+ MLX4_QP_OPTPAR_RETRY_COUNT = 1 << 12,
+ MLX4_QP_OPTPAR_RNR_RETRY = 1 << 13,
+ MLX4_QP_OPTPAR_ACK_TIMEOUT = 1 << 14,
+ MLX4_QP_OPTPAR_SCHED_QUEUE = 1 << 16,
+ MLX4_QP_OPTPAR_COUNTER_INDEX = 1 << 20
+};
+
+enum mlx4_qp_state {
+ MLX4_QP_STATE_RST = 0,
+ MLX4_QP_STATE_INIT = 1,
+ MLX4_QP_STATE_RTR = 2,
+ MLX4_QP_STATE_RTS = 3,
+ MLX4_QP_STATE_SQER = 4,
+ MLX4_QP_STATE_SQD = 5,
+ MLX4_QP_STATE_ERR = 6,
+ MLX4_QP_STATE_SQ_DRAINING = 7,
+ MLX4_QP_NUM_STATE
+};
+
+enum {
+ MLX4_QP_ST_RC = 0x0,
+ MLX4_QP_ST_UC = 0x1,
+ MLX4_QP_ST_RD = 0x2,
+ MLX4_QP_ST_UD = 0x3,
+ MLX4_QP_ST_XRC = 0x6,
+ MLX4_QP_ST_MLX = 0x7
+};
+
+enum {
+ MLX4_QP_PM_MIGRATED = 0x3,
+ MLX4_QP_PM_ARMED = 0x0,
+ MLX4_QP_PM_REARM = 0x1
+};
+
+enum {
+ /* params1 */
+ MLX4_QP_BIT_SRE = 1 << 15,
+ MLX4_QP_BIT_SWE = 1 << 14,
+ MLX4_QP_BIT_SAE = 1 << 13,
+ /* params2 */
+ MLX4_QP_BIT_RRE = 1 << 15,
+ MLX4_QP_BIT_RWE = 1 << 14,
+ MLX4_QP_BIT_RAE = 1 << 13,
+ MLX4_QP_BIT_RIC = 1 << 4,
+};
+
+struct mlx4_qp_path {
+ u8 fl;
+ u8 reserved1[2];
+ u8 pkey_index;
+ u8 counter_index;
+ u8 grh_mylmc;
+ __be16 rlid;
+ u8 ackto;
+ u8 mgid_index;
+ u8 static_rate;
+ u8 hop_limit;
+ __be32 tclass_flowlabel;
+ u8 rgid[16];
+ u8 sched_queue;
+ u8 vlan_index;
+ u8 reserved3[2];
+ u8 reserved4[2];
+ u8 dmac[6];
+};
+
+struct mlx4_qp_context {
+ __be32 flags;
+ __be32 pd;
+ u8 mtu_msgmax;
+ u8 rq_size_stride;
+ u8 sq_size_stride;
+ u8 rlkey;
+ __be32 usr_page;
+ __be32 local_qpn;
+ __be32 remote_qpn;
+ struct mlx4_qp_path pri_path;
+ struct mlx4_qp_path alt_path;
+ __be32 params1;
+ u32 reserved1;
+ __be32 next_send_psn;
+ __be32 cqn_send;
+ u32 reserved2[2];
+ __be32 last_acked_psn;
+ __be32 ssn;
+ __be32 params2;
+ __be32 rnr_nextrecvpsn;
+ __be32 xrcd;
+ __be32 cqn_recv;
+ __be64 db_rec_addr;
+ __be32 qkey;
+ __be32 srqn;
+ __be32 msn;
+ __be16 rq_wqe_counter;
+ __be16 sq_wqe_counter;
+ u32 reserved3[2];
+ __be32 param3;
+ __be32 nummmcpeers_basemkey;
+ u8 log_page_size;
+ u8 reserved4[2];
+ u8 mtt_base_addr_h;
+ __be32 mtt_base_addr_l;
+ u8 VE;
+ u8 reserved5;
+ __be16 VFT_id_prio;
+ u8 reserved6;
+ u8 exch_size;
+ __be16 exch_base;
+ u8 VFT_hop_cnt;
+ u8 my_fc_id_idx;
+ __be16 reserved7;
+ u32 reserved8[7];
+};
+
+/* Which firmware version adds support for NEC (NoErrorCompletion) bit */
+#define MLX4_FW_VER_WQE_CTRL_NEC mlx4_fw_ver(2, 2, 232)
+
+enum {
+ MLX4_WQE_CTRL_NEC = 1 << 29,
+ MLX4_WQE_CTRL_FENCE = 1 << 6,
+ MLX4_WQE_CTRL_CQ_UPDATE = 3 << 2,
+ MLX4_WQE_CTRL_SOLICITED = 1 << 1,
+ MLX4_WQE_CTRL_IP_CSUM = 1 << 4,
+ MLX4_WQE_CTRL_TCP_UDP_CSUM = 1 << 5,
+ MLX4_WQE_CTRL_INS_VLAN = 1 << 6,
+ MLX4_WQE_CTRL_STRONG_ORDER = 1 << 7,
+ MLX4_WQE_CTRL_FORCE_LOOPBACK = 1 << 0,
+};
+
+struct mlx4_wqe_ctrl_seg {
+ __be32 owner_opcode;
+ __be16 vlan_tag;
+ u8 ins_vlan;
+ u8 fence_size;
+ /*
+ * High 24 bits are SRC remote buffer; low 8 bits are flags:
+ * [7] SO (strong ordering)
+ * [5] TCP/UDP checksum
+ * [4] IP checksum
+ * [3:2] C (generate completion queue entry)
+ * [1] SE (solicited event)
+ */
+ __be32 srcrb_flags;
+ /*
+ * imm is immediate data for send/RDMA write w/ immediate;
+ * also invalidation key for send with invalidate; input
+ * modifier for WQEs on CCQs.
+ */
+ __be32 imm;
+};
+
+enum {
+ MLX4_WQE_MLX_VL15 = 1 << 17,
+ MLX4_WQE_MLX_SLR = 1 << 16,
+ MLX4_WQE_MLX_ICRC = 1 << 4
+};
+
+struct mlx4_wqe_mlx_seg {
+ u8 owner;
+ u8 reserved1[2];
+ u8 opcode;
+ u8 reserved2[3];
+ u8 size;
+ /*
+ * [17] VL15
+ * [16] SLR
+ * [15:12] static rate
+ * [11:8] SL
+ * [4] ICRC
+ * [3:2] C
+ * [0] FL (force loopback)
+ */
+ __be32 flags;
+ __be16 rlid;
+ u16 reserved3;
+};
+
+struct mlx4_wqe_datagram_seg {
+ __be32 av[8];
+ __be32 dqpn;
+ __be32 qkey;
+ __be16 vlan;
+ u8 mac[6];
+};
+
+struct mlx4_wqe_lso_seg {
+ __be32 mss_hdr_size;
+ __be32 header[0];
+};
+
+struct mlx4_wqe_bind_seg {
+ __be32 flags1;
+ __be32 flags2;
+ __be32 new_rkey;
+ __be32 lkey;
+ __be64 addr;
+ __be64 length;
+};
+
+enum {
+ MLX4_WQE_FMR_PERM_LOCAL_READ = 1 << 27,
+ MLX4_WQE_FMR_PERM_LOCAL_WRITE = 1 << 28,
+ MLX4_WQE_FMR_PERM_REMOTE_READ = 1 << 29,
+ MLX4_WQE_FMR_PERM_REMOTE_WRITE = 1 << 30,
+ MLX4_WQE_FMR_PERM_ATOMIC = 1 << 31
+};
+
+struct mlx4_wqe_fmr_seg {
+ __be32 flags;
+ __be32 mem_key;
+ __be64 buf_list;
+ __be64 start_addr;
+ __be64 reg_len;
+ __be32 offset;
+ __be32 page_size;
+ u32 reserved[2];
+};
+
+struct mlx4_wqe_fmr_ext_seg {
+ u8 flags;
+ u8 reserved;
+ __be16 app_mask;
+ __be16 wire_app_tag;
+ __be16 mem_app_tag;
+ __be32 wire_ref_tag_base;
+ __be32 mem_ref_tag_base;
+};
+
+struct mlx4_wqe_local_inval_seg {
+ __be32 flags;
+ u32 reserved1;
+ __be32 mem_key;
+ u32 reserved2[2];
+ __be32 guest_id;
+ __be64 pa;
+};
+
+struct mlx4_wqe_raddr_seg {
+ __be64 raddr;
+ __be32 rkey;
+ u32 reserved;
+};
+
+struct mlx4_wqe_atomic_seg {
+ __be64 swap_add;
+ __be64 compare;
+};
+
+struct mlx4_wqe_masked_atomic_seg {
+ __be64 swap_add;
+ __be64 compare;
+ __be64 swap_add_mask;
+ __be64 compare_mask;
+};
+
+struct mlx4_wqe_data_seg {
+ __be32 byte_count;
+ __be32 lkey;
+ __be64 addr;
+};
+
+enum {
+ MLX4_INLINE_ALIGN = 64,
+ MLX4_INLINE_SEG = 1 << 31,
+};
+
+struct mlx4_wqe_inline_seg {
+ __be32 byte_count;
+};
+
+int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+ enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state,
+ struct mlx4_qp_context *context, enum mlx4_qp_optpar optpar,
+ int sqd_event, struct mlx4_qp *qp);
+
+int mlx4_qp_query(struct mlx4_dev *dev, struct mlx4_qp *qp,
+ struct mlx4_qp_context *context);
+
+int mlx4_qp_to_ready(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+ struct mlx4_qp_context *context,
+ struct mlx4_qp *qp, enum mlx4_qp_state *qp_state);
+
+static inline struct mlx4_qp *__mlx4_qp_lookup(struct mlx4_dev *dev, u32 qpn)
+{
+ return radix_tree_lookup(&dev->qp_table_tree, qpn & (dev->caps.num_qps - 1));
+}
+
+struct mlx4_qp *mlx4_qp_lookup_lock(struct mlx4_dev *dev, u32 qpn);
+void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp);
+int mlx4_qp_get_region(struct mlx4_dev *dev, enum mlx4_qp_region region,
+ int *base_qpn, int *cnt);
+
+#endif /* MLX4_QP_H */
diff --git a/sys/ofed/include/linux/mlx4/srq.h b/sys/ofed/include/linux/mlx4/srq.h
new file mode 100644
index 0000000..5e041e5
--- /dev/null
+++ b/sys/ofed/include/linux/mlx4/srq.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_SRQ_H
+#define MLX4_SRQ_H
+
+#include <linux/types.h>
+#include <linux/mlx4/device.h>
+
+struct mlx4_wqe_srq_next_seg {
+ u16 reserved1;
+ __be16 next_wqe_index;
+ u32 reserved2[3];
+};
+
+void mlx4_srq_invalidate(struct mlx4_dev *dev, struct mlx4_srq *srq);
+void mlx4_srq_remove(struct mlx4_dev *dev, struct mlx4_srq *srq);
+
+static inline struct mlx4_srq *__mlx4_srq_lookup(struct mlx4_dev *dev, u32 srqn)
+{
+ return radix_tree_lookup(&dev->srq_table_tree,
+ srqn & (dev->caps.num_srqs - 1));
+}
+
+#endif /* MLX4_SRQ_H */
diff --git a/sys/ofed/include/linux/mm.h b/sys/ofed/include/linux/mm.h
new file mode 100644
index 0000000..13b749b
--- /dev/null
+++ b/sys/ofed/include/linux/mm.h
@@ -0,0 +1,84 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_MM_H_
+#define _LINUX_MM_H_
+
+#include <linux/spinlock.h>
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+
+#define PAGE_ALIGN(x) ALIGN(x, PAGE_SIZE)
+
+struct vm_area_struct {
+ vm_offset_t vm_start;
+ vm_offset_t vm_end;
+ vm_offset_t vm_pgoff;
+ vm_paddr_t vm_pfn; /* PFN For mmap. */
+ vm_memattr_t vm_page_prot;
+};
+
+/*
+ * Compute log2 of the power of two rounded up count of pages
+ * needed for size bytes.
+ */
+static inline int
+get_order(unsigned long size)
+{
+ int order;
+
+ size = (size - 1) >> PAGE_SHIFT;
+ order = 0;
+ while (size) {
+ order++;
+ size >>= 1;
+ }
+ return (order);
+}
+
+static inline void *
+lowmem_page_address(struct page *page)
+{
+
+ return page_address(page);
+}
+
+/*
+ * This only works via mmap ops.
+ */
+static inline int
+io_remap_pfn_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long pfn, unsigned long size,
+ vm_memattr_t prot)
+{
+ vma->vm_page_prot = prot;
+ vma->vm_pfn = pfn;
+
+ return (0);
+}
+
+#endif /* _LINUX_MM_H_ */
diff --git a/sys/ofed/include/linux/module.h b/sys/ofed/include/linux/module.h
new file mode 100644
index 0000000..1e3a682
--- /dev/null
+++ b/sys/ofed/include/linux/module.h
@@ -0,0 +1,87 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_MODULE_H_
+#define _LINUX_MODULE_H_
+
+#include <linux/list.h>
+#include <linux/compiler.h>
+#include <linux/kobject.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+
+#define MODULE_AUTHOR(name)
+#define MODULE_DESCRIPTION(name)
+#define MODULE_LICENSE(name)
+#define MODULE_VERSION(name)
+
+#define THIS_MODULE ((struct module *)0)
+
+#define EXPORT_SYMBOL(name)
+#define EXPORT_SYMBOL_GPL(name)
+
+#include <sys/linker.h>
+
+static inline void
+_module_run(void *arg)
+{
+ void (*fn)(void);
+#ifdef OFED_DEBUG_INIT
+ char name[1024];
+ caddr_t pc;
+ long offset;
+
+ pc = (caddr_t)arg;
+ if (linker_search_symbol_name(pc, name, sizeof(name), &offset) != 0)
+ printf("Running ??? (%p)\n", pc);
+ else
+ printf("Running %s (%p)\n", name, pc);
+#endif
+ fn = arg;
+ DROP_GIANT();
+ fn();
+ PICKUP_GIANT();
+}
+
+#define module_init(fn) \
+ SYSINIT(fn, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, _module_run, (fn))
+
+/*
+ * XXX This is a freebsdism designed to work around not having a module
+ * load order resolver built in.
+ */
+#define module_init_order(fn, order) \
+ SYSINIT(fn, SI_SUB_RUN_SCHEDULER, (order), _module_run, (fn))
+
+#define module_exit(fn) \
+ SYSUNINIT(fn, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, _module_run, (fn))
+
+#define module_get(module)
+#define module_put(module)
+#define try_module_get(module) 1
+
+#endif /* _LINUX_MODULE_H_ */
diff --git a/sys/ofed/include/linux/moduleparam.h b/sys/ofed/include/linux/moduleparam.h
new file mode 100644
index 0000000..2c541a6
--- /dev/null
+++ b/sys/ofed/include/linux/moduleparam.h
@@ -0,0 +1,226 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_MODULEPARAM_H_
+#define _LINUX_MODULEPARAM_H_
+
+#include <linux/types.h>
+
+/*
+ * These are presently not hooked up to anything. In linux the parameters
+ * can be set when modules are loaded. On FreeBSD these could be mapped
+ * to kenv in the future.
+ */
+struct kernel_param;
+
+typedef int (*param_set_fn)(const char *val, struct kernel_param *kp);
+typedef int (*param_get_fn)(char *buffer, struct kernel_param *kp);
+
+struct kernel_param {
+ const char *name;
+ u16 perm;
+ u16 flags;
+ param_set_fn set;
+ param_get_fn get;
+ union {
+ void *arg;
+ struct kparam_string *str;
+ struct kparam_array *arr;
+ } un;
+};
+
+#define KPARAM_ISBOOL 2
+
+struct kparam_string {
+ unsigned int maxlen;
+ char *string;
+};
+
+struct kparam_array
+{
+ unsigned int max;
+ unsigned int *num;
+ param_set_fn set;
+ param_get_fn get;
+ unsigned int elemsize;
+ void *elem;
+};
+
+static inline void
+param_sysinit(struct kernel_param *param)
+{
+}
+
+#define module_param_call(name, set, get, arg, perm) \
+ static struct kernel_param __param_##name = \
+ { #name, perm, 0, set, get, { arg } }; \
+ SYSINIT(name##_param_sysinit, SI_SUB_DRIVERS, SI_ORDER_FIRST, \
+ param_sysinit, &__param_##name);
+
+#define module_param_named(name, var, type, mode) \
+ module_param_call(name, param_set_##type, param_get_##type, &var, mode)
+
+#define module_param(var, type, mode) \
+ module_param_named(var, var, type, mode)
+
+#define MODULE_PARM_DESC(name, desc)
+
+static inline int
+param_set_byte(const char *val, struct kernel_param *kp)
+{
+
+ return 0;
+}
+
+static inline int
+param_get_byte(char *buffer, struct kernel_param *kp)
+{
+
+ return 0;
+}
+
+
+static inline int
+param_set_short(const char *val, struct kernel_param *kp)
+{
+
+ return 0;
+}
+
+static inline int
+param_get_short(char *buffer, struct kernel_param *kp)
+{
+
+ return 0;
+}
+
+
+static inline int
+param_set_ushort(const char *val, struct kernel_param *kp)
+{
+
+ return 0;
+}
+
+static inline int
+param_get_ushort(char *buffer, struct kernel_param *kp)
+{
+
+ return 0;
+}
+
+
+static inline int
+param_set_int(const char *val, struct kernel_param *kp)
+{
+
+ return 0;
+}
+
+static inline int
+param_get_int(char *buffer, struct kernel_param *kp)
+{
+
+ return 0;
+}
+
+
+static inline int
+param_set_uint(const char *val, struct kernel_param *kp)
+{
+
+ return 0;
+}
+
+static inline int
+param_get_uint(char *buffer, struct kernel_param *kp)
+{
+
+ return 0;
+}
+
+
+static inline int
+param_set_long(const char *val, struct kernel_param *kp)
+{
+
+ return 0;
+}
+
+static inline int
+param_get_long(char *buffer, struct kernel_param *kp)
+{
+
+ return 0;
+}
+
+
+static inline int
+param_set_ulong(const char *val, struct kernel_param *kp)
+{
+
+ return 0;
+}
+
+static inline int
+param_get_ulong(char *buffer, struct kernel_param *kp)
+{
+
+ return 0;
+}
+
+
+static inline int
+param_set_charp(const char *val, struct kernel_param *kp)
+{
+
+ return 0;
+}
+
+static inline int
+param_get_charp(char *buffer, struct kernel_param *kp)
+{
+
+ return 0;
+}
+
+
+static inline int
+param_set_bool(const char *val, struct kernel_param *kp)
+{
+
+ return 0;
+}
+
+static inline int
+param_get_bool(char *buffer, struct kernel_param *kp)
+{
+
+ return 0;
+}
+
+#endif /* _LINUX_MODULEPARAM_H_ */
diff --git a/sys/ofed/include/linux/mount.h b/sys/ofed/include/linux/mount.h
new file mode 100644
index 0000000..33db94e
--- /dev/null
+++ b/sys/ofed/include/linux/mount.h
@@ -0,0 +1,33 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_MOUNT_H_
+#define _LINUX_MOUNT_H_
+
+
+#endif /* _LINUX_MOUNT_H_ */
diff --git a/sys/ofed/include/linux/mutex.h b/sys/ofed/include/linux/mutex.h
new file mode 100644
index 0000000..ef65816
--- /dev/null
+++ b/sys/ofed/include/linux/mutex.h
@@ -0,0 +1,61 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_MUTEX_H_
+#define _LINUX_MUTEX_H_
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/sx.h>
+
+#include <linux/spinlock.h>
+
+typedef struct mutex {
+ struct sx sx;
+} mutex_t;
+
+#define mutex_lock(_m) sx_xlock(&(_m)->sx)
+#define mutex_lock_nested(_m, _s) mutex_lock(_m)
+#define mutex_lock_interruptible(_m) ({ mutex_lock((_m)); 0; })
+#define mutex_unlock(_m) sx_xunlock(&(_m)->sx)
+#define mutex_trylock(_m) !!sx_try_xlock(&(_m)->sx)
+
+#define DEFINE_MUTEX(lock) \
+ mutex_t lock; \
+ SX_SYSINIT_FLAGS(lock, &(lock).sx, "lnxmtx", SX_NOWITNESS)
+
+static inline void
+linux_mutex_init(mutex_t *m)
+{
+
+ memset(&m->sx, 0, sizeof(m->sx));
+ sx_init_flags(&m->sx, "lnxmtx", SX_NOWITNESS);
+}
+
+#define mutex_init linux_mutex_init
+
+#endif /* _LINUX_MUTEX_H_ */
diff --git a/sys/ofed/include/linux/net.h b/sys/ofed/include/linux/net.h
new file mode 100644
index 0000000..6e2aff3
--- /dev/null
+++ b/sys/ofed/include/linux/net.h
@@ -0,0 +1,73 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_NET_H_
+#define _LINUX_NET_H_
+
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+
+static inline int
+sock_create_kern(int family, int type, int proto, struct socket **res)
+{
+ return -socreate(family, res, type, proto, curthread->td_ucred,
+ curthread);
+}
+
+static inline int
+sock_getname(struct socket *so, struct sockaddr *addr, int *sockaddr_len,
+ int peer)
+{
+ struct sockaddr **nam;
+ int error;
+
+ nam = NULL;
+ if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0)
+ return (-ENOTCONN);
+
+ if (peer)
+ error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, nam);
+ else
+ error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, nam);
+ if (error)
+ return (-error);
+ *addr = **nam;
+ *sockaddr_len = addr->sa_len;
+
+ free(*nam, M_SONAME);
+ return (0);
+}
+
+static inline void
+sock_release(struct socket *so)
+{
+ soclose(so);
+}
+
+#endif /* _LINUX_NET_H_ */
diff --git a/sys/ofed/include/linux/netdevice.h b/sys/ofed/include/linux/netdevice.h
new file mode 100644
index 0000000..b02a9dd
--- /dev/null
+++ b/sys/ofed/include/linux/netdevice.h
@@ -0,0 +1,159 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_NETDEVICE_H_
+#define _LINUX_NETDEVICE_H_
+
+#include <linux/types.h>
+
+#include <sys/socket.h>
+
+#include <net/if_types.h>
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_dl.h>
+
+#include <linux/completion.h>
+#include <linux/device.h>
+#include <linux/ethtool.h>
+#include <linux/workqueue.h>
+#include <linux/net.h>
+#include <linux/notifier.h>
+
+struct net {
+};
+
+extern struct net init_net;
+
+#define MAX_ADDR_LEN 20
+
+#define net_device ifnet
+
+#define dev_get_by_index(n, idx) ifnet_byindex_ref((idx))
+#define dev_hold(d) if_ref((d))
+#define dev_put(d) if_rele((d))
+
+#define netif_running(dev) !!((dev)->if_drv_flags & IFF_DRV_RUNNING)
+#define netif_oper_up(dev) !!((dev)->if_flags & IFF_UP)
+#define netif_carrier_ok(dev) netif_running(dev)
+
+static inline void *
+netdev_priv(const struct net_device *dev)
+{
+ return (dev->if_softc);
+}
+
+static inline void
+_handle_ifnet_link_event(void *arg, struct ifnet *ifp, int linkstate)
+{
+ struct notifier_block *nb;
+
+ nb = arg;
+ if (linkstate == LINK_STATE_UP)
+ nb->notifier_call(nb, NETDEV_UP, ifp);
+ else
+ nb->notifier_call(nb, NETDEV_DOWN, ifp);
+}
+
+static inline void
+_handle_ifnet_arrival_event(void *arg, struct ifnet *ifp)
+{
+ struct notifier_block *nb;
+
+ nb = arg;
+ nb->notifier_call(nb, NETDEV_REGISTER, ifp);
+}
+
+static inline void
+_handle_ifnet_departure_event(void *arg, struct ifnet *ifp)
+{
+ struct notifier_block *nb;
+
+ nb = arg;
+ nb->notifier_call(nb, NETDEV_UNREGISTER, ifp);
+}
+
+static inline int
+register_netdevice_notifier(struct notifier_block *nb)
+{
+
+ nb->tags[NETDEV_UP] = EVENTHANDLER_REGISTER(
+ ifnet_link_event, _handle_ifnet_link_event, nb, 0);
+ nb->tags[NETDEV_REGISTER] = EVENTHANDLER_REGISTER(
+ ifnet_arrival_event, _handle_ifnet_arrival_event, nb, 0);
+ nb->tags[NETDEV_UNREGISTER] = EVENTHANDLER_REGISTER(
+ ifnet_departure_event, _handle_ifnet_departure_event, nb, 0);
+ return (0);
+}
+
+static inline int
+unregister_netdevice_notifier(struct notifier_block *nb)
+{
+
+ EVENTHANDLER_DEREGISTER(ifnet_link_event, nb->tags[NETDEV_UP]);
+ EVENTHANDLER_DEREGISTER(ifnet_arrival_event, nb->tags[NETDEV_REGISTER]);
+ EVENTHANDLER_DEREGISTER(ifnet_departure_event,
+ nb->tags[NETDEV_UNREGISTER]);
+ return (0);
+}
+
+#define rtnl_lock()
+#define rtnl_unlock()
+
+static inline int
+dev_mc_delete(struct net_device *dev, void *addr, int alen, int all)
+{
+ struct sockaddr_dl sdl;
+
+ if (alen > sizeof(sdl.sdl_data))
+ return (-EINVAL);
+ memset(&sdl, 0, sizeof(sdl));
+ sdl.sdl_len = sizeof(sdl);
+ sdl.sdl_family = AF_LINK;
+ sdl.sdl_alen = alen;
+ memcpy(&sdl.sdl_data, addr, alen);
+
+ return -if_delmulti(dev, (struct sockaddr *)&sdl);
+}
+
+static inline int
+dev_mc_add(struct net_device *dev, void *addr, int alen, int newonly)
+{
+ struct sockaddr_dl sdl;
+
+ if (alen > sizeof(sdl.sdl_data))
+ return (-EINVAL);
+ memset(&sdl, 0, sizeof(sdl));
+ sdl.sdl_len = sizeof(sdl);
+ sdl.sdl_family = AF_LINK;
+ sdl.sdl_alen = alen;
+ memcpy(&sdl.sdl_data, addr, alen);
+
+ return -if_addmulti(dev, (struct sockaddr *)&sdl, NULL);
+}
+
+#endif /* _LINUX_NETDEVICE_H_ */
diff --git a/sys/ofed/include/linux/notifier.h b/sys/ofed/include/linux/notifier.h
new file mode 100644
index 0000000..eeef8e7
--- /dev/null
+++ b/sys/ofed/include/linux/notifier.h
@@ -0,0 +1,54 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_NOTIFIER_H_
+#define _LINUX_NOTIFIER_H_
+
+#include <sys/eventhandler.h>
+
+/*
+ * Max number of FreeBSD events to map to Linux events per notify type.
+ */
+#define NOTIFY_DONE 0
+#define _NOTIFY_COUNT 5
+
+struct notifier_block {
+ int (*notifier_call)(struct notifier_block *, unsigned long, void *);
+ struct notifier_block *next;
+ int priority;
+ eventhandler_tag tags[_NOTIFY_COUNT];
+};
+
+/* Values must be less than NOTIFY_COUNT */
+#define NETDEV_UP 0x0001
+#define NETDEV_DOWN 0x0002
+#define NETDEV_REGISTER 0x0003
+#define NETDEV_UNREGISTER 0x0004
+
+
+#endif /* _LINUX_NOTIFIER_H_ */
diff --git a/sys/ofed/include/linux/page.h b/sys/ofed/include/linux/page.h
new file mode 100644
index 0000000..0c9052c
--- /dev/null
+++ b/sys/ofed/include/linux/page.h
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_PAGE_H_
+#define _LINUX_PAGE_H_
+
+#include <linux/types.h>
+
+#include <sys/param.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+
+#define page vm_page
+
+#define virt_to_page(x) PHYS_TO_VM_PAGE(vtophys((x)))
+
+#define clear_page(page) memset((page), 0, PAGE_SIZE)
+#define pgprot_noncached(prot) VM_MEMATTR_UNCACHED
+#define pgprot_writecombine(prot) VM_MEMATTR_WRITE_COMBINING
+
+#undef PAGE_MASK
+#define PAGE_MASK (~(PAGE_SIZE-1))
+
+#endif /* _LINUX_PAGE_H_ */
diff --git a/sys/ofed/include/linux/pci.h b/sys/ofed/include/linux/pci.h
new file mode 100644
index 0000000..e4fb5f5
--- /dev/null
+++ b/sys/ofed/include/linux/pci.h
@@ -0,0 +1,580 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_PCI_H_
+#define _LINUX_PCI_H_
+
+#define CONFIG_PCI_MSI
+
+#include <linux/types.h>
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/pciio.h>
+#include <sys/rman.h>
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+#include <dev/pci/pci_private.h>
+
+#include <machine/resource.h>
+
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/dmapool.h>
+#include <linux/dma-mapping.h>
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <asm/atomic.h>
+#include <linux/device.h>
+
+struct pci_device_id {
+ uint32_t vendor;
+ uint32_t device;
+ uint32_t subvendor;
+ uint32_t subdevice;
+ uint32_t class_mask;
+ uintptr_t driver_data;
+};
+
+#define MODULE_DEVICE_TABLE(bus, table)
+#define PCI_ANY_ID (-1)
+#define PCI_VENDOR_ID_MELLANOX 0x15b3
+#define PCI_VENDOR_ID_TOPSPIN 0x1867
+#define PCI_DEVICE_ID_MELLANOX_TAVOR 0x5a44
+#define PCI_DEVICE_ID_MELLANOX_TAVOR_BRIDGE 0x5a46
+#define PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT 0x6278
+#define PCI_DEVICE_ID_MELLANOX_ARBEL 0x6282
+#define PCI_DEVICE_ID_MELLANOX_SINAI_OLD 0x5e8c
+#define PCI_DEVICE_ID_MELLANOX_SINAI 0x6274
+
+
+#define PCI_VDEVICE(vendor, device) \
+ PCI_VENDOR_ID_##vendor, (device), PCI_ANY_ID, PCI_ANY_ID, 0, 0
+#define PCI_DEVICE(vendor, device) \
+ (vendor), (device), PCI_ANY_ID, PCI_ANY_ID, 0, 0
+
+#define to_pci_dev(n) container_of(n, struct pci_dev, dev)
+
+#define PCI_VENDOR_ID PCIR_DEVVENDOR
+#define PCI_COMMAND PCIR_COMMAND
+#define PCI_EXP_DEVCTL PCIR_EXPRESS_DEVICE_CTL
+#define PCI_EXP_LNKCTL PCIR_EXPRESS_LINK_CTL
+
+#define IORESOURCE_MEM SYS_RES_MEMORY
+#define IORESOURCE_IO SYS_RES_IOPORT
+#define IORESOURCE_IRQ SYS_RES_IRQ
+
+struct pci_dev;
+
+struct pci_driver {
+ struct list_head links;
+ char *name;
+ struct pci_device_id *id_table;
+ int (*probe)(struct pci_dev *dev, const struct pci_device_id *id);
+ void (*remove)(struct pci_dev *dev);
+ driver_t driver;
+ devclass_t bsdclass;
+};
+
+extern struct list_head pci_drivers;
+extern struct list_head pci_devices;
+extern spinlock_t pci_lock;
+
+#define __devexit_p(x) x
+
+struct pci_dev {
+ struct device dev;
+ struct list_head links;
+ struct pci_driver *pdrv;
+ uint64_t dma_mask;
+ uint16_t device;
+ uint16_t vendor;
+ unsigned int irq;
+};
+
+static inline struct resource_list_entry *
+_pci_get_rle(struct pci_dev *pdev, int type, int rid)
+{
+ struct pci_devinfo *dinfo;
+ struct resource_list *rl;
+
+ dinfo = device_get_ivars(pdev->dev.bsddev);
+ rl = &dinfo->resources;
+ return resource_list_find(rl, type, rid);
+}
+
+static inline struct resource_list_entry *
+_pci_get_bar(struct pci_dev *pdev, int bar)
+{
+ struct resource_list_entry *rle;
+
+ bar = PCIR_BAR(bar);
+ if ((rle = _pci_get_rle(pdev, SYS_RES_MEMORY, bar)) == NULL)
+ rle = _pci_get_rle(pdev, SYS_RES_IOPORT, bar);
+ return (rle);
+}
+
+static inline struct device *
+_pci_find_irq_dev(unsigned int irq)
+{
+ struct pci_dev *pdev;
+
+ spin_lock(&pci_lock);
+ list_for_each_entry(pdev, &pci_devices, links) {
+ if (irq == pdev->dev.irq)
+ break;
+ if (irq >= pdev->dev.msix && irq < pdev->dev.msix_max)
+ break;
+ }
+ spin_unlock(&pci_lock);
+ if (pdev)
+ return &pdev->dev;
+ return (NULL);
+}
+
+static inline unsigned long
+pci_resource_start(struct pci_dev *pdev, int bar)
+{
+ struct resource_list_entry *rle;
+
+ if ((rle = _pci_get_bar(pdev, bar)) == NULL)
+ return (0);
+ return rle->start;
+}
+
+static inline unsigned long
+pci_resource_len(struct pci_dev *pdev, int bar)
+{
+ struct resource_list_entry *rle;
+
+ if ((rle = _pci_get_bar(pdev, bar)) == NULL)
+ return (0);
+ return rle->count;
+}
+
+/*
+ * All drivers just seem to want to inspect the type not flags.
+ */
+static inline int
+pci_resource_flags(struct pci_dev *pdev, int bar)
+{
+ struct resource_list_entry *rle;
+
+ if ((rle = _pci_get_bar(pdev, bar)) == NULL)
+ return (0);
+ return rle->type;
+}
+
+static inline const char *
+pci_name(struct pci_dev *d)
+{
+
+ return device_get_desc(d->dev.bsddev);
+}
+
+static inline void *
+pci_get_drvdata(struct pci_dev *pdev)
+{
+
+ return dev_get_drvdata(&pdev->dev);
+}
+
+static inline void
+pci_set_drvdata(struct pci_dev *pdev, void *data)
+{
+
+ dev_set_drvdata(&pdev->dev, data);
+}
+
+static inline int
+pci_enable_device(struct pci_dev *pdev)
+{
+
+ pci_enable_io(pdev->dev.bsddev, SYS_RES_IOPORT);
+ pci_enable_io(pdev->dev.bsddev, SYS_RES_MEMORY);
+ return (0);
+}
+
+static inline void
+pci_disable_device(struct pci_dev *pdev)
+{
+}
+
+static inline int
+pci_set_master(struct pci_dev *pdev)
+{
+
+ pci_enable_busmaster(pdev->dev.bsddev);
+ return (0);
+}
+
+static inline int
+pci_request_region(struct pci_dev *pdev, int bar, const char *res_name)
+{
+ int rid;
+ int type;
+
+ type = pci_resource_flags(pdev, bar);
+ if (type == 0)
+ return (-ENODEV);
+ rid = PCIR_BAR(bar);
+ if (bus_alloc_resource_any(pdev->dev.bsddev, type, &rid,
+ RF_ACTIVE) == NULL)
+ return (-EINVAL);
+ return (0);
+}
+
+static inline void
+pci_release_region(struct pci_dev *pdev, int bar)
+{
+ struct resource_list_entry *rle;
+
+ if ((rle = _pci_get_bar(pdev, bar)) == NULL)
+ return;
+ bus_release_resource(pdev->dev.bsddev, rle->type, rle->rid, rle->res);
+}
+
+static inline void
+pci_release_regions(struct pci_dev *pdev)
+{
+ int i;
+
+ for (i = 0; i <= PCIR_MAX_BAR_0; i++)
+ pci_release_region(pdev, i);
+}
+
+static inline int
+pci_request_regions(struct pci_dev *pdev, const char *res_name)
+{
+ int error;
+ int i;
+
+ for (i = 0; i <= PCIR_MAX_BAR_0; i++) {
+ error = pci_request_region(pdev, i, res_name);
+ if (error && error != -ENODEV) {
+ pci_release_regions(pdev);
+ return (error);
+ }
+ }
+ return (0);
+}
+
+static inline void
+pci_disable_msix(struct pci_dev *pdev)
+{
+
+ pci_release_msi(pdev->dev.bsddev);
+}
+
+#define PCI_CAP_ID_EXP PCIY_EXPRESS
+#define PCI_CAP_ID_PCIX PCIY_PCIX
+
+static inline int
+pci_find_capability(struct pci_dev *pdev, int capid)
+{
+ int reg;
+
+ if (pci_find_extcap(pdev->dev.bsddev, capid, &reg))
+ return (0);
+ return (reg);
+}
+
+static inline int
+pci_read_config_byte(struct pci_dev *pdev, int where, u8 *val)
+{
+
+ *val = (u8)pci_read_config(pdev->dev.bsddev, where, 1);
+ return (0);
+}
+
+static inline int
+pci_read_config_word(struct pci_dev *pdev, int where, u16 *val)
+{
+
+ *val = (u16)pci_read_config(pdev->dev.bsddev, where, 2);
+ return (0);
+}
+
+static inline int
+pci_read_config_dword(struct pci_dev *pdev, int where, u32 *val)
+{
+
+ *val = (u32)pci_read_config(pdev->dev.bsddev, where, 4);
+ return (0);
+}
+
+static inline int
+pci_write_config_byte(struct pci_dev *pdev, int where, u8 val)
+{
+
+ pci_write_config(pdev->dev.bsddev, where, val, 1);
+ return (0);
+}
+
+static inline int
+pci_write_config_word(struct pci_dev *pdev, int where, u16 val)
+{
+
+ pci_write_config(pdev->dev.bsddev, where, val, 2);
+ return (0);
+}
+
+static inline int
+pci_write_config_dword(struct pci_dev *pdev, int where, u32 val)
+{
+
+ pci_write_config(pdev->dev.bsddev, where, val, 4);
+ return (0);
+}
+
+static struct pci_driver *
+linux_pci_find(device_t dev, struct pci_device_id **idp)
+{
+ struct pci_device_id *id;
+ struct pci_driver *pdrv;
+ uint16_t vendor;
+ uint16_t device;
+
+ vendor = pci_get_vendor(dev);
+ device = pci_get_device(dev);
+
+ spin_lock(&pci_lock);
+ list_for_each_entry(pdrv, &pci_drivers, links) {
+ for (id = pdrv->id_table; id->vendor != 0; id++) {
+ if (vendor == id->vendor && device == id->device) {
+ *idp = id;
+ spin_unlock(&pci_lock);
+ return (pdrv);
+ }
+ }
+ }
+ spin_unlock(&pci_lock);
+ return (NULL);
+}
+
+static inline int
+linux_pci_probe(device_t dev)
+{
+ struct pci_device_id *id;
+ struct pci_driver *pdrv;
+
+ if ((pdrv = linux_pci_find(dev, &id)) == NULL)
+ return (ENXIO);
+ if (device_get_driver(dev) != &pdrv->driver)
+ return (ENXIO);
+ device_set_desc(dev, pdrv->name);
+ return (0);
+}
+
+static inline int
+linux_pci_attach(device_t dev)
+{
+ struct resource_list_entry *rle;
+ struct pci_dev *pdev;
+ struct pci_driver *pdrv;
+ struct pci_device_id *id;
+ int error;
+
+ pdrv = linux_pci_find(dev, &id);
+ pdev = device_get_softc(dev);
+ pdev->dev.parent = &linux_rootdev;
+ pdev->dev.bsddev = dev;
+ INIT_LIST_HEAD(&pdev->dev.irqents);
+ pdev->device = id->device;
+ pdev->vendor = id->vendor;
+ pdev->dev.dma_mask = &pdev->dma_mask;
+ pdev->pdrv = pdrv;
+ kobject_init(&pdev->dev.kobj, &dev_ktype);
+ kobject_set_name(&pdev->dev.kobj, device_get_nameunit(dev));
+ kobject_add(&pdev->dev.kobj, &linux_rootdev.kobj,
+ kobject_name(&pdev->dev.kobj));
+ rle = _pci_get_rle(pdev, SYS_RES_IRQ, 0);
+ if (rle)
+ pdev->dev.irq = rle->start;
+ else
+ pdev->dev.irq = 0;
+ pdev->irq = pdev->dev.irq;
+ mtx_unlock(&Giant);
+ spin_lock(&pci_lock);
+ list_add(&pdev->links, &pci_devices);
+ spin_unlock(&pci_lock);
+ error = pdrv->probe(pdev, id);
+ mtx_lock(&Giant);
+ if (error) {
+ spin_lock(&pci_lock);
+ list_del(&pdev->links);
+ spin_unlock(&pci_lock);
+ put_device(&pdev->dev);
+ return (-error);
+ }
+ return (0);
+}
+
+static inline int
+linux_pci_detach(device_t dev)
+{
+ struct pci_dev *pdev;
+
+ pdev = device_get_softc(dev);
+ mtx_unlock(&Giant);
+ pdev->pdrv->remove(pdev);
+ mtx_lock(&Giant);
+ spin_lock(&pci_lock);
+ list_del(&pdev->links);
+ spin_unlock(&pci_lock);
+ put_device(&pdev->dev);
+
+ return (0);
+}
+
+static device_method_t pci_methods[] = {
+ DEVMETHOD(device_probe, linux_pci_probe),
+ DEVMETHOD(device_attach, linux_pci_attach),
+ DEVMETHOD(device_detach, linux_pci_detach),
+ {0, 0}
+};
+
+static inline int
+pci_register_driver(struct pci_driver *pdrv)
+{
+ devclass_t bus;
+ int error;
+
+ spin_lock(&pci_lock);
+ list_add(&pdrv->links, &pci_drivers);
+ spin_unlock(&pci_lock);
+ bus = devclass_find("pci");
+ pdrv->driver.name = pdrv->name;
+ pdrv->driver.methods = pci_methods;
+ pdrv->driver.size = sizeof(struct pci_dev);
+ mtx_lock(&Giant);
+ error = devclass_add_driver(bus, &pdrv->driver, BUS_PASS_DEFAULT,
+ &pdrv->bsdclass);
+ mtx_unlock(&Giant);
+ if (error)
+ return (-error);
+ return (0);
+}
+
+static inline void
+pci_unregister_driver(struct pci_driver *pdrv)
+{
+ devclass_t bus;
+
+ list_del(&pdrv->links);
+ bus = devclass_find("pci");
+ mtx_lock(&Giant);
+ devclass_delete_driver(bus, &pdrv->driver);
+ mtx_unlock(&Giant);
+}
+
+struct msix_entry {
+ int entry;
+ int vector;
+};
+
+/*
+ * Enable msix, positive errors indicate actual number of available
+ * vectors. Negative errors are failures.
+ */
+static inline int
+pci_enable_msix(struct pci_dev *pdev, struct msix_entry *entries, int nreq)
+{
+ struct resource_list_entry *rle;
+ int error;
+ int avail;
+ int i;
+
+ avail = pci_msix_count(pdev->dev.bsddev);
+ if (avail < nreq) {
+ if (avail == 0)
+ return -EINVAL;
+ return avail;
+ }
+ avail = nreq;
+ if ((error = -pci_alloc_msix(pdev->dev.bsddev, &avail)) != 0)
+ return error;
+ rle = _pci_get_rle(pdev, SYS_RES_IRQ, 1);
+ pdev->dev.msix = rle->start;
+ pdev->dev.msix_max = rle->start + avail;
+ for (i = 0; i < nreq; i++)
+ entries[i].vector = pdev->dev.msix + i;
+ return (0);
+}
+
+/* XXX This should not be necessary. */
+#define pcix_set_mmrbc(d, v) 0
+#define pcix_get_max_mmrbc(d) 0
+#define pcie_set_readrq(d, v) 0
+
+#define PCI_DMA_BIDIRECTIONAL 0
+#define PCI_DMA_TODEVICE 1
+#define PCI_DMA_FROMDEVICE 2
+#define PCI_DMA_NONE 3
+
+#define pci_pool dma_pool
+#define pci_pool_destroy dma_pool_destroy
+#define pci_pool_alloc dma_pool_alloc
+#define pci_pool_free dma_pool_free
+#define pci_pool_create(_name, _pdev, _size, _align, _alloc) \
+ dma_pool_create(_name, &(_pdev)->dev, _size, _align, _alloc)
+#define pci_free_consistent(_hwdev, _size, _vaddr, _dma_handle) \
+ dma_free_coherent((_hwdev) == NULL ? NULL : &(_hwdev)->dev, \
+ _size, _vaddr, _dma_handle)
+#define pci_map_sg(_hwdev, _sg, _nents, _dir) \
+ dma_map_sg((_hwdev) == NULL ? NULL : &(_hwdev->dev), \
+ _sg, _nents, (enum dma_data_direction)_dir)
+#define pci_map_single(_hwdev, _ptr, _size, _dir) \
+ dma_map_single((_hwdev) == NULL ? NULL : &(_hwdev->dev), \
+ (_ptr), (_size), (enum dma_data_direction)_dir)
+#define pci_unmap_single(_hwdev, _addr, _size, _dir) \
+ dma_unmap_single((_hwdev) == NULL ? NULL : &(_hwdev)->dev, \
+ _addr, _size, (enum dma_data_direction)_dir)
+#define pci_unmap_sg(_hwdev, _sg, _nents, _dir) \
+ dma_unmap_sg((_hwdev) == NULL ? NULL : &(_hwdev)->dev, \
+ _sg, _nents, (enum dma_data_direction)_dir)
+#define pci_map_page(_hwdev, _page, _offset, _size, _dir) \
+ dma_map_page((_hwdev) == NULL ? NULL : &(_hwdev)->dev, _page,\
+ _offset, _size, (enum dma_data_direction)_dir)
+#define pci_unmap_page(_hwdev, _dma_address, _size, _dir) \
+ dma_unmap_page((_hwdev) == NULL ? NULL : &(_hwdev)->dev, \
+ _dma_address, _size, (enum dma_data_direction)_dir)
+#define pci_set_dma_mask(_pdev, mask) dma_set_mask(&(_pdev)->dev, (mask))
+#define pci_dma_mapping_error(_pdev, _dma_addr) \
+ dma_mapping_error(&(_pdev)->dev, _dma_addr)
+#define pci_set_consistent_dma_mask(_pdev, _mask) \
+ dma_set_coherent_mask(&(_pdev)->dev, (_mask))
+#define DECLARE_PCI_UNMAP_ADDR(x) DEFINE_DMA_UNMAP_ADDR(x);
+#define DECLARE_PCI_UNMAP_LEN(x) DEFINE_DMA_UNMAP_LEN(x);
+#define pci_unmap_addr dma_unmap_addr
+#define pci_unmap_addr_set dma_unmap_addr_set
+#define pci_unmap_len dma_unmap_len
+#define pci_unmap_len_set dma_unmap_len_set
+
+
+#endif /* _LINUX_PCI_H_ */
diff --git a/sys/ofed/include/linux/poll.h b/sys/ofed/include/linux/poll.h
new file mode 100644
index 0000000..5b7f34e
--- /dev/null
+++ b/sys/ofed/include/linux/poll.h
@@ -0,0 +1,44 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_POLL_H_
+#define _LINUX_POLL_H_
+
+#include <sys/poll.h>
+#include <sys/fcntl.h>
+
+typedef struct poll_table_struct {
+} poll_table;
+
+static inline void
+poll_wait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p)
+{
+ selrecord(curthread, &filp->f_selinfo);
+}
+
+#endif /* _LINUX_POLL_H_ */
diff --git a/sys/ofed/include/linux/radix-tree.h b/sys/ofed/include/linux/radix-tree.h
new file mode 100644
index 0000000..a02a90f
--- /dev/null
+++ b/sys/ofed/include/linux/radix-tree.h
@@ -0,0 +1,60 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_RADIX_TREE_H_
+#define _LINUX_RADIX_TREE_H_
+
+#define RADIX_TREE_MAP_SHIFT 6
+#define RADIX_TREE_MAP_SIZE (1 << RADIX_TREE_MAP_SHIFT)
+#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE - 1)
+#define RADIX_TREE_MAX_HEIGHT \
+ DIV_ROUND_UP((sizeof(long) * NBBY), RADIX_TREE_MAP_SHIFT)
+
+struct radix_tree_node {
+ void *slots[RADIX_TREE_MAP_SIZE];
+ int count;
+};
+
+struct radix_tree_root {
+ struct radix_tree_node *rnode;
+ gfp_t gfp_mask;
+ int height;
+};
+
+#define RADIX_TREE_INIT(mask) \
+ { .rnode = NULL, .gfp_mask = mask, .height = 0 };
+#define INIT_RADIX_TREE(root, mask) \
+ { (root)->rnode = NULL; (root)->gfp_mask = mask; (root)->height = 0; }
+#define RADIX_TREE(name, mask) \
+ struct radix_tree_root name = RADIX_TREE_INIT(mask)
+
+void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
+void *radix_tree_delete(struct radix_tree_root *, unsigned long);
+int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
+
+#endif /* _LINUX_RADIX_TREE_H_ */
diff --git a/sys/ofed/include/linux/random.h b/sys/ofed/include/linux/random.h
new file mode 100644
index 0000000..84a24c8
--- /dev/null
+++ b/sys/ofed/include/linux/random.h
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_RANDOM_H_
+#define _LINUX_RANDOM_H_
+
+#include <sys/random.h>
+
+static inline void
+get_random_bytes(void *buf, int nbytes)
+{
+ read_random(buf, nbytes);
+}
+
+#endif /* _LINUX_RANDOM_H_ */
diff --git a/sys/ofed/include/linux/rbtree.h b/sys/ofed/include/linux/rbtree.h
new file mode 100644
index 0000000..ea9afc3
--- /dev/null
+++ b/sys/ofed/include/linux/rbtree.h
@@ -0,0 +1,111 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_RBTREE_H_
+#define _LINUX_RBTREE_H_
+
+#include <sys/stddef.h>
+#include <sys/tree.h>
+
+struct rb_node {
+ RB_ENTRY(rb_node) __entry;
+};
+#define rb_left __entry.rbe_left
+#define rb_right __entry.rbe_right
+
+/*
+ * We provide a false structure that has the same bit pattern as tree.h
+ * presents so it matches the member names expected by linux.
+ */
+struct rb_root {
+ struct rb_node *rb_node;
+};
+
+/*
+ * In linux all of the comparisons are done by the caller.
+ */
+int panic_cmp(struct rb_node *one, struct rb_node *two);
+
+RB_HEAD(linux_root, rb_node);
+RB_PROTOTYPE(linux_root, rb_node, __entry, panic_cmp);
+
+#define rb_parent(r) RB_PARENT(r, __entry)
+#define rb_color(r) RB_COLOR(r, __entry)
+#define rb_is_red(r) (rb_color(r) == RB_RED)
+#define rb_is_black(r) (rb_color(r) == RB_BLACK)
+#define rb_set_parent(r, p) rb_parent((r)) = (p)
+#define rb_set_color(r, c) rb_color((r)) = (c)
+#define rb_entry(ptr, type, member) container_of(ptr, type, member)
+
+#define RB_EMPTY_ROOT(root) RB_EMPTY((struct linux_root *)root)
+#define RB_EMPTY_NODE(node) (rb_parent(node) == node)
+#define RB_CLEAR_NODE(node) (rb_set_parent(node, node))
+
+#define rb_insert_color(node, root) \
+ linux_root_RB_INSERT_COLOR((struct linux_root *)(root), (node))
+#define rb_erase(node, root) \
+ linux_root_RB_REMOVE((struct linux_root *)(root), (node))
+#define rb_next(node) RB_NEXT(linux_root, NULL, (node))
+#define rb_prev(node) RB_PREV(linux_root, NULL, (node))
+#define rb_first(root) RB_MIN(linux_root, (struct linux_root *)(root))
+#define rb_last(root) RB_MAX(linux_root, (struct linux_root *)(root))
+
+static inline void
+rb_link_node(struct rb_node *node, struct rb_node *parent,
+ struct rb_node **rb_link)
+{
+ rb_set_parent(node, parent);
+ rb_set_color(node, RB_RED);
+ node->__entry.rbe_left = node->__entry.rbe_right = NULL;
+ *rb_link = node;
+}
+
+static inline void
+rb_replace_node(struct rb_node *victim, struct rb_node *new,
+ struct rb_root *root)
+{
+ struct rb_node *p;
+
+ p = rb_parent(victim);
+ if (p) {
+ if (p->rb_left == victim)
+ p->rb_left = new;
+ else
+ p->rb_right = new;
+ } else
+ root->rb_node = new;
+ if (victim->rb_left)
+ rb_set_parent(victim->rb_left, new);
+ if (victim->rb_right)
+ rb_set_parent(victim->rb_right, new);
+ *new = *victim;
+}
+
+#undef RB_ROOT
+#define RB_ROOT (struct rb_root) { NULL }
+
+#endif /* _LINUX_RBTREE_H_ */
diff --git a/sys/ofed/include/linux/rtnetlink.h b/sys/ofed/include/linux/rtnetlink.h
new file mode 100644
index 0000000..e5d814e
--- /dev/null
+++ b/sys/ofed/include/linux/rtnetlink.h
@@ -0,0 +1,27 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
diff --git a/sys/ofed/include/linux/rwlock.h b/sys/ofed/include/linux/rwlock.h
new file mode 100644
index 0000000..0162455
--- /dev/null
+++ b/sys/ofed/include/linux/rwlock.h
@@ -0,0 +1,63 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_RWLOCK_H_
+#define _LINUX_RWLOCK_H_
+
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+
+typedef struct {
+ struct rwlock rw;
+} rwlock_t;
+
+#define read_lock(_l) rw_rlock(&(_l)->rw)
+#define write_lock(_l) rw_wlock(&(_l)->rw)
+#define read_unlock(_l) rw_runlock(&(_l)->rw)
+#define write_unlock(_l) rw_wunlock(&(_l)->rw)
+#define read_lock_irq(lock) read_lock((lock))
+#define read_unlock_irq(lock) read_unlock((lock))
+#define write_lock_irq(lock) write_lock((lock))
+#define write_unlock_irq(lock) write_unlock((lock))
+#define read_lock_irqsave(lock, flags) \
+ do {(flags) = 0; read_lock(lock); } while (0)
+#define write_lock_irqsave(lock, flags) \
+ do {(flags) = 0; write_lock(lock); } while (0)
+#define read_unlock_irqrestore(lock, flags) \
+ do { read_unlock(lock); } while (0)
+#define write_unlock_irqrestore(lock, flags) \
+ do { write_unlock(lock); } while (0)
+
+static inline void
+rwlock_init(rwlock_t *lock)
+{
+
+ memset(&lock->rw, 0, sizeof(lock->rw));
+ rw_init_flags(&lock->rw, "lnxrw", RW_NOWITNESS);
+}
+
+#endif /* _LINUX_RWLOCK_H_ */
diff --git a/sys/ofed/include/linux/rwsem.h b/sys/ofed/include/linux/rwsem.h
new file mode 100644
index 0000000..f87c9d9
--- /dev/null
+++ b/sys/ofed/include/linux/rwsem.h
@@ -0,0 +1,56 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_RWSEM_H_
+#define _LINUX_RWSEM_H_
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/sx.h>
+
+struct rw_semaphore {
+ struct sx sx;
+};
+
+#define down_write(_rw) sx_xlock(&(_rw)->sx)
+#define up_write(_rw) sx_xunlock(&(_rw)->sx)
+#define down_read(_rw) sx_slock(&(_rw)->sx)
+#define up_read(_rw) sx_sunlock(&(_rw)->sx)
+#define down_read_trylock(_rw) !!sx_try_slock(&(_rw)->sx)
+#define down_write_trylock(_rw) !!sx_try_xlock(&(_rw)->sx)
+#define downgrade_write(_rw) sx_downgrade(&(_rw)->sx)
+#define down_read_nested(_rw, _sc) down_read(_rw)
+
+static inline void
+init_rwsem(struct rw_semaphore *rw)
+{
+
+ memset(&rw->sx, 0, sizeof(rw->sx));
+ sx_init_flags(&rw->sx, "lnxrwsem", SX_NOWITNESS);
+}
+
+#endif /* _LINUX_RWSEM_H_ */
diff --git a/sys/ofed/include/linux/scatterlist.h b/sys/ofed/include/linux/scatterlist.h
new file mode 100644
index 0000000..611ad56
--- /dev/null
+++ b/sys/ofed/include/linux/scatterlist.h
@@ -0,0 +1,98 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_SCATTERLIST_H_
+#define _LINUX_SCATTERLIST_H_
+
+#include <linux/string.h>
+#include <linux/page.h>
+
+struct scatterlist {
+ union {
+ struct page *page;
+ struct scatterlist *sg;
+ } sl_un;
+ unsigned long address;
+ unsigned long offset;
+ uint32_t length;
+ uint32_t flags;
+};
+
+#define sg_dma_address(sg) (sg)->address
+#define sg_dma_len(sg) (sg)->length
+#define sg_page(sg) (sg)->sl_un.page
+#define sg_scatternext(sg) (sg)->sl_un.sg
+
+#define SG_END 0x01
+#define SG_CHAIN 0x02
+
+static inline void
+sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len,
+ unsigned int offset)
+{
+ sg_page(sg) = page;
+ sg_dma_len(sg) = len;
+ sg->offset = offset;
+ if (offset > PAGE_SIZE)
+ panic("sg_set_page: Invalid offset %d\n", offset);
+}
+
+static inline void
+sg_set_buf(struct scatterlist *sg, const void *buf, unsigned int buflen)
+{
+ sg_set_page(sg, virt_to_page(buf), buflen,
+ ((uintptr_t)buf) & ~PAGE_MASK);
+}
+
+static inline void
+sg_init_table(struct scatterlist *sg, unsigned int nents)
+{
+ bzero(sg, sizeof(*sg) * nents);
+ sg[nents - 1].flags = SG_END;
+}
+
+static inline struct scatterlist *
+sg_next(struct scatterlist *sg)
+{
+ if (sg->flags & SG_END)
+ return (NULL);
+ sg++;
+ if (sg->flags & SG_CHAIN)
+ sg = sg_scatternext(sg);
+ return (sg);
+}
+
+static inline vm_paddr_t
+sg_phys(struct scatterlist *sg)
+{
+ return sg_page(sg)->phys_addr + sg->offset;
+}
+
+#define for_each_sg(sglist, sg, sgmax, _itr) \
+ for (_itr = 0, sg = (sglist); _itr < (sgmax); _itr++, sg = sg_next(sg))
+
+#endif /* _LINUX_SCATTERLIST_H_ */
diff --git a/sys/ofed/include/linux/sched.h b/sys/ofed/include/linux/sched.h
new file mode 100644
index 0000000..414b0ac
--- /dev/null
+++ b/sys/ofed/include/linux/sched.h
@@ -0,0 +1,109 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_SCHED_H_
+#define _LINUX_SCHED_H_
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/sleepqueue.h>
+
+#define MAX_SCHEDULE_TIMEOUT LONG_MAX
+
+#define TASK_RUNNING 0
+#define TASK_INTERRUPTIBLE 1
+#define TASK_UNINTERRUPTIBLE 2
+#define TASK_DEAD 64
+#define TASK_WAKEKILL 128
+#define TASK_WAKING 256
+
+#define TASK_SHOULD_STOP 1
+#define TASK_STOPPED 2
+
+/*
+ * A task_struct is only provided for those tasks created with kthread.
+ * Using these routines with threads not started via kthread will cause
+ * panics because no task_struct is allocated and td_retval[1] is
+ * overwritten by syscalls which kernel threads will not make use of.
+ */
+struct task_struct {
+ struct thread *task_thread;
+ int (*task_fn)(void *data);
+ void *task_data;
+ int task_ret;
+ int state;
+ int should_stop;
+};
+
+#define current ((struct task_struct *)curthread->td_retval[1])
+#define task_struct_get(x) (struct task_struct *)(x)->td_retval[1]
+#define task_struct_set(x, y) (x)->td_retval[1] = (register_t)(y)
+
+#define set_current_state(x) \
+ atomic_store_rel_int((volatile int *)&current->state, (x))
+#define __set_current_state(x) current->state = (x)
+
+
+#define schedule() \
+do { \
+ void *c; \
+ \
+ if (cold) \
+ break; \
+ c = curthread; \
+ sleepq_lock(c); \
+ if (current->state == TASK_INTERRUPTIBLE || \
+ current->state == TASK_UNINTERRUPTIBLE) { \
+ sleepq_add(c, NULL, "task", SLEEPQ_SLEEP, 0); \
+ sleepq_wait(c, 0); \
+ } else { \
+ sleepq_release(c); \
+ sched_relinquish(curthread); \
+ } \
+} while (0)
+
+#define wake_up_process(x) \
+do { \
+ int wakeup_swapper; \
+ void *c; \
+ \
+ c = (x)->task_thread; \
+ sleepq_lock(c); \
+ (x)->state = TASK_RUNNING; \
+ wakeup_swapper = sleepq_signal(c, SLEEPQ_SLEEP, 0, 0); \
+ sleepq_release(c); \
+ if (wakeup_swapper) \
+ kick_proc0(); \
+} while (0)
+
+#define cond_resched() if (!cold) sched_relinquish(curthread)
+
+#define sched_yield() sched_relinquish(curthread)
+
+#endif /* _LINUX_SCHED_H_ */
diff --git a/sys/ofed/include/linux/semaphore.h b/sys/ofed/include/linux/semaphore.h
new file mode 100644
index 0000000..4b9fd56
--- /dev/null
+++ b/sys/ofed/include/linux/semaphore.h
@@ -0,0 +1,66 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_SEMAPHORE_H_
+#define _LINUX_SEMAPHORE_H_
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/sema.h>
+
+/*
+ * XXX BSD semaphores are disused and slow. They also do not provide a
+ * sema_wait_sig method. This must be resolved eventually.
+ */
+struct semaphore {
+ struct sema sema;
+};
+
+#define down(_sem) sema_wait(&(_sem)->sema)
+#define down_interruptible(_sem) sema_wait(&(_sem)->sema), 0
+#define down_trylock(_sem) !sema_trywait(&(_sem)->sema)
+#define up(_sem) sema_post(&(_sem)->sema)
+
+static inline void
+linux_sema_init(struct semaphore *sem, int val)
+{
+
+ memset(&sem->sema, 0, sizeof(sem->sema));
+ sema_init(&sem->sema, val, "lnxsema");
+}
+
+static inline void
+init_MUTEX(struct semaphore *sem)
+{
+
+ memset(&sem->sema, 0, sizeof(sem->sema));
+ sema_init(&sem->sema, 1, "lnxsema");
+}
+
+#define sema_init linux_sema_init
+
+#endif /* _LINUX_SEMAPHORE_H_ */
diff --git a/sys/ofed/include/linux/slab.h b/sys/ofed/include/linux/slab.h
new file mode 100644
index 0000000..5e7e608
--- /dev/null
+++ b/sys/ofed/include/linux/slab.h
@@ -0,0 +1,102 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_SLAB_H_
+#define _LINUX_SLAB_H_
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <vm/uma.h>
+
+#include <linux/types.h>
+#include <linux/gfp.h>
+
+MALLOC_DECLARE(M_KMALLOC);
+
+#define kmalloc(size, flags) malloc((size), M_KMALLOC, (flags))
+#define kzalloc(size, flags) kmalloc((size), (flags) | M_ZERO)
+#define kfree(ptr) free(__DECONST(void *, (ptr)), M_KMALLOC)
+#define krealloc(ptr, size, flags) realloc((ptr), (size), M_KMALLOC, (flags))
+#define kcalloc(n, size, flags) kmalloc((n) * (size), flags | M_ZERO)
+
+struct kmem_cache {
+ uma_zone_t cache_zone;
+ void (*cache_ctor)(void *);
+};
+
+#define SLAB_HWCACHE_ALIGN 0x0001
+
+static inline int
+kmem_ctor(void *mem, int size, void *arg, int flags)
+{
+ void (*ctor)(void *);
+
+ ctor = arg;
+ ctor(mem);
+
+ return (0);
+}
+
+static inline struct kmem_cache *
+kmem_cache_create(char *name, size_t size, size_t align, u_long flags,
+ void (*ctor)(void *))
+{
+ struct kmem_cache *c;
+
+ c = malloc(sizeof(*c), M_KMALLOC, M_WAITOK);
+ if (align)
+ align--;
+ if (flags & SLAB_HWCACHE_ALIGN)
+ align = UMA_ALIGN_CACHE;
+ c->cache_zone = uma_zcreate(name, size, ctor ? kmem_ctor : NULL,
+ NULL, NULL, NULL, align, 0);
+ c->cache_ctor = ctor;
+
+ return c;
+}
+
+static inline void *
+kmem_cache_alloc(struct kmem_cache *c, int flags)
+{
+ return uma_zalloc_arg(c->cache_zone, c->cache_ctor, flags);
+}
+
+static inline void
+kmem_cache_free(struct kmem_cache *c, void *m)
+{
+ uma_zfree(c->cache_zone, m);
+}
+
+static inline void
+kmem_cache_destroy(struct kmem_cache *c)
+{
+ uma_zdestroy(c->cache_zone);
+ free(c, M_KMALLOC);
+}
+
+#endif /* _LINUX_SLAB_H_ */
diff --git a/sys/ofed/include/linux/socket.h b/sys/ofed/include/linux/socket.h
new file mode 100644
index 0000000..e14c982
--- /dev/null
+++ b/sys/ofed/include/linux/socket.h
@@ -0,0 +1,66 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_SOCKET_H_
+#define _LINUX_SOCKET_H_
+
+#include <sys/socket.h>
+
+#ifdef notyet
+static inline int
+memcpy_toiovec(struct iovec *v, unsigned char *kdata, int len)
+{
+ struct uio uio;
+ int error;
+
+ uio.uio_iov = v;
+ uio.uio_iovcnt = -1;
+ uio.uio_offset = 0;
+ uio.uio_resid = len;
+ uio.uio_segflag = UIO_USERSPACE;
+ uio.uio_rw = UIO_READ;
+ error = -uiomove(kdata, len, &uio);
+ return (error);
+}
+
+static inline int
+memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len)
+{
+ struct uio uio;
+ int error;
+
+ uio.uio_iov = v;
+ uio.uio_iovcnt = -1;
+ uio.uio_offset = 0;
+ uio.uio_resid = len;
+ uio.uio_segflag = UIO_USERSPACE;
+ uio.uio_rw = UIO_WRITE;
+ error = -uiomove(kdata, len, &uio);
+}
+#endif
+
+#endif /* _LINUX_SOCKET_H_ */
diff --git a/sys/ofed/include/linux/spinlock.h b/sys/ofed/include/linux/spinlock.h
new file mode 100644
index 0000000..4b972f4
--- /dev/null
+++ b/sys/ofed/include/linux/spinlock.h
@@ -0,0 +1,68 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_SPINLOCK_H_
+#define _LINUX_SPINLOCK_H_
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/lockdep.h>
+#include <linux/rwlock.h>
+
+typedef struct {
+ struct mtx m;
+} spinlock_t;
+
+#define spin_lock(_l) mtx_lock(&(_l)->m)
+#define spin_unlock(_l) mtx_unlock(&(_l)->m)
+#define spin_trylock(_l) mtx_trylock(&(_l)->m)
+#define spin_lock_nested(_l, _n) mtx_lock_flags(&(_l)->m, MTX_DUPOK)
+#define spin_lock_irq(lock) spin_lock(lock)
+#define spin_unlock_irq(lock) spin_unlock(lock)
+#define spin_lock_irqsave(lock, flags) \
+ do {(flags) = 0; spin_lock(lock); } while (0)
+#define spin_unlock_irqrestore(lock, flags) \
+ do { spin_unlock(lock); } while (0)
+
+static inline void
+spin_lock_init(spinlock_t *lock)
+{
+
+ memset(&lock->m, 0, sizeof(lock->m));
+ mtx_init(&lock->m, "lnxspin", NULL, MTX_DEF | MTX_NOWITNESS);
+}
+
+#define DEFINE_SPINLOCK(lock) \
+ spinlock_t lock; \
+ MTX_SYSINIT(lock, &(lock).m, "lnxspin", MTX_DEF)
+
+#endif /* _LINUX_SPINLOCK_H_ */
diff --git a/sys/ofed/include/linux/stddef.h b/sys/ofed/include/linux/stddef.h
new file mode 100644
index 0000000..22bf938
--- /dev/null
+++ b/sys/ofed/include/linux/stddef.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_STDDEF_H_
+#define _LINUX_STDDEF_H_
+
+#include <sys/stddef.h>
+
+#endif /* _LINUX_STDDEF_H_ */
diff --git a/sys/ofed/include/linux/string.h b/sys/ofed/include/linux/string.h
new file mode 100644
index 0000000..b14a5c6
--- /dev/null
+++ b/sys/ofed/include/linux/string.h
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_STRING_H_
+#define _LINUX_STRING_H_
+
+#include <linux/types.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+
+#include <sys/libkern.h>
+
+static inline void *
+kmemdup(const void *src, size_t len, gfp_t gfp)
+{
+ void *dst;
+
+ dst = kmalloc(len, gfp);
+ if (dst)
+ memcpy(dst, src, len);
+ return (dst);
+}
+
+#endif /* _LINUX_STRING_H_ */
diff --git a/sys/ofed/include/linux/sysfs.h b/sys/ofed/include/linux/sysfs.h
new file mode 100644
index 0000000..698f75e
--- /dev/null
+++ b/sys/ofed/include/linux/sysfs.h
@@ -0,0 +1,182 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_SYSFS_H_
+#define _LINUX_SYSFS_H_
+
+#include <sys/sysctl.h>
+
+struct attribute {
+ const char *name;
+ struct module *owner;
+ mode_t mode;
+};
+
+struct sysfs_ops {
+ ssize_t (*show)(struct kobject *, struct attribute *, char *);
+ ssize_t (*store)(struct kobject *, struct attribute *, const char *,
+ size_t);
+};
+
+struct attribute_group {
+ const char *name;
+ mode_t (*is_visible)(struct kobject *,
+ struct attribute *, int);
+ struct attribute **attrs;
+};
+
+#define __ATTR(_name, _mode, _show, _store) { \
+ .attr = { .name = __stringify(_name), .mode = _mode }, \
+ .show = _show, .store = _store, \
+}
+
+#define __ATTR_RO(_name) { \
+ .attr = { .name = __stringify(_name), .mode = 0444 }, \
+ .show = _name##_show, \
+}
+
+#define __ATTR_NULL { .attr = { .name = NULL } }
+
+/*
+ * Handle our generic '\0' terminated 'C' string.
+ * Two cases:
+ * a variable string: point arg1 at it, arg2 is max length.
+ * a constant string: point arg1 at it, arg2 is zero.
+ */
+
+static inline int
+sysctl_handle_attr(SYSCTL_HANDLER_ARGS)
+{
+ struct kobject *kobj;
+ struct attribute *attr;
+ const struct sysfs_ops *ops;
+ void *buf;
+ int error;
+ ssize_t len;
+
+ kobj = arg1;
+ attr = (struct attribute *)arg2;
+ buf = (void *)get_zeroed_page(GFP_KERNEL);
+ len = 1; /* Copy out a NULL byte at least. */
+ if (kobj->ktype == NULL || kobj->ktype->sysfs_ops == NULL)
+ return (ENODEV);
+ ops = kobj->ktype->sysfs_ops;
+ if (buf == NULL)
+ return (ENOMEM);
+ if (ops->show) {
+ len = ops->show(kobj, attr, buf);
+ /*
+ * It's valid not to have a 'show' so we just return 1 byte
+ * of NULL.
+ */
+ if (len < 0) {
+ error = -len;
+ len = 1;
+ if (error != EIO)
+ goto out;
+ }
+ }
+ error = SYSCTL_OUT(req, buf, len);
+ if (error || !req->newptr || ops->store == NULL)
+ goto out;
+ error = SYSCTL_IN(req, buf, PAGE_SIZE);
+ if (error)
+ goto out;
+ len = ops->store(kobj, attr, buf, req->newlen);
+ if (len < 0)
+ error = -len;
+out:
+ free_page((unsigned long)buf);
+
+ return (error);
+}
+
+static inline int
+sysfs_create_file(struct kobject *kobj, const struct attribute *attr)
+{
+
+ sysctl_add_oid(NULL, SYSCTL_CHILDREN(kobj->oidp), OID_AUTO,
+ attr->name, CTLTYPE_STRING|CTLFLAG_RW|CTLFLAG_MPSAFE, kobj,
+ (uintptr_t)attr, sysctl_handle_attr, "A", "");
+
+ return (0);
+}
+
+static inline void
+sysfs_remove_file(struct kobject *kobj, const struct attribute *attr)
+{
+
+ if (kobj->oidp)
+ sysctl_remove_name(kobj->oidp, attr->name, 1, 1);
+}
+
+static inline void
+sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp)
+{
+
+ if (kobj->oidp)
+ sysctl_remove_name(kobj->oidp, grp->name, 1, 1);
+}
+
+static inline int
+sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp)
+{
+ struct attribute **attr;
+ struct sysctl_oid *oidp;
+
+ oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(kobj->oidp),
+ OID_AUTO, grp->name, CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, grp->name);
+ for (attr = grp->attrs; *attr != NULL; attr++) {
+ sysctl_add_oid(NULL, SYSCTL_CHILDREN(oidp), OID_AUTO,
+ (*attr)->name, CTLTYPE_STRING|CTLFLAG_RW|CTLFLAG_MPSAFE,
+ kobj, (uintptr_t)*attr, sysctl_handle_attr, "A", "");
+ }
+
+ return (0);
+}
+
+static inline int
+sysfs_create_dir(struct kobject *kobj)
+{
+
+ kobj->oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(kobj->parent->oidp),
+ OID_AUTO, kobj->name, CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, kobj->name);
+
+ return (0);
+}
+
+static inline void
+sysfs_remove_dir(struct kobject *kobj)
+{
+
+ if (kobj->oidp == NULL)
+ return;
+ sysctl_remove_oid(kobj->oidp, 1, 1);
+}
+
+#endif /* _LINUX_SYSFS_H_ */
diff --git a/sys/ofed/include/linux/timer.h b/sys/ofed/include/linux/timer.h
new file mode 100644
index 0000000..ed4ed4a
--- /dev/null
+++ b/sys/ofed/include/linux/timer.h
@@ -0,0 +1,87 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_TIMER_H_
+#define _LINUX_TIMER_H_
+
+#include <linux/types.h>
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/callout.h>
+
+struct timer_list {
+ struct callout timer_callout;
+ void (*function)(unsigned long);
+ unsigned long data;
+};
+
+#define expires timer_callout.c_time
+
+static inline void
+_timer_fn(void *context)
+{
+ struct timer_list *timer;
+
+ timer = context;
+ timer->function(timer->data);
+}
+
+#define setup_timer(timer, func, dat) \
+do { \
+ (timer)->function = (func); \
+ (timer)->data = (dat); \
+ callout_init(&(timer)->timer_callout, CALLOUT_MPSAFE); \
+} while (0)
+
+#define init_timer(timer) \
+do { \
+ (timer)->function = NULL; \
+ (timer)->data = 0; \
+ callout_init(&(timer)->timer_callout, CALLOUT_MPSAFE); \
+} while (0)
+
+#define mod_timer(timer, expire) \
+ callout_reset(&(timer)->timer_callout, (expire) - jiffies, \
+ _timer_fn, (timer))
+
+#define add_timer(timer) \
+ callout_reset(&(timer)->timer_callout, \
+ (timer)->timer_callout.c_time - jiffies, _timer_fn, (timer))
+
+#define del_timer(timer) callout_stop(&(timer)->timer_callout)
+#define del_timer_sync(timer) callout_drain(&(timer)->timer_callout)
+
+#define timer_pending(timer) callout_pending(&(timer)->timer_callout)
+
+static inline unsigned long
+round_jiffies(unsigned long j)
+{
+ return roundup(j, hz);
+}
+
+#endif /* _LINUX_TIMER_H_ */
diff --git a/sys/ofed/include/linux/types.h b/sys/ofed/include/linux/types.h
new file mode 100644
index 0000000..496d6f9
--- /dev/null
+++ b/sys/ofed/include/linux/types.h
@@ -0,0 +1,55 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_TYPES_H_
+#define _LINUX_TYPES_H_
+
+#include <sys/cdefs.h>
+#include <sys/types.h>
+#include <linux/compiler.h>
+#include <asm/types.h>
+
+typedef __u16 __le16;
+typedef __u16 __be16;
+typedef __u32 __le32;
+typedef __u32 __be32;
+typedef __u64 __le64;
+typedef __u64 __be64;
+typedef _Bool bool;
+#define true TRUE
+#define false FALSE
+
+typedef unsigned long kernel_ulong_t;
+typedef unsigned int uint;
+typedef unsigned gfp_t;
+typedef uint64_t loff_t;
+typedef vm_paddr_t resource_size_t;
+
+#define DECLARE_BITMAP(n, bits) \
+ unsigned long n[howmany(bits, sizeof(long) * 8)]
+
+#endif /* _LINUX_TYPES_H_ */
diff --git a/sys/ofed/include/linux/uaccess.h b/sys/ofed/include/linux/uaccess.h
new file mode 100644
index 0000000..9015b1e
--- /dev/null
+++ b/sys/ofed/include/linux/uaccess.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_UACCESS_H_
+#define _LINUX_UACCESS_H_
+
+#define get_user(_x, _p) -copyin((_p), &(_x), sizeof(*(_p)))
+#define put_user(_x, _p) -copyout(&(_x), (_p), sizeof(*(_p)))
+
+#endif /* _LINUX_UACCESS_H_ */
diff --git a/sys/ofed/include/linux/vmalloc.h b/sys/ofed/include/linux/vmalloc.h
new file mode 100644
index 0000000..4a94a5c
--- /dev/null
+++ b/sys/ofed/include/linux/vmalloc.h
@@ -0,0 +1,41 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_VMALLOC_H_
+#define _LINUX_VMALLOC_H_
+
+#include <asm/page.h>
+
+#define VM_MAP 0x0000
+#define PAGE_KERNEL 0x0000
+
+void *vmap(struct page **pages, unsigned int count, unsigned long flags,
+ int prot);
+void vunmap(void *addr);
+
+#endif /* _LINUX_VMALLOC_H_ */
diff --git a/sys/ofed/include/linux/wait.h b/sys/ofed/include/linux/wait.h
new file mode 100644
index 0000000..b02014e
--- /dev/null
+++ b/sys/ofed/include/linux/wait.h
@@ -0,0 +1,112 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_WAIT_H_
+#define _LINUX_WAIT_H_
+
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sleepqueue.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+
+struct __wait_queue_head {
+ unsigned int wchan;
+};
+typedef struct __wait_queue_head wait_queue_head_t;
+
+#define init_waitqueue_head(x)
+
+static inline void
+__wake_up(struct __wait_queue_head *q, int all)
+{
+ int wakeup_swapper;
+ void *c;
+
+ c = &q->wchan;
+ sleepq_lock(c);
+ if (all)
+ wakeup_swapper = sleepq_broadcast(c, SLEEPQ_SLEEP, 0, 0);
+ else
+ wakeup_swapper = sleepq_signal(c, SLEEPQ_SLEEP, 0, 0);
+ sleepq_release(c);
+ if (wakeup_swapper)
+ kick_proc0();
+}
+
+#define wake_up(q) __wake_up(q, 0)
+#define wake_up_nr(q, nr) __wake_up(q, 1)
+#define wake_up_all(q) __wake_up(q, 1)
+#define wake_up_interruptible(q) __wake_up(q, 0)
+#define wake_up_interruptible_nr(q, nr) __wake_up(q, 1)
+#define wake_up_interruptible_all(q, nr) __wake_up(q, 1)
+
+#define wait_event(q, cond) \
+do { \
+ void *c = &(q).wchan; \
+ if (!(cond)) { \
+ for (;;) { \
+ sleepq_lock(c); \
+ if (cond) { \
+ sleepq_release(c); \
+ break; \
+ } \
+ sleepq_add(c, NULL, "completion", SLEEPQ_SLEEP, 0); \
+ sleepq_wait(c, 0); \
+ } \
+ } \
+} while (0)
+
+#define wait_event_interruptible(q, cond) \
+({ \
+ void *c = &(q).wchan; \
+ int _error; \
+ \
+ _error = 0; \
+ if (!(cond)) { \
+ for (; _error == 0;) { \
+ sleepq_lock(c); \
+ if (cond) { \
+ sleepq_release(c); \
+ break; \
+ } \
+ sleepq_add(c, NULL, "completion", \
+ SLEEPQ_SLEEP | SLEEPQ_INTERRUPTIBLE, 0); \
+ if (sleepq_wait_sig(c, 0)) \
+ _error = -ERESTARTSYS; \
+ } \
+ } \
+ -_error; \
+})
+
+#define DEFINE_WAIT(x)
+
+#endif /* _LINUX_WAIT_H_ */
diff --git a/sys/ofed/include/linux/workqueue.h b/sys/ofed/include/linux/workqueue.h
new file mode 100644
index 0000000..6b48f9c
--- /dev/null
+++ b/sys/ofed/include/linux/workqueue.h
@@ -0,0 +1,191 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LINUX_WORKQUEUE_H_
+#define _LINUX_WORKQUEUE_H_
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/slab.h>
+
+#include <sys/taskqueue.h>
+
+struct workqueue_struct {
+ struct taskqueue *taskqueue;
+};
+
+struct work_struct {
+ struct task work_task;
+ struct taskqueue *taskqueue;
+ void (*fn)(struct work_struct *);
+};
+
+struct delayed_work {
+ struct work_struct work;
+ struct callout timer;
+};
+
+static inline struct delayed_work *
+to_delayed_work(struct work_struct *work)
+{
+
+ return container_of(work, struct delayed_work, work);
+}
+
+
+static inline void
+_work_fn(void *context, int pending)
+{
+ struct work_struct *work;
+
+ work = context;
+ work->fn(work);
+}
+
+#define INIT_WORK(work, func) \
+do { \
+ (work)->fn = (func); \
+ (work)->taskqueue = NULL; \
+ TASK_INIT(&(work)->work_task, 0, _work_fn, (work)); \
+} while (0)
+
+#define INIT_DELAYED_WORK(_work, func) \
+do { \
+ INIT_WORK(&(_work)->work, func); \
+ callout_init(&(_work)->timer, CALLOUT_MPSAFE); \
+} while (0)
+
+#define INIT_DELAYED_WORK_DEFERRABLE INIT_DELAYED_WORK
+
+#define schedule_work(work) \
+do { \
+ (work)->taskqueue = taskqueue_thread; \
+ taskqueue_enqueue(taskqueue_thread, &(work)->work_task); \
+} while (0)
+
+#define flush_scheduled_work() flush_taskqueue(taskqueue_thread)
+
+#define queue_work(q, work) \
+do { \
+ (work)->taskqueue = (q)->taskqueue; \
+ taskqueue_enqueue((q)->taskqueue, &(work)->work_task); \
+} while (0)
+
+static inline void
+_delayed_work_fn(void *arg)
+{
+ struct delayed_work *work;
+
+ work = arg;
+ taskqueue_enqueue(work->work.taskqueue, &work->work.work_task);
+}
+
+static inline int
+queue_delayed_work(struct workqueue_struct *wq, struct delayed_work *work,
+ unsigned long delay)
+{
+ int pending;
+
+ pending = work->work.work_task.ta_pending;
+ work->work.taskqueue = wq->taskqueue;
+ if (delay != 0)
+ callout_reset(&work->timer, delay, _delayed_work_fn, work);
+ else
+ _delayed_work_fn((void *)work);
+
+ return (!pending);
+}
+
+static inline struct workqueue_struct *
+_create_workqueue_common(char *name, int cpus)
+{
+ struct workqueue_struct *wq;
+
+ wq = kmalloc(sizeof(*wq), M_WAITOK);
+ wq->taskqueue = taskqueue_create((name), M_WAITOK,
+ taskqueue_thread_enqueue, &wq->taskqueue);
+ taskqueue_start_threads(&wq->taskqueue, cpus, PWAIT, (name));
+
+ return (wq);
+}
+
+
+#define create_singlethread_workqueue(name) \
+ _create_workqueue_common(name, 1)
+
+#define create_workqueue(name) \
+ _create_workqueue_common(name, MAXCPU)
+
+static inline void
+destroy_workqueue(struct workqueue_struct *wq)
+{
+ taskqueue_free(wq->taskqueue);
+ kfree(wq);
+}
+
+#define flush_workqueue(wq) flush_taskqueue((wq)->taskqueue)
+
+static inline void
+_flush_fn(void *context, int pending)
+{
+}
+
+static inline void
+flush_taskqueue(struct taskqueue *tq)
+{
+ struct task flushtask;
+
+ TASK_INIT(&flushtask, 0, _flush_fn, NULL);
+ taskqueue_enqueue(tq, &flushtask);
+ taskqueue_drain(tq, &flushtask);
+}
+
+static inline int
+cancel_work_sync(struct work_struct *work)
+{
+ if (work->taskqueue &&
+ taskqueue_cancel(work->taskqueue, &work->work_task, NULL))
+ taskqueue_drain(work->taskqueue, &work->work_task);
+ return 0;
+}
+
+/*
+ * This may leave work running on another CPU as it does on Linux.
+ */
+static inline int
+cancel_delayed_work(struct delayed_work *work)
+{
+
+ callout_stop(&work->timer);
+ if (work->work.taskqueue &&
+ taskqueue_cancel(work->work.taskqueue, &work->work.work_task, NULL))
+ taskqueue_drain(work->work.taskqueue, &work->work.work_task);
+ return 0;
+}
+
+#endif /* _LINUX_WORKQUEUE_H_ */
diff --git a/sys/ofed/include/net/addrconf.h b/sys/ofed/include/net/addrconf.h
new file mode 100644
index 0000000..e5d814e
--- /dev/null
+++ b/sys/ofed/include/net/addrconf.h
@@ -0,0 +1,27 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
diff --git a/sys/ofed/include/net/arp.h b/sys/ofed/include/net/arp.h
new file mode 100644
index 0000000..e5d814e
--- /dev/null
+++ b/sys/ofed/include/net/arp.h
@@ -0,0 +1,27 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
diff --git a/sys/ofed/include/net/ip.h b/sys/ofed/include/net/ip.h
new file mode 100644
index 0000000..8b29d62
--- /dev/null
+++ b/sys/ofed/include/net/ip.h
@@ -0,0 +1,77 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_NET_IP_H_
+#define _LINUX_NET_IP_H_
+
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#include <net/if_types.h>
+#include <net/if.h>
+#include <net/if_var.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+
+static inline void inet_get_local_port_range(int *low, int *high)
+{
+ *low = V_ipport_firstauto;
+ *high = V_ipport_lastauto;
+}
+
+static inline void
+ip_ib_mc_map(uint32_t addr, const unsigned char *bcast, char *buf)
+{
+ unsigned char scope;
+
+ addr = ntohl(addr);
+ scope = bcast[5] & 0xF;
+ buf[0] = 0;
+ buf[1] = 0xff;
+ buf[2] = 0xff;
+ buf[3] = 0xff;
+ buf[4] = 0xff;
+ buf[5] = 0x10 | scope;
+ buf[6] = 0x40;
+ buf[7] = 0x1b;
+ buf[8] = bcast[8];
+ buf[9] = bcast[9];
+ buf[10] = 0;
+ buf[11] = 0;
+ buf[12] = 0;
+ buf[13] = 0;
+ buf[14] = 0;
+ buf[15] = 0;
+ buf[16] = (addr >> 24) & 0x0f;
+ buf[17] = (addr >> 16) & 0xff;
+ buf[18] = (addr >> 8) & 0xff;
+ buf[19] = addr & 0xff;
+}
+
+#endif /* _LINUX_NET_IP_H_ */
diff --git a/sys/ofed/include/net/ip6_route.h b/sys/ofed/include/net/ip6_route.h
new file mode 100644
index 0000000..e5d814e
--- /dev/null
+++ b/sys/ofed/include/net/ip6_route.h
@@ -0,0 +1,27 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
diff --git a/sys/ofed/include/net/ipv6.h b/sys/ofed/include/net/ipv6.h
new file mode 100644
index 0000000..6f02555
--- /dev/null
+++ b/sys/ofed/include/net/ipv6.h
@@ -0,0 +1,62 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_NET_IPV6_H_
+#define _LINUX_NET_IPV6_H_
+
+#ifndef KLD_MODULE
+#include "opt_inet6.h"
+#endif
+
+#define ipv6_addr_loopback IN6_IS_ADDR_LOOPBACK
+#define ipv6_addr_copy(dst, src) \
+ memcpy((dst), (src), sizeof(struct in6_addr))
+
+#ifdef INET6
+static inline void
+ipv6_ib_mc_map(const struct in6_addr *addr, const unsigned char *broadcast,
+ char *buf)
+{
+ unsigned char scope;
+
+ scope = broadcast[5] & 0xF;
+ buf[0] = 0;
+ buf[1] = 0xff;
+ buf[2] = 0xff;
+ buf[3] = 0xff;
+ buf[4] = 0xff;
+ buf[5] = 0x10 | scope;
+ buf[6] = 0x60;
+ buf[7] = 0x1b;
+ buf[8] = broadcast[8];
+ buf[9] = broadcast[9];
+ memcpy(&buf[10], &addr->s6_addr[6], 10);
+}
+#endif
+
+#endif /* _LINUX_NET_IPV6_H_ */
diff --git a/sys/ofed/include/net/neighbour.h b/sys/ofed/include/net/neighbour.h
new file mode 100644
index 0000000..e5d814e
--- /dev/null
+++ b/sys/ofed/include/net/neighbour.h
@@ -0,0 +1,27 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
diff --git a/sys/ofed/include/net/netevent.h b/sys/ofed/include/net/netevent.h
new file mode 100644
index 0000000..db5b50e
--- /dev/null
+++ b/sys/ofed/include/net/netevent.h
@@ -0,0 +1,71 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_NET_NETEVENT_H_
+#define _LINUX_NET_NETEVENT_H_
+
+#include <netinet/if_ether.h>
+
+enum netevent_notif_type {
+ NETEVENT_NEIGH_UPDATE = 0,
+#if 0 /* Unsupported events. */
+ NETEVENT_PMTU_UPDATE,
+ NETEVENT_REDIRECT,
+#endif
+};
+
+struct llentry;
+
+static inline void
+_handle_arp_update_event(void *arg, struct llentry *lle)
+{
+ struct notifier_block *nb;
+
+ nb = arg;
+ nb->notifier_call(nb, NETEVENT_NEIGH_UPDATE, lle);
+}
+
+static inline int
+register_netevent_notifier(struct notifier_block *nb)
+{
+ nb->tags[NETEVENT_NEIGH_UPDATE] = EVENTHANDLER_REGISTER(
+ arp_update_event, _handle_arp_update_event, nb, 0);
+ return (0);
+}
+
+static inline int
+unregister_netevent_notifier(struct notifier_block *nb)
+{
+
+ EVENTHANDLER_DEREGISTER(arp_update_event,
+ nb->tags[NETEVENT_NEIGH_UPDATE]);
+
+ return (0);
+}
+
+#endif /* _LINUX_NET_NETEVENT_H_ */
diff --git a/sys/ofed/include/net/tcp.h b/sys/ofed/include/net/tcp.h
new file mode 100644
index 0000000..75da3f8
--- /dev/null
+++ b/sys/ofed/include/net/tcp.h
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2010 Isilon Systems, Inc.
+ * Copyright (c) 2010 iX Systems, Inc.
+ * Copyright (c) 2010 Panasas, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_NET_TCP_H_
+#define _LINUX_NET_TCP_H_
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/kref.h>
+
+#include <net/ip.h>
+
+#endif /* _LINUX_NET_TCP_H_ */
diff --git a/sys/ofed/include/rdma/Kbuild b/sys/ofed/include/rdma/Kbuild
new file mode 100644
index 0000000..e7c0432
--- /dev/null
+++ b/sys/ofed/include/rdma/Kbuild
@@ -0,0 +1 @@
+header-y += ib_user_mad.h
diff --git a/sys/ofed/include/rdma/ib_addr.h b/sys/ofed/include/rdma/ib_addr.h
new file mode 100644
index 0000000..61b0a7c
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_addr.h
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(IB_ADDR_H)
+#define IB_ADDR_H
+
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/if_arp.h>
+#include <linux/netdevice.h>
+#include <linux/socket.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+#include <linux/ethtool.h>
+#include <linux/if_vlan.h>
+
+struct rdma_addr_client {
+ atomic_t refcount;
+ struct completion comp;
+};
+
+/**
+ * rdma_addr_register_client - Register an address client.
+ */
+void rdma_addr_register_client(struct rdma_addr_client *client);
+
+/**
+ * rdma_addr_unregister_client - Deregister an address client.
+ * @client: Client object to deregister.
+ */
+void rdma_addr_unregister_client(struct rdma_addr_client *client);
+
+struct rdma_dev_addr {
+ unsigned char src_dev_addr[MAX_ADDR_LEN];
+ unsigned char dst_dev_addr[MAX_ADDR_LEN];
+ unsigned char broadcast[MAX_ADDR_LEN];
+ unsigned short dev_type;
+ int bound_dev_if;
+ enum rdma_transport_type transport;
+};
+
+/**
+ * rdma_translate_ip - Translate a local IP address to an RDMA hardware
+ * address.
+ */
+int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr);
+
+/**
+ * rdma_resolve_ip - Resolve source and destination IP addresses to
+ * RDMA hardware addresses.
+ * @client: Address client associated with request.
+ * @src_addr: An optional source address to use in the resolution. If a
+ * source address is not provided, a usable address will be returned via
+ * the callback.
+ * @dst_addr: The destination address to resolve.
+ * @addr: A reference to a data location that will receive the resolved
+ * addresses. The data location must remain valid until the callback has
+ * been invoked.
+ * @timeout_ms: Amount of time to wait for the address resolution to complete.
+ * @callback: Call invoked once address resolution has completed, timed out,
+ * or been canceled. A status of 0 indicates success.
+ * @context: User-specified context associated with the call.
+ */
+int rdma_resolve_ip(struct rdma_addr_client *client,
+ struct sockaddr *src_addr, struct sockaddr *dst_addr,
+ struct rdma_dev_addr *addr, int timeout_ms,
+ void (*callback)(int status, struct sockaddr *src_addr,
+ struct rdma_dev_addr *addr, void *context),
+ void *context);
+
+void rdma_addr_cancel(struct rdma_dev_addr *addr);
+
+int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev,
+ const unsigned char *dst_dev_addr);
+
+static inline int ip_addr_size(struct sockaddr *addr)
+{
+ return addr->sa_family == AF_INET6 ?
+ sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in);
+}
+
+static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr)
+{
+ return ((u16)dev_addr->broadcast[8] << 8) | (u16)dev_addr->broadcast[9];
+}
+
+static inline void ib_addr_set_pkey(struct rdma_dev_addr *dev_addr, u16 pkey)
+{
+ dev_addr->broadcast[8] = pkey >> 8;
+ dev_addr->broadcast[9] = (unsigned char) pkey;
+}
+
+static inline void ib_addr_get_mgid(struct rdma_dev_addr *dev_addr,
+ union ib_gid *gid)
+{
+ memcpy(gid, dev_addr->broadcast + 4, sizeof *gid);
+}
+
+static inline int rdma_addr_gid_offset(struct rdma_dev_addr *dev_addr)
+{
+ return dev_addr->dev_type == ARPHRD_INFINIBAND ? 4 : 0;
+}
+
+static inline void iboe_mac_vlan_to_ll(union ib_gid *gid, u8 *mac, u16 vid)
+{
+ memset(gid->raw, 0, 16);
+ *((u32 *)gid->raw) = cpu_to_be32(0xfe800000);
+ if (vid < 0x1000) {
+ gid->raw[12] = vid & 0xff;
+ gid->raw[11] = vid >> 8;
+ } else {
+ gid->raw[12] = 0xfe;
+ gid->raw[11] = 0xff;
+ }
+
+ memcpy(gid->raw + 13, mac + 3, 3);
+ memcpy(gid->raw + 8, mac, 3);
+ gid->raw[8] ^= 2;
+}
+
+static inline u16 rdma_vlan_dev_vlan_id(const struct net_device *dev)
+{
+#ifdef __linux__
+ return dev->priv_flags & IFF_802_1Q_VLAN ?
+ vlan_dev_vlan_id(dev) : 0xffff;
+#else
+ uint16_t tag;
+
+ if (VLAN_TAG(__DECONST(struct ifnet *, dev), &tag) != 0)
+ return 0xffff;
+ return tag;
+#endif
+}
+
+static inline void iboe_addr_get_sgid(struct rdma_dev_addr *dev_addr,
+ union ib_gid *gid)
+{
+ struct net_device *dev;
+ u16 vid = 0xffff;
+
+ dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+ if (dev) {
+ vid = rdma_vlan_dev_vlan_id(dev);
+ dev_put(dev);
+ }
+
+ iboe_mac_vlan_to_ll(gid, dev_addr->src_dev_addr, vid);
+}
+
+static inline void rdma_addr_get_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid)
+{
+ if (dev_addr->transport == RDMA_TRANSPORT_IB &&
+ dev_addr->dev_type != ARPHRD_INFINIBAND)
+ iboe_addr_get_sgid(dev_addr, gid);
+ else
+ memcpy(gid, dev_addr->src_dev_addr +
+ rdma_addr_gid_offset(dev_addr), sizeof *gid);
+}
+
+static inline void rdma_addr_set_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid)
+{
+ memcpy(dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid);
+}
+
+static inline void rdma_addr_get_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid)
+{
+ memcpy(gid, dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof *gid);
+}
+
+static inline void rdma_addr_set_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid)
+{
+ memcpy(dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid);
+}
+
+static inline enum ib_mtu iboe_get_mtu(int mtu)
+{
+ /*
+ * reduce IB headers from effective IBoE MTU. 28 stands for
+ * atomic header which is the biggest possible header after BTH
+ */
+ mtu = mtu - IB_GRH_BYTES - IB_BTH_BYTES - 28;
+
+ if (mtu >= ib_mtu_enum_to_int(IB_MTU_4096))
+ return IB_MTU_4096;
+ else if (mtu >= ib_mtu_enum_to_int(IB_MTU_2048))
+ return IB_MTU_2048;
+ else if (mtu >= ib_mtu_enum_to_int(IB_MTU_1024))
+ return IB_MTU_1024;
+ else if (mtu >= ib_mtu_enum_to_int(IB_MTU_512))
+ return IB_MTU_512;
+ else if (mtu >= ib_mtu_enum_to_int(IB_MTU_256))
+ return IB_MTU_256;
+ else
+ return 0;
+}
+
+#ifdef __linux__
+static inline int iboe_get_rate(struct net_device *dev)
+{
+ struct ethtool_cmd cmd;
+
+ if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings ||
+ dev->ethtool_ops->get_settings(dev, &cmd))
+ return IB_RATE_PORT_CURRENT;
+
+ if (cmd.speed >= 40000)
+ return IB_RATE_40_GBPS;
+ else if (cmd.speed >= 30000)
+ return IB_RATE_30_GBPS;
+ else if (cmd.speed >= 20000)
+ return IB_RATE_20_GBPS;
+ else if (cmd.speed >= 10000)
+ return IB_RATE_10_GBPS;
+ else
+ return IB_RATE_PORT_CURRENT;
+}
+#else
+static inline int iboe_get_rate(struct net_device *dev)
+{
+ if (dev->if_baudrate >= IF_Gbps(40ULL))
+ return IB_RATE_40_GBPS;
+ else if (dev->if_baudrate >= IF_Gbps(30ULL))
+ return IB_RATE_30_GBPS;
+ else if (dev->if_baudrate >= IF_Gbps(20ULL))
+ return IB_RATE_20_GBPS;
+ else if (dev->if_baudrate >= IF_Gbps(10ULL))
+ return IB_RATE_10_GBPS;
+ else
+ return IB_RATE_PORT_CURRENT;
+}
+#endif
+
+static inline int rdma_link_local_addr(struct in6_addr *addr)
+{
+ if (addr->s6_addr32[0] == cpu_to_be32(0xfe800000) &&
+ addr->s6_addr32[1] == 0)
+ return 1;
+
+ return 0;
+}
+
+static inline void rdma_get_ll_mac(struct in6_addr *addr, u8 *mac)
+{
+ memcpy(mac, &addr->s6_addr[8], 3);
+ memcpy(mac + 3, &addr->s6_addr[13], 3);
+ mac[0] ^= 2;
+}
+
+static inline int rdma_is_multicast_addr(struct in6_addr *addr)
+{
+ return addr->s6_addr[0] == 0xff;
+}
+
+static inline void rdma_get_mcast_mac(struct in6_addr *addr, u8 *mac)
+{
+ int i;
+
+ mac[0] = 0x33;
+ mac[1] = 0x33;
+ for (i = 2; i < 6; ++i)
+ mac[i] = addr->s6_addr[i + 10];
+}
+
+static inline u16 rdma_get_vlan_id(union ib_gid *dgid)
+{
+ u16 vid;
+
+ vid = dgid->raw[11] << 8 | dgid->raw[12];
+ return vid < 0x1000 ? vid : 0xffff;
+}
+
+static inline struct net_device *rdma_vlan_dev_real_dev(const struct net_device *dev)
+{
+#ifdef __linux__
+ return dev->priv_flags & IFF_802_1Q_VLAN ?
+ vlan_dev_real_dev(dev) : 0;
+#else
+ return VLAN_TRUNKDEV(__DECONST(struct ifnet *, dev));
+#endif
+}
+
+#endif /* IB_ADDR_H */
diff --git a/sys/ofed/include/rdma/ib_cache.h b/sys/ofed/include/rdma/ib_cache.h
new file mode 100644
index 0000000..00a2b8e
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_cache.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IB_CACHE_H
+#define _IB_CACHE_H
+
+#include <rdma/ib_verbs.h>
+
+/**
+ * ib_get_cached_gid - Returns a cached GID table entry
+ * @device: The device to query.
+ * @port_num: The port number of the device to query.
+ * @index: The index into the cached GID table to query.
+ * @gid: The GID value found at the specified index.
+ *
+ * ib_get_cached_gid() fetches the specified GID table entry stored in
+ * the local software cache.
+ */
+int ib_get_cached_gid(struct ib_device *device,
+ u8 port_num,
+ int index,
+ union ib_gid *gid);
+
+/**
+ * ib_find_cached_gid - Returns the port number and GID table index where
+ * a specified GID value occurs.
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @port_num: The port number of the device where the GID value was found.
+ * @index: The index into the cached GID table where the GID was found. This
+ * parameter may be NULL.
+ *
+ * ib_find_cached_gid() searches for the specified GID value in
+ * the local software cache.
+ */
+int ib_find_cached_gid(struct ib_device *device,
+ union ib_gid *gid,
+ u8 *port_num,
+ u16 *index);
+
+/**
+ * ib_get_cached_pkey - Returns a cached PKey table entry
+ * @device: The device to query.
+ * @port_num: The port number of the device to query.
+ * @index: The index into the cached PKey table to query.
+ * @pkey: The PKey value found at the specified index.
+ *
+ * ib_get_cached_pkey() fetches the specified PKey table entry stored in
+ * the local software cache.
+ */
+int ib_get_cached_pkey(struct ib_device *device_handle,
+ u8 port_num,
+ int index,
+ u16 *pkey);
+
+/**
+ * ib_find_cached_pkey - Returns the PKey table index where a specified
+ * PKey value occurs.
+ * @device: The device to query.
+ * @port_num: The port number of the device to search for the PKey.
+ * @pkey: The PKey value to search for.
+ * @index: The index into the cached PKey table where the PKey was found.
+ *
+ * ib_find_cached_pkey() searches the specified PKey table in
+ * the local software cache.
+ */
+int ib_find_cached_pkey(struct ib_device *device,
+ u8 port_num,
+ u16 pkey,
+ u16 *index);
+
+/**
+ * ib_get_cached_lmc - Returns a cached lmc table entry
+ * @device: The device to query.
+ * @port_num: The port number of the device to query.
+ * @lmc: The lmc value for the specified port for that device.
+ *
+ * ib_get_cached_lmc() fetches the specified lmc table entry stored in
+ * the local software cache.
+ */
+int ib_get_cached_lmc(struct ib_device *device,
+ u8 port_num,
+ u8 *lmc);
+
+#endif /* _IB_CACHE_H */
diff --git a/sys/ofed/include/rdma/ib_cm.h b/sys/ofed/include/rdma/ib_cm.h
new file mode 100644
index 0000000..9388583
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_cm.h
@@ -0,0 +1,589 @@
+/*
+ * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if !defined(IB_CM_H)
+#define IB_CM_H
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_sa.h>
+
+enum ib_cm_state {
+ IB_CM_IDLE,
+ IB_CM_LISTEN,
+ IB_CM_REQ_SENT,
+ IB_CM_REQ_RCVD,
+ IB_CM_MRA_REQ_SENT,
+ IB_CM_MRA_REQ_RCVD,
+ IB_CM_REP_SENT,
+ IB_CM_REP_RCVD,
+ IB_CM_MRA_REP_SENT,
+ IB_CM_MRA_REP_RCVD,
+ IB_CM_ESTABLISHED,
+ IB_CM_DREQ_SENT,
+ IB_CM_DREQ_RCVD,
+ IB_CM_TIMEWAIT,
+ IB_CM_SIDR_REQ_SENT,
+ IB_CM_SIDR_REQ_RCVD
+};
+
+enum ib_cm_lap_state {
+ IB_CM_LAP_UNINIT,
+ IB_CM_LAP_IDLE,
+ IB_CM_LAP_SENT,
+ IB_CM_LAP_RCVD,
+ IB_CM_MRA_LAP_SENT,
+ IB_CM_MRA_LAP_RCVD,
+};
+
+enum ib_cm_event_type {
+ IB_CM_REQ_ERROR,
+ IB_CM_REQ_RECEIVED,
+ IB_CM_REP_ERROR,
+ IB_CM_REP_RECEIVED,
+ IB_CM_RTU_RECEIVED,
+ IB_CM_USER_ESTABLISHED,
+ IB_CM_DREQ_ERROR,
+ IB_CM_DREQ_RECEIVED,
+ IB_CM_DREP_RECEIVED,
+ IB_CM_TIMEWAIT_EXIT,
+ IB_CM_MRA_RECEIVED,
+ IB_CM_REJ_RECEIVED,
+ IB_CM_LAP_ERROR,
+ IB_CM_LAP_RECEIVED,
+ IB_CM_APR_RECEIVED,
+ IB_CM_SIDR_REQ_ERROR,
+ IB_CM_SIDR_REQ_RECEIVED,
+ IB_CM_SIDR_REP_RECEIVED
+};
+
+enum ib_cm_data_size {
+ IB_CM_REQ_PRIVATE_DATA_SIZE = 92,
+ IB_CM_MRA_PRIVATE_DATA_SIZE = 222,
+ IB_CM_REJ_PRIVATE_DATA_SIZE = 148,
+ IB_CM_REP_PRIVATE_DATA_SIZE = 196,
+ IB_CM_RTU_PRIVATE_DATA_SIZE = 224,
+ IB_CM_DREQ_PRIVATE_DATA_SIZE = 220,
+ IB_CM_DREP_PRIVATE_DATA_SIZE = 224,
+ IB_CM_REJ_ARI_LENGTH = 72,
+ IB_CM_LAP_PRIVATE_DATA_SIZE = 168,
+ IB_CM_APR_PRIVATE_DATA_SIZE = 148,
+ IB_CM_APR_INFO_LENGTH = 72,
+ IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE = 216,
+ IB_CM_SIDR_REP_PRIVATE_DATA_SIZE = 136,
+ IB_CM_SIDR_REP_INFO_LENGTH = 72,
+ IB_CM_COMPARE_SIZE = 64
+};
+
+struct ib_cm_id;
+
+struct ib_cm_req_event_param {
+ struct ib_cm_id *listen_id;
+ u8 port;
+
+ struct ib_sa_path_rec *primary_path;
+ struct ib_sa_path_rec *alternate_path;
+
+ __be64 remote_ca_guid;
+ u32 remote_qkey;
+ u32 remote_qpn;
+ enum ib_qp_type qp_type;
+
+ u32 starting_psn;
+ u8 responder_resources;
+ u8 initiator_depth;
+ unsigned int local_cm_response_timeout:5;
+ unsigned int flow_control:1;
+ unsigned int remote_cm_response_timeout:5;
+ unsigned int retry_count:3;
+ unsigned int rnr_retry_count:3;
+ unsigned int srq:1;
+};
+
+struct ib_cm_rep_event_param {
+ __be64 remote_ca_guid;
+ u32 remote_qkey;
+ u32 remote_qpn;
+ u32 starting_psn;
+ u8 responder_resources;
+ u8 initiator_depth;
+ unsigned int target_ack_delay:5;
+ unsigned int failover_accepted:2;
+ unsigned int flow_control:1;
+ unsigned int rnr_retry_count:3;
+ unsigned int srq:1;
+};
+
+enum ib_cm_rej_reason {
+ IB_CM_REJ_NO_QP = 1,
+ IB_CM_REJ_NO_EEC = 2,
+ IB_CM_REJ_NO_RESOURCES = 3,
+ IB_CM_REJ_TIMEOUT = 4,
+ IB_CM_REJ_UNSUPPORTED = 5,
+ IB_CM_REJ_INVALID_COMM_ID = 6,
+ IB_CM_REJ_INVALID_COMM_INSTANCE = 7,
+ IB_CM_REJ_INVALID_SERVICE_ID = 8,
+ IB_CM_REJ_INVALID_TRANSPORT_TYPE = 9,
+ IB_CM_REJ_STALE_CONN = 10,
+ IB_CM_REJ_RDC_NOT_EXIST = 11,
+ IB_CM_REJ_INVALID_GID = 12,
+ IB_CM_REJ_INVALID_LID = 13,
+ IB_CM_REJ_INVALID_SL = 14,
+ IB_CM_REJ_INVALID_TRAFFIC_CLASS = 15,
+ IB_CM_REJ_INVALID_HOP_LIMIT = 16,
+ IB_CM_REJ_INVALID_PACKET_RATE = 17,
+ IB_CM_REJ_INVALID_ALT_GID = 18,
+ IB_CM_REJ_INVALID_ALT_LID = 19,
+ IB_CM_REJ_INVALID_ALT_SL = 20,
+ IB_CM_REJ_INVALID_ALT_TRAFFIC_CLASS = 21,
+ IB_CM_REJ_INVALID_ALT_HOP_LIMIT = 22,
+ IB_CM_REJ_INVALID_ALT_PACKET_RATE = 23,
+ IB_CM_REJ_PORT_CM_REDIRECT = 24,
+ IB_CM_REJ_PORT_REDIRECT = 25,
+ IB_CM_REJ_INVALID_MTU = 26,
+ IB_CM_REJ_INSUFFICIENT_RESP_RESOURCES = 27,
+ IB_CM_REJ_CONSUMER_DEFINED = 28,
+ IB_CM_REJ_INVALID_RNR_RETRY = 29,
+ IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID = 30,
+ IB_CM_REJ_INVALID_CLASS_VERSION = 31,
+ IB_CM_REJ_INVALID_FLOW_LABEL = 32,
+ IB_CM_REJ_INVALID_ALT_FLOW_LABEL = 33
+};
+
+struct ib_cm_rej_event_param {
+ enum ib_cm_rej_reason reason;
+ void *ari;
+ u8 ari_length;
+};
+
+struct ib_cm_mra_event_param {
+ u8 service_timeout;
+};
+
+struct ib_cm_lap_event_param {
+ struct ib_sa_path_rec *alternate_path;
+};
+
+enum ib_cm_apr_status {
+ IB_CM_APR_SUCCESS,
+ IB_CM_APR_INVALID_COMM_ID,
+ IB_CM_APR_UNSUPPORTED,
+ IB_CM_APR_REJECT,
+ IB_CM_APR_REDIRECT,
+ IB_CM_APR_IS_CURRENT,
+ IB_CM_APR_INVALID_QPN_EECN,
+ IB_CM_APR_INVALID_LID,
+ IB_CM_APR_INVALID_GID,
+ IB_CM_APR_INVALID_FLOW_LABEL,
+ IB_CM_APR_INVALID_TCLASS,
+ IB_CM_APR_INVALID_HOP_LIMIT,
+ IB_CM_APR_INVALID_PACKET_RATE,
+ IB_CM_APR_INVALID_SL
+};
+
+struct ib_cm_apr_event_param {
+ enum ib_cm_apr_status ap_status;
+ void *apr_info;
+ u8 info_len;
+};
+
+struct ib_cm_sidr_req_event_param {
+ struct ib_cm_id *listen_id;
+ u8 port;
+ u16 pkey;
+};
+
+enum ib_cm_sidr_status {
+ IB_SIDR_SUCCESS,
+ IB_SIDR_UNSUPPORTED,
+ IB_SIDR_REJECT,
+ IB_SIDR_NO_QP,
+ IB_SIDR_REDIRECT,
+ IB_SIDR_UNSUPPORTED_VERSION
+};
+
+struct ib_cm_sidr_rep_event_param {
+ enum ib_cm_sidr_status status;
+ u32 qkey;
+ u32 qpn;
+ void *info;
+ u8 info_len;
+};
+
+struct ib_cm_event {
+ enum ib_cm_event_type event;
+ union {
+ struct ib_cm_req_event_param req_rcvd;
+ struct ib_cm_rep_event_param rep_rcvd;
+ /* No data for RTU received events. */
+ struct ib_cm_rej_event_param rej_rcvd;
+ struct ib_cm_mra_event_param mra_rcvd;
+ struct ib_cm_lap_event_param lap_rcvd;
+ struct ib_cm_apr_event_param apr_rcvd;
+ /* No data for DREQ/DREP received events. */
+ struct ib_cm_sidr_req_event_param sidr_req_rcvd;
+ struct ib_cm_sidr_rep_event_param sidr_rep_rcvd;
+ enum ib_wc_status send_status;
+ } param;
+
+ void *private_data;
+};
+
+/**
+ * ib_cm_handler - User-defined callback to process communication events.
+ * @cm_id: Communication identifier associated with the reported event.
+ * @event: Information about the communication event.
+ *
+ * IB_CM_REQ_RECEIVED and IB_CM_SIDR_REQ_RECEIVED communication events
+ * generated as a result of listen requests result in the allocation of a
+ * new @cm_id. The new @cm_id is returned to the user through this callback.
+ * Clients are responsible for destroying the new @cm_id. For peer-to-peer
+ * IB_CM_REQ_RECEIVED and all other events, the returned @cm_id corresponds
+ * to a user's existing communication identifier.
+ *
+ * Users may not call ib_destroy_cm_id while in the context of this callback;
+ * however, returning a non-zero value instructs the communication manager to
+ * destroy the @cm_id after the callback completes.
+ */
+typedef int (*ib_cm_handler)(struct ib_cm_id *cm_id,
+ struct ib_cm_event *event);
+
+struct ib_cm_id {
+ ib_cm_handler cm_handler;
+ void *context;
+ struct ib_device *device;
+ __be64 service_id;
+ __be64 service_mask;
+ enum ib_cm_state state; /* internal CM/debug use */
+ enum ib_cm_lap_state lap_state; /* internal CM/debug use */
+ __be32 local_id;
+ __be32 remote_id;
+ u32 remote_cm_qpn; /* 1 unless redirected */
+};
+
+/**
+ * ib_create_cm_id - Allocate a communication identifier.
+ * @device: Device associated with the cm_id. All related communication will
+ * be associated with the specified device.
+ * @cm_handler: Callback invoked to notify the user of CM events.
+ * @context: User specified context associated with the communication
+ * identifier.
+ *
+ * Communication identifiers are used to track connection states, service
+ * ID resolution requests, and listen requests.
+ */
+struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
+ ib_cm_handler cm_handler,
+ void *context);
+
+/**
+ * ib_destroy_cm_id - Destroy a connection identifier.
+ * @cm_id: Connection identifier to destroy.
+ *
+ * This call blocks until the connection identifier is destroyed.
+ */
+void ib_destroy_cm_id(struct ib_cm_id *cm_id);
+
+#define IB_SERVICE_ID_AGN_MASK cpu_to_be64(0xFF00000000000000ULL)
+#define IB_CM_ASSIGN_SERVICE_ID cpu_to_be64(0x0200000000000000ULL)
+#define IB_CMA_SERVICE_ID cpu_to_be64(0x0000000001000000ULL)
+#define IB_CMA_SERVICE_ID_MASK cpu_to_be64(0xFFFFFFFFFF000000ULL)
+#define IB_SDP_SERVICE_ID cpu_to_be64(0x0000000000010000ULL)
+#define IB_SDP_SERVICE_ID_MASK cpu_to_be64(0xFFFFFFFFFFFF0000ULL)
+
+struct ib_cm_compare_data {
+ u8 data[IB_CM_COMPARE_SIZE];
+ u8 mask[IB_CM_COMPARE_SIZE];
+};
+
+/**
+ * ib_cm_listen - Initiates listening on the specified service ID for
+ * connection and service ID resolution requests.
+ * @cm_id: Connection identifier associated with the listen request.
+ * @service_id: Service identifier matched against incoming connection
+ * and service ID resolution requests. The service ID should be specified
+ * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ * assign a service ID to the caller.
+ * @service_mask: Mask applied to service ID used to listen across a
+ * range of service IDs. If set to 0, the service ID is matched
+ * exactly. This parameter is ignored if %service_id is set to
+ * IB_CM_ASSIGN_SERVICE_ID.
+ * @compare_data: This parameter is optional. It specifies data that must
+ * appear in the private data of a connection request for the specified
+ * listen request.
+ */
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
+ struct ib_cm_compare_data *compare_data);
+
+struct ib_cm_req_param {
+ struct ib_sa_path_rec *primary_path;
+ struct ib_sa_path_rec *alternate_path;
+ __be64 service_id;
+ u32 qp_num;
+ enum ib_qp_type qp_type;
+ u32 starting_psn;
+ const void *private_data;
+ u8 private_data_len;
+ u8 peer_to_peer;
+ u8 responder_resources;
+ u8 initiator_depth;
+ u8 remote_cm_response_timeout;
+ u8 flow_control;
+ u8 local_cm_response_timeout;
+ u8 retry_count;
+ u8 rnr_retry_count;
+ u8 max_cm_retries;
+ u8 srq;
+};
+
+/**
+ * ib_send_cm_req - Sends a connection request to the remote node.
+ * @cm_id: Connection identifier that will be associated with the
+ * connection request.
+ * @param: Connection request information needed to establish the
+ * connection.
+ */
+int ib_send_cm_req(struct ib_cm_id *cm_id,
+ struct ib_cm_req_param *param);
+
+struct ib_cm_rep_param {
+ u32 qp_num;
+ u32 starting_psn;
+ const void *private_data;
+ u8 private_data_len;
+ u8 responder_resources;
+ u8 initiator_depth;
+ u8 failover_accepted;
+ u8 flow_control;
+ u8 rnr_retry_count;
+ u8 srq;
+};
+
+/**
+ * ib_send_cm_rep - Sends a connection reply in response to a connection
+ * request.
+ * @cm_id: Connection identifier that will be associated with the
+ * connection request.
+ * @param: Connection reply information needed to establish the
+ * connection.
+ */
+int ib_send_cm_rep(struct ib_cm_id *cm_id,
+ struct ib_cm_rep_param *param);
+
+/**
+ * ib_send_cm_rtu - Sends a connection ready to use message in response
+ * to a connection reply message.
+ * @cm_id: Connection identifier associated with the connection request.
+ * @private_data: Optional user-defined private data sent with the
+ * ready to use message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_rtu(struct ib_cm_id *cm_id,
+ const void *private_data,
+ u8 private_data_len);
+
+/**
+ * ib_send_cm_dreq - Sends a disconnection request for an existing
+ * connection.
+ * @cm_id: Connection identifier associated with the connection being
+ * released.
+ * @private_data: Optional user-defined private data sent with the
+ * disconnection request message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_dreq(struct ib_cm_id *cm_id,
+ const void *private_data,
+ u8 private_data_len);
+
+/**
+ * ib_send_cm_drep - Sends a disconnection reply to a disconnection request.
+ * @cm_id: Connection identifier associated with the connection being
+ * released.
+ * @private_data: Optional user-defined private data sent with the
+ * disconnection reply message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ *
+ * If the cm_id is in the correct state, the CM will transition the connection
+ * to the timewait state, even if an error occurs sending the DREP message.
+ */
+int ib_send_cm_drep(struct ib_cm_id *cm_id,
+ const void *private_data,
+ u8 private_data_len);
+
+/**
+ * ib_cm_notify - Notifies the CM of an event reported to the consumer.
+ * @cm_id: Connection identifier to transition to established.
+ * @event: Type of event.
+ *
+ * This routine should be invoked by users to notify the CM of relevant
+ * communication events. Events that should be reported to the CM and
+ * when to report them are:
+ *
+ * IB_EVENT_COMM_EST - Used when a message is received on a connected
+ * QP before an RTU has been received.
+ * IB_EVENT_PATH_MIG - Notifies the CM that the connection has failed over
+ * to the alternate path.
+ */
+int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event);
+
+/**
+ * ib_send_cm_rej - Sends a connection rejection message to the
+ * remote node.
+ * @cm_id: Connection identifier associated with the connection being
+ * rejected.
+ * @reason: Reason for the connection request rejection.
+ * @ari: Optional additional rejection information.
+ * @ari_length: Size of the additional rejection information, in bytes.
+ * @private_data: Optional user-defined private data sent with the
+ * rejection message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_rej(struct ib_cm_id *cm_id,
+ enum ib_cm_rej_reason reason,
+ void *ari,
+ u8 ari_length,
+ const void *private_data,
+ u8 private_data_len);
+
+#define IB_CM_MRA_FLAG_DELAY 0x80 /* Send MRA only after a duplicate msg */
+
+/**
+ * ib_send_cm_mra - Sends a message receipt acknowledgement to a connection
+ * message.
+ * @cm_id: Connection identifier associated with the connection message.
+ * @service_timeout: The lower 5-bits specify the maximum time required for
+ * the sender to reply to to the connection message. The upper 3-bits
+ * specify additional control flags.
+ * @private_data: Optional user-defined private data sent with the
+ * message receipt acknowledgement.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_mra(struct ib_cm_id *cm_id,
+ u8 service_timeout,
+ const void *private_data,
+ u8 private_data_len);
+
+/**
+ * ib_send_cm_lap - Sends a load alternate path request.
+ * @cm_id: Connection identifier associated with the load alternate path
+ * message.
+ * @alternate_path: A path record that identifies the alternate path to
+ * load.
+ * @private_data: Optional user-defined private data sent with the
+ * load alternate path message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_lap(struct ib_cm_id *cm_id,
+ struct ib_sa_path_rec *alternate_path,
+ const void *private_data,
+ u8 private_data_len);
+
+/**
+ * ib_cm_init_qp_attr - Initializes the QP attributes for use in transitioning
+ * to a specified QP state.
+ * @cm_id: Communication identifier associated with the QP attributes to
+ * initialize.
+ * @qp_attr: On input, specifies the desired QP state. On output, the
+ * mandatory and desired optional attributes will be set in order to
+ * modify the QP to the specified state.
+ * @qp_attr_mask: The QP attribute mask that may be used to transition the
+ * QP to the specified state.
+ *
+ * Users must set the @qp_attr->qp_state to the desired QP state. This call
+ * will set all required attributes for the given transition, along with
+ * known optional attributes. Users may override the attributes returned from
+ * this call before calling ib_modify_qp.
+ */
+int ib_cm_init_qp_attr(struct ib_cm_id *cm_id,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask);
+
+/**
+ * ib_send_cm_apr - Sends an alternate path response message in response to
+ * a load alternate path request.
+ * @cm_id: Connection identifier associated with the alternate path response.
+ * @status: Reply status sent with the alternate path response.
+ * @info: Optional additional information sent with the alternate path
+ * response.
+ * @info_length: Size of the additional information, in bytes.
+ * @private_data: Optional user-defined private data sent with the
+ * alternate path response message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_apr(struct ib_cm_id *cm_id,
+ enum ib_cm_apr_status status,
+ void *info,
+ u8 info_length,
+ const void *private_data,
+ u8 private_data_len);
+
+struct ib_cm_sidr_req_param {
+ struct ib_sa_path_rec *path;
+ __be64 service_id;
+ int timeout_ms;
+ const void *private_data;
+ u8 private_data_len;
+ u8 max_cm_retries;
+};
+
+/**
+ * ib_send_cm_sidr_req - Sends a service ID resolution request to the
+ * remote node.
+ * @cm_id: Communication identifier that will be associated with the
+ * service ID resolution request.
+ * @param: Service ID resolution request information.
+ */
+int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
+ struct ib_cm_sidr_req_param *param);
+
+struct ib_cm_sidr_rep_param {
+ u32 qp_num;
+ u32 qkey;
+ enum ib_cm_sidr_status status;
+ const void *info;
+ u8 info_length;
+ const void *private_data;
+ u8 private_data_len;
+};
+
+/**
+ * ib_send_cm_sidr_rep - Sends a service ID resolution reply to the
+ * remote node.
+ * @cm_id: Communication identifier associated with the received service ID
+ * resolution request.
+ * @param: Service ID resolution reply information.
+ */
+int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id,
+ struct ib_cm_sidr_rep_param *param);
+
+#endif /* IB_CM_H */
diff --git a/sys/ofed/include/rdma/ib_fmr_pool.h b/sys/ofed/include/rdma/ib_fmr_pool.h
new file mode 100644
index 0000000..f62b842
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_fmr_pool.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(IB_FMR_POOL_H)
+#define IB_FMR_POOL_H
+
+#include <rdma/ib_verbs.h>
+
+struct ib_fmr_pool;
+
+/**
+ * struct ib_fmr_pool_param - Parameters for creating FMR pool
+ * @max_pages_per_fmr:Maximum number of pages per map request.
+ * @page_shift: Log2 of sizeof "pages" mapped by this fmr
+ * @access:Access flags for FMRs in pool.
+ * @pool_size:Number of FMRs to allocate for pool.
+ * @dirty_watermark:Flush is triggered when @dirty_watermark dirty
+ * FMRs are present.
+ * @flush_function:Callback called when unmapped FMRs are flushed and
+ * more FMRs are possibly available for mapping
+ * @flush_arg:Context passed to user's flush function.
+ * @cache:If set, FMRs may be reused after unmapping for identical map
+ * requests.
+ */
+struct ib_fmr_pool_param {
+ int max_pages_per_fmr;
+ int page_shift;
+ enum ib_access_flags access;
+ int pool_size;
+ int dirty_watermark;
+ void (*flush_function)(struct ib_fmr_pool *pool,
+ void *arg);
+ void *flush_arg;
+ unsigned cache:1;
+};
+
+struct ib_pool_fmr {
+ struct ib_fmr *fmr;
+ struct ib_fmr_pool *pool;
+ struct list_head list;
+ struct hlist_node cache_node;
+ int ref_count;
+ int remap_count;
+ u64 io_virtual_address;
+ int page_list_len;
+ u64 page_list[0];
+};
+
+struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd,
+ struct ib_fmr_pool_param *params);
+
+void ib_destroy_fmr_pool(struct ib_fmr_pool *pool);
+
+int ib_flush_fmr_pool(struct ib_fmr_pool *pool);
+
+struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle,
+ u64 *page_list,
+ int list_len,
+ u64 io_virtual_address);
+
+int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr);
+
+#endif /* IB_FMR_POOL_H */
diff --git a/sys/ofed/include/rdma/ib_mad.h b/sys/ofed/include/rdma/ib_mad.h
new file mode 100644
index 0000000..d3b9401
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_mad.h
@@ -0,0 +1,655 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004-2006 Voltaire Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(IB_MAD_H)
+#define IB_MAD_H
+
+#include <linux/list.h>
+
+#include <rdma/ib_verbs.h>
+
+/* Management base version */
+#define IB_MGMT_BASE_VERSION 1
+
+/* Management classes */
+#define IB_MGMT_CLASS_SUBN_LID_ROUTED 0x01
+#define IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE 0x81
+#define IB_MGMT_CLASS_SUBN_ADM 0x03
+#define IB_MGMT_CLASS_PERF_MGMT 0x04
+#define IB_MGMT_CLASS_BM 0x05
+#define IB_MGMT_CLASS_DEVICE_MGMT 0x06
+#define IB_MGMT_CLASS_CM 0x07
+#define IB_MGMT_CLASS_SNMP 0x08
+#define IB_MGMT_CLASS_DEVICE_ADM 0x10
+#define IB_MGMT_CLASS_BOOT_MGMT 0x11
+#define IB_MGMT_CLASS_BIS 0x12
+#define IB_MGMT_CLASS_CONG_MGMT 0x21
+#define IB_MGMT_CLASS_VENDOR_RANGE2_START 0x30
+#define IB_MGMT_CLASS_VENDOR_RANGE2_END 0x4F
+
+#define IB_OPENIB_OUI (0x001405)
+
+/* Management methods */
+#define IB_MGMT_METHOD_GET 0x01
+#define IB_MGMT_METHOD_SET 0x02
+#define IB_MGMT_METHOD_GET_RESP 0x81
+#define IB_MGMT_METHOD_SEND 0x03
+#define IB_MGMT_METHOD_TRAP 0x05
+#define IB_MGMT_METHOD_REPORT 0x06
+#define IB_MGMT_METHOD_REPORT_RESP 0x86
+#define IB_MGMT_METHOD_TRAP_REPRESS 0x07
+
+#define IB_MGMT_METHOD_RESP 0x80
+#define IB_BM_ATTR_MOD_RESP cpu_to_be32(1)
+
+#define IB_MGMT_MAX_METHODS 128
+
+/* RMPP information */
+#define IB_MGMT_RMPP_VERSION 1
+
+#define IB_MGMT_RMPP_TYPE_DATA 1
+#define IB_MGMT_RMPP_TYPE_ACK 2
+#define IB_MGMT_RMPP_TYPE_STOP 3
+#define IB_MGMT_RMPP_TYPE_ABORT 4
+
+#define IB_MGMT_RMPP_FLAG_ACTIVE 1
+#define IB_MGMT_RMPP_FLAG_FIRST (1<<1)
+#define IB_MGMT_RMPP_FLAG_LAST (1<<2)
+
+#define IB_MGMT_RMPP_NO_RESPTIME 0x1F
+
+#define IB_MGMT_RMPP_STATUS_SUCCESS 0
+#define IB_MGMT_RMPP_STATUS_RESX 1
+#define IB_MGMT_RMPP_STATUS_ABORT_MIN 118
+#define IB_MGMT_RMPP_STATUS_T2L 118
+#define IB_MGMT_RMPP_STATUS_BAD_LEN 119
+#define IB_MGMT_RMPP_STATUS_BAD_SEG 120
+#define IB_MGMT_RMPP_STATUS_BADT 121
+#define IB_MGMT_RMPP_STATUS_W2S 122
+#define IB_MGMT_RMPP_STATUS_S2B 123
+#define IB_MGMT_RMPP_STATUS_BAD_STATUS 124
+#define IB_MGMT_RMPP_STATUS_UNV 125
+#define IB_MGMT_RMPP_STATUS_TMR 126
+#define IB_MGMT_RMPP_STATUS_UNSPEC 127
+#define IB_MGMT_RMPP_STATUS_ABORT_MAX 127
+
+#define IB_QP0 0
+#define IB_QP1 cpu_to_be32(1)
+#define IB_QP1_QKEY 0x80010000
+#define IB_QP_SET_QKEY 0x80000000
+
+#define IB_DEFAULT_PKEY_PARTIAL 0x7FFF
+#define IB_DEFAULT_PKEY_FULL 0xFFFF
+
+enum {
+ IB_MGMT_MAD_HDR = 24,
+ IB_MGMT_MAD_DATA = 232,
+ IB_MGMT_RMPP_HDR = 36,
+ IB_MGMT_RMPP_DATA = 220,
+ IB_MGMT_VENDOR_HDR = 40,
+ IB_MGMT_VENDOR_DATA = 216,
+ IB_MGMT_SA_HDR = 56,
+ IB_MGMT_SA_DATA = 200,
+ IB_MGMT_DEVICE_HDR = 64,
+ IB_MGMT_DEVICE_DATA = 192,
+};
+
+struct ib_mad_hdr {
+ u8 base_version;
+ u8 mgmt_class;
+ u8 class_version;
+ u8 method;
+ __be16 status;
+ __be16 class_specific;
+ __be64 tid;
+ __be16 attr_id;
+ __be16 resv;
+ __be32 attr_mod;
+};
+
+struct ib_rmpp_hdr {
+ u8 rmpp_version;
+ u8 rmpp_type;
+ u8 rmpp_rtime_flags;
+ u8 rmpp_status;
+ __be32 seg_num;
+ __be32 paylen_newwin;
+};
+
+typedef u64 __bitwise ib_sa_comp_mask;
+
+#define IB_SA_COMP_MASK(n) ((__force ib_sa_comp_mask) cpu_to_be64(1ull << n))
+
+/*
+ * ib_sa_hdr and ib_sa_mad structures must be packed because they have
+ * 64-bit fields that are only 32-bit aligned. 64-bit architectures will
+ * lay them out wrong otherwise. (And unfortunately they are sent on
+ * the wire so we can't change the layout)
+ */
+struct ib_sa_hdr {
+ __be64 sm_key;
+ __be16 attr_offset;
+ __be16 reserved;
+ ib_sa_comp_mask comp_mask;
+} __attribute__ ((packed));
+
+struct ib_mad {
+ struct ib_mad_hdr mad_hdr;
+ u8 data[IB_MGMT_MAD_DATA];
+};
+
+struct ib_rmpp_mad {
+ struct ib_mad_hdr mad_hdr;
+ struct ib_rmpp_hdr rmpp_hdr;
+ u8 data[IB_MGMT_RMPP_DATA];
+};
+
+struct ib_sa_mad {
+ struct ib_mad_hdr mad_hdr;
+ struct ib_rmpp_hdr rmpp_hdr;
+ struct ib_sa_hdr sa_hdr;
+ u8 data[IB_MGMT_SA_DATA];
+} __attribute__ ((packed));
+
+struct ib_vendor_mad {
+ struct ib_mad_hdr mad_hdr;
+ struct ib_rmpp_hdr rmpp_hdr;
+ u8 reserved;
+ u8 oui[3];
+ u8 data[IB_MGMT_VENDOR_DATA];
+};
+
+struct ib_class_port_info {
+ u8 base_version;
+ u8 class_version;
+ __be16 capability_mask;
+ u8 reserved[3];
+ u8 resp_time_value;
+ u8 redirect_gid[16];
+ __be32 redirect_tcslfl;
+ __be16 redirect_lid;
+ __be16 redirect_pkey;
+ __be32 redirect_qp;
+ __be32 redirect_qkey;
+ u8 trap_gid[16];
+ __be32 trap_tcslfl;
+ __be16 trap_lid;
+ __be16 trap_pkey;
+ __be32 trap_hlqp;
+ __be32 trap_qkey;
+};
+
+/**
+ * ib_mad_send_buf - MAD data buffer and work request for sends.
+ * @next: A pointer used to chain together MADs for posting.
+ * @mad: References an allocated MAD data buffer for MADs that do not have
+ * RMPP active. For MADs using RMPP, references the common and management
+ * class specific headers.
+ * @mad_agent: MAD agent that allocated the buffer.
+ * @ah: The address handle to use when sending the MAD.
+ * @context: User-controlled context fields.
+ * @hdr_len: Indicates the size of the data header of the MAD. This length
+ * includes the common MAD, RMPP, and class specific headers.
+ * @data_len: Indicates the total size of user-transferred data.
+ * @seg_count: The number of RMPP segments allocated for this send.
+ * @seg_size: Size of each RMPP segment.
+ * @timeout_ms: Time to wait for a response.
+ * @retries: Number of times to retry a request for a response. For MADs
+ * using RMPP, this applies per window. On completion, returns the number
+ * of retries needed to complete the transfer.
+ *
+ * Users are responsible for initializing the MAD buffer itself, with the
+ * exception of any RMPP header. Additional segment buffer space allocated
+ * beyond data_len is padding.
+ */
+struct ib_mad_send_buf {
+ struct ib_mad_send_buf *next;
+ void *mad;
+ struct ib_mad_agent *mad_agent;
+ struct ib_ah *ah;
+ void *context[2];
+ int hdr_len;
+ int data_len;
+ int seg_count;
+ int seg_size;
+ int timeout_ms;
+ int retries;
+};
+
+/**
+ * ib_response_mad - Returns if the specified MAD has been generated in
+ * response to a sent request or trap.
+ */
+int ib_response_mad(struct ib_mad *mad);
+
+/**
+ * ib_get_rmpp_resptime - Returns the RMPP response time.
+ * @rmpp_hdr: An RMPP header.
+ */
+static inline u8 ib_get_rmpp_resptime(struct ib_rmpp_hdr *rmpp_hdr)
+{
+ return rmpp_hdr->rmpp_rtime_flags >> 3;
+}
+
+/**
+ * ib_get_rmpp_flags - Returns the RMPP flags.
+ * @rmpp_hdr: An RMPP header.
+ */
+static inline u8 ib_get_rmpp_flags(struct ib_rmpp_hdr *rmpp_hdr)
+{
+ return rmpp_hdr->rmpp_rtime_flags & 0x7;
+}
+
+/**
+ * ib_set_rmpp_resptime - Sets the response time in an RMPP header.
+ * @rmpp_hdr: An RMPP header.
+ * @rtime: The response time to set.
+ */
+static inline void ib_set_rmpp_resptime(struct ib_rmpp_hdr *rmpp_hdr, u8 rtime)
+{
+ rmpp_hdr->rmpp_rtime_flags = ib_get_rmpp_flags(rmpp_hdr) | (rtime << 3);
+}
+
+/**
+ * ib_set_rmpp_flags - Sets the flags in an RMPP header.
+ * @rmpp_hdr: An RMPP header.
+ * @flags: The flags to set.
+ */
+static inline void ib_set_rmpp_flags(struct ib_rmpp_hdr *rmpp_hdr, u8 flags)
+{
+ rmpp_hdr->rmpp_rtime_flags = (rmpp_hdr->rmpp_rtime_flags & 0xF8) |
+ (flags & 0x7);
+}
+
+struct ib_mad_agent;
+struct ib_mad_send_wc;
+struct ib_mad_recv_wc;
+
+/**
+ * ib_mad_send_handler - callback handler for a sent MAD.
+ * @mad_agent: MAD agent that sent the MAD.
+ * @mad_send_wc: Send work completion information on the sent MAD.
+ */
+typedef void (*ib_mad_send_handler)(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_wc *mad_send_wc);
+
+/**
+ * ib_mad_snoop_handler - Callback handler for snooping sent MADs.
+ * @mad_agent: MAD agent that snooped the MAD.
+ * @send_wr: Work request information on the sent MAD.
+ * @mad_send_wc: Work completion information on the sent MAD. Valid
+ * only for snooping that occurs on a send completion.
+ *
+ * Clients snooping MADs should not modify data referenced by the @send_wr
+ * or @mad_send_wc.
+ */
+typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_buf *send_buf,
+ struct ib_mad_send_wc *mad_send_wc);
+
+/**
+ * ib_mad_recv_handler - callback handler for a received MAD.
+ * @mad_agent: MAD agent requesting the received MAD.
+ * @mad_recv_wc: Received work completion information on the received MAD.
+ *
+ * MADs received in response to a send request operation will be handed to
+ * the user before the send operation completes. All data buffers given
+ * to registered agents through this routine are owned by the receiving
+ * client, except for snooping agents. Clients snooping MADs should not
+ * modify the data referenced by @mad_recv_wc.
+ */
+typedef void (*ib_mad_recv_handler)(struct ib_mad_agent *mad_agent,
+ struct ib_mad_recv_wc *mad_recv_wc);
+
+/**
+ * ib_mad_agent - Used to track MAD registration with the access layer.
+ * @device: Reference to device registration is on.
+ * @qp: Reference to QP used for sending and receiving MADs.
+ * @mr: Memory region for system memory usable for DMA.
+ * @recv_handler: Callback handler for a received MAD.
+ * @send_handler: Callback handler for a sent MAD.
+ * @snoop_handler: Callback handler for snooped sent MADs.
+ * @context: User-specified context associated with this registration.
+ * @hi_tid: Access layer assigned transaction ID for this client.
+ * Unsolicited MADs sent by this client will have the upper 32-bits
+ * of their TID set to this value.
+ * @port_num: Port number on which QP is registered
+ * @rmpp_version: If set, indicates the RMPP version used by this agent.
+ */
+struct ib_mad_agent {
+ struct ib_device *device;
+ struct ib_qp *qp;
+ struct ib_mr *mr;
+ ib_mad_recv_handler recv_handler;
+ ib_mad_send_handler send_handler;
+ ib_mad_snoop_handler snoop_handler;
+ void *context;
+ u32 hi_tid;
+ u8 port_num;
+ u8 rmpp_version;
+};
+
+/**
+ * ib_mad_send_wc - MAD send completion information.
+ * @send_buf: Send MAD data buffer associated with the send MAD request.
+ * @status: Completion status.
+ * @vendor_err: Optional vendor error information returned with a failed
+ * request.
+ */
+struct ib_mad_send_wc {
+ struct ib_mad_send_buf *send_buf;
+ enum ib_wc_status status;
+ u32 vendor_err;
+};
+
+/**
+ * ib_mad_recv_buf - received MAD buffer information.
+ * @list: Reference to next data buffer for a received RMPP MAD.
+ * @grh: References a data buffer containing the global route header.
+ * The data refereced by this buffer is only valid if the GRH is
+ * valid.
+ * @mad: References the start of the received MAD.
+ */
+struct ib_mad_recv_buf {
+ struct list_head list;
+ struct ib_grh *grh;
+ struct ib_mad *mad;
+};
+
+/**
+ * ib_mad_recv_wc - received MAD information.
+ * @wc: Completion information for the received data.
+ * @recv_buf: Specifies the location of the received data buffer(s).
+ * @rmpp_list: Specifies a list of RMPP reassembled received MAD buffers.
+ * @mad_len: The length of the received MAD, without duplicated headers.
+ *
+ * For received response, the wr_id contains a pointer to the ib_mad_send_buf
+ * for the corresponding send request.
+ */
+struct ib_mad_recv_wc {
+ struct ib_wc *wc;
+ struct ib_mad_recv_buf recv_buf;
+ struct list_head rmpp_list;
+ int mad_len;
+};
+
+/**
+ * ib_mad_reg_req - MAD registration request
+ * @mgmt_class: Indicates which management class of MADs should be receive
+ * by the caller. This field is only required if the user wishes to
+ * receive unsolicited MADs, otherwise it should be 0.
+ * @mgmt_class_version: Indicates which version of MADs for the given
+ * management class to receive.
+ * @oui: Indicates IEEE OUI when mgmt_class is a vendor class
+ * in the range from 0x30 to 0x4f. Otherwise not used.
+ * @method_mask: The caller will receive unsolicited MADs for any method
+ * where @method_mask = 1.
+ */
+struct ib_mad_reg_req {
+ u8 mgmt_class;
+ u8 mgmt_class_version;
+ u8 oui[3];
+ DECLARE_BITMAP(method_mask, IB_MGMT_MAX_METHODS);
+};
+
+/**
+ * ib_register_mad_agent - Register to send/receive MADs.
+ * @device: The device to register with.
+ * @port_num: The port on the specified device to use.
+ * @qp_type: Specifies which QP to access. Must be either
+ * IB_QPT_SMI or IB_QPT_GSI.
+ * @mad_reg_req: Specifies which unsolicited MADs should be received
+ * by the caller. This parameter may be NULL if the caller only
+ * wishes to receive solicited responses.
+ * @rmpp_version: If set, indicates that the client will send
+ * and receive MADs that contain the RMPP header for the given version.
+ * If set to 0, indicates that RMPP is not used by this client.
+ * @send_handler: The completion callback routine invoked after a send
+ * request has completed.
+ * @recv_handler: The completion callback routine invoked for a received
+ * MAD.
+ * @context: User specified context associated with the registration.
+ */
+struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
+ u8 port_num,
+ enum ib_qp_type qp_type,
+ struct ib_mad_reg_req *mad_reg_req,
+ u8 rmpp_version,
+ ib_mad_send_handler send_handler,
+ ib_mad_recv_handler recv_handler,
+ void *context);
+
+enum ib_mad_snoop_flags {
+ /*IB_MAD_SNOOP_POSTED_SENDS = 1,*/
+ /*IB_MAD_SNOOP_RMPP_SENDS = (1<<1),*/
+ IB_MAD_SNOOP_SEND_COMPLETIONS = (1<<2),
+ /*IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS = (1<<3),*/
+ IB_MAD_SNOOP_RECVS = (1<<4)
+ /*IB_MAD_SNOOP_RMPP_RECVS = (1<<5),*/
+ /*IB_MAD_SNOOP_REDIRECTED_QPS = (1<<6)*/
+};
+
+/**
+ * ib_register_mad_snoop - Register to snoop sent and received MADs.
+ * @device: The device to register with.
+ * @port_num: The port on the specified device to use.
+ * @qp_type: Specifies which QP traffic to snoop. Must be either
+ * IB_QPT_SMI or IB_QPT_GSI.
+ * @mad_snoop_flags: Specifies information where snooping occurs.
+ * @send_handler: The callback routine invoked for a snooped send.
+ * @recv_handler: The callback routine invoked for a snooped receive.
+ * @context: User specified context associated with the registration.
+ */
+struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device,
+ u8 port_num,
+ enum ib_qp_type qp_type,
+ int mad_snoop_flags,
+ ib_mad_snoop_handler snoop_handler,
+ ib_mad_recv_handler recv_handler,
+ void *context);
+
+/**
+ * ib_unregister_mad_agent - Unregisters a client from using MAD services.
+ * @mad_agent: Corresponding MAD registration request to deregister.
+ *
+ * After invoking this routine, MAD services are no longer usable by the
+ * client on the associated QP.
+ */
+int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent);
+
+/**
+ * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated
+ * with the registered client.
+ * @send_buf: Specifies the information needed to send the MAD(s).
+ * @bad_send_buf: Specifies the MAD on which an error was encountered. This
+ * parameter is optional if only a single MAD is posted.
+ *
+ * Sent MADs are not guaranteed to complete in the order that they were posted.
+ *
+ * If the MAD requires RMPP, the data buffer should contain a single copy
+ * of the common MAD, RMPP, and class specific headers, followed by the class
+ * defined data. If the class defined data would not divide evenly into
+ * RMPP segments, then space must be allocated at the end of the referenced
+ * buffer for any required padding. To indicate the amount of class defined
+ * data being transferred, the paylen_newwin field in the RMPP header should
+ * be set to the size of the class specific header plus the amount of class
+ * defined data being transferred. The paylen_newwin field should be
+ * specified in network-byte order.
+ */
+int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
+ struct ib_mad_send_buf **bad_send_buf);
+
+
+/**
+ * ib_free_recv_mad - Returns data buffers used to receive a MAD.
+ * @mad_recv_wc: Work completion information for a received MAD.
+ *
+ * Clients receiving MADs through their ib_mad_recv_handler must call this
+ * routine to return the work completion buffers to the access layer.
+ */
+void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc);
+
+/**
+ * ib_cancel_mad - Cancels an outstanding send MAD operation.
+ * @mad_agent: Specifies the registration associated with sent MAD.
+ * @send_buf: Indicates the MAD to cancel.
+ *
+ * MADs will be returned to the user through the corresponding
+ * ib_mad_send_handler.
+ */
+void ib_cancel_mad(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_buf *send_buf);
+
+/**
+ * ib_modify_mad - Modifies an outstanding send MAD operation.
+ * @mad_agent: Specifies the registration associated with sent MAD.
+ * @send_buf: Indicates the MAD to modify.
+ * @timeout_ms: New timeout value for sent MAD.
+ *
+ * This call will reset the timeout value for a sent MAD to the specified
+ * value.
+ */
+int ib_modify_mad(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_buf *send_buf, u32 timeout_ms);
+
+/**
+ * ib_redirect_mad_qp - Registers a QP for MAD services.
+ * @qp: Reference to a QP that requires MAD services.
+ * @rmpp_version: If set, indicates that the client will send
+ * and receive MADs that contain the RMPP header for the given version.
+ * If set to 0, indicates that RMPP is not used by this client.
+ * @send_handler: The completion callback routine invoked after a send
+ * request has completed.
+ * @recv_handler: The completion callback routine invoked for a received
+ * MAD.
+ * @context: User specified context associated with the registration.
+ *
+ * Use of this call allows clients to use MAD services, such as RMPP,
+ * on user-owned QPs. After calling this routine, users may send
+ * MADs on the specified QP by calling ib_mad_post_send.
+ */
+struct ib_mad_agent *ib_redirect_mad_qp(struct ib_qp *qp,
+ u8 rmpp_version,
+ ib_mad_send_handler send_handler,
+ ib_mad_recv_handler recv_handler,
+ void *context);
+
+/**
+ * ib_process_mad_wc - Processes a work completion associated with a
+ * MAD sent or received on a redirected QP.
+ * @mad_agent: Specifies the registered MAD service using the redirected QP.
+ * @wc: References a work completion associated with a sent or received
+ * MAD segment.
+ *
+ * This routine is used to complete or continue processing on a MAD request.
+ * If the work completion is associated with a send operation, calling
+ * this routine is required to continue an RMPP transfer or to wait for a
+ * corresponding response, if it is a request. If the work completion is
+ * associated with a receive operation, calling this routine is required to
+ * process an inbound or outbound RMPP transfer, or to match a response MAD
+ * with its corresponding request.
+ */
+int ib_process_mad_wc(struct ib_mad_agent *mad_agent,
+ struct ib_wc *wc);
+
+/**
+ * ib_create_send_mad - Allocate and initialize a data buffer and work request
+ * for sending a MAD.
+ * @mad_agent: Specifies the registered MAD service to associate with the MAD.
+ * @remote_qpn: Specifies the QPN of the receiving node.
+ * @pkey_index: Specifies which PKey the MAD will be sent using. This field
+ * is valid only if the remote_qpn is QP 1.
+ * @rmpp_active: Indicates if the send will enable RMPP.
+ * @hdr_len: Indicates the size of the data header of the MAD. This length
+ * should include the common MAD header, RMPP header, plus any class
+ * specific header.
+ * @data_len: Indicates the size of any user-transferred data. The call will
+ * automatically adjust the allocated buffer size to account for any
+ * additional padding that may be necessary.
+ * @gfp_mask: GFP mask used for the memory allocation.
+ *
+ * This routine allocates a MAD for sending. The returned MAD send buffer
+ * will reference a data buffer usable for sending a MAD, along
+ * with an initialized work request structure. Users may modify the returned
+ * MAD data buffer before posting the send.
+ *
+ * The returned MAD header, class specific headers, and any padding will be
+ * cleared. Users are responsible for initializing the common MAD header,
+ * any class specific header, and MAD data area.
+ * If @rmpp_active is set, the RMPP header will be initialized for sending.
+ */
+struct ib_mad_send_buf *ib_create_send_mad(struct ib_mad_agent *mad_agent,
+ u32 remote_qpn, u16 pkey_index,
+ int rmpp_active,
+ int hdr_len, int data_len,
+ gfp_t gfp_mask);
+
+/**
+ * ib_is_mad_class_rmpp - returns whether given management class
+ * supports RMPP.
+ * @mgmt_class: management class
+ *
+ * This routine returns whether the management class supports RMPP.
+ */
+int ib_is_mad_class_rmpp(u8 mgmt_class);
+
+/**
+ * ib_get_mad_data_offset - returns the data offset for a given
+ * management class.
+ * @mgmt_class: management class
+ *
+ * This routine returns the data offset in the MAD for the management
+ * class requested.
+ */
+int ib_get_mad_data_offset(u8 mgmt_class);
+
+/**
+ * ib_get_rmpp_segment - returns the data buffer for a given RMPP segment.
+ * @send_buf: Previously allocated send data buffer.
+ * @seg_num: number of segment to return
+ *
+ * This routine returns a pointer to the data buffer of an RMPP MAD.
+ * Users must provide synchronization to @send_buf around this call.
+ */
+void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num);
+
+/**
+ * ib_free_send_mad - Returns data buffers used to send a MAD.
+ * @send_buf: Previously allocated send data buffer.
+ */
+void ib_free_send_mad(struct ib_mad_send_buf *send_buf);
+
+#endif /* IB_MAD_H */
diff --git a/sys/ofed/include/rdma/ib_marshall.h b/sys/ofed/include/rdma/ib_marshall.h
new file mode 100644
index 0000000..db03720
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_marshall.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2005-2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(IB_USER_MARSHALL_H)
+#define IB_USER_MARSHALL_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_sa.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_user_sa.h>
+
+void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst,
+ struct ib_qp_attr *src);
+
+void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst,
+ struct ib_ah_attr *src);
+
+void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst,
+ struct ib_sa_path_rec *src);
+
+void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst,
+ struct ib_user_path_rec *src);
+
+#endif /* IB_USER_MARSHALL_H */
diff --git a/sys/ofed/include/rdma/ib_pack.h b/sys/ofed/include/rdma/ib_pack.h
new file mode 100644
index 0000000..af615a4
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_pack.h
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_PACK_H
+#define IB_PACK_H
+
+#include <rdma/ib_verbs.h>
+
+enum {
+ IB_LRH_BYTES = 8,
+ IB_ETH_BYTES = 14,
+ IB_VLAN_BYTES = 4,
+ IB_GRH_BYTES = 40,
+ IB_BTH_BYTES = 12,
+ IB_DETH_BYTES = 8
+};
+
+struct ib_field {
+ size_t struct_offset_bytes;
+ size_t struct_size_bytes;
+ int offset_words;
+ int offset_bits;
+ int size_bits;
+ char *field_name;
+};
+
+#define RESERVED \
+ .field_name = "reserved"
+
+/*
+ * This macro cleans up the definitions of constants for BTH opcodes.
+ * It is used to define constants such as IB_OPCODE_UD_SEND_ONLY,
+ * which becomes IB_OPCODE_UD + IB_OPCODE_SEND_ONLY, and this gives
+ * the correct value.
+ *
+ * In short, user code should use the constants defined using the
+ * macro rather than worrying about adding together other constants.
+*/
+#define IB_OPCODE(transport, op) \
+ IB_OPCODE_ ## transport ## _ ## op = \
+ IB_OPCODE_ ## transport + IB_OPCODE_ ## op
+
+enum {
+ /* transport types -- just used to define real constants */
+ IB_OPCODE_RC = 0x00,
+ IB_OPCODE_UC = 0x20,
+ IB_OPCODE_RD = 0x40,
+ IB_OPCODE_UD = 0x60,
+
+ /* operations -- just used to define real constants */
+ IB_OPCODE_SEND_FIRST = 0x00,
+ IB_OPCODE_SEND_MIDDLE = 0x01,
+ IB_OPCODE_SEND_LAST = 0x02,
+ IB_OPCODE_SEND_LAST_WITH_IMMEDIATE = 0x03,
+ IB_OPCODE_SEND_ONLY = 0x04,
+ IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE = 0x05,
+ IB_OPCODE_RDMA_WRITE_FIRST = 0x06,
+ IB_OPCODE_RDMA_WRITE_MIDDLE = 0x07,
+ IB_OPCODE_RDMA_WRITE_LAST = 0x08,
+ IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE = 0x09,
+ IB_OPCODE_RDMA_WRITE_ONLY = 0x0a,
+ IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE = 0x0b,
+ IB_OPCODE_RDMA_READ_REQUEST = 0x0c,
+ IB_OPCODE_RDMA_READ_RESPONSE_FIRST = 0x0d,
+ IB_OPCODE_RDMA_READ_RESPONSE_MIDDLE = 0x0e,
+ IB_OPCODE_RDMA_READ_RESPONSE_LAST = 0x0f,
+ IB_OPCODE_RDMA_READ_RESPONSE_ONLY = 0x10,
+ IB_OPCODE_ACKNOWLEDGE = 0x11,
+ IB_OPCODE_ATOMIC_ACKNOWLEDGE = 0x12,
+ IB_OPCODE_COMPARE_SWAP = 0x13,
+ IB_OPCODE_FETCH_ADD = 0x14,
+
+ /* real constants follow -- see comment about above IB_OPCODE()
+ macro for more details */
+
+ /* RC */
+ IB_OPCODE(RC, SEND_FIRST),
+ IB_OPCODE(RC, SEND_MIDDLE),
+ IB_OPCODE(RC, SEND_LAST),
+ IB_OPCODE(RC, SEND_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RC, SEND_ONLY),
+ IB_OPCODE(RC, SEND_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RC, RDMA_WRITE_FIRST),
+ IB_OPCODE(RC, RDMA_WRITE_MIDDLE),
+ IB_OPCODE(RC, RDMA_WRITE_LAST),
+ IB_OPCODE(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RC, RDMA_WRITE_ONLY),
+ IB_OPCODE(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RC, RDMA_READ_REQUEST),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_FIRST),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_MIDDLE),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_LAST),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_ONLY),
+ IB_OPCODE(RC, ACKNOWLEDGE),
+ IB_OPCODE(RC, ATOMIC_ACKNOWLEDGE),
+ IB_OPCODE(RC, COMPARE_SWAP),
+ IB_OPCODE(RC, FETCH_ADD),
+
+ /* UC */
+ IB_OPCODE(UC, SEND_FIRST),
+ IB_OPCODE(UC, SEND_MIDDLE),
+ IB_OPCODE(UC, SEND_LAST),
+ IB_OPCODE(UC, SEND_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(UC, SEND_ONLY),
+ IB_OPCODE(UC, SEND_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(UC, RDMA_WRITE_FIRST),
+ IB_OPCODE(UC, RDMA_WRITE_MIDDLE),
+ IB_OPCODE(UC, RDMA_WRITE_LAST),
+ IB_OPCODE(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(UC, RDMA_WRITE_ONLY),
+ IB_OPCODE(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+
+ /* RD */
+ IB_OPCODE(RD, SEND_FIRST),
+ IB_OPCODE(RD, SEND_MIDDLE),
+ IB_OPCODE(RD, SEND_LAST),
+ IB_OPCODE(RD, SEND_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RD, SEND_ONLY),
+ IB_OPCODE(RD, SEND_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RD, RDMA_WRITE_FIRST),
+ IB_OPCODE(RD, RDMA_WRITE_MIDDLE),
+ IB_OPCODE(RD, RDMA_WRITE_LAST),
+ IB_OPCODE(RD, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RD, RDMA_WRITE_ONLY),
+ IB_OPCODE(RD, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RD, RDMA_READ_REQUEST),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_FIRST),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_MIDDLE),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_LAST),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_ONLY),
+ IB_OPCODE(RD, ACKNOWLEDGE),
+ IB_OPCODE(RD, ATOMIC_ACKNOWLEDGE),
+ IB_OPCODE(RD, COMPARE_SWAP),
+ IB_OPCODE(RD, FETCH_ADD),
+
+ /* UD */
+ IB_OPCODE(UD, SEND_ONLY),
+ IB_OPCODE(UD, SEND_ONLY_WITH_IMMEDIATE)
+};
+
+enum {
+ IB_LNH_RAW = 0,
+ IB_LNH_IP = 1,
+ IB_LNH_IBA_LOCAL = 2,
+ IB_LNH_IBA_GLOBAL = 3
+};
+
+struct ib_unpacked_lrh {
+ u8 virtual_lane;
+ u8 link_version;
+ u8 service_level;
+ u8 link_next_header;
+ __be16 destination_lid;
+ __be16 packet_length;
+ __be16 source_lid;
+};
+
+struct ib_unpacked_grh {
+ u8 ip_version;
+ u8 traffic_class;
+ __be32 flow_label;
+ __be16 payload_length;
+ u8 next_header;
+ u8 hop_limit;
+ union ib_gid source_gid;
+ union ib_gid destination_gid;
+};
+
+struct ib_unpacked_bth {
+ u8 opcode;
+ u8 solicited_event;
+ u8 mig_req;
+ u8 pad_count;
+ u8 transport_header_version;
+ __be16 pkey;
+ __be32 destination_qpn;
+ u8 ack_req;
+ __be32 psn;
+};
+
+struct ib_unpacked_deth {
+ __be32 qkey;
+ __be32 source_qpn;
+};
+
+struct ib_unpacked_eth {
+ u8 dmac_h[4];
+ u8 dmac_l[2];
+ u8 smac_h[2];
+ u8 smac_l[4];
+ __be16 type;
+};
+
+struct ib_unpacked_vlan {
+ __be16 tag;
+ __be16 type;
+};
+
+struct ib_ud_header {
+ int lrh_present;
+ struct ib_unpacked_lrh lrh;
+ int eth_present;
+ struct ib_unpacked_eth eth;
+ int vlan_present;
+ struct ib_unpacked_vlan vlan;
+ int grh_present;
+ struct ib_unpacked_grh grh;
+ struct ib_unpacked_bth bth;
+ struct ib_unpacked_deth deth;
+ int immediate_present;
+ __be32 immediate_data;
+};
+
+void ib_pack(const struct ib_field *desc,
+ int desc_len,
+ void *structure,
+ void *buf);
+
+void ib_unpack(const struct ib_field *desc,
+ int desc_len,
+ void *buf,
+ void *structure);
+
+void ib_ud_header_init(int payload_bytes,
+ int lrh_present,
+ int eth_present,
+ int vlan_present,
+ int grh_present,
+ int immediate_present,
+ struct ib_ud_header *header);
+
+int ib_ud_header_pack(struct ib_ud_header *header,
+ void *buf);
+
+int ib_ud_header_unpack(void *buf,
+ struct ib_ud_header *header);
+int ib_lrh_header_pack(struct ib_unpacked_lrh *lrh, void *buf);
+int ib_lrh_header_unpack(void *buf, struct ib_unpacked_lrh *lrh);
+
+#endif /* IB_PACK_H */
diff --git a/sys/ofed/include/rdma/ib_sa.h b/sys/ofed/include/rdma/ib_sa.h
new file mode 100644
index 0000000..5a8f2ce
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_sa.h
@@ -0,0 +1,559 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_SA_H
+#define IB_SA_H
+
+#include <linux/completion.h>
+#include <linux/compiler.h>
+
+#include <asm/atomic.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_mad.h>
+
+enum {
+ IB_SA_CLASS_VERSION = 2, /* IB spec version 1.1/1.2 */
+
+ IB_SA_METHOD_GET_TABLE = 0x12,
+ IB_SA_METHOD_GET_TABLE_RESP = 0x92,
+ IB_SA_METHOD_DELETE = 0x15,
+ IB_SA_METHOD_DELETE_RESP = 0x95,
+ IB_SA_METHOD_GET_MULTI = 0x14,
+ IB_SA_METHOD_GET_MULTI_RESP = 0x94,
+ IB_SA_METHOD_GET_TRACE_TBL = 0x13
+};
+
+enum {
+ IB_SA_ATTR_CLASS_PORTINFO = 0x01,
+ IB_SA_ATTR_NOTICE = 0x02,
+ IB_SA_ATTR_INFORM_INFO = 0x03,
+ IB_SA_ATTR_NODE_REC = 0x11,
+ IB_SA_ATTR_PORT_INFO_REC = 0x12,
+ IB_SA_ATTR_SL2VL_REC = 0x13,
+ IB_SA_ATTR_SWITCH_REC = 0x14,
+ IB_SA_ATTR_LINEAR_FDB_REC = 0x15,
+ IB_SA_ATTR_RANDOM_FDB_REC = 0x16,
+ IB_SA_ATTR_MCAST_FDB_REC = 0x17,
+ IB_SA_ATTR_SM_INFO_REC = 0x18,
+ IB_SA_ATTR_LINK_REC = 0x20,
+ IB_SA_ATTR_GUID_INFO_REC = 0x30,
+ IB_SA_ATTR_SERVICE_REC = 0x31,
+ IB_SA_ATTR_PARTITION_REC = 0x33,
+ IB_SA_ATTR_PATH_REC = 0x35,
+ IB_SA_ATTR_VL_ARB_REC = 0x36,
+ IB_SA_ATTR_MC_MEMBER_REC = 0x38,
+ IB_SA_ATTR_TRACE_REC = 0x39,
+ IB_SA_ATTR_MULTI_PATH_REC = 0x3a,
+ IB_SA_ATTR_SERVICE_ASSOC_REC = 0x3b,
+ IB_SA_ATTR_INFORM_INFO_REC = 0xf3
+};
+
+enum ib_sa_selector {
+ IB_SA_GT = 0,
+ IB_SA_LT = 1,
+ IB_SA_EQ = 2,
+ /*
+ * The meaning of "best" depends on the attribute: for
+ * example, for MTU best will return the largest available
+ * MTU, while for packet life time, best will return the
+ * smallest available life time.
+ */
+ IB_SA_BEST = 3
+};
+
+/*
+ * Structures for SA records are named "struct ib_sa_xxx_rec." No
+ * attempt is made to pack structures to match the physical layout of
+ * SA records in SA MADs; all packing and unpacking is handled by the
+ * SA query code.
+ *
+ * For a record with structure ib_sa_xxx_rec, the naming convention
+ * for the component mask value for field yyy is IB_SA_XXX_REC_YYY (we
+ * never use different abbreviations or otherwise change the spelling
+ * of xxx/yyy between ib_sa_xxx_rec.yyy and IB_SA_XXX_REC_YYY).
+ *
+ * Reserved rows are indicated with comments to help maintainability.
+ */
+
+#define IB_SA_PATH_REC_SERVICE_ID (IB_SA_COMP_MASK( 0) |\
+ IB_SA_COMP_MASK( 1))
+#define IB_SA_PATH_REC_DGID IB_SA_COMP_MASK( 2)
+#define IB_SA_PATH_REC_SGID IB_SA_COMP_MASK( 3)
+#define IB_SA_PATH_REC_DLID IB_SA_COMP_MASK( 4)
+#define IB_SA_PATH_REC_SLID IB_SA_COMP_MASK( 5)
+#define IB_SA_PATH_REC_RAW_TRAFFIC IB_SA_COMP_MASK( 6)
+/* reserved: 7 */
+#define IB_SA_PATH_REC_FLOW_LABEL IB_SA_COMP_MASK( 8)
+#define IB_SA_PATH_REC_HOP_LIMIT IB_SA_COMP_MASK( 9)
+#define IB_SA_PATH_REC_TRAFFIC_CLASS IB_SA_COMP_MASK(10)
+#define IB_SA_PATH_REC_REVERSIBLE IB_SA_COMP_MASK(11)
+#define IB_SA_PATH_REC_NUMB_PATH IB_SA_COMP_MASK(12)
+#define IB_SA_PATH_REC_PKEY IB_SA_COMP_MASK(13)
+#define IB_SA_PATH_REC_QOS_CLASS IB_SA_COMP_MASK(14)
+#define IB_SA_PATH_REC_SL IB_SA_COMP_MASK(15)
+#define IB_SA_PATH_REC_MTU_SELECTOR IB_SA_COMP_MASK(16)
+#define IB_SA_PATH_REC_MTU IB_SA_COMP_MASK(17)
+#define IB_SA_PATH_REC_RATE_SELECTOR IB_SA_COMP_MASK(18)
+#define IB_SA_PATH_REC_RATE IB_SA_COMP_MASK(19)
+#define IB_SA_PATH_REC_PACKET_LIFE_TIME_SELECTOR IB_SA_COMP_MASK(20)
+#define IB_SA_PATH_REC_PACKET_LIFE_TIME IB_SA_COMP_MASK(21)
+#define IB_SA_PATH_REC_PREFERENCE IB_SA_COMP_MASK(22)
+
+struct ib_sa_path_rec {
+ __be64 service_id;
+ union ib_gid dgid;
+ union ib_gid sgid;
+ __be16 dlid;
+ __be16 slid;
+ int raw_traffic;
+ /* reserved */
+ __be32 flow_label;
+ u8 hop_limit;
+ u8 traffic_class;
+ int reversible;
+ u8 numb_path;
+ __be16 pkey;
+ __be16 qos_class;
+ u8 sl;
+ u8 mtu_selector;
+ u8 mtu;
+ u8 rate_selector;
+ u8 rate;
+ u8 packet_life_time_selector;
+ u8 packet_life_time;
+ u8 preference;
+};
+
+#define IB_SA_MCMEMBER_REC_MGID IB_SA_COMP_MASK( 0)
+#define IB_SA_MCMEMBER_REC_PORT_GID IB_SA_COMP_MASK( 1)
+#define IB_SA_MCMEMBER_REC_QKEY IB_SA_COMP_MASK( 2)
+#define IB_SA_MCMEMBER_REC_MLID IB_SA_COMP_MASK( 3)
+#define IB_SA_MCMEMBER_REC_MTU_SELECTOR IB_SA_COMP_MASK( 4)
+#define IB_SA_MCMEMBER_REC_MTU IB_SA_COMP_MASK( 5)
+#define IB_SA_MCMEMBER_REC_TRAFFIC_CLASS IB_SA_COMP_MASK( 6)
+#define IB_SA_MCMEMBER_REC_PKEY IB_SA_COMP_MASK( 7)
+#define IB_SA_MCMEMBER_REC_RATE_SELECTOR IB_SA_COMP_MASK( 8)
+#define IB_SA_MCMEMBER_REC_RATE IB_SA_COMP_MASK( 9)
+#define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR IB_SA_COMP_MASK(10)
+#define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME IB_SA_COMP_MASK(11)
+#define IB_SA_MCMEMBER_REC_SL IB_SA_COMP_MASK(12)
+#define IB_SA_MCMEMBER_REC_FLOW_LABEL IB_SA_COMP_MASK(13)
+#define IB_SA_MCMEMBER_REC_HOP_LIMIT IB_SA_COMP_MASK(14)
+#define IB_SA_MCMEMBER_REC_SCOPE IB_SA_COMP_MASK(15)
+#define IB_SA_MCMEMBER_REC_JOIN_STATE IB_SA_COMP_MASK(16)
+#define IB_SA_MCMEMBER_REC_PROXY_JOIN IB_SA_COMP_MASK(17)
+
+struct ib_sa_mcmember_rec {
+ union ib_gid mgid;
+ union ib_gid port_gid;
+ __be32 qkey;
+ __be16 mlid;
+ u8 mtu_selector;
+ u8 mtu;
+ u8 traffic_class;
+ __be16 pkey;
+ u8 rate_selector;
+ u8 rate;
+ u8 packet_life_time_selector;
+ u8 packet_life_time;
+ u8 sl;
+ __be32 flow_label;
+ u8 hop_limit;
+ u8 scope;
+ u8 join_state;
+ int proxy_join;
+};
+
+/* Service Record Component Mask Sec 15.2.5.14 Ver 1.1 */
+#define IB_SA_SERVICE_REC_SERVICE_ID IB_SA_COMP_MASK( 0)
+#define IB_SA_SERVICE_REC_SERVICE_GID IB_SA_COMP_MASK( 1)
+#define IB_SA_SERVICE_REC_SERVICE_PKEY IB_SA_COMP_MASK( 2)
+/* reserved: 3 */
+#define IB_SA_SERVICE_REC_SERVICE_LEASE IB_SA_COMP_MASK( 4)
+#define IB_SA_SERVICE_REC_SERVICE_KEY IB_SA_COMP_MASK( 5)
+#define IB_SA_SERVICE_REC_SERVICE_NAME IB_SA_COMP_MASK( 6)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_0 IB_SA_COMP_MASK( 7)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_1 IB_SA_COMP_MASK( 8)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_2 IB_SA_COMP_MASK( 9)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_3 IB_SA_COMP_MASK(10)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_4 IB_SA_COMP_MASK(11)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_5 IB_SA_COMP_MASK(12)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_6 IB_SA_COMP_MASK(13)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_7 IB_SA_COMP_MASK(14)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_8 IB_SA_COMP_MASK(15)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_9 IB_SA_COMP_MASK(16)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_10 IB_SA_COMP_MASK(17)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_11 IB_SA_COMP_MASK(18)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_12 IB_SA_COMP_MASK(19)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_13 IB_SA_COMP_MASK(20)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_14 IB_SA_COMP_MASK(21)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_15 IB_SA_COMP_MASK(22)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_0 IB_SA_COMP_MASK(23)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_1 IB_SA_COMP_MASK(24)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_2 IB_SA_COMP_MASK(25)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_3 IB_SA_COMP_MASK(26)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_4 IB_SA_COMP_MASK(27)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_5 IB_SA_COMP_MASK(28)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_6 IB_SA_COMP_MASK(29)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_7 IB_SA_COMP_MASK(30)
+#define IB_SA_SERVICE_REC_SERVICE_DATA32_0 IB_SA_COMP_MASK(31)
+#define IB_SA_SERVICE_REC_SERVICE_DATA32_1 IB_SA_COMP_MASK(32)
+#define IB_SA_SERVICE_REC_SERVICE_DATA32_2 IB_SA_COMP_MASK(33)
+#define IB_SA_SERVICE_REC_SERVICE_DATA32_3 IB_SA_COMP_MASK(34)
+#define IB_SA_SERVICE_REC_SERVICE_DATA64_0 IB_SA_COMP_MASK(35)
+#define IB_SA_SERVICE_REC_SERVICE_DATA64_1 IB_SA_COMP_MASK(36)
+
+#define IB_DEFAULT_SERVICE_LEASE 0xFFFFFFFF
+
+struct ib_sa_service_rec {
+ u64 id;
+ union ib_gid gid;
+ __be16 pkey;
+ /* reserved */
+ u32 lease;
+ u8 key[16];
+ u8 name[64];
+ u8 data8[16];
+ u16 data16[8];
+ u32 data32[4];
+ u64 data64[2];
+};
+
+enum {
+ IB_SA_EVENT_TYPE_FATAL = 0x0,
+ IB_SA_EVENT_TYPE_URGENT = 0x1,
+ IB_SA_EVENT_TYPE_SECURITY = 0x2,
+ IB_SA_EVENT_TYPE_SM = 0x3,
+ IB_SA_EVENT_TYPE_INFO = 0x4,
+ IB_SA_EVENT_TYPE_EMPTY = 0x7F,
+ IB_SA_EVENT_TYPE_ALL = 0xFFFF
+};
+
+enum {
+ IB_SA_EVENT_PRODUCER_TYPE_CA = 0x1,
+ IB_SA_EVENT_PRODUCER_TYPE_SWITCH = 0x2,
+ IB_SA_EVENT_PRODUCER_TYPE_ROUTER = 0x3,
+ IB_SA_EVENT_PRODUCER_TYPE_CLASS_MANAGER = 0x4,
+ IB_SA_EVENT_PRODUCER_TYPE_ALL = 0xFFFFFF
+};
+
+enum {
+ IB_SA_SM_TRAP_GID_IN_SERVICE = 64,
+ IB_SA_SM_TRAP_GID_OUT_OF_SERVICE = 65,
+ IB_SA_SM_TRAP_CREATE_MC_GROUP = 66,
+ IB_SA_SM_TRAP_DELETE_MC_GROUP = 67,
+ IB_SA_SM_TRAP_PORT_CHANGE_STATE = 128,
+ IB_SA_SM_TRAP_LINK_INTEGRITY = 129,
+ IB_SA_SM_TRAP_EXCESSIVE_BUFFER_OVERRUN = 130,
+ IB_SA_SM_TRAP_FLOW_CONTROL_UPDATE_EXPIRED = 131,
+ IB_SA_SM_TRAP_BAD_M_KEY = 256,
+ IB_SA_SM_TRAP_BAD_P_KEY = 257,
+ IB_SA_SM_TRAP_BAD_Q_KEY = 258,
+ IB_SA_SM_TRAP_SWITCH_BAD_P_KEY = 259,
+ IB_SA_SM_TRAP_ALL = 0xFFFF
+};
+
+struct ib_sa_inform {
+ union ib_gid gid;
+ __be16 lid_range_begin;
+ __be16 lid_range_end;
+ u8 is_generic;
+ u8 subscribe;
+ __be16 type;
+ union {
+ struct {
+ __be16 trap_num;
+ __be32 qpn;
+ u8 resp_time;
+ __be32 producer_type;
+ } generic;
+ struct {
+ __be16 device_id;
+ __be32 qpn;
+ u8 resp_time;
+ __be32 vendor_id;
+ } vendor;
+ } trap;
+};
+
+struct ib_sa_notice {
+ u8 is_generic;
+ u8 type;
+ union {
+ struct {
+ __be32 producer_type;
+ __be16 trap_num;
+ } generic;
+ struct {
+ __be32 vendor_id;
+ __be16 device_id;
+ } vendor;
+ } trap;
+ __be16 issuer_lid;
+ __be16 notice_count;
+ u8 notice_toggle;
+ /*
+ * Align data 16 bits off 64 bit field to match InformInfo definition.
+ * Data contained within this field will then align properly.
+ * See IB spec 1.2, sections 13.4.8.2 and 14.2.5.1.
+ */
+ u8 reserved[5];
+ u8 data_details[54];
+ union ib_gid issuer_gid;
+};
+
+/*
+ * SM notice data details for:
+ *
+ * IB_SA_SM_TRAP_GID_IN_SERVICE = 64
+ * IB_SA_SM_TRAP_GID_OUT_OF_SERVICE = 65
+ * IB_SA_SM_TRAP_CREATE_MC_GROUP = 66
+ * IB_SA_SM_TRAP_DELETE_MC_GROUP = 67
+ */
+struct ib_sa_notice_data_gid {
+ u8 reserved[6];
+ u8 gid[16];
+ u8 padding[32];
+};
+
+/*
+ * SM notice data details for:
+ *
+ * IB_SA_SM_TRAP_PORT_CHANGE_STATE = 128
+ */
+struct ib_sa_notice_data_port_change {
+ __be16 lid;
+ u8 padding[52];
+};
+
+/*
+ * SM notice data details for:
+ *
+ * IB_SA_SM_TRAP_LINK_INTEGRITY = 129
+ * IB_SA_SM_TRAP_EXCESSIVE_BUFFER_OVERRUN = 130
+ * IB_SA_SM_TRAP_FLOW_CONTROL_UPDATE_EXPIRED = 131
+ */
+struct ib_sa_notice_data_port_error {
+ u8 reserved[2];
+ __be16 lid;
+ u8 port_num;
+ u8 padding[49];
+};
+
+struct ib_sa_client {
+ atomic_t users;
+ struct completion comp;
+};
+
+/**
+ * ib_sa_register_client - Register an SA client.
+ */
+void ib_sa_register_client(struct ib_sa_client *client);
+
+/**
+ * ib_sa_unregister_client - Deregister an SA client.
+ * @client: Client object to deregister.
+ */
+void ib_sa_unregister_client(struct ib_sa_client *client);
+
+struct ib_sa_query;
+
+void ib_sa_cancel_query(int id, struct ib_sa_query *query);
+
+int ib_sa_path_rec_get(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ struct ib_sa_path_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ int timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_path_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **query);
+
+int ib_sa_service_rec_query(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ u8 method,
+ struct ib_sa_service_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ int timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_service_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query);
+
+struct ib_sa_multicast {
+ struct ib_sa_mcmember_rec rec;
+ ib_sa_comp_mask comp_mask;
+ int (*callback)(int status,
+ struct ib_sa_multicast *multicast);
+ void *context;
+};
+
+/**
+ * ib_sa_join_multicast - Initiates a join request to the specified multicast
+ * group.
+ * @client: SA client
+ * @device: Device associated with the multicast group.
+ * @port_num: Port on the specified device to associate with the multicast
+ * group.
+ * @rec: SA multicast member record specifying group attributes.
+ * @comp_mask: Component mask indicating which group attributes of %rec are
+ * valid.
+ * @gfp_mask: GFP mask for memory allocations.
+ * @callback: User callback invoked once the join operation completes.
+ * @context: User specified context stored with the ib_sa_multicast structure.
+ *
+ * This call initiates a multicast join request with the SA for the specified
+ * multicast group. If the join operation is started successfully, it returns
+ * an ib_sa_multicast structure that is used to track the multicast operation.
+ * Users must free this structure by calling ib_free_multicast, even if the
+ * join operation later fails. (The callback status is non-zero.)
+ *
+ * If the join operation fails; status will be non-zero, with the following
+ * failures possible:
+ * -ETIMEDOUT: The request timed out.
+ * -EIO: An error occurred sending the query.
+ * -EINVAL: The MCMemberRecord values differed from the existing group's.
+ * -ENETRESET: Indicates that an fatal error has occurred on the multicast
+ * group, and the user must rejoin the group to continue using it.
+ */
+struct ib_sa_multicast *ib_sa_join_multicast(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ struct ib_sa_mcmember_rec *rec,
+ ib_sa_comp_mask comp_mask, gfp_t gfp_mask,
+ int (*callback)(int status,
+ struct ib_sa_multicast
+ *multicast),
+ void *context);
+
+/**
+ * ib_free_multicast - Frees the multicast tracking structure, and releases
+ * any reference on the multicast group.
+ * @multicast: Multicast tracking structure allocated by ib_join_multicast.
+ *
+ * This call blocks until the multicast identifier is destroyed. It may
+ * not be called from within the multicast callback; however, returning a non-
+ * zero value from the callback will result in destroying the multicast
+ * tracking structure.
+ */
+void ib_sa_free_multicast(struct ib_sa_multicast *multicast);
+
+/**
+ * ib_get_mcmember_rec - Looks up a multicast member record by its MGID and
+ * returns it if found.
+ * @device: Device associated with the multicast group.
+ * @port_num: Port on the specified device to associate with the multicast
+ * group.
+ * @mgid: MGID of multicast group.
+ * @rec: Location to copy SA multicast member record.
+ */
+int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num,
+ union ib_gid *mgid, struct ib_sa_mcmember_rec *rec);
+
+/**
+ * ib_init_ah_from_mcmember - Initialize address handle attributes based on
+ * an SA multicast member record.
+ */
+int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
+ struct ib_sa_mcmember_rec *rec,
+ struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_init_ah_from_path - Initialize address handle attributes based on an SA
+ * path record.
+ */
+int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
+ struct ib_sa_path_rec *rec,
+ struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_sa_unpack_path - Convert a path record from MAD format to struct
+ * ib_sa_path_rec.
+ */
+void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec);
+
+struct ib_inform_info {
+ void *context;
+ int (*callback)(int status,
+ struct ib_inform_info *info,
+ struct ib_sa_notice *notice);
+ u16 trap_number;
+};
+
+/**
+ * ib_sa_register_inform_info - Registers to receive notice events.
+ * @device: Device associated with the registration.
+ * @port_num: Port on the specified device to associate with the registration.
+ * @trap_number: InformInfo trap number to register for.
+ * @gfp_mask: GFP mask for memory allocations.
+ * @callback: User callback invoked once the registration completes and to
+ * report noticed events.
+ * @context: User specified context stored with the ib_inform_reg structure.
+ *
+ * This call initiates a registration request with the SA for the specified
+ * trap number. If the operation is started successfully, it returns
+ * an ib_inform_info structure that is used to track the registration operation.
+ * Users must free this structure by calling ib_unregister_inform_info,
+ * even if the operation later fails. (The callback status is non-zero.)
+ *
+ * If the registration fails; status will be non-zero. If the registration
+ * succeeds, the callback status will be zero, but the notice parameter will
+ * be NULL. If the notice parameter is not NULL, a trap or notice is being
+ * reported to the user.
+ *
+ * A status of -ENETRESET indicates that an error occurred which requires
+ * reregisteration.
+ */
+struct ib_inform_info *
+ib_sa_register_inform_info(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ u16 trap_number, gfp_t gfp_mask,
+ int (*callback)(int status,
+ struct ib_inform_info *info,
+ struct ib_sa_notice *notice),
+ void *context);
+
+/**
+ * ib_sa_unregister_inform_info - Releases an InformInfo registration.
+ * @info: InformInfo registration tracking structure.
+ *
+ * This call blocks until the registration request is destroyed. It may
+ * not be called from within the registration callback.
+ */
+void ib_sa_unregister_inform_info(struct ib_inform_info *info);
+
+#endif /* IB_SA_H */
diff --git a/sys/ofed/include/rdma/ib_smi.h b/sys/ofed/include/rdma/ib_smi.h
new file mode 100644
index 0000000..98b9086
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_smi.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(IB_SMI_H)
+#define IB_SMI_H
+
+#include <rdma/ib_mad.h>
+
+#define IB_SMP_DATA_SIZE 64
+#define IB_SMP_MAX_PATH_HOPS 64
+
+struct ib_smp {
+ u8 base_version;
+ u8 mgmt_class;
+ u8 class_version;
+ u8 method;
+ __be16 status;
+ u8 hop_ptr;
+ u8 hop_cnt;
+ __be64 tid;
+ __be16 attr_id;
+ __be16 resv;
+ __be32 attr_mod;
+ __be64 mkey;
+ __be16 dr_slid;
+ __be16 dr_dlid;
+ u8 reserved[28];
+ u8 data[IB_SMP_DATA_SIZE];
+ u8 initial_path[IB_SMP_MAX_PATH_HOPS];
+ u8 return_path[IB_SMP_MAX_PATH_HOPS];
+} __attribute__ ((packed));
+
+#define IB_SMP_DIRECTION cpu_to_be16(0x8000)
+
+/* Subnet management attributes */
+#define IB_SMP_ATTR_NOTICE cpu_to_be16(0x0002)
+#define IB_SMP_ATTR_NODE_DESC cpu_to_be16(0x0010)
+#define IB_SMP_ATTR_NODE_INFO cpu_to_be16(0x0011)
+#define IB_SMP_ATTR_SWITCH_INFO cpu_to_be16(0x0012)
+#define IB_SMP_ATTR_GUID_INFO cpu_to_be16(0x0014)
+#define IB_SMP_ATTR_PORT_INFO cpu_to_be16(0x0015)
+#define IB_SMP_ATTR_PKEY_TABLE cpu_to_be16(0x0016)
+#define IB_SMP_ATTR_SL_TO_VL_TABLE cpu_to_be16(0x0017)
+#define IB_SMP_ATTR_VL_ARB_TABLE cpu_to_be16(0x0018)
+#define IB_SMP_ATTR_LINEAR_FORWARD_TABLE cpu_to_be16(0x0019)
+#define IB_SMP_ATTR_RANDOM_FORWARD_TABLE cpu_to_be16(0x001A)
+#define IB_SMP_ATTR_MCAST_FORWARD_TABLE cpu_to_be16(0x001B)
+#define IB_SMP_ATTR_SM_INFO cpu_to_be16(0x0020)
+#define IB_SMP_ATTR_VENDOR_DIAG cpu_to_be16(0x0030)
+#define IB_SMP_ATTR_LED_INFO cpu_to_be16(0x0031)
+#define IB_SMP_ATTR_VENDOR_MASK cpu_to_be16(0xFF00)
+
+struct ib_port_info {
+ __be64 mkey;
+ __be64 gid_prefix;
+ __be16 lid;
+ __be16 sm_lid;
+ __be32 cap_mask;
+ __be16 diag_code;
+ __be16 mkey_lease_period;
+ u8 local_port_num;
+ u8 link_width_enabled;
+ u8 link_width_supported;
+ u8 link_width_active;
+ u8 linkspeed_portstate; /* 4 bits, 4 bits */
+ u8 portphysstate_linkdown; /* 4 bits, 4 bits */
+ u8 mkeyprot_resv_lmc; /* 2 bits, 3, 3 */
+ u8 linkspeedactive_enabled; /* 4 bits, 4 bits */
+ u8 neighbormtu_mastersmsl; /* 4 bits, 4 bits */
+ u8 vlcap_inittype; /* 4 bits, 4 bits */
+ u8 vl_high_limit;
+ u8 vl_arb_high_cap;
+ u8 vl_arb_low_cap;
+ u8 inittypereply_mtucap; /* 4 bits, 4 bits */
+ u8 vlstallcnt_hoqlife; /* 3 bits, 5 bits */
+ u8 operationalvl_pei_peo_fpi_fpo; /* 4 bits, 1, 1, 1, 1 */
+ __be16 mkey_violations;
+ __be16 pkey_violations;
+ __be16 qkey_violations;
+ u8 guid_cap;
+ u8 clientrereg_resv_subnetto; /* 1 bit, 2 bits, 5 */
+ u8 resv_resptimevalue; /* 3 bits, 5 bits */
+ u8 localphyerrors_overrunerrors; /* 4 bits, 4 bits */
+ __be16 max_credit_hint;
+ u8 resv;
+ u8 link_roundtrip_latency[3];
+};
+
+static inline u8
+ib_get_smp_direction(struct ib_smp *smp)
+{
+ return ((smp->status & IB_SMP_DIRECTION) == IB_SMP_DIRECTION);
+}
+
+#endif /* IB_SMI_H */
diff --git a/sys/ofed/include/rdma/ib_umem.h b/sys/ofed/include/rdma/ib_umem.h
new file mode 100644
index 0000000..afa09f9
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_umem.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2007 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_UMEM_H
+#define IB_UMEM_H
+
+#include <linux/list.h>
+#include <linux/scatterlist.h>
+#include <linux/workqueue.h>
+#include <linux/dma-attrs.h>
+
+struct ib_ucontext;
+
+struct ib_umem {
+ struct ib_ucontext *context;
+ size_t length;
+ int offset;
+ int page_size;
+ int writable;
+ int hugetlb;
+ struct list_head chunk_list;
+#ifdef __linux__
+ struct work_struct work;
+ struct mm_struct *mm;
+#else
+ unsigned long start;
+#endif
+ unsigned long diff;
+};
+
+struct ib_umem_chunk {
+ struct list_head list;
+ int nents;
+ int nmap;
+ struct dma_attrs attrs;
+ struct scatterlist page_list[0];
+};
+
+struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
+ size_t size, int access, int dmasync);
+void ib_umem_release(struct ib_umem *umem);
+int ib_umem_page_count(struct ib_umem *umem);
+
+#endif /* IB_UMEM_H */
diff --git a/sys/ofed/include/rdma/ib_user_cm.h b/sys/ofed/include/rdma/ib_user_cm.h
new file mode 100644
index 0000000..bd3d380
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_user_cm.h
@@ -0,0 +1,324 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_USER_CM_H
+#define IB_USER_CM_H
+
+#include <rdma/ib_user_sa.h>
+
+#define IB_USER_CM_ABI_VERSION 5
+
+enum {
+ IB_USER_CM_CMD_CREATE_ID,
+ IB_USER_CM_CMD_DESTROY_ID,
+ IB_USER_CM_CMD_ATTR_ID,
+
+ IB_USER_CM_CMD_LISTEN,
+ IB_USER_CM_CMD_NOTIFY,
+
+ IB_USER_CM_CMD_SEND_REQ,
+ IB_USER_CM_CMD_SEND_REP,
+ IB_USER_CM_CMD_SEND_RTU,
+ IB_USER_CM_CMD_SEND_DREQ,
+ IB_USER_CM_CMD_SEND_DREP,
+ IB_USER_CM_CMD_SEND_REJ,
+ IB_USER_CM_CMD_SEND_MRA,
+ IB_USER_CM_CMD_SEND_LAP,
+ IB_USER_CM_CMD_SEND_APR,
+ IB_USER_CM_CMD_SEND_SIDR_REQ,
+ IB_USER_CM_CMD_SEND_SIDR_REP,
+
+ IB_USER_CM_CMD_EVENT,
+ IB_USER_CM_CMD_INIT_QP_ATTR,
+};
+/*
+ * command ABI structures.
+ */
+struct ib_ucm_cmd_hdr {
+ __u32 cmd;
+ __u16 in;
+ __u16 out;
+};
+
+struct ib_ucm_create_id {
+ __u64 uid;
+ __u64 response;
+};
+
+struct ib_ucm_create_id_resp {
+ __u32 id;
+};
+
+struct ib_ucm_destroy_id {
+ __u64 response;
+ __u32 id;
+ __u32 reserved;
+};
+
+struct ib_ucm_destroy_id_resp {
+ __u32 events_reported;
+};
+
+struct ib_ucm_attr_id {
+ __u64 response;
+ __u32 id;
+ __u32 reserved;
+};
+
+struct ib_ucm_attr_id_resp {
+ __be64 service_id;
+ __be64 service_mask;
+ __be32 local_id;
+ __be32 remote_id;
+};
+
+struct ib_ucm_init_qp_attr {
+ __u64 response;
+ __u32 id;
+ __u32 qp_state;
+};
+
+struct ib_ucm_listen {
+ __be64 service_id;
+ __be64 service_mask;
+ __u32 id;
+ __u32 reserved;
+};
+
+struct ib_ucm_notify {
+ __u32 id;
+ __u32 event;
+};
+
+struct ib_ucm_private_data {
+ __u64 data;
+ __u32 id;
+ __u8 len;
+ __u8 reserved[3];
+};
+
+struct ib_ucm_req {
+ __u32 id;
+ __u32 qpn;
+ __u32 qp_type;
+ __u32 psn;
+ __be64 sid;
+ __u64 data;
+ __u64 primary_path;
+ __u64 alternate_path;
+ __u8 len;
+ __u8 peer_to_peer;
+ __u8 responder_resources;
+ __u8 initiator_depth;
+ __u8 remote_cm_response_timeout;
+ __u8 flow_control;
+ __u8 local_cm_response_timeout;
+ __u8 retry_count;
+ __u8 rnr_retry_count;
+ __u8 max_cm_retries;
+ __u8 srq;
+ __u8 reserved[5];
+};
+
+struct ib_ucm_rep {
+ __u64 uid;
+ __u64 data;
+ __u32 id;
+ __u32 qpn;
+ __u32 psn;
+ __u8 len;
+ __u8 responder_resources;
+ __u8 initiator_depth;
+ __u8 target_ack_delay;
+ __u8 failover_accepted;
+ __u8 flow_control;
+ __u8 rnr_retry_count;
+ __u8 srq;
+ __u8 reserved[4];
+};
+
+struct ib_ucm_info {
+ __u32 id;
+ __u32 status;
+ __u64 info;
+ __u64 data;
+ __u8 info_len;
+ __u8 data_len;
+ __u8 reserved[6];
+};
+
+struct ib_ucm_mra {
+ __u64 data;
+ __u32 id;
+ __u8 len;
+ __u8 timeout;
+ __u8 reserved[2];
+};
+
+struct ib_ucm_lap {
+ __u64 path;
+ __u64 data;
+ __u32 id;
+ __u8 len;
+ __u8 reserved[3];
+};
+
+struct ib_ucm_sidr_req {
+ __u32 id;
+ __u32 timeout;
+ __be64 sid;
+ __u64 data;
+ __u64 path;
+ __u16 reserved_pkey;
+ __u8 len;
+ __u8 max_cm_retries;
+ __u8 reserved[4];
+};
+
+struct ib_ucm_sidr_rep {
+ __u32 id;
+ __u32 qpn;
+ __u32 qkey;
+ __u32 status;
+ __u64 info;
+ __u64 data;
+ __u8 info_len;
+ __u8 data_len;
+ __u8 reserved[6];
+};
+/*
+ * event notification ABI structures.
+ */
+struct ib_ucm_event_get {
+ __u64 response;
+ __u64 data;
+ __u64 info;
+ __u8 data_len;
+ __u8 info_len;
+ __u8 reserved[6];
+};
+
+struct ib_ucm_req_event_resp {
+ struct ib_user_path_rec primary_path;
+ struct ib_user_path_rec alternate_path;
+ __be64 remote_ca_guid;
+ __u32 remote_qkey;
+ __u32 remote_qpn;
+ __u32 qp_type;
+ __u32 starting_psn;
+ __u8 responder_resources;
+ __u8 initiator_depth;
+ __u8 local_cm_response_timeout;
+ __u8 flow_control;
+ __u8 remote_cm_response_timeout;
+ __u8 retry_count;
+ __u8 rnr_retry_count;
+ __u8 srq;
+ __u8 port;
+ __u8 reserved[7];
+};
+
+struct ib_ucm_rep_event_resp {
+ __be64 remote_ca_guid;
+ __u32 remote_qkey;
+ __u32 remote_qpn;
+ __u32 starting_psn;
+ __u8 responder_resources;
+ __u8 initiator_depth;
+ __u8 target_ack_delay;
+ __u8 failover_accepted;
+ __u8 flow_control;
+ __u8 rnr_retry_count;
+ __u8 srq;
+ __u8 reserved[5];
+};
+
+struct ib_ucm_rej_event_resp {
+ __u32 reason;
+ /* ari in ib_ucm_event_get info field. */
+};
+
+struct ib_ucm_mra_event_resp {
+ __u8 timeout;
+ __u8 reserved[3];
+};
+
+struct ib_ucm_lap_event_resp {
+ struct ib_user_path_rec path;
+};
+
+struct ib_ucm_apr_event_resp {
+ __u32 status;
+ /* apr info in ib_ucm_event_get info field. */
+};
+
+struct ib_ucm_sidr_req_event_resp {
+ __u16 pkey;
+ __u8 port;
+ __u8 reserved;
+};
+
+struct ib_ucm_sidr_rep_event_resp {
+ __u32 status;
+ __u32 qkey;
+ __u32 qpn;
+ /* info in ib_ucm_event_get info field. */
+};
+
+#define IB_UCM_PRES_DATA 0x01
+#define IB_UCM_PRES_INFO 0x02
+#define IB_UCM_PRES_PRIMARY 0x04
+#define IB_UCM_PRES_ALTERNATE 0x08
+
+struct ib_ucm_event_resp {
+ __u64 uid;
+ __u32 id;
+ __u32 event;
+ __u32 present;
+ __u32 reserved;
+ union {
+ struct ib_ucm_req_event_resp req_resp;
+ struct ib_ucm_rep_event_resp rep_resp;
+ struct ib_ucm_rej_event_resp rej_resp;
+ struct ib_ucm_mra_event_resp mra_resp;
+ struct ib_ucm_lap_event_resp lap_resp;
+ struct ib_ucm_apr_event_resp apr_resp;
+
+ struct ib_ucm_sidr_req_event_resp sidr_req_resp;
+ struct ib_ucm_sidr_rep_event_resp sidr_rep_resp;
+
+ __u32 send_status;
+ } u;
+};
+
+#endif /* IB_USER_CM_H */
diff --git a/sys/ofed/include/rdma/ib_user_mad.h b/sys/ofed/include/rdma/ib_user_mad.h
new file mode 100644
index 0000000..3595ae2
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_user_mad.h
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_USER_MAD_H
+#define IB_USER_MAD_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define IB_USER_MAD_ABI_VERSION 5
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ */
+
+/**
+ * ib_user_mad_hdr_old - Old version of MAD packet header without pkey_index
+ * @id - ID of agent MAD received with/to be sent with
+ * @status - 0 on successful receive, ETIMEDOUT if no response
+ * received (transaction ID in data[] will be set to TID of original
+ * request) (ignored on send)
+ * @timeout_ms - Milliseconds to wait for response (unset on receive)
+ * @retries - Number of automatic retries to attempt
+ * @qpn - Remote QP number received from/to be sent to
+ * @qkey - Remote Q_Key to be sent with (unset on receive)
+ * @lid - Remote lid received from/to be sent to
+ * @sl - Service level received with/to be sent with
+ * @path_bits - Local path bits received with/to be sent with
+ * @grh_present - If set, GRH was received/should be sent
+ * @gid_index - Local GID index to send with (unset on receive)
+ * @hop_limit - Hop limit in GRH
+ * @traffic_class - Traffic class in GRH
+ * @gid - Remote GID in GRH
+ * @flow_label - Flow label in GRH
+ */
+struct ib_user_mad_hdr_old {
+ __u32 id;
+ __u32 status;
+ __u32 timeout_ms;
+ __u32 retries;
+ __u32 length;
+ __be32 qpn;
+ __be32 qkey;
+ __be16 lid;
+ __u8 sl;
+ __u8 path_bits;
+ __u8 grh_present;
+ __u8 gid_index;
+ __u8 hop_limit;
+ __u8 traffic_class;
+ __u8 gid[16];
+ __be32 flow_label;
+};
+
+/**
+ * ib_user_mad_hdr - MAD packet header
+ * This layout allows specifying/receiving the P_Key index. To use
+ * this capability, an application must call the
+ * IB_USER_MAD_ENABLE_PKEY ioctl on the user MAD file handle before
+ * any other actions with the file handle.
+ * @id - ID of agent MAD received with/to be sent with
+ * @status - 0 on successful receive, ETIMEDOUT if no response
+ * received (transaction ID in data[] will be set to TID of original
+ * request) (ignored on send)
+ * @timeout_ms - Milliseconds to wait for response (unset on receive)
+ * @retries - Number of automatic retries to attempt
+ * @qpn - Remote QP number received from/to be sent to
+ * @qkey - Remote Q_Key to be sent with (unset on receive)
+ * @lid - Remote lid received from/to be sent to
+ * @sl - Service level received with/to be sent with
+ * @path_bits - Local path bits received with/to be sent with
+ * @grh_present - If set, GRH was received/should be sent
+ * @gid_index - Local GID index to send with (unset on receive)
+ * @hop_limit - Hop limit in GRH
+ * @traffic_class - Traffic class in GRH
+ * @gid - Remote GID in GRH
+ * @flow_label - Flow label in GRH
+ * @pkey_index - P_Key index
+ */
+struct ib_user_mad_hdr {
+ __u32 id;
+ __u32 status;
+ __u32 timeout_ms;
+ __u32 retries;
+ __u32 length;
+ __be32 qpn;
+ __be32 qkey;
+ __be16 lid;
+ __u8 sl;
+ __u8 path_bits;
+ __u8 grh_present;
+ __u8 gid_index;
+ __u8 hop_limit;
+ __u8 traffic_class;
+ __u8 gid[16];
+ __be32 flow_label;
+ __u16 pkey_index;
+ __u8 reserved[6];
+};
+
+/**
+ * ib_user_mad - MAD packet
+ * @hdr - MAD packet header
+ * @data - Contents of MAD
+ *
+ */
+struct ib_user_mad {
+ struct ib_user_mad_hdr hdr;
+ __u64 data[0];
+};
+
+/*
+ * Earlier versions of this interface definition declared the
+ * method_mask[] member as an array of __u32 but treated it as a
+ * bitmap made up of longs in the kernel. This ambiguity meant that
+ * 32-bit big-endian applications that can run on both 32-bit and
+ * 64-bit kernels had no consistent ABI to rely on, and 64-bit
+ * big-endian applications that treated method_mask as being made up
+ * of 32-bit words would have their bitmap misinterpreted.
+ *
+ * To clear up this confusion, we change the declaration of
+ * method_mask[] to use unsigned long and handle the conversion from
+ * 32-bit userspace to 64-bit kernel for big-endian systems in the
+ * compat_ioctl method. Unfortunately, to keep the structure layout
+ * the same, we need the method_mask[] array to be aligned only to 4
+ * bytes even when long is 64 bits, which forces us into this ugly
+ * typedef.
+ */
+typedef unsigned long __attribute__((aligned(4))) packed_ulong;
+#define IB_USER_MAD_LONGS_PER_METHOD_MASK (128 / (8 * sizeof (long)))
+
+/**
+ * ib_user_mad_reg_req - MAD registration request
+ * @id - Set by the kernel; used to identify agent in future requests.
+ * @qpn - Queue pair number; must be 0 or 1.
+ * @method_mask - The caller will receive unsolicited MADs for any method
+ * where @method_mask = 1.
+ * @mgmt_class - Indicates which management class of MADs should be receive
+ * by the caller. This field is only required if the user wishes to
+ * receive unsolicited MADs, otherwise it should be 0.
+ * @mgmt_class_version - Indicates which version of MADs for the given
+ * management class to receive.
+ * @oui: Indicates IEEE OUI when mgmt_class is a vendor class
+ * in the range from 0x30 to 0x4f. Otherwise not used.
+ * @rmpp_version: If set, indicates the RMPP version used.
+ *
+ */
+struct ib_user_mad_reg_req {
+ __u32 id;
+ packed_ulong method_mask[IB_USER_MAD_LONGS_PER_METHOD_MASK];
+ __u8 qpn;
+ __u8 mgmt_class;
+ __u8 mgmt_class_version;
+ __u8 oui[3];
+ __u8 rmpp_version;
+};
+
+#define IB_IOCTL_MAGIC 0x1b
+
+#define IB_USER_MAD_REGISTER_AGENT _IO(IB_IOCTL_MAGIC, 1)
+
+#define IB_USER_MAD_UNREGISTER_AGENT _IO(IB_IOCTL_MAGIC, 2)
+
+#define IB_USER_MAD_ENABLE_PKEY _IO(IB_IOCTL_MAGIC, 3)
+
+#endif /* IB_USER_MAD_H */
diff --git a/sys/ofed/include/rdma/ib_user_sa.h b/sys/ofed/include/rdma/ib_user_sa.h
new file mode 100644
index 0000000..cfc7c9b
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_user_sa.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_USER_SA_H
+#define IB_USER_SA_H
+
+#include <linux/types.h>
+
+enum {
+ IB_PATH_GMP = 1,
+ IB_PATH_PRIMARY = (1<<1),
+ IB_PATH_ALTERNATE = (1<<2),
+ IB_PATH_OUTBOUND = (1<<3),
+ IB_PATH_INBOUND = (1<<4),
+ IB_PATH_INBOUND_REVERSE = (1<<5),
+ IB_PATH_BIDIRECTIONAL = IB_PATH_OUTBOUND | IB_PATH_INBOUND_REVERSE
+};
+
+struct ib_path_rec_data {
+ __u32 flags;
+ __u32 reserved;
+ __u32 path_rec[16];
+};
+
+struct ib_user_path_rec {
+ __u8 dgid[16];
+ __u8 sgid[16];
+ __be16 dlid;
+ __be16 slid;
+ __u32 raw_traffic;
+ __be32 flow_label;
+ __u32 reversible;
+ __u32 mtu;
+ __be16 pkey;
+ __u8 hop_limit;
+ __u8 traffic_class;
+ __u8 numb_path;
+ __u8 sl;
+ __u8 mtu_selector;
+ __u8 rate_selector;
+ __u8 rate;
+ __u8 packet_life_time_selector;
+ __u8 packet_life_time;
+ __u8 preference;
+};
+
+#endif /* IB_USER_SA_H */
diff --git a/sys/ofed/include/rdma/ib_user_verbs.h b/sys/ofed/include/rdma/ib_user_verbs.h
new file mode 100644
index 0000000..b2721c7
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_user_verbs.h
@@ -0,0 +1,801 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ * Copyright (c) 2006 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_USER_VERBS_H
+#define IB_USER_VERBS_H
+
+#include <linux/types.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define IB_USER_VERBS_ABI_VERSION 6
+
+enum {
+ IB_USER_VERBS_CMD_GET_CONTEXT,
+ IB_USER_VERBS_CMD_QUERY_DEVICE,
+ IB_USER_VERBS_CMD_QUERY_PORT,
+ IB_USER_VERBS_CMD_ALLOC_PD,
+ IB_USER_VERBS_CMD_DEALLOC_PD,
+ IB_USER_VERBS_CMD_CREATE_AH,
+ IB_USER_VERBS_CMD_MODIFY_AH,
+ IB_USER_VERBS_CMD_QUERY_AH,
+ IB_USER_VERBS_CMD_DESTROY_AH,
+ IB_USER_VERBS_CMD_REG_MR,
+ IB_USER_VERBS_CMD_REG_SMR,
+ IB_USER_VERBS_CMD_REREG_MR,
+ IB_USER_VERBS_CMD_QUERY_MR,
+ IB_USER_VERBS_CMD_DEREG_MR,
+ IB_USER_VERBS_CMD_ALLOC_MW,
+ IB_USER_VERBS_CMD_BIND_MW,
+ IB_USER_VERBS_CMD_DEALLOC_MW,
+ IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL,
+ IB_USER_VERBS_CMD_CREATE_CQ,
+ IB_USER_VERBS_CMD_RESIZE_CQ,
+ IB_USER_VERBS_CMD_DESTROY_CQ,
+ IB_USER_VERBS_CMD_POLL_CQ,
+ IB_USER_VERBS_CMD_PEEK_CQ,
+ IB_USER_VERBS_CMD_REQ_NOTIFY_CQ,
+ IB_USER_VERBS_CMD_CREATE_QP,
+ IB_USER_VERBS_CMD_QUERY_QP,
+ IB_USER_VERBS_CMD_MODIFY_QP,
+ IB_USER_VERBS_CMD_DESTROY_QP,
+ IB_USER_VERBS_CMD_POST_SEND,
+ IB_USER_VERBS_CMD_POST_RECV,
+ IB_USER_VERBS_CMD_ATTACH_MCAST,
+ IB_USER_VERBS_CMD_DETACH_MCAST,
+ IB_USER_VERBS_CMD_CREATE_SRQ,
+ IB_USER_VERBS_CMD_MODIFY_SRQ,
+ IB_USER_VERBS_CMD_QUERY_SRQ,
+ IB_USER_VERBS_CMD_DESTROY_SRQ,
+ IB_USER_VERBS_CMD_POST_SRQ_RECV,
+ IB_USER_VERBS_CMD_CREATE_XRC_SRQ,
+ IB_USER_VERBS_CMD_OPEN_XRC_DOMAIN,
+ IB_USER_VERBS_CMD_CLOSE_XRC_DOMAIN,
+ IB_USER_VERBS_CMD_CREATE_XRC_RCV_QP,
+ IB_USER_VERBS_CMD_MODIFY_XRC_RCV_QP,
+ IB_USER_VERBS_CMD_QUERY_XRC_RCV_QP,
+ IB_USER_VERBS_CMD_REG_XRC_RCV_QP,
+ IB_USER_VERBS_CMD_UNREG_XRC_RCV_QP,
+};
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * Specifically:
+ * - Do not use pointer types -- pass pointers in __u64 instead.
+ * - Make sure that any structure larger than 4 bytes is padded to a
+ * multiple of 8 bytes. Otherwise the structure size will be
+ * different between 32-bit and 64-bit architectures.
+ */
+
+struct ib_uverbs_async_event_desc {
+ __u64 element;
+ __u32 event_type; /* enum ib_event_type */
+ __u32 reserved;
+};
+
+struct ib_uverbs_comp_event_desc {
+ __u64 cq_handle;
+};
+
+/*
+ * All commands from userspace should start with a __u32 command field
+ * followed by __u16 in_words and out_words fields (which give the
+ * length of the command block and response buffer if any in 32-bit
+ * words). The kernel driver will read these fields first and read
+ * the rest of the command struct based on these value.
+ */
+
+struct ib_uverbs_cmd_hdr {
+ __u32 command;
+ __u16 in_words;
+ __u16 out_words;
+};
+
+struct ib_uverbs_get_context {
+ __u64 response;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_get_context_resp {
+ __u32 async_fd;
+ __u32 num_comp_vectors;
+};
+
+struct ib_uverbs_query_device {
+ __u64 response;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_query_device_resp {
+ __u64 fw_ver;
+ __be64 node_guid;
+ __be64 sys_image_guid;
+ __u64 max_mr_size;
+ __u64 page_size_cap;
+ __u32 vendor_id;
+ __u32 vendor_part_id;
+ __u32 hw_ver;
+ __u32 max_qp;
+ __u32 max_qp_wr;
+ __u32 device_cap_flags;
+ __u32 max_sge;
+ __u32 max_sge_rd;
+ __u32 max_cq;
+ __u32 max_cqe;
+ __u32 max_mr;
+ __u32 max_pd;
+ __u32 max_qp_rd_atom;
+ __u32 max_ee_rd_atom;
+ __u32 max_res_rd_atom;
+ __u32 max_qp_init_rd_atom;
+ __u32 max_ee_init_rd_atom;
+ __u32 atomic_cap;
+ __u32 max_ee;
+ __u32 max_rdd;
+ __u32 max_mw;
+ __u32 max_raw_ipv6_qp;
+ __u32 max_raw_ethy_qp;
+ __u32 max_mcast_grp;
+ __u32 max_mcast_qp_attach;
+ __u32 max_total_mcast_qp_attach;
+ __u32 max_ah;
+ __u32 max_fmr;
+ __u32 max_map_per_fmr;
+ __u32 max_srq;
+ __u32 max_srq_wr;
+ __u32 max_srq_sge;
+ __u16 max_pkeys;
+ __u8 local_ca_ack_delay;
+ __u8 phys_port_cnt;
+ __u8 reserved[4];
+};
+
+struct ib_uverbs_query_port {
+ __u64 response;
+ __u8 port_num;
+ __u8 reserved[7];
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_query_port_resp {
+ __u32 port_cap_flags;
+ __u32 max_msg_sz;
+ __u32 bad_pkey_cntr;
+ __u32 qkey_viol_cntr;
+ __u32 gid_tbl_len;
+ __u16 pkey_tbl_len;
+ __u16 lid;
+ __u16 sm_lid;
+ __u8 state;
+ __u8 max_mtu;
+ __u8 active_mtu;
+ __u8 lmc;
+ __u8 max_vl_num;
+ __u8 sm_sl;
+ __u8 subnet_timeout;
+ __u8 init_type_reply;
+ __u8 active_width;
+ __u8 active_speed;
+ __u8 phys_state;
+ __u8 link_layer;
+ __u8 reserved[2];
+};
+
+struct ib_uverbs_alloc_pd {
+ __u64 response;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_alloc_pd_resp {
+ __u32 pd_handle;
+};
+
+struct ib_uverbs_dealloc_pd {
+ __u32 pd_handle;
+};
+
+struct ib_uverbs_reg_mr {
+ __u64 response;
+ __u64 start;
+ __u64 length;
+ __u64 hca_va;
+ __u32 pd_handle;
+ __u32 access_flags;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_reg_mr_resp {
+ __u32 mr_handle;
+ __u32 lkey;
+ __u32 rkey;
+};
+
+struct ib_uverbs_dereg_mr {
+ __u32 mr_handle;
+};
+
+struct ib_uverbs_create_comp_channel {
+ __u64 response;
+};
+
+struct ib_uverbs_create_comp_channel_resp {
+ __u32 fd;
+};
+
+struct ib_uverbs_create_cq {
+ __u64 response;
+ __u64 user_handle;
+ __u32 cqe;
+ __u32 comp_vector;
+ __s32 comp_channel;
+ __u32 reserved;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_create_cq_resp {
+ __u32 cq_handle;
+ __u32 cqe;
+};
+
+struct ib_uverbs_resize_cq {
+ __u64 response;
+ __u32 cq_handle;
+ __u32 cqe;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_resize_cq_resp {
+ __u32 cqe;
+ __u32 reserved;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_poll_cq {
+ __u64 response;
+ __u32 cq_handle;
+ __u32 ne;
+};
+
+struct ib_uverbs_wc {
+ __u64 wr_id;
+ __u32 status;
+ __u32 opcode;
+ __u32 vendor_err;
+ __u32 byte_len;
+ union {
+ __u32 imm_data;
+ __u32 invalidate_rkey;
+ } ex;
+ __u32 qp_num;
+ __u32 src_qp;
+ __u32 wc_flags;
+ __u16 pkey_index;
+ __u16 slid;
+ __u8 sl;
+ __u8 dlid_path_bits;
+ __u8 port_num;
+ __u8 reserved;
+};
+
+struct ib_uverbs_poll_cq_resp {
+ __u32 count;
+ __u32 reserved;
+ struct ib_uverbs_wc wc[0];
+};
+
+struct ib_uverbs_req_notify_cq {
+ __u32 cq_handle;
+ __u32 solicited_only;
+};
+
+struct ib_uverbs_destroy_cq {
+ __u64 response;
+ __u32 cq_handle;
+ __u32 reserved;
+};
+
+struct ib_uverbs_destroy_cq_resp {
+ __u32 comp_events_reported;
+ __u32 async_events_reported;
+};
+
+struct ib_uverbs_global_route {
+ __u8 dgid[16];
+ __u32 flow_label;
+ __u8 sgid_index;
+ __u8 hop_limit;
+ __u8 traffic_class;
+ __u8 reserved;
+};
+
+struct ib_uverbs_ah_attr {
+ struct ib_uverbs_global_route grh;
+ __u16 dlid;
+ __u8 sl;
+ __u8 src_path_bits;
+ __u8 static_rate;
+ __u8 is_global;
+ __u8 port_num;
+ __u8 reserved;
+};
+
+struct ib_uverbs_qp_attr {
+ __u32 qp_attr_mask;
+ __u32 qp_state;
+ __u32 cur_qp_state;
+ __u32 path_mtu;
+ __u32 path_mig_state;
+ __u32 qkey;
+ __u32 rq_psn;
+ __u32 sq_psn;
+ __u32 dest_qp_num;
+ __u32 qp_access_flags;
+
+ struct ib_uverbs_ah_attr ah_attr;
+ struct ib_uverbs_ah_attr alt_ah_attr;
+
+ /* ib_qp_cap */
+ __u32 max_send_wr;
+ __u32 max_recv_wr;
+ __u32 max_send_sge;
+ __u32 max_recv_sge;
+ __u32 max_inline_data;
+
+ __u16 pkey_index;
+ __u16 alt_pkey_index;
+ __u8 en_sqd_async_notify;
+ __u8 sq_draining;
+ __u8 max_rd_atomic;
+ __u8 max_dest_rd_atomic;
+ __u8 min_rnr_timer;
+ __u8 port_num;
+ __u8 timeout;
+ __u8 retry_cnt;
+ __u8 rnr_retry;
+ __u8 alt_port_num;
+ __u8 alt_timeout;
+ __u8 reserved[5];
+};
+
+struct ib_uverbs_create_qp {
+ __u64 response;
+ __u64 user_handle;
+ __u32 pd_handle;
+ __u32 send_cq_handle;
+ __u32 recv_cq_handle;
+ __u32 srq_handle;
+ __u32 max_send_wr;
+ __u32 max_recv_wr;
+ __u32 max_send_sge;
+ __u32 max_recv_sge;
+ __u32 max_inline_data;
+ __u8 sq_sig_all;
+ __u8 qp_type;
+ __u8 is_srq;
+ __u8 reserved;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_create_qp_resp {
+ __u32 qp_handle;
+ __u32 qpn;
+ __u32 max_send_wr;
+ __u32 max_recv_wr;
+ __u32 max_send_sge;
+ __u32 max_recv_sge;
+ __u32 max_inline_data;
+ __u32 reserved;
+};
+
+/*
+ * This struct needs to remain a multiple of 8 bytes to keep the
+ * alignment of the modify QP parameters.
+ */
+struct ib_uverbs_qp_dest {
+ __u8 dgid[16];
+ __u32 flow_label;
+ __u16 dlid;
+ __u16 reserved;
+ __u8 sgid_index;
+ __u8 hop_limit;
+ __u8 traffic_class;
+ __u8 sl;
+ __u8 src_path_bits;
+ __u8 static_rate;
+ __u8 is_global;
+ __u8 port_num;
+};
+
+struct ib_uverbs_query_qp {
+ __u64 response;
+ __u32 qp_handle;
+ __u32 attr_mask;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_query_qp_resp {
+ struct ib_uverbs_qp_dest dest;
+ struct ib_uverbs_qp_dest alt_dest;
+ __u32 max_send_wr;
+ __u32 max_recv_wr;
+ __u32 max_send_sge;
+ __u32 max_recv_sge;
+ __u32 max_inline_data;
+ __u32 qkey;
+ __u32 rq_psn;
+ __u32 sq_psn;
+ __u32 dest_qp_num;
+ __u32 qp_access_flags;
+ __u16 pkey_index;
+ __u16 alt_pkey_index;
+ __u8 qp_state;
+ __u8 cur_qp_state;
+ __u8 path_mtu;
+ __u8 path_mig_state;
+ __u8 sq_draining;
+ __u8 max_rd_atomic;
+ __u8 max_dest_rd_atomic;
+ __u8 min_rnr_timer;
+ __u8 port_num;
+ __u8 timeout;
+ __u8 retry_cnt;
+ __u8 rnr_retry;
+ __u8 alt_port_num;
+ __u8 alt_timeout;
+ __u8 sq_sig_all;
+ __u8 reserved[5];
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_modify_qp {
+ struct ib_uverbs_qp_dest dest;
+ struct ib_uverbs_qp_dest alt_dest;
+ __u32 qp_handle;
+ __u32 attr_mask;
+ __u32 qkey;
+ __u32 rq_psn;
+ __u32 sq_psn;
+ __u32 dest_qp_num;
+ __u32 qp_access_flags;
+ __u16 pkey_index;
+ __u16 alt_pkey_index;
+ __u8 qp_state;
+ __u8 cur_qp_state;
+ __u8 path_mtu;
+ __u8 path_mig_state;
+ __u8 en_sqd_async_notify;
+ __u8 max_rd_atomic;
+ __u8 max_dest_rd_atomic;
+ __u8 min_rnr_timer;
+ __u8 port_num;
+ __u8 timeout;
+ __u8 retry_cnt;
+ __u8 rnr_retry;
+ __u8 alt_port_num;
+ __u8 alt_timeout;
+ __u8 reserved[2];
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_modify_qp_resp {
+};
+
+struct ib_uverbs_destroy_qp {
+ __u64 response;
+ __u32 qp_handle;
+ __u32 reserved;
+};
+
+struct ib_uverbs_destroy_qp_resp {
+ __u32 events_reported;
+};
+
+/*
+ * The ib_uverbs_sge structure isn't used anywhere, since we assume
+ * the ib_sge structure is packed the same way on 32-bit and 64-bit
+ * architectures in both kernel and user space. It's just here to
+ * document the ABI.
+ */
+struct ib_uverbs_sge {
+ __u64 addr;
+ __u32 length;
+ __u32 lkey;
+};
+
+struct ib_uverbs_send_wr {
+ __u64 wr_id;
+ __u32 num_sge;
+ __u32 opcode;
+ __u32 send_flags;
+ union {
+ __u32 imm_data;
+ __u32 invalidate_rkey;
+ } ex;
+ union {
+ struct {
+ __u64 remote_addr;
+ __u32 rkey;
+ __u32 reserved;
+ } rdma;
+ struct {
+ __u64 remote_addr;
+ __u64 compare_add;
+ __u64 swap;
+ __u32 rkey;
+ __u32 reserved;
+ } atomic;
+ struct {
+ __u32 ah;
+ __u32 remote_qpn;
+ __u32 remote_qkey;
+ __u32 reserved;
+ } ud;
+ } wr;
+};
+
+struct ib_uverbs_post_send {
+ __u64 response;
+ __u32 qp_handle;
+ __u32 wr_count;
+ __u32 sge_count;
+ __u32 wqe_size;
+ struct ib_uverbs_send_wr send_wr[0];
+};
+
+struct ib_uverbs_post_send_resp {
+ __u32 bad_wr;
+};
+
+struct ib_uverbs_recv_wr {
+ __u64 wr_id;
+ __u32 num_sge;
+ __u32 reserved;
+};
+
+struct ib_uverbs_post_recv {
+ __u64 response;
+ __u32 qp_handle;
+ __u32 wr_count;
+ __u32 sge_count;
+ __u32 wqe_size;
+ struct ib_uverbs_recv_wr recv_wr[0];
+};
+
+struct ib_uverbs_post_recv_resp {
+ __u32 bad_wr;
+};
+
+struct ib_uverbs_post_srq_recv {
+ __u64 response;
+ __u32 srq_handle;
+ __u32 wr_count;
+ __u32 sge_count;
+ __u32 wqe_size;
+ struct ib_uverbs_recv_wr recv[0];
+};
+
+struct ib_uverbs_post_srq_recv_resp {
+ __u32 bad_wr;
+};
+
+struct ib_uverbs_create_ah {
+ __u64 response;
+ __u64 user_handle;
+ __u32 pd_handle;
+ __u32 reserved;
+ struct ib_uverbs_ah_attr attr;
+};
+
+struct ib_uverbs_create_ah_resp {
+ __u32 ah_handle;
+};
+
+struct ib_uverbs_destroy_ah {
+ __u32 ah_handle;
+};
+
+struct ib_uverbs_attach_mcast {
+ __u8 gid[16];
+ __u32 qp_handle;
+ __u16 mlid;
+ __u16 reserved;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_detach_mcast {
+ __u8 gid[16];
+ __u32 qp_handle;
+ __u16 mlid;
+ __u16 reserved;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_create_srq {
+ __u64 response;
+ __u64 user_handle;
+ __u32 pd_handle;
+ __u32 max_wr;
+ __u32 max_sge;
+ __u32 srq_limit;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_create_xrc_srq {
+ __u64 response;
+ __u64 user_handle;
+ __u32 pd_handle;
+ __u32 max_wr;
+ __u32 max_sge;
+ __u32 srq_limit;
+ __u32 xrcd_handle;
+ __u32 xrc_cq;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_create_srq_resp {
+ __u32 srq_handle;
+ __u32 max_wr;
+ __u32 max_sge;
+ __u32 reserved;
+};
+
+struct ib_uverbs_modify_srq {
+ __u32 srq_handle;
+ __u32 attr_mask;
+ __u32 max_wr;
+ __u32 srq_limit;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_query_srq {
+ __u64 response;
+ __u32 srq_handle;
+ __u32 reserved;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_query_srq_resp {
+ __u32 max_wr;
+ __u32 max_sge;
+ __u32 srq_limit;
+ __u32 reserved;
+};
+
+struct ib_uverbs_destroy_srq {
+ __u64 response;
+ __u32 srq_handle;
+ __u32 reserved;
+};
+
+struct ib_uverbs_destroy_srq_resp {
+ __u32 events_reported;
+};
+
+struct ib_uverbs_open_xrc_domain {
+ __u64 response;
+ __u32 fd;
+ __u32 oflags;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_open_xrc_domain_resp {
+ __u32 xrcd_handle;
+};
+
+struct ib_uverbs_close_xrc_domain {
+ __u64 response;
+ __u32 xrcd_handle;
+ __u32 reserved;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_create_xrc_rcv_qp {
+ __u64 response;
+ __u64 user_handle;
+ __u32 xrc_domain_handle;
+ __u32 max_send_wr;
+ __u32 max_recv_wr;
+ __u32 max_send_sge;
+ __u32 max_recv_sge;
+ __u32 max_inline_data;
+ __u8 sq_sig_all;
+ __u8 qp_type;
+ __u8 reserved[6];
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_create_xrc_rcv_qp_resp {
+ __u32 qpn;
+ __u32 reserved;
+};
+
+struct ib_uverbs_modify_xrc_rcv_qp {
+ __u32 xrc_domain_handle;
+ __u32 qp_num;
+ struct ib_uverbs_qp_dest dest;
+ struct ib_uverbs_qp_dest alt_dest;
+ __u32 attr_mask;
+ __u32 qkey;
+ __u32 rq_psn;
+ __u32 sq_psn;
+ __u32 dest_qp_num;
+ __u32 qp_access_flags;
+ __u16 pkey_index;
+ __u16 alt_pkey_index;
+ __u8 qp_state;
+ __u8 cur_qp_state;
+ __u8 path_mtu;
+ __u8 path_mig_state;
+ __u8 en_sqd_async_notify;
+ __u8 max_rd_atomic;
+ __u8 max_dest_rd_atomic;
+ __u8 min_rnr_timer;
+ __u8 port_num;
+ __u8 timeout;
+ __u8 retry_cnt;
+ __u8 rnr_retry;
+ __u8 alt_port_num;
+ __u8 alt_timeout;
+ __u8 reserved[6];
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_query_xrc_rcv_qp {
+ __u64 response;
+ __u32 xrc_domain_handle;
+ __u32 qp_num;
+ __u32 attr_mask;
+ __u32 reserved;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_reg_xrc_rcv_qp {
+ __u32 xrc_domain_handle;
+ __u32 qp_num;
+ __u64 driver_data[0];
+};
+
+struct ib_uverbs_unreg_xrc_rcv_qp {
+ __u32 xrc_domain_handle;
+ __u32 qp_num;
+ __u64 driver_data[0];
+};
+
+
+#endif /* IB_USER_VERBS_H */
diff --git a/sys/ofed/include/rdma/ib_verbs.h b/sys/ofed/include/rdma/ib_verbs.h
new file mode 100644
index 0000000..f5b054a
--- /dev/null
+++ b/sys/ofed/include/rdma/ib_verbs.h
@@ -0,0 +1,2170 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(IB_VERBS_H)
+#define IB_VERBS_H
+
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/rwsem.h>
+#include <linux/scatterlist.h>
+
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+#include <linux/rbtree.h>
+#include <linux/mutex.h>
+
+union ib_gid {
+ u8 raw[16];
+ struct {
+ __be64 subnet_prefix;
+ __be64 interface_id;
+ } global;
+};
+
+enum rdma_node_type {
+ /* IB values map to NodeInfo:NodeType. */
+ RDMA_NODE_IB_CA = 1,
+ RDMA_NODE_IB_SWITCH,
+ RDMA_NODE_IB_ROUTER,
+ RDMA_NODE_RNIC
+};
+
+enum rdma_transport_type {
+ RDMA_TRANSPORT_IB,
+ RDMA_TRANSPORT_IWARP
+};
+
+enum rdma_transport_type
+rdma_node_get_transport(enum rdma_node_type node_type) __attribute_const__;
+
+enum rdma_link_layer {
+ IB_LINK_LAYER_UNSPECIFIED,
+ IB_LINK_LAYER_INFINIBAND,
+ IB_LINK_LAYER_ETHERNET,
+};
+
+enum ib_device_cap_flags {
+ IB_DEVICE_RESIZE_MAX_WR = 1,
+ IB_DEVICE_BAD_PKEY_CNTR = (1<<1),
+ IB_DEVICE_BAD_QKEY_CNTR = (1<<2),
+ IB_DEVICE_RAW_MULTI = (1<<3),
+ IB_DEVICE_AUTO_PATH_MIG = (1<<4),
+ IB_DEVICE_CHANGE_PHY_PORT = (1<<5),
+ IB_DEVICE_UD_AV_PORT_ENFORCE = (1<<6),
+ IB_DEVICE_CURR_QP_STATE_MOD = (1<<7),
+ IB_DEVICE_SHUTDOWN_PORT = (1<<8),
+ IB_DEVICE_INIT_TYPE = (1<<9),
+ IB_DEVICE_PORT_ACTIVE_EVENT = (1<<10),
+ IB_DEVICE_SYS_IMAGE_GUID = (1<<11),
+ IB_DEVICE_RC_RNR_NAK_GEN = (1<<12),
+ IB_DEVICE_SRQ_RESIZE = (1<<13),
+ IB_DEVICE_N_NOTIFY_CQ = (1<<14),
+ IB_DEVICE_LOCAL_DMA_LKEY = (1<<15),
+ IB_DEVICE_RESERVED = (1<<16), /* old SEND_W_INV */
+ IB_DEVICE_MEM_WINDOW = (1<<17),
+ /*
+ * Devices should set IB_DEVICE_UD_IP_SUM if they support
+ * insertion of UDP and TCP checksum on outgoing UD IPoIB
+ * messages and can verify the validity of checksum for
+ * incoming messages. Setting this flag implies that the
+ * IPoIB driver may set NETIF_F_IP_CSUM for datagram mode.
+ */
+ IB_DEVICE_UD_IP_CSUM = (1<<18),
+ IB_DEVICE_UD_TSO = (1<<19),
+ IB_DEVICE_XRC = (1<<20),
+ IB_DEVICE_MEM_MGT_EXTENSIONS = (1<<21),
+ IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1<<22),
+};
+
+enum ib_atomic_cap {
+ IB_ATOMIC_NONE,
+ IB_ATOMIC_HCA,
+ IB_ATOMIC_GLOB
+};
+
+struct ib_device_attr {
+ u64 fw_ver;
+ __be64 sys_image_guid;
+ u64 max_mr_size;
+ u64 page_size_cap;
+ u32 vendor_id;
+ u32 vendor_part_id;
+ u32 hw_ver;
+ int max_qp;
+ int max_qp_wr;
+ int device_cap_flags;
+ int max_sge;
+ int max_sge_rd;
+ int max_cq;
+ int max_cqe;
+ int max_mr;
+ int max_pd;
+ int max_qp_rd_atom;
+ int max_ee_rd_atom;
+ int max_res_rd_atom;
+ int max_qp_init_rd_atom;
+ int max_ee_init_rd_atom;
+ enum ib_atomic_cap atomic_cap;
+ enum ib_atomic_cap masked_atomic_cap;
+ int max_ee;
+ int max_rdd;
+ int max_mw;
+ int max_raw_ipv6_qp;
+ int max_raw_ethy_qp;
+ int max_mcast_grp;
+ int max_mcast_qp_attach;
+ int max_total_mcast_qp_attach;
+ int max_ah;
+ int max_fmr;
+ int max_map_per_fmr;
+ int max_srq;
+ int max_srq_wr;
+ int max_srq_sge;
+ unsigned int max_fast_reg_page_list_len;
+ u16 max_pkeys;
+ u8 local_ca_ack_delay;
+};
+
+enum ib_mtu {
+ IB_MTU_256 = 1,
+ IB_MTU_512 = 2,
+ IB_MTU_1024 = 3,
+ IB_MTU_2048 = 4,
+ IB_MTU_4096 = 5
+};
+
+static inline int ib_mtu_enum_to_int(enum ib_mtu mtu)
+{
+ switch (mtu) {
+ case IB_MTU_256: return 256;
+ case IB_MTU_512: return 512;
+ case IB_MTU_1024: return 1024;
+ case IB_MTU_2048: return 2048;
+ case IB_MTU_4096: return 4096;
+ default: return -1;
+ }
+}
+
+enum ib_port_state {
+ IB_PORT_NOP = 0,
+ IB_PORT_DOWN = 1,
+ IB_PORT_INIT = 2,
+ IB_PORT_ARMED = 3,
+ IB_PORT_ACTIVE = 4,
+ IB_PORT_ACTIVE_DEFER = 5
+};
+
+enum ib_port_cap_flags {
+ IB_PORT_SM = 1 << 1,
+ IB_PORT_NOTICE_SUP = 1 << 2,
+ IB_PORT_TRAP_SUP = 1 << 3,
+ IB_PORT_OPT_IPD_SUP = 1 << 4,
+ IB_PORT_AUTO_MIGR_SUP = 1 << 5,
+ IB_PORT_SL_MAP_SUP = 1 << 6,
+ IB_PORT_MKEY_NVRAM = 1 << 7,
+ IB_PORT_PKEY_NVRAM = 1 << 8,
+ IB_PORT_LED_INFO_SUP = 1 << 9,
+ IB_PORT_SM_DISABLED = 1 << 10,
+ IB_PORT_SYS_IMAGE_GUID_SUP = 1 << 11,
+ IB_PORT_PKEY_SW_EXT_PORT_TRAP_SUP = 1 << 12,
+ IB_PORT_CM_SUP = 1 << 16,
+ IB_PORT_SNMP_TUNNEL_SUP = 1 << 17,
+ IB_PORT_REINIT_SUP = 1 << 18,
+ IB_PORT_DEVICE_MGMT_SUP = 1 << 19,
+ IB_PORT_VENDOR_CLASS_SUP = 1 << 20,
+ IB_PORT_DR_NOTICE_SUP = 1 << 21,
+ IB_PORT_CAP_MASK_NOTICE_SUP = 1 << 22,
+ IB_PORT_BOOT_MGMT_SUP = 1 << 23,
+ IB_PORT_LINK_LATENCY_SUP = 1 << 24,
+ IB_PORT_CLIENT_REG_SUP = 1 << 25
+};
+
+enum ib_port_width {
+ IB_WIDTH_1X = 1,
+ IB_WIDTH_4X = 2,
+ IB_WIDTH_8X = 4,
+ IB_WIDTH_12X = 8
+};
+
+static inline int ib_width_enum_to_int(enum ib_port_width width)
+{
+ switch (width) {
+ case IB_WIDTH_1X: return 1;
+ case IB_WIDTH_4X: return 4;
+ case IB_WIDTH_8X: return 8;
+ case IB_WIDTH_12X: return 12;
+ default: return -1;
+ }
+}
+
+struct ib_protocol_stats {
+ /* TBD... */
+};
+
+struct iw_protocol_stats {
+ u64 ipInReceives;
+ u64 ipInHdrErrors;
+ u64 ipInTooBigErrors;
+ u64 ipInNoRoutes;
+ u64 ipInAddrErrors;
+ u64 ipInUnknownProtos;
+ u64 ipInTruncatedPkts;
+ u64 ipInDiscards;
+ u64 ipInDelivers;
+ u64 ipOutForwDatagrams;
+ u64 ipOutRequests;
+ u64 ipOutDiscards;
+ u64 ipOutNoRoutes;
+ u64 ipReasmTimeout;
+ u64 ipReasmReqds;
+ u64 ipReasmOKs;
+ u64 ipReasmFails;
+ u64 ipFragOKs;
+ u64 ipFragFails;
+ u64 ipFragCreates;
+ u64 ipInMcastPkts;
+ u64 ipOutMcastPkts;
+ u64 ipInBcastPkts;
+ u64 ipOutBcastPkts;
+
+ u64 tcpRtoAlgorithm;
+ u64 tcpRtoMin;
+ u64 tcpRtoMax;
+ u64 tcpMaxConn;
+ u64 tcpActiveOpens;
+ u64 tcpPassiveOpens;
+ u64 tcpAttemptFails;
+ u64 tcpEstabResets;
+ u64 tcpCurrEstab;
+ u64 tcpInSegs;
+ u64 tcpOutSegs;
+ u64 tcpRetransSegs;
+ u64 tcpInErrs;
+ u64 tcpOutRsts;
+};
+
+union rdma_protocol_stats {
+ struct ib_protocol_stats ib;
+ struct iw_protocol_stats iw;
+};
+
+struct ib_port_attr {
+ enum ib_port_state state;
+ enum ib_mtu max_mtu;
+ enum ib_mtu active_mtu;
+ int gid_tbl_len;
+ u32 port_cap_flags;
+ u32 max_msg_sz;
+ u32 bad_pkey_cntr;
+ u32 qkey_viol_cntr;
+ u16 pkey_tbl_len;
+ u16 lid;
+ u16 sm_lid;
+ u8 lmc;
+ u8 max_vl_num;
+ u8 sm_sl;
+ u8 subnet_timeout;
+ u8 init_type_reply;
+ u8 active_width;
+ u8 active_speed;
+ u8 phys_state;
+ enum rdma_link_layer link_layer;
+};
+
+enum ib_device_modify_flags {
+ IB_DEVICE_MODIFY_SYS_IMAGE_GUID = 1 << 0,
+ IB_DEVICE_MODIFY_NODE_DESC = 1 << 1
+};
+
+struct ib_device_modify {
+ u64 sys_image_guid;
+ char node_desc[64];
+};
+
+enum ib_port_modify_flags {
+ IB_PORT_SHUTDOWN = 1,
+ IB_PORT_INIT_TYPE = (1<<2),
+ IB_PORT_RESET_QKEY_CNTR = (1<<3)
+};
+
+struct ib_port_modify {
+ u32 set_port_cap_mask;
+ u32 clr_port_cap_mask;
+ u8 init_type;
+};
+
+enum ib_event_type {
+ IB_EVENT_CQ_ERR,
+ IB_EVENT_QP_FATAL,
+ IB_EVENT_QP_REQ_ERR,
+ IB_EVENT_QP_ACCESS_ERR,
+ IB_EVENT_COMM_EST,
+ IB_EVENT_SQ_DRAINED,
+ IB_EVENT_PATH_MIG,
+ IB_EVENT_PATH_MIG_ERR,
+ IB_EVENT_DEVICE_FATAL,
+ IB_EVENT_PORT_ACTIVE,
+ IB_EVENT_PORT_ERR,
+ IB_EVENT_LID_CHANGE,
+ IB_EVENT_PKEY_CHANGE,
+ IB_EVENT_SM_CHANGE,
+ IB_EVENT_SRQ_ERR,
+ IB_EVENT_SRQ_LIMIT_REACHED,
+ IB_EVENT_QP_LAST_WQE_REACHED,
+ IB_EVENT_CLIENT_REREGISTER,
+ IB_EVENT_GID_CHANGE,
+};
+
+enum ib_event_flags {
+ IB_XRC_QP_EVENT_FLAG = 0x80000000,
+};
+
+struct ib_event {
+ struct ib_device *device;
+ union {
+ struct ib_cq *cq;
+ struct ib_qp *qp;
+ struct ib_srq *srq;
+ u8 port_num;
+ u32 xrc_qp_num;
+ } element;
+ enum ib_event_type event;
+};
+
+struct ib_event_handler {
+ struct ib_device *device;
+ void (*handler)(struct ib_event_handler *, struct ib_event *);
+ struct list_head list;
+};
+
+#define INIT_IB_EVENT_HANDLER(_ptr, _device, _handler) \
+ do { \
+ (_ptr)->device = _device; \
+ (_ptr)->handler = _handler; \
+ INIT_LIST_HEAD(&(_ptr)->list); \
+ } while (0)
+
+struct ib_global_route {
+ union ib_gid dgid;
+ u32 flow_label;
+ u8 sgid_index;
+ u8 hop_limit;
+ u8 traffic_class;
+};
+
+struct ib_grh {
+ __be32 version_tclass_flow;
+ __be16 paylen;
+ u8 next_hdr;
+ u8 hop_limit;
+ union ib_gid sgid;
+ union ib_gid dgid;
+};
+
+enum {
+ IB_MULTICAST_QPN = 0xffffff
+};
+
+#define IB_LID_PERMISSIVE cpu_to_be16(0xFFFF)
+
+enum ib_ah_flags {
+ IB_AH_GRH = 1
+};
+
+enum ib_rate {
+ IB_RATE_PORT_CURRENT = 0,
+ IB_RATE_2_5_GBPS = 2,
+ IB_RATE_5_GBPS = 5,
+ IB_RATE_10_GBPS = 3,
+ IB_RATE_20_GBPS = 6,
+ IB_RATE_30_GBPS = 4,
+ IB_RATE_40_GBPS = 7,
+ IB_RATE_60_GBPS = 8,
+ IB_RATE_80_GBPS = 9,
+ IB_RATE_120_GBPS = 10
+};
+
+/**
+ * ib_rate_to_mult - Convert the IB rate enum to a multiple of the
+ * base rate of 2.5 Gbit/sec. For example, IB_RATE_5_GBPS will be
+ * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec.
+ * @rate: rate to convert.
+ */
+int ib_rate_to_mult(enum ib_rate rate) __attribute_const__;
+
+/**
+ * mult_to_ib_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate
+ * enum.
+ * @mult: multiple to convert.
+ */
+enum ib_rate mult_to_ib_rate(int mult) __attribute_const__;
+
+struct ib_ah_attr {
+ struct ib_global_route grh;
+ u16 dlid;
+ u8 sl;
+ u8 src_path_bits;
+ u8 static_rate;
+ u8 ah_flags;
+ u8 port_num;
+};
+
+enum ib_wc_status {
+ IB_WC_SUCCESS,
+ IB_WC_LOC_LEN_ERR,
+ IB_WC_LOC_QP_OP_ERR,
+ IB_WC_LOC_EEC_OP_ERR,
+ IB_WC_LOC_PROT_ERR,
+ IB_WC_WR_FLUSH_ERR,
+ IB_WC_MW_BIND_ERR,
+ IB_WC_BAD_RESP_ERR,
+ IB_WC_LOC_ACCESS_ERR,
+ IB_WC_REM_INV_REQ_ERR,
+ IB_WC_REM_ACCESS_ERR,
+ IB_WC_REM_OP_ERR,
+ IB_WC_RETRY_EXC_ERR,
+ IB_WC_RNR_RETRY_EXC_ERR,
+ IB_WC_LOC_RDD_VIOL_ERR,
+ IB_WC_REM_INV_RD_REQ_ERR,
+ IB_WC_REM_ABORT_ERR,
+ IB_WC_INV_EECN_ERR,
+ IB_WC_INV_EEC_STATE_ERR,
+ IB_WC_FATAL_ERR,
+ IB_WC_RESP_TIMEOUT_ERR,
+ IB_WC_GENERAL_ERR
+};
+
+enum ib_wc_opcode {
+ IB_WC_SEND,
+ IB_WC_RDMA_WRITE,
+ IB_WC_RDMA_READ,
+ IB_WC_COMP_SWAP,
+ IB_WC_FETCH_ADD,
+ IB_WC_BIND_MW,
+ IB_WC_LSO,
+ IB_WC_LOCAL_INV,
+ IB_WC_FAST_REG_MR,
+ IB_WC_MASKED_COMP_SWAP,
+ IB_WC_MASKED_FETCH_ADD,
+/*
+ * Set value of IB_WC_RECV so consumers can test if a completion is a
+ * receive by testing (opcode & IB_WC_RECV).
+ */
+ IB_WC_RECV = 1 << 7,
+ IB_WC_RECV_RDMA_WITH_IMM
+};
+
+enum ib_wc_flags {
+ IB_WC_GRH = 1,
+ IB_WC_WITH_IMM = (1<<1),
+ IB_WC_WITH_INVALIDATE = (1<<2),
+};
+
+struct ib_wc {
+ u64 wr_id;
+ enum ib_wc_status status;
+ enum ib_wc_opcode opcode;
+ u32 vendor_err;
+ u32 byte_len;
+ struct ib_qp *qp;
+ union {
+ __be32 imm_data;
+ u32 invalidate_rkey;
+ } ex;
+ u32 src_qp;
+ int wc_flags;
+ u16 pkey_index;
+ u16 slid;
+ u8 sl;
+ u8 dlid_path_bits;
+ u8 port_num; /* valid only for DR SMPs on switches */
+ int csum_ok;
+};
+
+enum ib_cq_notify_flags {
+ IB_CQ_SOLICITED = 1 << 0,
+ IB_CQ_NEXT_COMP = 1 << 1,
+ IB_CQ_SOLICITED_MASK = IB_CQ_SOLICITED | IB_CQ_NEXT_COMP,
+ IB_CQ_REPORT_MISSED_EVENTS = 1 << 2,
+};
+
+enum ib_srq_attr_mask {
+ IB_SRQ_MAX_WR = 1 << 0,
+ IB_SRQ_LIMIT = 1 << 1,
+};
+
+struct ib_srq_attr {
+ u32 max_wr;
+ u32 max_sge;
+ u32 srq_limit;
+};
+
+struct ib_srq_init_attr {
+ void (*event_handler)(struct ib_event *, void *);
+ void *srq_context;
+ struct ib_srq_attr attr;
+};
+
+struct ib_qp_cap {
+ u32 max_send_wr;
+ u32 max_recv_wr;
+ u32 max_send_sge;
+ u32 max_recv_sge;
+ u32 max_inline_data;
+};
+
+enum ib_sig_type {
+ IB_SIGNAL_ALL_WR,
+ IB_SIGNAL_REQ_WR
+};
+
+enum ib_qp_type {
+ /*
+ * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries
+ * here (and in that order) since the MAD layer uses them as
+ * indices into a 2-entry table.
+ */
+ IB_QPT_SMI,
+ IB_QPT_GSI,
+
+ IB_QPT_RC,
+ IB_QPT_UC,
+ IB_QPT_UD,
+ IB_QPT_XRC,
+ IB_QPT_RAW_IPV6,
+ IB_QPT_RAW_ETY,
+ IB_QPT_RAW_ETH
+};
+
+enum ib_qp_create_flags {
+ IB_QP_CREATE_IPOIB_UD_LSO = 1 << 0,
+ IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = 1 << 1,
+};
+
+struct ib_qp_init_attr {
+ void (*event_handler)(struct ib_event *, void *);
+ void *qp_context;
+ struct ib_cq *send_cq;
+ struct ib_cq *recv_cq;
+ struct ib_srq *srq;
+ struct ib_qp_cap cap;
+ enum ib_sig_type sq_sig_type;
+ enum ib_qp_type qp_type;
+ enum ib_qp_create_flags create_flags;
+ struct ib_xrcd *xrc_domain; /* XRC qp's only */
+ u8 port_num; /* special QP types only */
+};
+
+enum ib_rnr_timeout {
+ IB_RNR_TIMER_655_36 = 0,
+ IB_RNR_TIMER_000_01 = 1,
+ IB_RNR_TIMER_000_02 = 2,
+ IB_RNR_TIMER_000_03 = 3,
+ IB_RNR_TIMER_000_04 = 4,
+ IB_RNR_TIMER_000_06 = 5,
+ IB_RNR_TIMER_000_08 = 6,
+ IB_RNR_TIMER_000_12 = 7,
+ IB_RNR_TIMER_000_16 = 8,
+ IB_RNR_TIMER_000_24 = 9,
+ IB_RNR_TIMER_000_32 = 10,
+ IB_RNR_TIMER_000_48 = 11,
+ IB_RNR_TIMER_000_64 = 12,
+ IB_RNR_TIMER_000_96 = 13,
+ IB_RNR_TIMER_001_28 = 14,
+ IB_RNR_TIMER_001_92 = 15,
+ IB_RNR_TIMER_002_56 = 16,
+ IB_RNR_TIMER_003_84 = 17,
+ IB_RNR_TIMER_005_12 = 18,
+ IB_RNR_TIMER_007_68 = 19,
+ IB_RNR_TIMER_010_24 = 20,
+ IB_RNR_TIMER_015_36 = 21,
+ IB_RNR_TIMER_020_48 = 22,
+ IB_RNR_TIMER_030_72 = 23,
+ IB_RNR_TIMER_040_96 = 24,
+ IB_RNR_TIMER_061_44 = 25,
+ IB_RNR_TIMER_081_92 = 26,
+ IB_RNR_TIMER_122_88 = 27,
+ IB_RNR_TIMER_163_84 = 28,
+ IB_RNR_TIMER_245_76 = 29,
+ IB_RNR_TIMER_327_68 = 30,
+ IB_RNR_TIMER_491_52 = 31
+};
+
+enum ib_qp_attr_mask {
+ IB_QP_STATE = 1,
+ IB_QP_CUR_STATE = (1<<1),
+ IB_QP_EN_SQD_ASYNC_NOTIFY = (1<<2),
+ IB_QP_ACCESS_FLAGS = (1<<3),
+ IB_QP_PKEY_INDEX = (1<<4),
+ IB_QP_PORT = (1<<5),
+ IB_QP_QKEY = (1<<6),
+ IB_QP_AV = (1<<7),
+ IB_QP_PATH_MTU = (1<<8),
+ IB_QP_TIMEOUT = (1<<9),
+ IB_QP_RETRY_CNT = (1<<10),
+ IB_QP_RNR_RETRY = (1<<11),
+ IB_QP_RQ_PSN = (1<<12),
+ IB_QP_MAX_QP_RD_ATOMIC = (1<<13),
+ IB_QP_ALT_PATH = (1<<14),
+ IB_QP_MIN_RNR_TIMER = (1<<15),
+ IB_QP_SQ_PSN = (1<<16),
+ IB_QP_MAX_DEST_RD_ATOMIC = (1<<17),
+ IB_QP_PATH_MIG_STATE = (1<<18),
+ IB_QP_CAP = (1<<19),
+ IB_QP_DEST_QPN = (1<<20)
+};
+
+enum ib_qp_state {
+ IB_QPS_RESET,
+ IB_QPS_INIT,
+ IB_QPS_RTR,
+ IB_QPS_RTS,
+ IB_QPS_SQD,
+ IB_QPS_SQE,
+ IB_QPS_ERR
+};
+
+enum ib_mig_state {
+ IB_MIG_MIGRATED,
+ IB_MIG_REARM,
+ IB_MIG_ARMED
+};
+
+struct ib_qp_attr {
+ enum ib_qp_state qp_state;
+ enum ib_qp_state cur_qp_state;
+ enum ib_mtu path_mtu;
+ enum ib_mig_state path_mig_state;
+ u32 qkey;
+ u32 rq_psn;
+ u32 sq_psn;
+ u32 dest_qp_num;
+ int qp_access_flags;
+ struct ib_qp_cap cap;
+ struct ib_ah_attr ah_attr;
+ struct ib_ah_attr alt_ah_attr;
+ u16 pkey_index;
+ u16 alt_pkey_index;
+ u8 en_sqd_async_notify;
+ u8 sq_draining;
+ u8 max_rd_atomic;
+ u8 max_dest_rd_atomic;
+ u8 min_rnr_timer;
+ u8 port_num;
+ u8 timeout;
+ u8 retry_cnt;
+ u8 rnr_retry;
+ u8 alt_port_num;
+ u8 alt_timeout;
+};
+
+enum ib_wr_opcode {
+ IB_WR_RDMA_WRITE,
+ IB_WR_RDMA_WRITE_WITH_IMM,
+ IB_WR_SEND,
+ IB_WR_SEND_WITH_IMM,
+ IB_WR_RDMA_READ,
+ IB_WR_ATOMIC_CMP_AND_SWP,
+ IB_WR_ATOMIC_FETCH_AND_ADD,
+ IB_WR_LSO,
+ IB_WR_BIG_LSO,
+ IB_WR_SEND_WITH_INV,
+ IB_WR_RDMA_READ_WITH_INV,
+ IB_WR_LOCAL_INV,
+ IB_WR_FAST_REG_MR,
+ IB_WR_MASKED_ATOMIC_CMP_AND_SWP,
+ IB_WR_MASKED_ATOMIC_FETCH_AND_ADD,
+};
+
+enum ib_send_flags {
+ IB_SEND_FENCE = 1,
+ IB_SEND_SIGNALED = (1<<1),
+ IB_SEND_SOLICITED = (1<<2),
+ IB_SEND_INLINE = (1<<3),
+ IB_SEND_IP_CSUM = (1<<4)
+};
+
+struct ib_sge {
+ u64 addr;
+ u32 length;
+ u32 lkey;
+};
+
+struct ib_fast_reg_page_list {
+ struct ib_device *device;
+ u64 *page_list;
+ unsigned int max_page_list_len;
+};
+
+struct ib_send_wr {
+ struct ib_send_wr *next;
+ u64 wr_id;
+ struct ib_sge *sg_list;
+ int num_sge;
+ enum ib_wr_opcode opcode;
+ int send_flags;
+ union {
+ __be32 imm_data;
+ u32 invalidate_rkey;
+ } ex;
+ union {
+ struct {
+ u64 remote_addr;
+ u32 rkey;
+ } rdma;
+ struct {
+ u64 remote_addr;
+ u64 compare_add;
+ u64 swap;
+ u64 compare_add_mask;
+ u64 swap_mask;
+ u32 rkey;
+ } atomic;
+ struct {
+ struct ib_ah *ah;
+ void *header;
+ int hlen;
+ int mss;
+ u32 remote_qpn;
+ u32 remote_qkey;
+ u16 pkey_index; /* valid for GSI only */
+ u8 port_num; /* valid for DR SMPs on switch only */
+ } ud;
+ struct {
+ u64 iova_start;
+ struct ib_fast_reg_page_list *page_list;
+ unsigned int page_shift;
+ unsigned int page_list_len;
+ u32 length;
+ int access_flags;
+ u32 rkey;
+ } fast_reg;
+ struct {
+ struct ib_unpacked_lrh *lrh;
+ u32 eth_type;
+ u8 static_rate;
+ } raw_ety;
+ } wr;
+ u32 xrc_remote_srq_num; /* valid for XRC sends only */
+};
+
+struct ib_recv_wr {
+ struct ib_recv_wr *next;
+ u64 wr_id;
+ struct ib_sge *sg_list;
+ int num_sge;
+};
+
+enum ib_access_flags {
+ IB_ACCESS_LOCAL_WRITE = 1,
+ IB_ACCESS_REMOTE_WRITE = (1<<1),
+ IB_ACCESS_REMOTE_READ = (1<<2),
+ IB_ACCESS_REMOTE_ATOMIC = (1<<3),
+ IB_ACCESS_MW_BIND = (1<<4)
+};
+
+struct ib_phys_buf {
+ u64 addr;
+ u64 size;
+};
+
+struct ib_mr_attr {
+ struct ib_pd *pd;
+ u64 device_virt_addr;
+ u64 size;
+ int mr_access_flags;
+ u32 lkey;
+ u32 rkey;
+};
+
+enum ib_mr_rereg_flags {
+ IB_MR_REREG_TRANS = 1,
+ IB_MR_REREG_PD = (1<<1),
+ IB_MR_REREG_ACCESS = (1<<2)
+};
+
+struct ib_mw_bind {
+ struct ib_mr *mr;
+ u64 wr_id;
+ u64 addr;
+ u32 length;
+ int send_flags;
+ int mw_access_flags;
+};
+
+struct ib_fmr_attr {
+ int max_pages;
+ int max_maps;
+ u8 page_shift;
+};
+
+struct ib_ucontext {
+ struct ib_device *device;
+ struct list_head pd_list;
+ struct list_head mr_list;
+ struct list_head mw_list;
+ struct list_head cq_list;
+ struct list_head qp_list;
+ struct list_head srq_list;
+ struct list_head ah_list;
+ struct list_head xrc_domain_list;
+ int closing;
+};
+
+struct ib_uobject {
+ u64 user_handle; /* handle given to us by userspace */
+ struct ib_ucontext *context; /* associated user context */
+ void *object; /* containing object */
+ struct list_head list; /* link to context's list */
+ int id; /* index into kernel idr */
+ struct kref ref;
+ struct rw_semaphore mutex; /* protects .live */
+ int live;
+};
+
+struct ib_udata {
+ void __user *inbuf;
+ void __user *outbuf;
+ size_t inlen;
+ size_t outlen;
+};
+
+struct ib_uxrc_rcv_object {
+ struct list_head list; /* link to context's list */
+ u32 qp_num;
+ u32 domain_handle;
+};
+
+struct ib_pd {
+ struct ib_device *device;
+ struct ib_uobject *uobject;
+ atomic_t usecnt; /* count all resources */
+};
+
+struct ib_xrcd {
+ struct ib_device *device;
+ struct ib_uobject *uobject;
+ struct inode *inode;
+ struct rb_node node;
+ atomic_t usecnt; /* count all resources */
+};
+
+
+struct ib_ah {
+ struct ib_device *device;
+ struct ib_pd *pd;
+ struct ib_uobject *uobject;
+};
+
+typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context);
+
+struct ib_cq {
+ struct ib_device *device;
+ struct ib_uobject *uobject;
+ ib_comp_handler comp_handler;
+ void (*event_handler)(struct ib_event *, void *);
+ void *cq_context;
+ int cqe;
+ atomic_t usecnt; /* count number of work queues */
+};
+
+struct ib_srq {
+ struct ib_device *device;
+ struct ib_pd *pd;
+ struct ib_cq *xrc_cq;
+ struct ib_xrcd *xrcd;
+ struct ib_uobject *uobject;
+ void (*event_handler)(struct ib_event *, void *);
+ void *srq_context;
+ atomic_t usecnt;
+ u32 xrc_srq_num;
+};
+
+struct ib_qp {
+ struct ib_device *device;
+ struct ib_pd *pd;
+ struct ib_cq *send_cq;
+ struct ib_cq *recv_cq;
+ struct ib_srq *srq;
+ struct ib_uobject *uobject;
+ void (*event_handler)(struct ib_event *, void *);
+ void *qp_context;
+ u32 qp_num;
+ enum ib_qp_type qp_type;
+ struct ib_xrcd *xrcd; /* XRC QPs only */
+};
+
+struct ib_mr {
+ struct ib_device *device;
+ struct ib_pd *pd;
+ struct ib_uobject *uobject;
+ u32 lkey;
+ u32 rkey;
+ atomic_t usecnt; /* count number of MWs */
+};
+
+struct ib_mw {
+ struct ib_device *device;
+ struct ib_pd *pd;
+ struct ib_uobject *uobject;
+ u32 rkey;
+};
+
+struct ib_fmr {
+ struct ib_device *device;
+ struct ib_pd *pd;
+ struct list_head list;
+ u32 lkey;
+ u32 rkey;
+};
+
+struct ib_mad;
+struct ib_grh;
+
+enum ib_process_mad_flags {
+ IB_MAD_IGNORE_MKEY = 1,
+ IB_MAD_IGNORE_BKEY = 2,
+ IB_MAD_IGNORE_ALL = IB_MAD_IGNORE_MKEY | IB_MAD_IGNORE_BKEY
+};
+
+enum ib_mad_result {
+ IB_MAD_RESULT_FAILURE = 0, /* (!SUCCESS is the important flag) */
+ IB_MAD_RESULT_SUCCESS = 1 << 0, /* MAD was successfully processed */
+ IB_MAD_RESULT_REPLY = 1 << 1, /* Reply packet needs to be sent */
+ IB_MAD_RESULT_CONSUMED = 1 << 2 /* Packet consumed: stop processing */
+};
+
+#define IB_DEVICE_NAME_MAX 64
+
+struct ib_cache {
+ rwlock_t lock;
+ struct ib_event_handler event_handler;
+ struct ib_pkey_cache **pkey_cache;
+ struct ib_gid_cache **gid_cache;
+ u8 *lmc_cache;
+};
+
+struct ib_dma_mapping_ops {
+ int (*mapping_error)(struct ib_device *dev,
+ u64 dma_addr);
+ u64 (*map_single)(struct ib_device *dev,
+ void *ptr, size_t size,
+ enum dma_data_direction direction);
+ void (*unmap_single)(struct ib_device *dev,
+ u64 addr, size_t size,
+ enum dma_data_direction direction);
+ u64 (*map_page)(struct ib_device *dev,
+ struct page *page, unsigned long offset,
+ size_t size,
+ enum dma_data_direction direction);
+ void (*unmap_page)(struct ib_device *dev,
+ u64 addr, size_t size,
+ enum dma_data_direction direction);
+ int (*map_sg)(struct ib_device *dev,
+ struct scatterlist *sg, int nents,
+ enum dma_data_direction direction);
+ void (*unmap_sg)(struct ib_device *dev,
+ struct scatterlist *sg, int nents,
+ enum dma_data_direction direction);
+ u64 (*dma_address)(struct ib_device *dev,
+ struct scatterlist *sg);
+ unsigned int (*dma_len)(struct ib_device *dev,
+ struct scatterlist *sg);
+ void (*sync_single_for_cpu)(struct ib_device *dev,
+ u64 dma_handle,
+ size_t size,
+ enum dma_data_direction dir);
+ void (*sync_single_for_device)(struct ib_device *dev,
+ u64 dma_handle,
+ size_t size,
+ enum dma_data_direction dir);
+ void *(*alloc_coherent)(struct ib_device *dev,
+ size_t size,
+ u64 *dma_handle,
+ gfp_t flag);
+ void (*free_coherent)(struct ib_device *dev,
+ size_t size, void *cpu_addr,
+ u64 dma_handle);
+};
+
+struct iw_cm_verbs;
+
+struct ib_device {
+ struct device *dma_device;
+
+ char name[IB_DEVICE_NAME_MAX];
+
+ struct list_head event_handler_list;
+ spinlock_t event_handler_lock;
+
+ struct list_head core_list;
+ struct list_head client_data_list;
+ spinlock_t client_data_lock;
+
+ struct ib_cache cache;
+ int *pkey_tbl_len;
+ int *gid_tbl_len;
+
+ int num_comp_vectors;
+
+ struct iw_cm_verbs *iwcm;
+
+ int (*get_protocol_stats)(struct ib_device *device,
+ union rdma_protocol_stats *stats);
+ int (*query_device)(struct ib_device *device,
+ struct ib_device_attr *device_attr);
+ int (*query_port)(struct ib_device *device,
+ u8 port_num,
+ struct ib_port_attr *port_attr);
+ enum rdma_link_layer (*get_link_layer)(struct ib_device *device,
+ u8 port_num);
+ int (*query_gid)(struct ib_device *device,
+ u8 port_num, int index,
+ union ib_gid *gid);
+ int (*query_pkey)(struct ib_device *device,
+ u8 port_num, u16 index, u16 *pkey);
+ int (*modify_device)(struct ib_device *device,
+ int device_modify_mask,
+ struct ib_device_modify *device_modify);
+ int (*modify_port)(struct ib_device *device,
+ u8 port_num, int port_modify_mask,
+ struct ib_port_modify *port_modify);
+ struct ib_ucontext * (*alloc_ucontext)(struct ib_device *device,
+ struct ib_udata *udata);
+ int (*dealloc_ucontext)(struct ib_ucontext *context);
+ int (*mmap)(struct ib_ucontext *context,
+ struct vm_area_struct *vma);
+ struct ib_pd * (*alloc_pd)(struct ib_device *device,
+ struct ib_ucontext *context,
+ struct ib_udata *udata);
+ int (*dealloc_pd)(struct ib_pd *pd);
+ struct ib_ah * (*create_ah)(struct ib_pd *pd,
+ struct ib_ah_attr *ah_attr);
+ int (*modify_ah)(struct ib_ah *ah,
+ struct ib_ah_attr *ah_attr);
+ int (*query_ah)(struct ib_ah *ah,
+ struct ib_ah_attr *ah_attr);
+ int (*destroy_ah)(struct ib_ah *ah);
+ struct ib_srq * (*create_srq)(struct ib_pd *pd,
+ struct ib_srq_init_attr *srq_init_attr,
+ struct ib_udata *udata);
+ int (*modify_srq)(struct ib_srq *srq,
+ struct ib_srq_attr *srq_attr,
+ enum ib_srq_attr_mask srq_attr_mask,
+ struct ib_udata *udata);
+ int (*query_srq)(struct ib_srq *srq,
+ struct ib_srq_attr *srq_attr);
+ int (*destroy_srq)(struct ib_srq *srq);
+ int (*post_srq_recv)(struct ib_srq *srq,
+ struct ib_recv_wr *recv_wr,
+ struct ib_recv_wr **bad_recv_wr);
+ struct ib_qp * (*create_qp)(struct ib_pd *pd,
+ struct ib_qp_init_attr *qp_init_attr,
+ struct ib_udata *udata);
+ int (*modify_qp)(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask,
+ struct ib_udata *udata);
+ int (*query_qp)(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask,
+ struct ib_qp_init_attr *qp_init_attr);
+ int (*destroy_qp)(struct ib_qp *qp);
+ int (*post_send)(struct ib_qp *qp,
+ struct ib_send_wr *send_wr,
+ struct ib_send_wr **bad_send_wr);
+ int (*post_recv)(struct ib_qp *qp,
+ struct ib_recv_wr *recv_wr,
+ struct ib_recv_wr **bad_recv_wr);
+ struct ib_cq * (*create_cq)(struct ib_device *device, int cqe,
+ int comp_vector,
+ struct ib_ucontext *context,
+ struct ib_udata *udata);
+ int (*modify_cq)(struct ib_cq *cq, u16 cq_count,
+ u16 cq_period);
+ int (*destroy_cq)(struct ib_cq *cq);
+ int (*resize_cq)(struct ib_cq *cq, int cqe,
+ struct ib_udata *udata);
+ int (*poll_cq)(struct ib_cq *cq, int num_entries,
+ struct ib_wc *wc);
+ int (*peek_cq)(struct ib_cq *cq, int wc_cnt);
+ int (*req_notify_cq)(struct ib_cq *cq,
+ enum ib_cq_notify_flags flags);
+ int (*req_ncomp_notif)(struct ib_cq *cq,
+ int wc_cnt);
+ struct ib_mr * (*get_dma_mr)(struct ib_pd *pd,
+ int mr_access_flags);
+ struct ib_mr * (*reg_phys_mr)(struct ib_pd *pd,
+ struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf,
+ int mr_access_flags,
+ u64 *iova_start);
+ struct ib_mr * (*reg_user_mr)(struct ib_pd *pd,
+ u64 start, u64 length,
+ u64 virt_addr,
+ int mr_access_flags,
+ struct ib_udata *udata);
+ int (*query_mr)(struct ib_mr *mr,
+ struct ib_mr_attr *mr_attr);
+ int (*dereg_mr)(struct ib_mr *mr);
+ struct ib_mr * (*alloc_fast_reg_mr)(struct ib_pd *pd,
+ int max_page_list_len);
+ struct ib_fast_reg_page_list * (*alloc_fast_reg_page_list)(struct ib_device *device,
+ int page_list_len);
+ void (*free_fast_reg_page_list)(struct ib_fast_reg_page_list *page_list);
+ int (*rereg_phys_mr)(struct ib_mr *mr,
+ int mr_rereg_mask,
+ struct ib_pd *pd,
+ struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf,
+ int mr_access_flags,
+ u64 *iova_start);
+ struct ib_mw * (*alloc_mw)(struct ib_pd *pd);
+ int (*bind_mw)(struct ib_qp *qp,
+ struct ib_mw *mw,
+ struct ib_mw_bind *mw_bind);
+ int (*dealloc_mw)(struct ib_mw *mw);
+ struct ib_fmr * (*alloc_fmr)(struct ib_pd *pd,
+ int mr_access_flags,
+ struct ib_fmr_attr *fmr_attr);
+ int (*map_phys_fmr)(struct ib_fmr *fmr,
+ u64 *page_list, int list_len,
+ u64 iova);
+ int (*unmap_fmr)(struct list_head *fmr_list);
+ int (*dealloc_fmr)(struct ib_fmr *fmr);
+ int (*attach_mcast)(struct ib_qp *qp,
+ union ib_gid *gid,
+ u16 lid);
+ int (*detach_mcast)(struct ib_qp *qp,
+ union ib_gid *gid,
+ u16 lid);
+ int (*process_mad)(struct ib_device *device,
+ int process_mad_flags,
+ u8 port_num,
+ struct ib_wc *in_wc,
+ struct ib_grh *in_grh,
+ struct ib_mad *in_mad,
+ struct ib_mad *out_mad);
+ struct ib_srq * (*create_xrc_srq)(struct ib_pd *pd,
+ struct ib_cq *xrc_cq,
+ struct ib_xrcd *xrcd,
+ struct ib_srq_init_attr *srq_init_attr,
+ struct ib_udata *udata);
+ struct ib_xrcd * (*alloc_xrcd)(struct ib_device *device,
+ struct ib_ucontext *context,
+ struct ib_udata *udata);
+ int (*dealloc_xrcd)(struct ib_xrcd *xrcd);
+ int (*create_xrc_rcv_qp)(struct ib_qp_init_attr *init_attr,
+ u32 *qp_num);
+ int (*modify_xrc_rcv_qp)(struct ib_xrcd *xrcd,
+ u32 qp_num,
+ struct ib_qp_attr *attr,
+ int attr_mask);
+ int (*query_xrc_rcv_qp)(struct ib_xrcd *xrcd,
+ u32 qp_num,
+ struct ib_qp_attr *attr,
+ int attr_mask,
+ struct ib_qp_init_attr *init_attr);
+ int (*reg_xrc_rcv_qp)(struct ib_xrcd *xrcd,
+ void *context,
+ u32 qp_num);
+ int (*unreg_xrc_rcv_qp)(struct ib_xrcd *xrcd,
+ void *context,
+ u32 qp_num);
+
+ struct ib_dma_mapping_ops *dma_ops;
+
+ struct module *owner;
+ struct device dev;
+ struct kobject *ports_parent;
+ struct list_head port_list;
+
+ enum {
+ IB_DEV_UNINITIALIZED,
+ IB_DEV_REGISTERED,
+ IB_DEV_UNREGISTERED
+ } reg_state;
+
+ u64 uverbs_cmd_mask;
+ int uverbs_abi_ver;
+
+ char node_desc[64];
+ __be64 node_guid;
+ u32 local_dma_lkey;
+ u8 node_type;
+ u8 phys_port_cnt;
+ struct rb_root ib_uverbs_xrcd_table;
+ struct mutex xrcd_table_mutex;
+};
+
+struct ib_client {
+ char *name;
+ void (*add) (struct ib_device *);
+ void (*remove)(struct ib_device *);
+
+ struct list_head list;
+};
+
+struct ib_device *ib_alloc_device(size_t size);
+void ib_dealloc_device(struct ib_device *device);
+
+int ib_register_device (struct ib_device *device);
+void ib_unregister_device(struct ib_device *device);
+
+int ib_register_client (struct ib_client *client);
+void ib_unregister_client(struct ib_client *client);
+
+void *ib_get_client_data(struct ib_device *device, struct ib_client *client);
+void ib_set_client_data(struct ib_device *device, struct ib_client *client,
+ void *data);
+
+static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len)
+{
+ return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0;
+}
+
+static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len)
+{
+ return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0;
+}
+
+/**
+ * ib_sysfs_create_port_files - iterate over port sysfs directories
+ * @device: the IB device
+ * @create: a function to create sysfs files in each port directory
+ */
+int ib_sysfs_create_port_files(struct ib_device *device,
+ int (*create)(struct ib_device *dev, u8 port_num,
+ struct kobject *kobj));
+
+/**
+ * ib_modify_qp_is_ok - Check that the supplied attribute mask
+ * contains all required attributes and no attributes not allowed for
+ * the given QP state transition.
+ * @cur_state: Current QP state
+ * @next_state: Next QP state
+ * @type: QP type
+ * @mask: Mask of supplied QP attributes
+ *
+ * This function is a helper function that a low-level driver's
+ * modify_qp method can use to validate the consumer's input. It
+ * checks that cur_state and next_state are valid QP states, that a
+ * transition from cur_state to next_state is allowed by the IB spec,
+ * and that the attribute mask supplied is allowed for the transition.
+ */
+int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
+ enum ib_qp_type type, enum ib_qp_attr_mask mask);
+
+int ib_register_event_handler (struct ib_event_handler *event_handler);
+int ib_unregister_event_handler(struct ib_event_handler *event_handler);
+void ib_dispatch_event(struct ib_event *event);
+
+int ib_query_device(struct ib_device *device,
+ struct ib_device_attr *device_attr);
+
+int ib_query_port(struct ib_device *device,
+ u8 port_num, struct ib_port_attr *port_attr);
+
+enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device,
+ u8 port_num);
+
+int ib_query_gid(struct ib_device *device,
+ u8 port_num, int index, union ib_gid *gid);
+
+int ib_query_pkey(struct ib_device *device,
+ u8 port_num, u16 index, u16 *pkey);
+
+int ib_modify_device(struct ib_device *device,
+ int device_modify_mask,
+ struct ib_device_modify *device_modify);
+
+int ib_modify_port(struct ib_device *device,
+ u8 port_num, int port_modify_mask,
+ struct ib_port_modify *port_modify);
+
+int ib_find_gid(struct ib_device *device, union ib_gid *gid,
+ u8 *port_num, u16 *index);
+
+int ib_find_pkey(struct ib_device *device,
+ u8 port_num, u16 pkey, u16 *index);
+
+/**
+ * ib_alloc_pd - Allocates an unused protection domain.
+ * @device: The device on which to allocate the protection domain.
+ *
+ * A protection domain object provides an association between QPs, shared
+ * receive queues, address handles, memory regions, and memory windows.
+ */
+struct ib_pd *ib_alloc_pd(struct ib_device *device);
+
+/**
+ * ib_dealloc_pd - Deallocates a protection domain.
+ * @pd: The protection domain to deallocate.
+ */
+int ib_dealloc_pd(struct ib_pd *pd);
+
+/**
+ * ib_create_ah - Creates an address handle for the given address vector.
+ * @pd: The protection domain associated with the address handle.
+ * @ah_attr: The attributes of the address vector.
+ *
+ * The address handle is used to reference a local or global destination
+ * in all UD QP post sends.
+ */
+struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_init_ah_from_wc - Initializes address handle attributes from a
+ * work completion.
+ * @device: Device on which the received message arrived.
+ * @port_num: Port on which the received message arrived.
+ * @wc: Work completion associated with the received message.
+ * @grh: References the received global route header. This parameter is
+ * ignored unless the work completion indicates that the GRH is valid.
+ * @ah_attr: Returned attributes that can be used when creating an address
+ * handle for replying to the message.
+ */
+int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
+ struct ib_grh *grh, struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_create_ah_from_wc - Creates an address handle associated with the
+ * sender of the specified work completion.
+ * @pd: The protection domain associated with the address handle.
+ * @wc: Work completion information associated with a received message.
+ * @grh: References the received global route header. This parameter is
+ * ignored unless the work completion indicates that the GRH is valid.
+ * @port_num: The outbound port number to associate with the address.
+ *
+ * The address handle is used to reference a local or global destination
+ * in all UD QP post sends.
+ */
+struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc,
+ struct ib_grh *grh, u8 port_num);
+
+/**
+ * ib_modify_ah - Modifies the address vector associated with an address
+ * handle.
+ * @ah: The address handle to modify.
+ * @ah_attr: The new address vector attributes to associate with the
+ * address handle.
+ */
+int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_query_ah - Queries the address vector associated with an address
+ * handle.
+ * @ah: The address handle to query.
+ * @ah_attr: The address vector attributes associated with the address
+ * handle.
+ */
+int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+
+/**
+ * ib_destroy_ah - Destroys an address handle.
+ * @ah: The address handle to destroy.
+ */
+int ib_destroy_ah(struct ib_ah *ah);
+
+/**
+ * ib_create_xrc_srq - Creates an XRC SRQ associated with the specified
+ * protection domain, cq, and xrc domain.
+ * @pd: The protection domain associated with the SRQ.
+ * @xrc_cq: The cq to be associated with the XRC SRQ.
+ * @xrcd: The XRC domain to be associated with the XRC SRQ.
+ * @srq_init_attr: A list of initial attributes required to create the
+ * XRC SRQ. If XRC SRQ creation succeeds, then the attributes are updated
+ * to the actual capabilities of the created XRC SRQ.
+ *
+ * srq_attr->max_wr and srq_attr->max_sge are read the determine the
+ * requested size of the XRC SRQ, and set to the actual values allocated
+ * on return. If ib_create_xrc_srq() succeeds, then max_wr and max_sge
+ * will always be at least as large as the requested values.
+ */
+struct ib_srq *ib_create_xrc_srq(struct ib_pd *pd,
+ struct ib_cq *xrc_cq,
+ struct ib_xrcd *xrcd,
+ struct ib_srq_init_attr *srq_init_attr);
+
+/**
+ * ib_create_srq - Creates an SRQ associated with the specified
+ * protection domain.
+ * @pd: The protection domain associated with the SRQ.
+ * @srq_init_attr: A list of initial attributes required to create the
+ * SRQ. If SRQ creation succeeds, then the attributes are updated to
+ * the actual capabilities of the created SRQ.
+ *
+ * srq_attr->max_wr and srq_attr->max_sge are read the determine the
+ * requested size of the SRQ, and set to the actual values allocated
+ * on return. If ib_create_srq() succeeds, then max_wr and max_sge
+ * will always be at least as large as the requested values.
+ */
+struct ib_srq *ib_create_srq(struct ib_pd *pd,
+ struct ib_srq_init_attr *srq_init_attr);
+
+/**
+ * ib_modify_srq - Modifies the attributes for the specified SRQ.
+ * @srq: The SRQ to modify.
+ * @srq_attr: On input, specifies the SRQ attributes to modify. On output,
+ * the current values of selected SRQ attributes are returned.
+ * @srq_attr_mask: A bit-mask used to specify which attributes of the SRQ
+ * are being modified.
+ *
+ * The mask may contain IB_SRQ_MAX_WR to resize the SRQ and/or
+ * IB_SRQ_LIMIT to set the SRQ's limit and request notification when
+ * the number of receives queued drops below the limit.
+ */
+int ib_modify_srq(struct ib_srq *srq,
+ struct ib_srq_attr *srq_attr,
+ enum ib_srq_attr_mask srq_attr_mask);
+
+/**
+ * ib_query_srq - Returns the attribute list and current values for the
+ * specified SRQ.
+ * @srq: The SRQ to query.
+ * @srq_attr: The attributes of the specified SRQ.
+ */
+int ib_query_srq(struct ib_srq *srq,
+ struct ib_srq_attr *srq_attr);
+
+/**
+ * ib_destroy_srq - Destroys the specified SRQ.
+ * @srq: The SRQ to destroy.
+ */
+int ib_destroy_srq(struct ib_srq *srq);
+
+/**
+ * ib_post_srq_recv - Posts a list of work requests to the specified SRQ.
+ * @srq: The SRQ to post the work request on.
+ * @recv_wr: A list of work requests to post on the receive queue.
+ * @bad_recv_wr: On an immediate failure, this parameter will reference
+ * the work request that failed to be posted on the QP.
+ */
+static inline int ib_post_srq_recv(struct ib_srq *srq,
+ struct ib_recv_wr *recv_wr,
+ struct ib_recv_wr **bad_recv_wr)
+{
+ return srq->device->post_srq_recv(srq, recv_wr, bad_recv_wr);
+}
+
+/**
+ * ib_create_qp - Creates a QP associated with the specified protection
+ * domain.
+ * @pd: The protection domain associated with the QP.
+ * @qp_init_attr: A list of initial attributes required to create the
+ * QP. If QP creation succeeds, then the attributes are updated to
+ * the actual capabilities of the created QP.
+ */
+struct ib_qp *ib_create_qp(struct ib_pd *pd,
+ struct ib_qp_init_attr *qp_init_attr);
+
+/**
+ * ib_modify_qp - Modifies the attributes for the specified QP and then
+ * transitions the QP to the given state.
+ * @qp: The QP to modify.
+ * @qp_attr: On input, specifies the QP attributes to modify. On output,
+ * the current values of selected QP attributes are returned.
+ * @qp_attr_mask: A bit-mask used to specify which attributes of the QP
+ * are being modified.
+ */
+int ib_modify_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask);
+
+/**
+ * ib_query_qp - Returns the attribute list and current values for the
+ * specified QP.
+ * @qp: The QP to query.
+ * @qp_attr: The attributes of the specified QP.
+ * @qp_attr_mask: A bit-mask used to select specific attributes to query.
+ * @qp_init_attr: Additional attributes of the selected QP.
+ *
+ * The qp_attr_mask may be used to limit the query to gathering only the
+ * selected attributes.
+ */
+int ib_query_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask,
+ struct ib_qp_init_attr *qp_init_attr);
+
+/**
+ * ib_destroy_qp - Destroys the specified QP.
+ * @qp: The QP to destroy.
+ */
+int ib_destroy_qp(struct ib_qp *qp);
+
+/**
+ * ib_post_send - Posts a list of work requests to the send queue of
+ * the specified QP.
+ * @qp: The QP to post the work request on.
+ * @send_wr: A list of work requests to post on the send queue.
+ * @bad_send_wr: On an immediate failure, this parameter will reference
+ * the work request that failed to be posted on the QP.
+ */
+static inline int ib_post_send(struct ib_qp *qp,
+ struct ib_send_wr *send_wr,
+ struct ib_send_wr **bad_send_wr)
+{
+ return qp->device->post_send(qp, send_wr, bad_send_wr);
+}
+
+/**
+ * ib_post_recv - Posts a list of work requests to the receive queue of
+ * the specified QP.
+ * @qp: The QP to post the work request on.
+ * @recv_wr: A list of work requests to post on the receive queue.
+ * @bad_recv_wr: On an immediate failure, this parameter will reference
+ * the work request that failed to be posted on the QP.
+ */
+static inline int ib_post_recv(struct ib_qp *qp,
+ struct ib_recv_wr *recv_wr,
+ struct ib_recv_wr **bad_recv_wr)
+{
+ return qp->device->post_recv(qp, recv_wr, bad_recv_wr);
+}
+
+/*
+ * IB_CQ_VECTOR_LEAST_ATTACHED: The constant specifies that
+ * the CQ will be attached to the completion vector that has
+ * the least number of CQs already attached to it.
+ */
+#define IB_CQ_VECTOR_LEAST_ATTACHED 0xffffffff
+
+/**
+ * ib_create_cq - Creates a CQ on the specified device.
+ * @device: The device on which to create the CQ.
+ * @comp_handler: A user-specified callback that is invoked when a
+ * completion event occurs on the CQ.
+ * @event_handler: A user-specified callback that is invoked when an
+ * asynchronous event not associated with a completion occurs on the CQ.
+ * @cq_context: Context associated with the CQ returned to the user via
+ * the associated completion and event handlers.
+ * @cqe: The minimum size of the CQ.
+ * @comp_vector - Completion vector used to signal completion events.
+ * Must be >= 0 and < context->num_comp_vectors
+ * or IB_CQ_VECTOR_LEAST_ATTACHED.
+ *
+ * Users can examine the cq structure to determine the actual CQ size.
+ */
+struct ib_cq *ib_create_cq(struct ib_device *device,
+ ib_comp_handler comp_handler,
+ void (*event_handler)(struct ib_event *, void *),
+ void *cq_context, int cqe, int comp_vector);
+
+/**
+ * ib_resize_cq - Modifies the capacity of the CQ.
+ * @cq: The CQ to resize.
+ * @cqe: The minimum size of the CQ.
+ *
+ * Users can examine the cq structure to determine the actual CQ size.
+ */
+int ib_resize_cq(struct ib_cq *cq, int cqe);
+
+/**
+ * ib_modify_cq - Modifies moderation params of the CQ
+ * @cq: The CQ to modify.
+ * @cq_count: number of CQEs that will trigger an event
+ * @cq_period: max period of time in usec before triggering an event
+ *
+ */
+int ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
+
+/**
+ * ib_destroy_cq - Destroys the specified CQ.
+ * @cq: The CQ to destroy.
+ */
+int ib_destroy_cq(struct ib_cq *cq);
+
+/**
+ * ib_poll_cq - poll a CQ for completion(s)
+ * @cq:the CQ being polled
+ * @num_entries:maximum number of completions to return
+ * @wc:array of at least @num_entries &struct ib_wc where completions
+ * will be returned
+ *
+ * Poll a CQ for (possibly multiple) completions. If the return value
+ * is < 0, an error occurred. If the return value is >= 0, it is the
+ * number of completions returned. If the return value is
+ * non-negative and < num_entries, then the CQ was emptied.
+ */
+static inline int ib_poll_cq(struct ib_cq *cq, int num_entries,
+ struct ib_wc *wc)
+{
+ return cq->device->poll_cq(cq, num_entries, wc);
+}
+
+/**
+ * ib_peek_cq - Returns the number of unreaped completions currently
+ * on the specified CQ.
+ * @cq: The CQ to peek.
+ * @wc_cnt: A minimum number of unreaped completions to check for.
+ *
+ * If the number of unreaped completions is greater than or equal to wc_cnt,
+ * this function returns wc_cnt, otherwise, it returns the actual number of
+ * unreaped completions.
+ */
+int ib_peek_cq(struct ib_cq *cq, int wc_cnt);
+
+/**
+ * ib_req_notify_cq - Request completion notification on a CQ.
+ * @cq: The CQ to generate an event for.
+ * @flags:
+ * Must contain exactly one of %IB_CQ_SOLICITED or %IB_CQ_NEXT_COMP
+ * to request an event on the next solicited event or next work
+ * completion at any type, respectively. %IB_CQ_REPORT_MISSED_EVENTS
+ * may also be |ed in to request a hint about missed events, as
+ * described below.
+ *
+ * Return Value:
+ * < 0 means an error occurred while requesting notification
+ * == 0 means notification was requested successfully, and if
+ * IB_CQ_REPORT_MISSED_EVENTS was passed in, then no events
+ * were missed and it is safe to wait for another event. In
+ * this case is it guaranteed that any work completions added
+ * to the CQ since the last CQ poll will trigger a completion
+ * notification event.
+ * > 0 is only returned if IB_CQ_REPORT_MISSED_EVENTS was passed
+ * in. It means that the consumer must poll the CQ again to
+ * make sure it is empty to avoid missing an event because of a
+ * race between requesting notification and an entry being
+ * added to the CQ. This return value means it is possible
+ * (but not guaranteed) that a work completion has been added
+ * to the CQ since the last poll without triggering a
+ * completion notification event.
+ */
+static inline int ib_req_notify_cq(struct ib_cq *cq,
+ enum ib_cq_notify_flags flags)
+{
+ return cq->device->req_notify_cq(cq, flags);
+}
+
+/**
+ * ib_req_ncomp_notif - Request completion notification when there are
+ * at least the specified number of unreaped completions on the CQ.
+ * @cq: The CQ to generate an event for.
+ * @wc_cnt: The number of unreaped completions that should be on the
+ * CQ before an event is generated.
+ */
+static inline int ib_req_ncomp_notif(struct ib_cq *cq, int wc_cnt)
+{
+ return cq->device->req_ncomp_notif ?
+ cq->device->req_ncomp_notif(cq, wc_cnt) :
+ -ENOSYS;
+}
+
+/**
+ * ib_get_dma_mr - Returns a memory region for system memory that is
+ * usable for DMA.
+ * @pd: The protection domain associated with the memory region.
+ * @mr_access_flags: Specifies the memory access rights.
+ *
+ * Note that the ib_dma_*() functions defined below must be used
+ * to create/destroy addresses used with the Lkey or Rkey returned
+ * by ib_get_dma_mr().
+ */
+struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
+
+/**
+ * ib_dma_mapping_error - check a DMA addr for error
+ * @dev: The device for which the dma_addr was created
+ * @dma_addr: The DMA address to check
+ */
+static inline int ib_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+ if (dev->dma_ops)
+ return dev->dma_ops->mapping_error(dev, dma_addr);
+ return dma_mapping_error(dev->dma_device, dma_addr);
+}
+
+/**
+ * ib_dma_map_single - Map a kernel virtual address to DMA address
+ * @dev: The device for which the dma_addr is to be created
+ * @cpu_addr: The kernel virtual address
+ * @size: The size of the region in bytes
+ * @direction: The direction of the DMA
+ */
+static inline u64 ib_dma_map_single(struct ib_device *dev,
+ void *cpu_addr, size_t size,
+ enum dma_data_direction direction)
+{
+ if (dev->dma_ops)
+ return dev->dma_ops->map_single(dev, cpu_addr, size, direction);
+ return dma_map_single(dev->dma_device, cpu_addr, size, direction);
+}
+
+/**
+ * ib_dma_unmap_single - Destroy a mapping created by ib_dma_map_single()
+ * @dev: The device for which the DMA address was created
+ * @addr: The DMA address
+ * @size: The size of the region in bytes
+ * @direction: The direction of the DMA
+ */
+static inline void ib_dma_unmap_single(struct ib_device *dev,
+ u64 addr, size_t size,
+ enum dma_data_direction direction)
+{
+ if (dev->dma_ops)
+ dev->dma_ops->unmap_single(dev, addr, size, direction);
+ else
+ dma_unmap_single(dev->dma_device, addr, size, direction);
+}
+
+static inline u64 ib_dma_map_single_attrs(struct ib_device *dev,
+ void *cpu_addr, size_t size,
+ enum dma_data_direction direction,
+ struct dma_attrs *attrs)
+{
+ return dma_map_single_attrs(dev->dma_device, cpu_addr, size,
+ direction, attrs);
+}
+
+static inline void ib_dma_unmap_single_attrs(struct ib_device *dev,
+ u64 addr, size_t size,
+ enum dma_data_direction direction,
+ struct dma_attrs *attrs)
+{
+ return dma_unmap_single_attrs(dev->dma_device, addr, size,
+ direction, attrs);
+}
+
+/**
+ * ib_dma_map_page - Map a physical page to DMA address
+ * @dev: The device for which the dma_addr is to be created
+ * @page: The page to be mapped
+ * @offset: The offset within the page
+ * @size: The size of the region in bytes
+ * @direction: The direction of the DMA
+ */
+static inline u64 ib_dma_map_page(struct ib_device *dev,
+ struct page *page,
+ unsigned long offset,
+ size_t size,
+ enum dma_data_direction direction)
+{
+ if (dev->dma_ops)
+ return dev->dma_ops->map_page(dev, page, offset, size, direction);
+ return dma_map_page(dev->dma_device, page, offset, size, direction);
+}
+
+/**
+ * ib_dma_unmap_page - Destroy a mapping created by ib_dma_map_page()
+ * @dev: The device for which the DMA address was created
+ * @addr: The DMA address
+ * @size: The size of the region in bytes
+ * @direction: The direction of the DMA
+ */
+static inline void ib_dma_unmap_page(struct ib_device *dev,
+ u64 addr, size_t size,
+ enum dma_data_direction direction)
+{
+ if (dev->dma_ops)
+ dev->dma_ops->unmap_page(dev, addr, size, direction);
+ else
+ dma_unmap_page(dev->dma_device, addr, size, direction);
+}
+
+/**
+ * ib_dma_map_sg - Map a scatter/gather list to DMA addresses
+ * @dev: The device for which the DMA addresses are to be created
+ * @sg: The array of scatter/gather entries
+ * @nents: The number of scatter/gather entries
+ * @direction: The direction of the DMA
+ */
+static inline int ib_dma_map_sg(struct ib_device *dev,
+ struct scatterlist *sg, int nents,
+ enum dma_data_direction direction)
+{
+ if (dev->dma_ops)
+ return dev->dma_ops->map_sg(dev, sg, nents, direction);
+ return dma_map_sg(dev->dma_device, sg, nents, direction);
+}
+
+/**
+ * ib_dma_unmap_sg - Unmap a scatter/gather list of DMA addresses
+ * @dev: The device for which the DMA addresses were created
+ * @sg: The array of scatter/gather entries
+ * @nents: The number of scatter/gather entries
+ * @direction: The direction of the DMA
+ */
+static inline void ib_dma_unmap_sg(struct ib_device *dev,
+ struct scatterlist *sg, int nents,
+ enum dma_data_direction direction)
+{
+ if (dev->dma_ops)
+ dev->dma_ops->unmap_sg(dev, sg, nents, direction);
+ else
+ dma_unmap_sg(dev->dma_device, sg, nents, direction);
+}
+
+static inline int ib_dma_map_sg_attrs(struct ib_device *dev,
+ struct scatterlist *sg, int nents,
+ enum dma_data_direction direction,
+ struct dma_attrs *attrs)
+{
+ return dma_map_sg_attrs(dev->dma_device, sg, nents, direction, attrs);
+}
+
+static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev,
+ struct scatterlist *sg, int nents,
+ enum dma_data_direction direction,
+ struct dma_attrs *attrs)
+{
+ dma_unmap_sg_attrs(dev->dma_device, sg, nents, direction, attrs);
+}
+/**
+ * ib_sg_dma_address - Return the DMA address from a scatter/gather entry
+ * @dev: The device for which the DMA addresses were created
+ * @sg: The scatter/gather entry
+ */
+static inline u64 ib_sg_dma_address(struct ib_device *dev,
+ struct scatterlist *sg)
+{
+ if (dev->dma_ops)
+ return dev->dma_ops->dma_address(dev, sg);
+ return sg_dma_address(sg);
+}
+
+/**
+ * ib_sg_dma_len - Return the DMA length from a scatter/gather entry
+ * @dev: The device for which the DMA addresses were created
+ * @sg: The scatter/gather entry
+ */
+static inline unsigned int ib_sg_dma_len(struct ib_device *dev,
+ struct scatterlist *sg)
+{
+ if (dev->dma_ops)
+ return dev->dma_ops->dma_len(dev, sg);
+ return sg_dma_len(sg);
+}
+
+/**
+ * ib_dma_sync_single_for_cpu - Prepare DMA region to be accessed by CPU
+ * @dev: The device for which the DMA address was created
+ * @addr: The DMA address
+ * @size: The size of the region in bytes
+ * @dir: The direction of the DMA
+ */
+static inline void ib_dma_sync_single_for_cpu(struct ib_device *dev,
+ u64 addr,
+ size_t size,
+ enum dma_data_direction dir)
+{
+ if (dev->dma_ops)
+ dev->dma_ops->sync_single_for_cpu(dev, addr, size, dir);
+ else
+ dma_sync_single_for_cpu(dev->dma_device, addr, size, dir);
+}
+
+/**
+ * ib_dma_sync_single_for_device - Prepare DMA region to be accessed by device
+ * @dev: The device for which the DMA address was created
+ * @addr: The DMA address
+ * @size: The size of the region in bytes
+ * @dir: The direction of the DMA
+ */
+static inline void ib_dma_sync_single_for_device(struct ib_device *dev,
+ u64 addr,
+ size_t size,
+ enum dma_data_direction dir)
+{
+ if (dev->dma_ops)
+ dev->dma_ops->sync_single_for_device(dev, addr, size, dir);
+ else
+ dma_sync_single_for_device(dev->dma_device, addr, size, dir);
+}
+
+/**
+ * ib_dma_alloc_coherent - Allocate memory and map it for DMA
+ * @dev: The device for which the DMA address is requested
+ * @size: The size of the region to allocate in bytes
+ * @dma_handle: A pointer for returning the DMA address of the region
+ * @flag: memory allocator flags
+ */
+static inline void *ib_dma_alloc_coherent(struct ib_device *dev,
+ size_t size,
+ u64 *dma_handle,
+ gfp_t flag)
+{
+ if (dev->dma_ops)
+ return dev->dma_ops->alloc_coherent(dev, size, dma_handle, flag);
+ else {
+ dma_addr_t handle;
+ void *ret;
+
+ ret = dma_alloc_coherent(dev->dma_device, size, &handle, flag);
+ *dma_handle = handle;
+ return ret;
+ }
+}
+
+/**
+ * ib_dma_free_coherent - Free memory allocated by ib_dma_alloc_coherent()
+ * @dev: The device for which the DMA addresses were allocated
+ * @size: The size of the region
+ * @cpu_addr: the address returned by ib_dma_alloc_coherent()
+ * @dma_handle: the DMA address returned by ib_dma_alloc_coherent()
+ */
+static inline void ib_dma_free_coherent(struct ib_device *dev,
+ size_t size, void *cpu_addr,
+ u64 dma_handle)
+{
+ if (dev->dma_ops)
+ dev->dma_ops->free_coherent(dev, size, cpu_addr, dma_handle);
+ else
+ dma_free_coherent(dev->dma_device, size, cpu_addr, dma_handle);
+}
+
+/**
+ * ib_reg_phys_mr - Prepares a virtually addressed memory region for use
+ * by an HCA.
+ * @pd: The protection domain associated assigned to the registered region.
+ * @phys_buf_array: Specifies a list of physical buffers to use in the
+ * memory region.
+ * @num_phys_buf: Specifies the size of the phys_buf_array.
+ * @mr_access_flags: Specifies the memory access rights.
+ * @iova_start: The offset of the region's starting I/O virtual address.
+ */
+struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
+ struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf,
+ int mr_access_flags,
+ u64 *iova_start);
+
+/**
+ * ib_rereg_phys_mr - Modifies the attributes of an existing memory region.
+ * Conceptually, this call performs the functions deregister memory region
+ * followed by register physical memory region. Where possible,
+ * resources are reused instead of deallocated and reallocated.
+ * @mr: The memory region to modify.
+ * @mr_rereg_mask: A bit-mask used to indicate which of the following
+ * properties of the memory region are being modified.
+ * @pd: If %IB_MR_REREG_PD is set in mr_rereg_mask, this field specifies
+ * the new protection domain to associated with the memory region,
+ * otherwise, this parameter is ignored.
+ * @phys_buf_array: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
+ * field specifies a list of physical buffers to use in the new
+ * translation, otherwise, this parameter is ignored.
+ * @num_phys_buf: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
+ * field specifies the size of the phys_buf_array, otherwise, this
+ * parameter is ignored.
+ * @mr_access_flags: If %IB_MR_REREG_ACCESS is set in mr_rereg_mask, this
+ * field specifies the new memory access rights, otherwise, this
+ * parameter is ignored.
+ * @iova_start: The offset of the region's starting I/O virtual address.
+ */
+int ib_rereg_phys_mr(struct ib_mr *mr,
+ int mr_rereg_mask,
+ struct ib_pd *pd,
+ struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf,
+ int mr_access_flags,
+ u64 *iova_start);
+
+/**
+ * ib_query_mr - Retrieves information about a specific memory region.
+ * @mr: The memory region to retrieve information about.
+ * @mr_attr: The attributes of the specified memory region.
+ */
+int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
+
+/**
+ * ib_dereg_mr - Deregisters a memory region and removes it from the
+ * HCA translation table.
+ * @mr: The memory region to deregister.
+ */
+int ib_dereg_mr(struct ib_mr *mr);
+
+/**
+ * ib_alloc_fast_reg_mr - Allocates memory region usable with the
+ * IB_WR_FAST_REG_MR send work request.
+ * @pd: The protection domain associated with the region.
+ * @max_page_list_len: requested max physical buffer list length to be
+ * used with fast register work requests for this MR.
+ */
+struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len);
+
+/**
+ * ib_alloc_fast_reg_page_list - Allocates a page list array
+ * @device - ib device pointer.
+ * @page_list_len - size of the page list array to be allocated.
+ *
+ * This allocates and returns a struct ib_fast_reg_page_list * and a
+ * page_list array that is at least page_list_len in size. The actual
+ * size is returned in max_page_list_len. The caller is responsible
+ * for initializing the contents of the page_list array before posting
+ * a send work request with the IB_WC_FAST_REG_MR opcode.
+ *
+ * The page_list array entries must be translated using one of the
+ * ib_dma_*() functions just like the addresses passed to
+ * ib_map_phys_fmr(). Once the ib_post_send() is issued, the struct
+ * ib_fast_reg_page_list must not be modified by the caller until the
+ * IB_WC_FAST_REG_MR work request completes.
+ */
+struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(
+ struct ib_device *device, int page_list_len);
+
+/**
+ * ib_free_fast_reg_page_list - Deallocates a previously allocated
+ * page list array.
+ * @page_list - struct ib_fast_reg_page_list pointer to be deallocated.
+ */
+void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);
+
+/**
+ * ib_update_fast_reg_key - updates the key portion of the fast_reg MR
+ * R_Key and L_Key.
+ * @mr - struct ib_mr pointer to be updated.
+ * @newkey - new key to be used.
+ */
+static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey)
+{
+ mr->lkey = (mr->lkey & 0xffffff00) | newkey;
+ mr->rkey = (mr->rkey & 0xffffff00) | newkey;
+}
+
+/**
+ * ib_alloc_mw - Allocates a memory window.
+ * @pd: The protection domain associated with the memory window.
+ */
+struct ib_mw *ib_alloc_mw(struct ib_pd *pd);
+
+/**
+ * ib_bind_mw - Posts a work request to the send queue of the specified
+ * QP, which binds the memory window to the given address range and
+ * remote access attributes.
+ * @qp: QP to post the bind work request on.
+ * @mw: The memory window to bind.
+ * @mw_bind: Specifies information about the memory window, including
+ * its address range, remote access rights, and associated memory region.
+ */
+static inline int ib_bind_mw(struct ib_qp *qp,
+ struct ib_mw *mw,
+ struct ib_mw_bind *mw_bind)
+{
+ /* XXX reference counting in corresponding MR? */
+ return mw->device->bind_mw ?
+ mw->device->bind_mw(qp, mw, mw_bind) :
+ -ENOSYS;
+}
+
+/**
+ * ib_dealloc_mw - Deallocates a memory window.
+ * @mw: The memory window to deallocate.
+ */
+int ib_dealloc_mw(struct ib_mw *mw);
+
+/**
+ * ib_alloc_fmr - Allocates a unmapped fast memory region.
+ * @pd: The protection domain associated with the unmapped region.
+ * @mr_access_flags: Specifies the memory access rights.
+ * @fmr_attr: Attributes of the unmapped region.
+ *
+ * A fast memory region must be mapped before it can be used as part of
+ * a work request.
+ */
+struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
+ int mr_access_flags,
+ struct ib_fmr_attr *fmr_attr);
+
+/**
+ * ib_map_phys_fmr - Maps a list of physical pages to a fast memory region.
+ * @fmr: The fast memory region to associate with the pages.
+ * @page_list: An array of physical pages to map to the fast memory region.
+ * @list_len: The number of pages in page_list.
+ * @iova: The I/O virtual address to use with the mapped region.
+ */
+static inline int ib_map_phys_fmr(struct ib_fmr *fmr,
+ u64 *page_list, int list_len,
+ u64 iova)
+{
+ return fmr->device->map_phys_fmr(fmr, page_list, list_len, iova);
+}
+
+/**
+ * ib_unmap_fmr - Removes the mapping from a list of fast memory regions.
+ * @fmr_list: A linked list of fast memory regions to unmap.
+ */
+int ib_unmap_fmr(struct list_head *fmr_list);
+
+/**
+ * ib_dealloc_fmr - Deallocates a fast memory region.
+ * @fmr: The fast memory region to deallocate.
+ */
+int ib_dealloc_fmr(struct ib_fmr *fmr);
+
+/**
+ * ib_attach_mcast - Attaches the specified QP to a multicast group.
+ * @qp: QP to attach to the multicast group. The QP must be type
+ * IB_QPT_UD.
+ * @gid: Multicast group GID.
+ * @lid: Multicast group LID in host byte order.
+ *
+ * In order to send and receive multicast packets, subnet
+ * administration must have created the multicast group and configured
+ * the fabric appropriately. The port associated with the specified
+ * QP must also be a member of the multicast group.
+ */
+int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+/**
+ * ib_detach_mcast - Detaches the specified QP from a multicast group.
+ * @qp: QP to detach from the multicast group.
+ * @gid: Multicast group GID.
+ * @lid: Multicast group LID in host byte order.
+ */
+int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+
+/**
+ * ib_dealloc_xrcd - Deallocates an extended reliably connected domain.
+ * @xrcd: The xrc domain to deallocate.
+ */
+int ib_dealloc_xrcd(struct ib_xrcd *xrcd);
+
+/**
+ * ib_alloc_xrcd - Allocates an extended reliably connected domain.
+ * @device: The device on which to allocate the xrcd.
+ */
+struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device);
+
+#endif /* IB_VERBS_H */
diff --git a/sys/ofed/include/rdma/iw_cm.h b/sys/ofed/include/rdma/iw_cm.h
new file mode 100644
index 0000000..cbb822e
--- /dev/null
+++ b/sys/ofed/include/rdma/iw_cm.h
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef IW_CM_H
+#define IW_CM_H
+
+#include <linux/in.h>
+#include <rdma/ib_cm.h>
+
+struct iw_cm_id;
+
+enum iw_cm_event_type {
+ IW_CM_EVENT_CONNECT_REQUEST = 1, /* connect request received */
+ IW_CM_EVENT_CONNECT_REPLY, /* reply from active connect request */
+ IW_CM_EVENT_ESTABLISHED, /* passive side accept successful */
+ IW_CM_EVENT_DISCONNECT, /* orderly shutdown */
+ IW_CM_EVENT_CLOSE /* close complete */
+};
+
+enum iw_cm_event_status {
+ IW_CM_EVENT_STATUS_OK = 0, /* request successful */
+ IW_CM_EVENT_STATUS_ACCEPTED = 0, /* connect request accepted */
+ IW_CM_EVENT_STATUS_REJECTED, /* connect request rejected */
+ IW_CM_EVENT_STATUS_TIMEOUT, /* the operation timed out */
+ IW_CM_EVENT_STATUS_RESET, /* reset from remote peer */
+ IW_CM_EVENT_STATUS_EINVAL, /* asynchronous failure for bad parm */
+};
+
+struct iw_cm_event {
+ enum iw_cm_event_type event;
+ enum iw_cm_event_status status;
+ struct sockaddr_in local_addr;
+ struct sockaddr_in remote_addr;
+ void *private_data;
+ u8 private_data_len;
+ void *provider_data;
+};
+
+/**
+ * iw_cm_handler - Function to be called by the IW CM when delivering events
+ * to the client.
+ *
+ * @cm_id: The IW CM identifier associated with the event.
+ * @event: Pointer to the event structure.
+ */
+typedef int (*iw_cm_handler)(struct iw_cm_id *cm_id,
+ struct iw_cm_event *event);
+
+/**
+ * iw_event_handler - Function called by the provider when delivering provider
+ * events to the IW CM. Returns either 0 indicating the event was processed
+ * or -errno if the event could not be processed.
+ *
+ * @cm_id: The IW CM identifier associated with the event.
+ * @event: Pointer to the event structure.
+ */
+typedef int (*iw_event_handler)(struct iw_cm_id *cm_id,
+ struct iw_cm_event *event);
+
+struct iw_cm_id {
+ iw_cm_handler cm_handler; /* client callback function */
+ void *context; /* client cb context */
+ struct ib_device *device;
+ struct sockaddr_in local_addr;
+ struct sockaddr_in remote_addr;
+ void *provider_data; /* provider private data */
+ iw_event_handler event_handler; /* cb for provider
+ events */
+ /* Used by provider to add and remove refs on IW cm_id */
+ void (*add_ref)(struct iw_cm_id *);
+ void (*rem_ref)(struct iw_cm_id *);
+};
+
+struct iw_cm_conn_param {
+ const void *private_data;
+ u16 private_data_len;
+ u32 ord;
+ u32 ird;
+ u32 qpn;
+};
+
+struct iw_cm_verbs {
+ void (*add_ref)(struct ib_qp *qp);
+
+ void (*rem_ref)(struct ib_qp *qp);
+
+ struct ib_qp * (*get_qp)(struct ib_device *device,
+ int qpn);
+
+ int (*connect)(struct iw_cm_id *cm_id,
+ struct iw_cm_conn_param *conn_param);
+
+ int (*accept)(struct iw_cm_id *cm_id,
+ struct iw_cm_conn_param *conn_param);
+
+ int (*reject)(struct iw_cm_id *cm_id,
+ const void *pdata, u8 pdata_len);
+
+ int (*create_listen)(struct iw_cm_id *cm_id,
+ int backlog);
+
+ int (*destroy_listen)(struct iw_cm_id *cm_id);
+};
+
+/**
+ * iw_create_cm_id - Create an IW CM identifier.
+ *
+ * @device: The IB device on which to create the IW CM identier.
+ * @event_handler: User callback invoked to report events associated with the
+ * returned IW CM identifier.
+ * @context: User specified context associated with the id.
+ */
+struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
+ iw_cm_handler cm_handler, void *context);
+
+/**
+ * iw_destroy_cm_id - Destroy an IW CM identifier.
+ *
+ * @cm_id: The previously created IW CM identifier to destroy.
+ *
+ * The client can assume that no events will be delivered for the CM ID after
+ * this function returns.
+ */
+void iw_destroy_cm_id(struct iw_cm_id *cm_id);
+
+/**
+ * iw_cm_bind_qp - Unbind the specified IW CM identifier and QP
+ *
+ * @cm_id: The IW CM idenfier to unbind from the QP.
+ * @qp: The QP
+ *
+ * This is called by the provider when destroying the QP to ensure
+ * that any references held by the IWCM are released. It may also
+ * be called by the IWCM when destroying a CM_ID to that any
+ * references held by the provider are released.
+ */
+void iw_cm_unbind_qp(struct iw_cm_id *cm_id, struct ib_qp *qp);
+
+/**
+ * iw_cm_get_qp - Return the ib_qp associated with a QPN
+ *
+ * @ib_device: The IB device
+ * @qpn: The queue pair number
+ */
+struct ib_qp *iw_cm_get_qp(struct ib_device *device, int qpn);
+
+/**
+ * iw_cm_listen - Listen for incoming connection requests on the
+ * specified IW CM id.
+ *
+ * @cm_id: The IW CM identifier.
+ * @backlog: The maximum number of outstanding un-accepted inbound listen
+ * requests to queue.
+ *
+ * The source address and port number are specified in the IW CM identifier
+ * structure.
+ */
+int iw_cm_listen(struct iw_cm_id *cm_id, int backlog);
+
+/**
+ * iw_cm_accept - Called to accept an incoming connect request.
+ *
+ * @cm_id: The IW CM identifier associated with the connection request.
+ * @iw_param: Pointer to a structure containing connection establishment
+ * parameters.
+ *
+ * The specified cm_id will have been provided in the event data for a
+ * CONNECT_REQUEST event. Subsequent events related to this connection will be
+ * delivered to the specified IW CM identifier prior and may occur prior to
+ * the return of this function. If this function returns a non-zero value, the
+ * client can assume that no events will be delivered to the specified IW CM
+ * identifier.
+ */
+int iw_cm_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param);
+
+/**
+ * iw_cm_reject - Reject an incoming connection request.
+ *
+ * @cm_id: Connection identifier associated with the request.
+ * @private_daa: Pointer to data to deliver to the remote peer as part of the
+ * reject message.
+ * @private_data_len: The number of bytes in the private_data parameter.
+ *
+ * The client can assume that no events will be delivered to the specified IW
+ * CM identifier following the return of this function. The private_data
+ * buffer is available for reuse when this function returns.
+ */
+int iw_cm_reject(struct iw_cm_id *cm_id, const void *private_data,
+ u8 private_data_len);
+
+/**
+ * iw_cm_connect - Called to request a connection to a remote peer.
+ *
+ * @cm_id: The IW CM identifier for the connection.
+ * @iw_param: Pointer to a structure containing connection establishment
+ * parameters.
+ *
+ * Events may be delivered to the specified IW CM identifier prior to the
+ * return of this function. If this function returns a non-zero value, the
+ * client can assume that no events will be delivered to the specified IW CM
+ * identifier.
+ */
+int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param);
+
+/**
+ * iw_cm_disconnect - Close the specified connection.
+ *
+ * @cm_id: The IW CM identifier to close.
+ * @abrupt: If 0, the connection will be closed gracefully, otherwise, the
+ * connection will be reset.
+ *
+ * The IW CM identifier is still active until the IW_CM_EVENT_CLOSE event is
+ * delivered.
+ */
+int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt);
+
+/**
+ * iw_cm_init_qp_attr - Called to initialize the attributes of the QP
+ * associated with a IW CM identifier.
+ *
+ * @cm_id: The IW CM identifier associated with the QP
+ * @qp_attr: Pointer to the QP attributes structure.
+ * @qp_attr_mask: Pointer to a bit vector specifying which QP attributes are
+ * valid.
+ */
+int iw_cm_init_qp_attr(struct iw_cm_id *cm_id, struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask);
+
+#endif /* IW_CM_H */
diff --git a/sys/ofed/include/rdma/rdma_cm.h b/sys/ofed/include/rdma/rdma_cm.h
new file mode 100644
index 0000000..c6b2962
--- /dev/null
+++ b/sys/ofed/include/rdma/rdma_cm.h
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(RDMA_CM_H)
+#define RDMA_CM_H
+
+#include <linux/socket.h>
+#include <linux/in6.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_sa.h>
+
+/*
+ * Upon receiving a device removal event, users must destroy the associated
+ * RDMA identifier and release all resources allocated with the device.
+ */
+enum rdma_cm_event_type {
+ RDMA_CM_EVENT_ADDR_RESOLVED,
+ RDMA_CM_EVENT_ADDR_ERROR,
+ RDMA_CM_EVENT_ROUTE_RESOLVED,
+ RDMA_CM_EVENT_ROUTE_ERROR,
+ RDMA_CM_EVENT_CONNECT_REQUEST,
+ RDMA_CM_EVENT_CONNECT_RESPONSE,
+ RDMA_CM_EVENT_CONNECT_ERROR,
+ RDMA_CM_EVENT_UNREACHABLE,
+ RDMA_CM_EVENT_REJECTED,
+ RDMA_CM_EVENT_ESTABLISHED,
+ RDMA_CM_EVENT_DISCONNECTED,
+ RDMA_CM_EVENT_DEVICE_REMOVAL,
+ RDMA_CM_EVENT_MULTICAST_JOIN,
+ RDMA_CM_EVENT_MULTICAST_ERROR,
+ RDMA_CM_EVENT_ADDR_CHANGE,
+ RDMA_CM_EVENT_TIMEWAIT_EXIT
+};
+
+enum rdma_port_space {
+ RDMA_PS_SDP = 0x0001,
+ RDMA_PS_IPOIB = 0x0002,
+ RDMA_PS_TCP = 0x0106,
+ RDMA_PS_UDP = 0x0111,
+ RDMA_PS_SCTP = 0x0183
+};
+
+struct rdma_addr {
+ struct sockaddr_storage src_addr;
+ struct sockaddr_storage dst_addr;
+ struct rdma_dev_addr dev_addr;
+};
+
+struct rdma_route {
+ struct rdma_addr addr;
+ struct ib_sa_path_rec *path_rec;
+ int num_paths;
+};
+
+struct rdma_conn_param {
+ const void *private_data;
+ u8 private_data_len;
+ u8 responder_resources;
+ u8 initiator_depth;
+ u8 flow_control;
+ u8 retry_count; /* ignored when accepting */
+ u8 rnr_retry_count;
+ /* Fields below ignored if a QP is created on the rdma_cm_id. */
+ u8 srq;
+ u32 qp_num;
+};
+
+struct rdma_ud_param {
+ const void *private_data;
+ u8 private_data_len;
+ struct ib_ah_attr ah_attr;
+ u32 qp_num;
+ u32 qkey;
+};
+
+struct rdma_cm_event {
+ enum rdma_cm_event_type event;
+ int status;
+ union {
+ struct rdma_conn_param conn;
+ struct rdma_ud_param ud;
+ } param;
+};
+
+struct rdma_cm_id;
+
+/**
+ * rdma_cm_event_handler - Callback used to report user events.
+ *
+ * Notes: Users may not call rdma_destroy_id from this callback to destroy
+ * the passed in id, or a corresponding listen id. Returning a
+ * non-zero value from the callback will destroy the passed in id.
+ */
+typedef int (*rdma_cm_event_handler)(struct rdma_cm_id *id,
+ struct rdma_cm_event *event);
+
+struct rdma_cm_id {
+ struct ib_device *device;
+ void *context;
+ struct ib_qp *qp;
+ rdma_cm_event_handler event_handler;
+ struct rdma_route route;
+ enum rdma_port_space ps;
+ u8 port_num;
+};
+
+/**
+ * rdma_create_id - Create an RDMA identifier.
+ *
+ * @event_handler: User callback invoked to report events associated with the
+ * returned rdma_id.
+ * @context: User specified context associated with the id.
+ * @ps: RDMA port space.
+ */
+struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
+ void *context, enum rdma_port_space ps);
+
+/**
+ * rdma_destroy_id - Destroys an RDMA identifier.
+ *
+ * @id: RDMA identifier.
+ *
+ * Note: calling this function has the effect of canceling in-flight
+ * asynchronous operations associated with the id.
+ */
+void rdma_destroy_id(struct rdma_cm_id *id);
+
+/**
+ * rdma_bind_addr - Bind an RDMA identifier to a source address and
+ * associated RDMA device, if needed.
+ *
+ * @id: RDMA identifier.
+ * @addr: Local address information. Wildcard values are permitted.
+ *
+ * This associates a source address with the RDMA identifier before calling
+ * rdma_listen. If a specific local address is given, the RDMA identifier will
+ * be bound to a local RDMA device.
+ */
+int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr);
+
+/**
+ * rdma_resolve_addr - Resolve destination and optional source addresses
+ * from IP addresses to an RDMA address. If successful, the specified
+ * rdma_cm_id will be bound to a local device.
+ *
+ * @id: RDMA identifier.
+ * @src_addr: Source address information. This parameter may be NULL.
+ * @dst_addr: Destination address information.
+ * @timeout_ms: Time to wait for resolution to complete.
+ */
+int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+ struct sockaddr *dst_addr, int timeout_ms);
+
+/**
+ * rdma_resolve_route - Resolve the RDMA address bound to the RDMA identifier
+ * into route information needed to establish a connection.
+ *
+ * This is called on the client side of a connection.
+ * Users must have first called rdma_resolve_addr to resolve a dst_addr
+ * into an RDMA address before calling this routine.
+ */
+int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms);
+
+/**
+ * rdma_create_qp - Allocate a QP and associate it with the specified RDMA
+ * identifier.
+ *
+ * QPs allocated to an rdma_cm_id will automatically be transitioned by the CMA
+ * through their states.
+ */
+int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd,
+ struct ib_qp_init_attr *qp_init_attr);
+
+/**
+ * rdma_destroy_qp - Deallocate the QP associated with the specified RDMA
+ * identifier.
+ *
+ * Users must destroy any QP associated with an RDMA identifier before
+ * destroying the RDMA ID.
+ */
+void rdma_destroy_qp(struct rdma_cm_id *id);
+
+/**
+ * rdma_init_qp_attr - Initializes the QP attributes for use in transitioning
+ * to a specified QP state.
+ * @id: Communication identifier associated with the QP attributes to
+ * initialize.
+ * @qp_attr: On input, specifies the desired QP state. On output, the
+ * mandatory and desired optional attributes will be set in order to
+ * modify the QP to the specified state.
+ * @qp_attr_mask: The QP attribute mask that may be used to transition the
+ * QP to the specified state.
+ *
+ * Users must set the @qp_attr->qp_state to the desired QP state. This call
+ * will set all required attributes for the given transition, along with
+ * known optional attributes. Users may override the attributes returned from
+ * this call before calling ib_modify_qp.
+ *
+ * Users that wish to have their QP automatically transitioned through its
+ * states can associate a QP with the rdma_cm_id by calling rdma_create_qp().
+ */
+int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask);
+
+/**
+ * rdma_connect - Initiate an active connection request.
+ * @id: Connection identifier to connect.
+ * @conn_param: Connection information used for connected QPs.
+ *
+ * Users must have resolved a route for the rdma_cm_id to connect with
+ * by having called rdma_resolve_route before calling this routine.
+ *
+ * This call will either connect to a remote QP or obtain remote QP
+ * information for unconnected rdma_cm_id's. The actual operation is
+ * based on the rdma_cm_id's port space.
+ */
+int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param);
+
+/**
+ * rdma_listen - This function is called by the passive side to
+ * listen for incoming connection requests.
+ *
+ * Users must have bound the rdma_cm_id to a local address by calling
+ * rdma_bind_addr before calling this routine.
+ */
+int rdma_listen(struct rdma_cm_id *id, int backlog);
+
+/**
+ * rdma_accept - Called to accept a connection request or response.
+ * @id: Connection identifier associated with the request.
+ * @conn_param: Information needed to establish the connection. This must be
+ * provided if accepting a connection request. If accepting a connection
+ * response, this parameter must be NULL.
+ *
+ * Typically, this routine is only called by the listener to accept a connection
+ * request. It must also be called on the active side of a connection if the
+ * user is performing their own QP transitions.
+ *
+ * In the case of error, a reject message is sent to the remote side and the
+ * state of the qp associated with the id is modified to error, such that any
+ * previously posted receive buffers would be flushed.
+ */
+int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param);
+
+/**
+ * rdma_notify - Notifies the RDMA CM of an asynchronous event that has
+ * occurred on the connection.
+ * @id: Connection identifier to transition to established.
+ * @event: Asynchronous event.
+ *
+ * This routine should be invoked by users to notify the CM of relevant
+ * communication events. Events that should be reported to the CM and
+ * when to report them are:
+ *
+ * IB_EVENT_COMM_EST - Used when a message is received on a connected
+ * QP before an RTU has been received.
+ */
+int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event);
+
+/**
+ * rdma_reject - Called to reject a connection request or response.
+ */
+int rdma_reject(struct rdma_cm_id *id, const void *private_data,
+ u8 private_data_len);
+
+/**
+ * rdma_disconnect - This function disconnects the associated QP and
+ * transitions it into the error state.
+ */
+int rdma_disconnect(struct rdma_cm_id *id);
+
+/**
+ * rdma_join_multicast - Join the multicast group specified by the given
+ * address.
+ * @id: Communication identifier associated with the request.
+ * @addr: Multicast address identifying the group to join.
+ * @context: User-defined context associated with the join request, returned
+ * to the user through the private_data pointer in multicast events.
+ */
+int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
+ void *context);
+
+/**
+ * rdma_leave_multicast - Leave the multicast group specified by the given
+ * address.
+ */
+void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr);
+
+/**
+ * rdma_set_service_type - Set the type of service associated with a
+ * connection identifier.
+ * @id: Communication identifier to associated with service type.
+ * @tos: Type of service.
+ *
+ * The type of service is interpretted as a differentiated service
+ * field (RFC 2474). The service type should be specified before
+ * performing route resolution, as existing communication on the
+ * connection identifier may be unaffected. The type of service
+ * requested may not be supported by the network to all destinations.
+ */
+void rdma_set_service_type(struct rdma_cm_id *id, int tos);
+
+#endif /* RDMA_CM_H */
diff --git a/sys/ofed/include/rdma/rdma_cm_ib.h b/sys/ofed/include/rdma/rdma_cm_ib.h
new file mode 100644
index 0000000..2389c3b
--- /dev/null
+++ b/sys/ofed/include/rdma/rdma_cm_ib.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(RDMA_CM_IB_H)
+#define RDMA_CM_IB_H
+
+#include <rdma/rdma_cm.h>
+
+/**
+ * rdma_set_ib_paths - Manually sets the path records used to establish a
+ * connection.
+ * @id: Connection identifier associated with the request.
+ * @path_rec: Reference to the path record
+ *
+ * This call permits a user to specify routing information for rdma_cm_id's
+ * bound to Infiniband devices. It is called on the client side of a
+ * connection and replaces the call to rdma_resolve_route.
+ */
+int rdma_set_ib_paths(struct rdma_cm_id *id,
+ struct ib_sa_path_rec *path_rec, int num_paths);
+
+/* Global qkey for UDP QPs and multicast groups. */
+#define RDMA_UDP_QKEY 0x01234567
+
+#endif /* RDMA_CM_IB_H */
diff --git a/sys/ofed/include/rdma/rdma_user_cm.h b/sys/ofed/include/rdma/rdma_user_cm.h
new file mode 100644
index 0000000..1d16502
--- /dev/null
+++ b/sys/ofed/include/rdma/rdma_user_cm.h
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2005-2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RDMA_USER_CM_H
+#define RDMA_USER_CM_H
+
+#include <linux/types.h>
+#include <linux/in6.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_user_sa.h>
+
+#define RDMA_USER_CM_ABI_VERSION 4
+
+#define RDMA_MAX_PRIVATE_DATA 256
+
+enum {
+ RDMA_USER_CM_CMD_CREATE_ID,
+ RDMA_USER_CM_CMD_DESTROY_ID,
+ RDMA_USER_CM_CMD_BIND_ADDR,
+ RDMA_USER_CM_CMD_RESOLVE_ADDR,
+ RDMA_USER_CM_CMD_RESOLVE_ROUTE,
+ RDMA_USER_CM_CMD_QUERY_ROUTE,
+ RDMA_USER_CM_CMD_CONNECT,
+ RDMA_USER_CM_CMD_LISTEN,
+ RDMA_USER_CM_CMD_ACCEPT,
+ RDMA_USER_CM_CMD_REJECT,
+ RDMA_USER_CM_CMD_DISCONNECT,
+ RDMA_USER_CM_CMD_INIT_QP_ATTR,
+ RDMA_USER_CM_CMD_GET_EVENT,
+ RDMA_USER_CM_CMD_GET_OPTION,
+ RDMA_USER_CM_CMD_SET_OPTION,
+ RDMA_USER_CM_CMD_NOTIFY,
+ RDMA_USER_CM_CMD_JOIN_MCAST,
+ RDMA_USER_CM_CMD_LEAVE_MCAST,
+ RDMA_USER_CM_CMD_MIGRATE_ID
+};
+
+/*
+ * command ABI structures.
+ */
+struct rdma_ucm_cmd_hdr {
+ __u32 cmd;
+ __u16 in;
+ __u16 out;
+};
+
+struct rdma_ucm_create_id {
+ __u64 uid;
+ __u64 response;
+ __u16 ps;
+ __u8 reserved[6];
+};
+
+struct rdma_ucm_create_id_resp {
+ __u32 id;
+};
+
+struct rdma_ucm_destroy_id {
+ __u64 response;
+ __u32 id;
+ __u32 reserved;
+};
+
+struct rdma_ucm_destroy_id_resp {
+ __u32 events_reported;
+};
+
+struct rdma_ucm_bind_addr {
+ __u64 response;
+ struct sockaddr_in6 addr;
+ __u32 id;
+};
+
+struct rdma_ucm_resolve_addr {
+ struct sockaddr_in6 src_addr;
+ struct sockaddr_in6 dst_addr;
+ __u32 id;
+ __u32 timeout_ms;
+};
+
+struct rdma_ucm_resolve_route {
+ __u32 id;
+ __u32 timeout_ms;
+};
+
+struct rdma_ucm_query_route {
+ __u64 response;
+ __u32 id;
+ __u32 reserved;
+};
+
+struct rdma_ucm_query_route_resp {
+ __u64 node_guid;
+ struct ib_user_path_rec ib_route[2];
+ struct sockaddr_in6 src_addr;
+ struct sockaddr_in6 dst_addr;
+ __u32 num_paths;
+ __u8 port_num;
+ __u8 reserved[3];
+};
+
+struct rdma_ucm_conn_param {
+ __u32 qp_num;
+ __u32 reserved;
+ __u8 private_data[RDMA_MAX_PRIVATE_DATA];
+ __u8 private_data_len;
+ __u8 srq;
+ __u8 responder_resources;
+ __u8 initiator_depth;
+ __u8 flow_control;
+ __u8 retry_count;
+ __u8 rnr_retry_count;
+ __u8 valid;
+};
+
+struct rdma_ucm_ud_param {
+ __u32 qp_num;
+ __u32 qkey;
+ struct ib_uverbs_ah_attr ah_attr;
+ __u8 private_data[RDMA_MAX_PRIVATE_DATA];
+ __u8 private_data_len;
+ __u8 reserved[7];
+};
+
+struct rdma_ucm_connect {
+ struct rdma_ucm_conn_param conn_param;
+ __u32 id;
+ __u32 reserved;
+};
+
+struct rdma_ucm_listen {
+ __u32 id;
+ __u32 backlog;
+};
+
+struct rdma_ucm_accept {
+ __u64 uid;
+ struct rdma_ucm_conn_param conn_param;
+ __u32 id;
+ __u32 reserved;
+};
+
+struct rdma_ucm_reject {
+ __u32 id;
+ __u8 private_data_len;
+ __u8 reserved[3];
+ __u8 private_data[RDMA_MAX_PRIVATE_DATA];
+};
+
+struct rdma_ucm_disconnect {
+ __u32 id;
+};
+
+struct rdma_ucm_init_qp_attr {
+ __u64 response;
+ __u32 id;
+ __u32 qp_state;
+};
+
+struct rdma_ucm_notify {
+ __u32 id;
+ __u32 event;
+};
+
+struct rdma_ucm_join_mcast {
+ __u64 response; /* rdma_ucm_create_id_resp */
+ __u64 uid;
+ struct sockaddr_in6 addr;
+ __u32 id;
+};
+
+struct rdma_ucm_get_event {
+ __u64 response;
+};
+
+struct rdma_ucm_event_resp {
+ __u64 uid;
+ __u32 id;
+ __u32 event;
+ __u32 status;
+ union {
+ struct rdma_ucm_conn_param conn;
+ struct rdma_ucm_ud_param ud;
+ } param;
+};
+
+/* Option levels */
+enum {
+ RDMA_OPTION_ID = 0,
+ RDMA_OPTION_IB = 1
+};
+
+/* Option details */
+enum {
+ RDMA_OPTION_ID_TOS = 0,
+ RDMA_OPTION_IB_PATH = 1
+};
+
+struct rdma_ucm_set_option {
+ __u64 optval;
+ __u32 id;
+ __u32 level;
+ __u32 optname;
+ __u32 optlen;
+};
+
+struct rdma_ucm_migrate_id {
+ __u64 response;
+ __u32 id;
+ __u32 fd;
+};
+
+struct rdma_ucm_migrate_resp {
+ __u32 events_reported;
+};
+
+#endif /* RDMA_USER_CM_H */
diff --git a/sys/ofed/include/rdma/sdp_socket.h b/sys/ofed/include/rdma/sdp_socket.h
new file mode 100644
index 0000000..902dc97
--- /dev/null
+++ b/sys/ofed/include/rdma/sdp_socket.h
@@ -0,0 +1,21 @@
+/* Stuff that should go into include/linux/socket.h */
+
+#ifndef SDP_SOCKET_H
+#define SDP_SOCKET_H
+
+#ifndef AF_INET_SDP
+#define AF_INET_SDP 27
+#define PF_INET_SDP AF_INET_SDP
+#endif
+
+#ifndef SDP_ZCOPY_THRESH
+#define SDP_ZCOPY_THRESH 80
+#endif
+
+#ifndef SDP_LAST_BIND_ERR
+#define SDP_LAST_BIND_ERR 81
+#endif
+
+/* TODO: AF_INET6_SDP ? */
+
+#endif
OpenPOWER on IntegriCloud